├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── environment.yaml
├── kantts
├── __init__.py
├── bin
│ ├── __init__.py
│ ├── infer_hifigan.py
│ ├── infer_sambert.py
│ ├── text_to_wav.py
│ ├── train_hifigan.py
│ ├── train_sambert.py
│ └── train_sybert.py
├── configs
│ ├── audio_config_16k.yaml
│ ├── audio_config_24k.yaml
│ ├── audio_config_48k.yaml
│ ├── audio_config_8k.yaml
│ ├── audio_config_se_16k.yaml
│ ├── hifigan_noncausal_nsf_global_v1_16k.yaml
│ ├── hifigan_noncausal_nsf_v1_16k.yaml
│ ├── hifigan_noncausal_v1_16k.yaml
│ ├── hifigan_v1_16k.yaml
│ ├── hifigan_v1_24k.yaml
│ ├── hifigan_v1_48k.yaml
│ ├── hifigan_v1_8k.yaml
│ ├── hifigan_v1_nsf_24k.yaml
│ ├── sambert_16k.yaml
│ ├── sambert_16k_MAS.yaml
│ ├── sambert_16k_MAS_byte.yaml
│ ├── sambert_24k.yaml
│ ├── sambert_48k.yaml
│ ├── sambert_fp_8k.yaml
│ ├── sambert_nsf_16k.yaml
│ ├── sambert_nsf_24k.yaml
│ ├── sambert_se_nsf_global_16k.yaml
│ ├── sambert_sichuan_16k.yaml
│ └── sybert.yaml
├── datasets
│ ├── __init__.py
│ ├── data_types.py
│ └── dataset.py
├── models
│ ├── __init__.py
│ ├── hifigan
│ │ ├── hifigan.py
│ │ └── layers.py
│ ├── pqmf.py
│ ├── sambert
│ │ ├── __init__.py
│ │ ├── adaptors.py
│ │ ├── alignment.py
│ │ ├── attention.py
│ │ ├── fsmn.py
│ │ ├── kantts_sambert.py
│ │ └── positions.py
│ └── utils.py
├── preprocess
│ ├── __init__.py
│ ├── audio_processor
│ │ ├── __init__.py
│ │ ├── audio_processor.py
│ │ └── core
│ │ │ ├── __init__.py
│ │ │ ├── dsp.py
│ │ │ └── utils.py
│ ├── data_process.py
│ ├── fp_processor.py
│ ├── languages
│ │ ├── PinYin
│ │ │ ├── En2ChPhoneMap.txt
│ │ │ ├── PhoneSet.xml
│ │ │ ├── PosSet.xml
│ │ │ ├── py2phoneMap.txt
│ │ │ └── tonelist.txt
│ │ ├── Sichuan
│ │ │ ├── En2ChPhoneMap.txt
│ │ │ ├── PhoneSet.xml
│ │ │ ├── PosSet.xml
│ │ │ ├── py2phoneMap.txt
│ │ │ └── tonelist.txt
│ │ ├── WuuShanghai
│ │ │ ├── En2ChPhoneMap.txt
│ │ │ ├── PhoneSet.xml
│ │ │ ├── PosSet.xml
│ │ │ ├── py2phoneMap.txt
│ │ │ └── tonelist.txt
│ │ ├── ZhHK
│ │ │ ├── En2ChPhoneMap.txt
│ │ │ ├── PhoneSet.xml
│ │ │ ├── PosSet.xml
│ │ │ ├── py2phoneMap.txt
│ │ │ └── tonelist.txt
│ │ └── __init__.py
│ ├── script_convertor
│ │ ├── TextScriptConvertor.py
│ │ ├── __init__.py
│ │ └── core
│ │ │ ├── Phone.py
│ │ │ ├── PhoneSet.py
│ │ │ ├── Pos.py
│ │ │ ├── PosSet.py
│ │ │ ├── Script.py
│ │ │ ├── ScriptItem.py
│ │ │ ├── ScriptSentence.py
│ │ │ ├── ScriptWord.py
│ │ │ ├── Syllable.py
│ │ │ ├── SyllableFormatter.py
│ │ │ ├── XmlObj.py
│ │ │ ├── __init__.py
│ │ │ ├── core_types.py
│ │ │ └── utils.py
│ ├── se_processor
│ │ ├── D_TDNN.py
│ │ ├── __init__.py
│ │ ├── layers.py
│ │ └── se_processor.py
│ └── text_process.py
├── train
│ ├── __init__.py
│ ├── loss.py
│ ├── scheduler.py
│ └── trainer.py
└── utils
│ ├── __init__.py
│ ├── audio_torch.py
│ ├── ling_unit
│ ├── __init__.py
│ ├── cleaners.py
│ ├── emotion_types.py
│ ├── lang_symbols.py
│ ├── ling_unit.py
│ └── numbers.py
│ ├── log.py
│ └── plot.py
├── notebooks
└── README.md
├── requirements.txt
├── setup.py
└── test
└── __init__.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | #
162 | # MISC
163 | .DS_Store
164 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/ambv/black
3 | rev: 20.8b1
4 | hooks:
5 | - id: black
6 | additional_dependencies: ['click==8.0.4']
7 | - repo: https://gitlab.com/pycqa/flake8
8 | rev: 3.8.4
9 | hooks:
10 | - id: flake8
11 | args: ['--max-line-length=120', '--extend-ignore=E203']
12 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Alibaba Research
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # KAN-TTS
2 |
3 | With KAN-TTS you can train your own TTS model from zero to hero :).
4 |
5 | ## Models
6 | Temporarily we support sam-bert and hifi-GAN, other models are coming soon.
7 |
8 | ## Support Languages
9 | | Language | Model Links |
10 | | :---: | :---: |
11 | | Mandarin | https://modelscope.cn/models?name=zhcn&page=1&tasks=text-to-speech&type=audio |
12 | | English | https://modelscope.cn/models?name=enus&page=1&tasks=text-to-speech&type=audio |
13 | | British | https://modelscope.cn/models?name=engb&page=1&tasks=text-to-speech&type=audio |
14 | | Shanghainese | https://modelscope.cn/models?name=WuuShanghai&page=1&tasks=text-to-speech&type=audio |
15 | | Sichuanese | https://modelscope.cn/models?name=Sichuan&page=1&tasks=text-to-speech&type=audio |
16 | | Cantonese | https://modelscope.cn/models?name=Cantonese&page=1&tasks=text-to-speech&type=audio |
17 | | Italian | https://modelscope.cn/models?name=itit&page=1&tasks=text-to-speech&type=audio |
18 | | Spanish | https://modelscope.cn/models?name=eses&page=1&tasks=text-to-speech&type=audio |
19 | | Russian | https://modelscope.cn/models?name=ruru&page=1&tasks=text-to-speech&type=audio |
20 | | Korean | https://modelscope.cn/models?name=kokr&page=1&tasks=text-to-speech&type=audio |
21 | More languages are coming soon.
22 |
23 | ## Training Tutorial
24 | You can find the training tutorial in our wiki page [KAN-TTS Wiki](https://github.com/AlibabaResearch/KAN-TTS/wiki).
25 |
26 | ## ModelScope Demo
27 | Try our demo on ModelScope [KAN-TTS Demo](https://modelscope.cn/models?page=1&tasks=text-to-speech).
28 |
29 | ## Contribute to this repo
30 |
31 | ```shell
32 | pip install -r requirements.txt
33 | pre-commit install
34 | ```
35 |
36 | ## Contact us
37 | If you have any questions, please feel free to contact us.
38 |
39 | Scan the QR code to join our DingTalk group.
40 |
41 |
42 |
--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
1 | name: maas
2 | channels:
3 | - pytorch
4 | - conda-forge
5 | - defaults
6 | dependencies:
7 | - audioread
8 | - cudatoolkit=10.1
9 | - ffmpeg
10 | - lame
11 | - librosa=0.9.2
12 | - libsndfile
13 | - matplotlib=3.5.1
14 | - matplotlib-base=3.5.1
15 | - numba
16 | - numpy
17 | - unidecode
18 | - inflect
19 | - numpy-base
20 | - pip
21 | - protobuf=3.20.1
22 | - pysocks=1.7.1
23 | - pysoundfile
24 | - python=3.7.13
25 | - python-dateutil=2.8.2
26 | - python_abi=3.7
27 | - pytorch=1.7.0
28 | - pywavelets=1.3.0
29 | - pyyaml=6.0
30 | - readline
31 | - scikit-learn=1.0.2
32 | - scipy=1.7.3
33 | - setuptools=61.2.0
34 | - six=1.16.0
35 | - sqlite=3.38.5
36 | - tensorboardx=2.2
37 | - threadpoolctl=3.1.0
38 | - tk=8.6.12
39 | - torchaudio=0.7.0
40 | - torchvision=0.8.0
41 | - tqdm
42 | - urllib3
43 | - wheel
44 | - yaml=0.2.5
45 | - pip:
46 | - appnope==0.1.3
47 | - backcall==0.2.0
48 | - cython==0.29.30
49 | - dataclasses==0.6
50 | - future==0.18.2
51 | - greenlet==1.1.2
52 | - ipdb
53 | - ipython
54 | - jedi==0.18.1
55 | - matplotlib-inline==0.1.3
56 | - msgpack==1.0.4
57 | - parso==0.8.3
58 | - pexpect==4.8.0
59 | - pickleshare==0.7.5
60 | - prompt-toolkit==3.0.30
61 | - ptyprocess==0.7.0
62 | - pygments==2.12.0
63 | - pysptk
64 | - git+https://github.com/fbcotter/pytorch_wavelets.git
65 | - sox
66 | - toml==0.10.2
67 | - traitlets==5.3.0
68 | - wcwidth==0.2.5
69 | - bitstring==3.1.6
70 | - --find-links https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
71 | - ttsfrd
72 |
--------------------------------------------------------------------------------
/kantts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/__init__.py
--------------------------------------------------------------------------------
/kantts/bin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/bin/__init__.py
--------------------------------------------------------------------------------
/kantts/bin/infer_hifigan.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import argparse
4 | import torch
5 | import soundfile as sf
6 | import yaml
7 | import logging
8 | import numpy as np
9 | import time
10 | import glob
11 |
12 | ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # NOQA: E402
13 | sys.path.insert(0, os.path.dirname(ROOT_PATH)) # NOQA: E402
14 |
15 | try:
16 | from kantts.utils.log import logging_to_file
17 | except ImportError:
18 | raise ImportError("Please install kantts.")
19 |
20 | logging.basicConfig(
21 | # filename=os.path.join(stage_dir, 'stdout.log'),
22 | format="%(asctime)s, %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s",
23 | datefmt="%Y-%m-%d:%H:%M:%S",
24 | level=logging.INFO,
25 | )
26 |
27 |
28 | def count_parameters(model):
29 | return sum(p.numel() for p in model.parameters() if p.requires_grad)
30 |
31 |
32 | def load_model(ckpt, config=None):
33 | # load config if not provided
34 | if config is None:
35 | dirname = os.path.dirname(os.path.dirname(ckpt))
36 | config = os.path.join(dirname, "config.yaml")
37 | with open(config) as f:
38 | config = yaml.load(f, Loader=yaml.Loader)
39 |
40 | # lazy load for circular error
41 | from kantts.models.hifigan.hifigan import Generator
42 |
43 | model = Generator(**config["Model"]["Generator"]["params"])
44 | states = torch.load(ckpt, map_location="cpu")
45 | model.load_state_dict(states["model"]["generator"])
46 |
47 | # add pqmf if needed
48 | if config["Model"]["Generator"]["params"]["out_channels"] > 1:
49 | # lazy load for circular error
50 | from kantts.models.pqmf import PQMF
51 |
52 | model.pqmf = PQMF()
53 |
54 | return model
55 |
56 |
57 | def binarize(mel, threshold=0.6):
58 | # vuv binarize
59 | res_mel = mel.copy()
60 | index = np.where(mel[:, -1] < threshold)[0]
61 | res_mel[:, -1] = 1.0
62 | res_mel[:, -1][index] = 0.0
63 | return res_mel
64 |
65 |
66 | def hifigan_infer(input_mel, ckpt_path, output_dir, config=None):
67 | if not torch.cuda.is_available():
68 | device = torch.device("cpu")
69 | else:
70 | torch.backends.cudnn.benchmark = True
71 | device = torch.device("cuda", 0)
72 |
73 | if config is not None:
74 | with open(config, "r") as f:
75 | config = yaml.load(f, Loader=yaml.Loader)
76 | else:
77 | config_path = os.path.join(
78 | os.path.dirname(os.path.dirname(ckpt_path)), "config.yaml"
79 | )
80 | if not os.path.exists(config_path):
81 | raise ValueError("config file not found: {}".format(config_path))
82 | with open(config_path, "r") as f:
83 | config = yaml.load(f, Loader=yaml.Loader)
84 |
85 | for key, value in config.items():
86 | logging.info(f"{key} = {value}")
87 |
88 | # check directory existence
89 | if not os.path.exists(output_dir):
90 | os.makedirs(output_dir)
91 |
92 | logging_to_file(os.path.join(output_dir, "stdout.log"))
93 |
94 | if os.path.isfile(input_mel):
95 | mel_lst = [input_mel]
96 | elif os.path.isdir(input_mel):
97 | mel_lst = glob.glob(os.path.join(input_mel, "*.npy"))
98 | else:
99 | raise ValueError("input_mel should be a file or a directory")
100 |
101 | model = load_model(ckpt_path, config)
102 |
103 | logging.info(f"Loaded model parameters from {ckpt_path}.")
104 | model.remove_weight_norm()
105 | model = model.eval().to(device)
106 |
107 | with torch.no_grad():
108 | start = time.time()
109 | pcm_len = 0
110 | for mel in mel_lst:
111 | utt_id = os.path.splitext(os.path.basename(mel))[0]
112 | mel_data = np.load(mel)
113 | if model.nsf_enable:
114 | mel_data = binarize(mel_data)
115 | # generate
116 | mel_data = torch.tensor(mel_data, dtype=torch.float).to(device)
117 | # (T, C) -> (B, C, T)
118 | mel_data = mel_data.transpose(1, 0).unsqueeze(0)
119 | y = model(mel_data)
120 | if hasattr(model, "pqmf"):
121 | y = model.pqmf.synthesis(y)
122 | y = y.view(-1).cpu().numpy()
123 | pcm_len += len(y)
124 |
125 | # save as PCM 16 bit wav file
126 | sf.write(
127 | os.path.join(output_dir, f"{utt_id}_gen.wav"),
128 | y,
129 | config["audio_config"]["sampling_rate"],
130 | "PCM_16",
131 | )
132 | rtf = (time.time() - start) / (
133 | pcm_len / config["audio_config"]["sampling_rate"]
134 | )
135 |
136 | # report average RTF
137 | logging.info(
138 | f"Finished generation of {len(mel_lst)} utterances (RTF = {rtf:.03f})."
139 | )
140 |
141 |
142 | if __name__ == "__main__":
143 | parser = argparse.ArgumentParser(description="Infer hifigan model")
144 | parser.add_argument(
145 | "--ckpt", type=str, required=True, help="Path to model checkpoint"
146 | )
147 | parser.add_argument(
148 | "--input_mel",
149 | type=str,
150 | required=True,
151 | help="Path to input mel file or directory containing mel files",
152 | )
153 | parser.add_argument(
154 | "--output_dir", type=str, required=True, help="Path to output directory"
155 | )
156 | parser.add_argument("--config", type=str, default=None, help="Path to config file")
157 | args = parser.parse_args()
158 | hifigan_infer(
159 | args.input_mel,
160 | args.ckpt,
161 | args.output_dir,
162 | args.config,
163 | )
164 |
--------------------------------------------------------------------------------
/kantts/bin/text_to_wav.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import argparse
4 | import yaml
5 | import logging
6 | import zipfile
7 | from glob import glob
8 | import soundfile as sf
9 | import numpy as np
10 |
11 |
12 | ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # NOQA: E402
13 | sys.path.insert(0, os.path.dirname(ROOT_PATH)) # NOQA: E402
14 |
15 | try:
16 | from kantts.bin.infer_sambert import am_infer
17 | from kantts.bin.infer_hifigan import hifigan_infer
18 | from kantts.utils.ling_unit import text_to_mit_symbols as text_to_symbols
19 | except ImportError:
20 | raise ImportError("Please install kantts.")
21 |
22 | logging.basicConfig(
23 | # filename=os.path.join(stage_dir, 'stdout.log'),
24 | format="%(asctime)s, %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s",
25 | datefmt="%Y-%m-%d:%H:%M:%S",
26 | level=logging.INFO,
27 | )
28 |
29 |
30 | def concat_process(chunked_dir, output_dir):
31 | wav_files = sorted(glob(os.path.join(chunked_dir, "*.wav")))
32 | print(wav_files)
33 | sentence_sil = 0.28 # seconds
34 | end_sil = 0.05 # seconds
35 |
36 | cnt = 0
37 | wav_concat = None
38 | main_id, sub_id = 0, 0
39 |
40 | while cnt < len(wav_files):
41 | wav_file = os.path.join(
42 | chunked_dir, "{}_{}_mel_gen.wav".format(main_id, sub_id)
43 | )
44 | if os.path.exists(wav_file):
45 | wav, sr = sf.read(wav_file)
46 | sentence_sil_samples = int(sentence_sil * sr)
47 | end_sil_samples = int(end_sil * sr)
48 | if sub_id == 0:
49 | wav_concat = wav
50 | else:
51 | wav_concat = np.concatenate(
52 | (wav_concat, np.zeros(sentence_sil_samples), wav), axis=0
53 | )
54 |
55 | sub_id += 1
56 | cnt += 1
57 | else:
58 | if wav_concat is not None:
59 | wav_concat = np.concatenate(
60 | (wav_concat, np.zeros(end_sil_samples)), axis=0
61 | )
62 | sf.write(os.path.join(output_dir, f"{main_id}.wav"), wav_concat, sr)
63 |
64 | main_id += 1
65 | sub_id = 0
66 | wav_concat = None
67 |
68 | if cnt == len(wav_files):
69 | wav_concat = np.concatenate((wav_concat, np.zeros(end_sil_samples)), axis=0)
70 | sf.write(os.path.join(output_dir, f"{main_id}.wav"), wav_concat, sr)
71 |
72 |
73 | def text_to_wav(
74 | text_file,
75 | output_dir,
76 | resources_zip_file,
77 | am_ckpt,
78 | voc_ckpt,
79 | speaker=None,
80 | se_file=None,
81 | lang="PinYin",
82 | ):
83 | os.makedirs(output_dir, exist_ok=True)
84 | os.makedirs(os.path.join(output_dir, "res_wavs"), exist_ok=True)
85 |
86 | resource_root_dir = os.path.dirname(resources_zip_file)
87 | resource_dir = os.path.join(resource_root_dir, "resource")
88 |
89 | if not os.path.exists(resource_dir):
90 | logging.info("Extracting resources...")
91 | with zipfile.ZipFile(resources_zip_file, "r") as zip_ref:
92 | zip_ref.extractall(resource_root_dir)
93 |
94 | with open(text_file, "r") as text_data:
95 | texts = text_data.readlines()
96 |
97 | logging.info("Converting text to symbols...")
98 | am_config = os.path.join(os.path.dirname(os.path.dirname(am_ckpt)), "config.yaml")
99 | with open(am_config, "r") as f:
100 | config = yaml.load(f, Loader=yaml.Loader)
101 | if speaker is None:
102 | speaker = config["linguistic_unit"]["speaker_list"].split(",")[0]
103 | symbols_lst = text_to_symbols(texts, resource_dir, speaker, lang)
104 | symbols_file = os.path.join(output_dir, "symbols.lst")
105 | with open(symbols_file, "w") as symbol_data:
106 | for symbol in symbols_lst:
107 | symbol_data.write(symbol)
108 |
109 | logging.info("AM is infering...")
110 | am_infer(symbols_file, am_ckpt, output_dir, se_file)
111 |
112 | logging.info("Vocoder is infering...")
113 | hifigan_infer(os.path.join(output_dir, "feat"), voc_ckpt, output_dir)
114 |
115 | concat_process(output_dir, os.path.join(output_dir, "res_wavs"))
116 |
117 | logging.info("Text to wav finished!")
118 |
119 |
120 | if __name__ == "__main__":
121 | parser = argparse.ArgumentParser(description="Text to wav")
122 | parser.add_argument("--txt", type=str, required=True, help="Path to text file")
123 | parser.add_argument(
124 | "--output_dir", type=str, required=True, help="Path to output directory"
125 | )
126 | parser.add_argument(
127 | "--res_zip", type=str, required=True, help="Path to resource zip file"
128 | )
129 | parser.add_argument(
130 | "--am_ckpt", type=str, required=True, help="Path to am ckpt file"
131 | )
132 | parser.add_argument(
133 | "--voc_ckpt", type=str, required=True, help="Path to voc ckpt file"
134 | )
135 | parser.add_argument(
136 | "--speaker",
137 | type=str,
138 | required=False,
139 | default=None,
140 | help="The speaker name, default is the first speaker",
141 | )
142 | parser.add_argument(
143 | "--se_file",
144 | type=str,
145 | required=False,
146 | default=None,
147 | help="The speaker embedding file , default is None",
148 | )
149 | parser.add_argument(
150 | "--lang",
151 | type=str,
152 | default="PinYin",
153 | help="""The language of the text, default is PinYin, other options are:
154 | English,
155 | British,
156 | ZhHK,
157 | WuuShanghai,
158 | Sichuan,
159 | Indonesian,
160 | Malay,
161 | Filipino,
162 | Vietnamese,
163 | Korean,
164 | Russian
165 | """,
166 | )
167 | args = parser.parse_args()
168 | text_to_wav(
169 | args.txt,
170 | args.output_dir,
171 | args.res_zip,
172 | args.am_ckpt,
173 | args.voc_ckpt,
174 | args.speaker,
175 | args.se_file,
176 | args.lang,
177 | )
178 |
--------------------------------------------------------------------------------
/kantts/configs/audio_config_16k.yaml:
--------------------------------------------------------------------------------
1 | # Audio processing configs
2 |
3 | audio_config:
4 | # Preprocess
5 | wav_normalize: True
6 | trim_silence: True
7 | trim_silence_threshold_db: 60
8 | preemphasize: False
9 |
10 | # Feature extraction
11 | sampling_rate: 16000
12 | hop_length: 200
13 | win_length: 1000
14 | n_fft: 2048
15 | n_mels: 80
16 | fmin: 0.0
17 | fmax: 8000.0
18 | phone_level_feature: True
19 |
20 | # Normalization
21 | norm_type: "mean_std" # "mean_std" or "global"
22 | max_norm: 1.0
23 | symmetric: False
24 | min_level_db: -100.0
25 | ref_level_db: 20
26 |
27 | num_workers: 16
28 |
--------------------------------------------------------------------------------
/kantts/configs/audio_config_24k.yaml:
--------------------------------------------------------------------------------
1 | # Audio processing configs
2 |
3 | audio_config:
4 | # Preprocess
5 | wav_normalize: True
6 | trim_silence: True
7 | trim_silence_threshold_db: 60
8 | preemphasize: False
9 |
10 | # Feature extraction
11 | sampling_rate: 24000
12 | hop_length: 240
13 | win_length: 1024
14 | n_fft: 1024
15 | n_mels: 80
16 | fmin: 50.0
17 | fmax: 8000.0
18 | phone_level_feature: True
19 |
20 | # Normalization
21 | norm_type: "mean_std" # "mean_std" or "global"
22 | max_norm: 1.0
23 | symmetric: False
24 | min_level_db: -100.0
25 | ref_level_db: 20
26 |
27 | num_workers: 16
28 |
--------------------------------------------------------------------------------
/kantts/configs/audio_config_48k.yaml:
--------------------------------------------------------------------------------
1 | # Audio processing configs
2 |
3 | audio_config:
4 | # Preprocess
5 | wav_normalize: True
6 | trim_silence: True
7 | trim_silence_threshold_db: 60
8 | preemphasize: False
9 |
10 | # Feature extraction
11 | sampling_rate: 48000
12 | hop_length: 600
13 | win_length: 2400
14 | n_fft: 4096
15 | n_mels: 128
16 | fmin: 0.0
17 | fmax: 12000.0
18 | phone_level_feature: True
19 |
20 | # Normalization
21 | norm_type: "mean_std" # "mean_std" or "global"
22 | max_norm: 1.0
23 | symmetric: False
24 | min_level_db: -100.0
25 | ref_level_db: 20
26 |
27 | num_workers: 16
28 |
--------------------------------------------------------------------------------
/kantts/configs/audio_config_8k.yaml:
--------------------------------------------------------------------------------
1 | # Audio processing configs
2 |
3 | audio_config:
4 | # Preprocess
5 | wav_normalize: True
6 | trim_silence: True
7 | trim_silence_threshold_db: 60
8 | preemphasize: False
9 |
10 | # Feature extraction
11 | sampling_rate: 8000
12 | hop_length: 100
13 | win_length: 600
14 | n_fft: 2048
15 | n_mels: 80
16 | fmin: 0.0
17 | fmax: 4000.0
18 | phone_level_feature: True
19 |
20 | # Normalization
21 | norm_type: "mean_std" # "mean_std" or "global"
22 | max_norm: 1.0
23 | symmetric: False
24 | min_level_db: -100.0
25 | ref_level_db: 20
26 |
27 | num_workers: 16
28 |
29 |
--------------------------------------------------------------------------------
/kantts/configs/audio_config_se_16k.yaml:
--------------------------------------------------------------------------------
1 | # Audio processing configs
2 |
3 | audio_config:
4 | # Preprocess
5 | wav_normalize: True
6 | trim_silence: True
7 | trim_silence_threshold_db: 60
8 | preemphasize: False
9 |
10 | # Feature extraction
11 | sampling_rate: 16000
12 | hop_length: 200
13 | win_length: 1000
14 | n_fft: 2048
15 | n_mels: 80
16 | fmin: 0.0
17 | fmax: 8000.0
18 | phone_level_feature: True
19 | se_feature: True
20 |
21 | # Normalization
22 | norm_type: "mean_std" # "mean_std" or "global"
23 | max_norm: 1.0
24 | symmetric: False
25 | min_level_db: -100.0
26 | ref_level_db: 20
27 |
28 | num_workers: 16
29 |
--------------------------------------------------------------------------------
/kantts/configs/hifigan_noncausal_v1_16k.yaml:
--------------------------------------------------------------------------------
1 | model_type: hifigan
2 | Model:
3 | ###########################################################
4 | # GENERATOR NETWORK ARCHITECTURE SETTING #
5 | ###########################################################
6 | Generator:
7 | params:
8 | in_channels: 80
9 | out_channels: 1
10 | channels: 256
11 | kernel_size: 7
12 | upsample_scales: [10, 5, 2, 2]
13 | upsample_kernal_sizes: [20, 11, 4, 4]
14 | resblock_kernel_sizes: [3, 7, 11]
15 | resblock_dilations:
16 | - [1, 3, 5, 7]
17 | - [1, 3, 5, 7]
18 | - [1, 3, 5, 7]
19 | bias: true
20 | causal: false
21 | nonlinear_activation: "LeakyReLU"
22 | nonlinear_activation_params:
23 | negative_slope: 0.1
24 | use_weight_norm: true
25 | optimizer:
26 | type: Adam
27 | params:
28 | lr: 2.0e-4
29 | betas: [0.5, 0.9]
30 | weight_decay: 0.0
31 | scheduler:
32 | type: MultiStepLR
33 | params:
34 | gamma: 0.5
35 | milestones:
36 | - 200000
37 | - 400000
38 | - 600000
39 | - 800000
40 |
41 | ###########################################################
42 | # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
43 | ###########################################################
44 | MultiScaleDiscriminator:
45 | params:
46 | scales: 3
47 | downsample_pooling: "DWT"
48 | downsample_pooling_params:
49 | kernel_size: 4
50 | stride: 2
51 | padding: 2
52 | discriminator_params:
53 | in_channels: 1
54 | out_channels: 1
55 | kernel_sizes: [15, 41, 5, 3]
56 | channels: 128
57 | max_downsample_channels: 1024
58 | max_groups: 16
59 | bias: true
60 | downsample_scales: [4, 4, 4, 4, 1]
61 | nonlinear_activation: "LeakyReLU"
62 | nonlinear_activation_params:
63 | negative_slope: 0.1
64 | follow_official_norm: true
65 | optimizer:
66 | type: Adam
67 | params:
68 | lr: 2.0e-4
69 | betas: [0.5, 0.9]
70 | weight_decay: 0.0
71 | scheduler:
72 | type: MultiStepLR
73 | params:
74 | gamma: 0.5
75 | milestones:
76 | - 200000
77 | - 400000
78 | - 600000
79 | - 800000
80 |
81 | MultiPeriodDiscriminator:
82 | params:
83 | periods: [2, 3, 5, 7, 11]
84 | discriminator_params:
85 | in_channels: 1
86 | out_channels: 1
87 | kernel_sizes: [5, 3]
88 | channels: 32
89 | downsample_scales: [3, 3, 3, 3, 1]
90 | max_downsample_channels: 1024
91 | bias: true
92 | nonlinear_activation: "LeakyReLU"
93 | nonlinear_activation_params:
94 | negative_slope: 0.1
95 | use_spectral_norm: false
96 | optimizer:
97 | type: Adam
98 | params:
99 | lr: 2.0e-4
100 | betas: [0.5, 0.9]
101 | weight_decay: 0.0
102 | scheduler:
103 | type: MultiStepLR
104 | params:
105 | gamma: 0.5
106 | milestones:
107 | - 200000
108 | - 400000
109 | - 600000
110 | - 800000
111 |
112 | ####################################################
113 | # LOSS SETTING #
114 | ####################################################
115 | Loss:
116 | generator_adv_loss:
117 | enable: True
118 | params:
119 | average_by_discriminators: False
120 | weights: 1.0
121 |
122 | discriminator_adv_loss:
123 | enable: True
124 | params:
125 | average_by_discriminators: False
126 | weights: 1.0
127 |
128 | stft_loss:
129 | enable: False # Whether to use multi-resolution STFT loss.
130 |
131 | mel_loss:
132 | enable: True
133 | params:
134 | fs: 16000
135 | fft_size: 2048
136 | hop_size: 200
137 | win_length: 1000
138 | window: "hann"
139 | num_mels: 80
140 | fmin: 0
141 | fmax: 8000
142 | log_base: null
143 | weights: 45.0
144 |
145 | subband_stft_loss:
146 | enable: False
147 | params:
148 | fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
149 | hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
150 | win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
151 | window: "hann_window" # Window function for STFT-based loss
152 |
153 | feat_match_loss:
154 | enable: True
155 | params:
156 | average_by_discriminators: false
157 | average_by_layers: false
158 | weights: 2.0
159 |
160 |
161 | ###########################################################
162 | # DATA LOADER SETTING #
163 | ###########################################################
164 | batch_size: 16
165 | batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size.
166 | pin_memory: True
167 | num_workers: 2 # FIXME: set > 0 may stuck on macos
168 | remove_short_samples: False
169 | allow_cache: True
170 |
171 | generator_grad_norm: -1
172 |
173 | discriminator_grad_norm: -1
174 |
175 | ###########################################################
176 | # INTERVAL SETTING #
177 | ###########################################################
178 | generator_train_start_steps: 1 # Number of steps to start to train discriminator.
179 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
180 | train_max_steps: 2500000 # Number of training steps.
181 | save_interval_steps: 20000 # Interval steps to save checkpoint.
182 | eval_interval_steps: 10000 # Interval steps to evaluate the network.
183 | log_interval_steps: 1000 # Interval steps to record the training log.
184 |
185 | ###########################################################
186 | # OTHER SETTING #
187 | ###########################################################
188 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
189 |
--------------------------------------------------------------------------------
/kantts/configs/hifigan_v1_16k.yaml:
--------------------------------------------------------------------------------
1 | model_type: hifigan
2 | Model:
3 | ###########################################################
4 | # GENERATOR NETWORK ARCHITECTURE SETTING #
5 | ###########################################################
6 | Generator:
7 | params:
8 | in_channels: 80
9 | out_channels: 1
10 | channels: 256
11 | kernel_size: 7
12 | upsample_scales: [10, 5, 2, 2]
13 | upsample_kernal_sizes: [20, 10, 4, 4]
14 | resblock_kernel_sizes: [3, 7, 11]
15 | resblock_dilations:
16 | - [1, 3, 5, 7]
17 | - [1, 3, 5, 7]
18 | - [1, 3, 5, 7]
19 | bias: true
20 | causal: true
21 | nonlinear_activation: "LeakyReLU"
22 | nonlinear_activation_params:
23 | negative_slope: 0.1
24 | use_weight_norm: true
25 | optimizer:
26 | type: Adam
27 | params:
28 | lr: 2.0e-4
29 | betas: [0.5, 0.9]
30 | weight_decay: 0.0
31 | scheduler:
32 | type: MultiStepLR
33 | params:
34 | gamma: 0.5
35 | milestones:
36 | - 200000
37 | - 400000
38 | - 600000
39 | - 800000
40 |
41 | ###########################################################
42 | # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
43 | ###########################################################
44 | MultiScaleDiscriminator:
45 | params:
46 | scales: 3
47 | downsample_pooling: "DWT"
48 | downsample_pooling_params:
49 | kernel_size: 4
50 | stride: 2
51 | padding: 2
52 | discriminator_params:
53 | in_channels: 1
54 | out_channels: 1
55 | kernel_sizes: [15, 41, 5, 3]
56 | channels: 128
57 | max_downsample_channels: 1024
58 | max_groups: 16
59 | bias: true
60 | downsample_scales: [4, 4, 4, 4, 1]
61 | nonlinear_activation: "LeakyReLU"
62 | nonlinear_activation_params:
63 | negative_slope: 0.1
64 | follow_official_norm: true
65 | optimizer:
66 | type: Adam
67 | params:
68 | lr: 2.0e-4
69 | betas: [0.5, 0.9]
70 | weight_decay: 0.0
71 | scheduler:
72 | type: MultiStepLR
73 | params:
74 | gamma: 0.5
75 | milestones:
76 | - 200000
77 | - 400000
78 | - 600000
79 | - 800000
80 |
81 | MultiPeriodDiscriminator:
82 | params:
83 | periods: [2, 3, 5, 7, 11]
84 | discriminator_params:
85 | in_channels: 1
86 | out_channels: 1
87 | kernel_sizes: [5, 3]
88 | channels: 32
89 | downsample_scales: [3, 3, 3, 3, 1]
90 | max_downsample_channels: 1024
91 | bias: true
92 | nonlinear_activation: "LeakyReLU"
93 | nonlinear_activation_params:
94 | negative_slope: 0.1
95 | use_spectral_norm: false
96 | optimizer:
97 | type: Adam
98 | params:
99 | lr: 2.0e-4
100 | betas: [0.5, 0.9]
101 | weight_decay: 0.0
102 | scheduler:
103 | type: MultiStepLR
104 | params:
105 | gamma: 0.5
106 | milestones:
107 | - 200000
108 | - 400000
109 | - 600000
110 | - 800000
111 |
112 | ####################################################
113 | # LOSS SETTING #
114 | ####################################################
115 | Loss:
116 | generator_adv_loss:
117 | enable: True
118 | params:
119 | average_by_discriminators: False
120 | weights: 1.0
121 |
122 | discriminator_adv_loss:
123 | enable: True
124 | params:
125 | average_by_discriminators: False
126 | weights: 1.0
127 |
128 | stft_loss:
129 | enable: False # Whether to use multi-resolution STFT loss.
130 |
131 | mel_loss:
132 | enable: True
133 | params:
134 | fs: 16000
135 | fft_size: 2048
136 | hop_size: 200
137 | win_length: 1000
138 | window: "hann"
139 | num_mels: 80
140 | fmin: 0
141 | fmax: 8000
142 | log_base: null
143 | weights: 45.0
144 |
145 | subband_stft_loss:
146 | enable: False
147 | params:
148 | fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
149 | hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
150 | win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
151 | window: "hann_window" # Window function for STFT-based loss
152 |
153 | feat_match_loss:
154 | enable: True
155 | params:
156 | average_by_discriminators: false
157 | average_by_layers: false
158 | weights: 2.0
159 |
160 |
161 | ###########################################################
162 | # DATA LOADER SETTING #
163 | ###########################################################
164 | batch_size: 16
165 | batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size.
166 | pin_memory: True
167 | num_workers: 2 # FIXME: set > 0 may stuck on macos
168 | remove_short_samples: False
169 | allow_cache: True
170 |
171 | generator_grad_norm: -1
172 |
173 | discriminator_grad_norm: -1
174 |
175 | ###########################################################
176 | # INTERVAL SETTING #
177 | ###########################################################
178 | generator_train_start_steps: 1 # Number of steps to start to train discriminator.
179 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
180 | train_max_steps: 2500000 # Number of training steps.
181 | save_interval_steps: 20000 # Interval steps to save checkpoint.
182 | eval_interval_steps: 10000 # Interval steps to evaluate the network.
183 | log_interval_steps: 1000 # Interval steps to record the training log.
184 |
185 | ###########################################################
186 | # OTHER SETTING #
187 | ###########################################################
188 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
189 |
--------------------------------------------------------------------------------
/kantts/configs/hifigan_v1_24k.yaml:
--------------------------------------------------------------------------------
1 | model_type: hifigan
2 | Model:
3 | ###########################################################
4 | # GENERATOR NETWORK ARCHITECTURE SETTING #
5 | ###########################################################
6 | Generator:
7 | params:
8 | in_channels: 80
9 | out_channels: 1
10 | channels: 512
11 | kernel_size: 7
12 | upsample_scales: [8, 5, 3, 2]
13 | upsample_kernal_sizes: [16, 10, 6, 4]
14 | resblock_kernel_sizes: [3, 7, 11]
15 | resblock_dilations:
16 | - [1, 3, 5]
17 | - [1, 3, 5]
18 | - [1, 3, 5]
19 | bias: true
20 | causal: true
21 | nonlinear_activation: "LeakyReLU"
22 | nonlinear_activation_params:
23 | negative_slope: 0.1
24 | use_weight_norm: true
25 | optimizer:
26 | type: Adam
27 | params:
28 | lr: 2.0e-4
29 | betas: [0.5, 0.9]
30 | weight_decay: 0.0
31 | scheduler:
32 | type: MultiStepLR
33 | params:
34 | gamma: 0.5
35 | milestones:
36 | - 200000
37 | - 400000
38 | - 600000
39 | - 800000
40 |
41 | ###########################################################
42 | # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
43 | ###########################################################
44 | MultiScaleDiscriminator:
45 | params:
46 | scales: 3
47 | downsample_pooling: "DWT"
48 | downsample_pooling_params:
49 | kernel_size: 4
50 | stride: 2
51 | padding: 2
52 | discriminator_params:
53 | in_channels: 1
54 | out_channels: 1
55 | kernel_sizes: [15, 41, 5, 3]
56 | channels: 128
57 | max_downsample_channels: 1024
58 | max_groups: 16
59 | bias: true
60 | downsample_scales: [4, 4, 4, 4, 1]
61 | nonlinear_activation: "LeakyReLU"
62 | nonlinear_activation_params:
63 | negative_slope: 0.1
64 | follow_official_norm: true
65 | optimizer:
66 | type: Adam
67 | params:
68 | lr: 2.0e-4
69 | betas: [0.5, 0.9]
70 | weight_decay: 0.0
71 | scheduler:
72 | type: MultiStepLR
73 | params:
74 | gamma: 0.5
75 | milestones:
76 | - 200000
77 | - 400000
78 | - 600000
79 | - 800000
80 |
81 | MultiPeriodDiscriminator:
82 | params:
83 | periods: [2, 3, 5, 7, 11]
84 | discriminator_params:
85 | in_channels: 1
86 | out_channels: 1
87 | kernel_sizes: [5, 3]
88 | channels: 32
89 | downsample_scales: [3, 3, 3, 3, 1]
90 | max_downsample_channels: 1024
91 | bias: true
92 | nonlinear_activation: "LeakyReLU"
93 | nonlinear_activation_params:
94 | negative_slope: 0.1
95 | use_spectral_norm: false
96 | optimizer:
97 | type: Adam
98 | params:
99 | lr: 2.0e-4
100 | betas: [0.5, 0.9]
101 | weight_decay: 0.0
102 | scheduler:
103 | type: MultiStepLR
104 | params:
105 | gamma: 0.5
106 | milestones:
107 | - 200000
108 | - 400000
109 | - 600000
110 | - 800000
111 |
112 | ####################################################
113 | # LOSS SETTING #
114 | ####################################################
115 | Loss:
116 | generator_adv_loss:
117 | enable: True
118 | params:
119 | average_by_discriminators: False
120 | weights: 1.0
121 |
122 | discriminator_adv_loss:
123 | enable: True
124 | params:
125 | average_by_discriminators: False
126 | weights: 1.0
127 |
128 | stft_loss:
129 | enable: False # Whether to use multi-resolution STFT loss.
130 |
131 | mel_loss:
132 | enable: True
133 | params:
134 | fs: 24000
135 | fft_size: 1024
136 | hop_size: 240
137 | win_length: 1024
138 | window: "hann"
139 | num_mels: 80
140 | fmin: 0
141 | fmax: 8000
142 | log_base: null
143 | weights: 45.0
144 |
145 | subband_stft_loss:
146 | enable: False
147 | params:
148 | fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
149 | hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
150 | win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
151 | window: "hann_window" # Window function for STFT-based loss
152 |
153 | feat_match_loss:
154 | enable: True
155 | params:
156 | average_by_discriminators: false
157 | average_by_layers: false
158 | weights: 2.0
159 |
160 |
161 | ###########################################################
162 | # DATA LOADER SETTING #
163 | ###########################################################
164 | batch_size: 16
165 | batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size.
166 | pin_memory: True
167 | num_workers: 2 # FIXME: set > 0 may stuck on macos
168 | remove_short_samples: False
169 | allow_cache: True
170 |
171 | generator_grad_norm: -1
172 |
173 | discriminator_grad_norm: -1
174 |
175 | ###########################################################
176 | # INTERVAL SETTING #
177 | ###########################################################
178 | generator_train_start_steps: 1 # Number of steps to start to train discriminator.
179 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
180 | train_max_steps: 2500000 # Number of training steps.
181 | save_interval_steps: 20000 # Interval steps to save checkpoint.
182 | eval_interval_steps: 10000 # Interval steps to evaluate the network.
183 | log_interval_steps: 1000 # Interval steps to record the training log.
184 |
185 | ###########################################################
186 | # OTHER SETTING #
187 | ###########################################################
188 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
189 |
--------------------------------------------------------------------------------
/kantts/configs/hifigan_v1_48k.yaml:
--------------------------------------------------------------------------------
1 | model_type: hifigan
2 | Model:
3 | ###########################################################
4 | # GENERATOR NETWORK ARCHITECTURE SETTING #
5 | ###########################################################
6 | Generator:
7 | params:
8 | in_channels: 128
9 | out_channels: 1
10 | channels: 512
11 | kernel_size: 7
12 | upsample_scales: [10, 5, 3, 2, 2]
13 | upsample_kernal_sizes: [20, 10, 6, 4, 4]
14 | resblock_kernel_sizes: [3, 7, 11]
15 | resblock_dilations:
16 | - [1, 3, 5, 7]
17 | - [1, 3, 5, 7]
18 | - [1, 3, 5, 7]
19 | bias: true
20 | causal: true
21 | nonlinear_activation: "LeakyReLU"
22 | nonlinear_activation_params:
23 | negative_slope: 0.1
24 | use_weight_norm: true
25 | optimizer:
26 | type: Adam
27 | params:
28 | lr: 2.0e-4
29 | betas: [0.5, 0.9]
30 | weight_decay: 0.0
31 | scheduler:
32 | type: MultiStepLR
33 | params:
34 | gamma: 0.5
35 | milestones:
36 | - 200000
37 | - 400000
38 | - 600000
39 | - 800000
40 |
41 | ###########################################################
42 | # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
43 | ###########################################################
44 | MultiScaleDiscriminator:
45 | params:
46 | scales: 3
47 | downsample_pooling: "DWT"
48 | downsample_pooling_params:
49 | kernel_size: 4
50 | stride: 2
51 | padding: 2
52 | discriminator_params:
53 | in_channels: 1
54 | out_channels: 1
55 | kernel_sizes: [15, 41, 5, 3]
56 | channels: 128
57 | max_downsample_channels: 1024
58 | max_groups: 16
59 | bias: true
60 | downsample_scales: [4, 4, 4, 4, 1]
61 | nonlinear_activation: "LeakyReLU"
62 | nonlinear_activation_params:
63 | negative_slope: 0.1
64 | follow_official_norm: true
65 | optimizer:
66 | type: Adam
67 | params:
68 | lr: 2.0e-4
69 | betas: [0.5, 0.9]
70 | weight_decay: 0.0
71 | scheduler:
72 | type: MultiStepLR
73 | params:
74 | gamma: 0.5
75 | milestones:
76 | - 200000
77 | - 400000
78 | - 600000
79 | - 800000
80 |
81 | MultiPeriodDiscriminator:
82 | params:
83 | periods: [2, 3, 5, 7, 11]
84 | discriminator_params:
85 | in_channels: 1
86 | out_channels: 1
87 | kernel_sizes: [5, 3]
88 | channels: 32
89 | downsample_scales: [3, 3, 3, 3, 1]
90 | max_downsample_channels: 1024
91 | bias: true
92 | nonlinear_activation: "LeakyReLU"
93 | nonlinear_activation_params:
94 | negative_slope: 0.1
95 | use_spectral_norm: false
96 | optimizer:
97 | type: Adam
98 | params:
99 | lr: 2.0e-4
100 | betas: [0.5, 0.9]
101 | weight_decay: 0.0
102 | scheduler:
103 | type: MultiStepLR
104 | params:
105 | gamma: 0.5
106 | milestones:
107 | - 200000
108 | - 400000
109 | - 600000
110 | - 800000
111 |
112 | ####################################################
113 | # LOSS SETTING #
114 | ####################################################
115 | Loss:
116 | generator_adv_loss:
117 | enable: True
118 | params:
119 | average_by_discriminators: False
120 | weights: 1.0
121 |
122 | discriminator_adv_loss:
123 | enable: True
124 | params:
125 | average_by_discriminators: False
126 | weights: 1.0
127 |
128 | stft_loss:
129 | enable: False # Whether to use multi-resolution STFT loss.
130 |
131 | mel_loss:
132 | enable: True
133 | params:
134 | fs: 48000
135 | fft_size: 4096
136 | hop_size: 600
137 | win_length: 2400
138 | window: "hann"
139 | num_mels: 128
140 | fmin: 0
141 | fmax: 12000
142 | log_base: null
143 | weights: 45.0
144 |
145 | subband_stft_loss:
146 | enable: False
147 | params:
148 | fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
149 | hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
150 | win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
151 | window: "hann_window" # Window function for STFT-based loss
152 |
153 | feat_match_loss:
154 | enable: True
155 | params:
156 | average_by_discriminators: false
157 | average_by_layers: false
158 | weights: 2.0
159 |
160 |
161 | ###########################################################
162 | # DATA LOADER SETTING #
163 | ###########################################################
164 | batch_size: 16
165 | batch_max_steps: 19200 # Length of each audio in batch. Make sure dividable by hop_size.
166 | pin_memory: True
167 | num_workers: 2 # FIXME: set > 0 may stuck on macos
168 | remove_short_samples: False
169 | allow_cache: True
170 |
171 | generator_grad_norm: -1
172 |
173 | discriminator_grad_norm: -1
174 |
175 | ###########################################################
176 | # INTERVAL SETTING #
177 | ###########################################################
178 | generator_train_start_steps: 1 # Number of steps to start to train discriminator.
179 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
180 | train_max_steps: 2500000 # Number of training steps.
181 | save_interval_steps: 20000 # Interval steps to save checkpoint.
182 | eval_interval_steps: 10000 # Interval steps to evaluate the network.
183 | log_interval_steps: 1000 # Interval steps to record the training log.
184 |
185 | ###########################################################
186 | # OTHER SETTING #
187 | ###########################################################
188 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
189 |
--------------------------------------------------------------------------------
/kantts/configs/hifigan_v1_8k.yaml:
--------------------------------------------------------------------------------
1 | model_type: hifigan
2 | Model:
3 | ###########################################################
4 | # GENERATOR NETWORK ARCHITECTURE SETTING #
5 | ###########################################################
6 | Generator:
7 | params:
8 | in_channels: 80
9 | out_channels: 1
10 | channels: 256
11 | kernel_size: 7
12 | upsample_scales: [5, 5, 2, 2]
13 | upsample_kernal_sizes: [10, 10, 4, 4]
14 | resblock_kernel_sizes: [3, 7, 11]
15 | resblock_dilations:
16 | - [1, 3, 5, 7]
17 | - [1, 3, 5, 7]
18 | - [1, 3, 5, 7]
19 | bias: true
20 | causal: true
21 | nonlinear_activation: "LeakyReLU"
22 | nonlinear_activation_params:
23 | negative_slope: 0.1
24 | use_weight_norm: true
25 | optimizer:
26 | type: Adam
27 | params:
28 | lr: 2.0e-4
29 | betas: [0.5, 0.9]
30 | weight_decay: 0.0
31 | scheduler:
32 | type: MultiStepLR
33 | params:
34 | gamma: 0.5
35 | milestones:
36 | - 200000
37 | - 400000
38 | - 600000
39 | - 800000
40 |
41 | ###########################################################
42 | # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
43 | ###########################################################
44 | MultiScaleDiscriminator:
45 | params:
46 | scales: 3
47 | downsample_pooling: "DWT"
48 | downsample_pooling_params:
49 | kernel_size: 4
50 | stride: 2
51 | padding: 2
52 | discriminator_params:
53 | in_channels: 1
54 | out_channels: 1
55 | kernel_sizes: [15, 41, 5, 3]
56 | channels: 128
57 | max_downsample_channels: 1024
58 | max_groups: 16
59 | bias: true
60 | downsample_scales: [4, 4, 4, 4, 1]
61 | nonlinear_activation: "LeakyReLU"
62 | nonlinear_activation_params:
63 | negative_slope: 0.1
64 | follow_official_norm: true
65 | optimizer:
66 | type: Adam
67 | params:
68 | lr: 2.0e-4
69 | betas: [0.5, 0.9]
70 | weight_decay: 0.0
71 | scheduler:
72 | type: MultiStepLR
73 | params:
74 | gamma: 0.5
75 | milestones:
76 | - 200000
77 | - 400000
78 | - 600000
79 | - 800000
80 |
81 | MultiPeriodDiscriminator:
82 | params:
83 | periods: [2, 3, 5, 7, 11]
84 | discriminator_params:
85 | in_channels: 1
86 | out_channels: 1
87 | kernel_sizes: [5, 3]
88 | channels: 32
89 | downsample_scales: [3, 3, 3, 3, 1]
90 | max_downsample_channels: 1024
91 | bias: true
92 | nonlinear_activation: "LeakyReLU"
93 | nonlinear_activation_params:
94 | negative_slope: 0.1
95 | use_spectral_norm: false
96 | optimizer:
97 | type: Adam
98 | params:
99 | lr: 2.0e-4
100 | betas: [0.5, 0.9]
101 | weight_decay: 0.0
102 | scheduler:
103 | type: MultiStepLR
104 | params:
105 | gamma: 0.5
106 | milestones:
107 | - 200000
108 | - 400000
109 | - 600000
110 | - 800000
111 |
112 | ####################################################
113 | # LOSS SETTING #
114 | ####################################################
115 | Loss:
116 | generator_adv_loss:
117 | enable: True
118 | params:
119 | average_by_discriminators: False
120 | weights: 1.0
121 |
122 | discriminator_adv_loss:
123 | enable: True
124 | params:
125 | average_by_discriminators: False
126 | weights: 1.0
127 |
128 | stft_loss:
129 | enable: False # Whether to use multi-resolution STFT loss.
130 |
131 | mel_loss:
132 | enable: True
133 | params:
134 | fs: 8000
135 | fft_size: 2048
136 | hop_size: 100
137 | win_length: 600
138 | window: "hann"
139 | num_mels: 80
140 | fmin: 0
141 | fmax: 8000
142 | log_base: null
143 | weights: 45.0
144 |
145 | subband_stft_loss:
146 | enable: False
147 | params:
148 | fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
149 | hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
150 | win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
151 | window: "hann_window" # Window function for STFT-based loss
152 |
153 | feat_match_loss:
154 | enable: True
155 | params:
156 | average_by_discriminators: false
157 | average_by_layers: false
158 | weights: 2.0
159 |
160 |
161 | ###########################################################
162 | # DATA LOADER SETTING #
163 | ###########################################################
164 | batch_size: 16
165 | batch_max_steps: 6000 # Length of each audio in batch. Make sure dividable by hop_size.
166 | pin_memory: True
167 | num_workers: 2 # FIXME: set > 0 may stuck on macos
168 | remove_short_samples: False
169 | allow_cache: True
170 |
171 | generator_grad_norm: -1
172 |
173 | discriminator_grad_norm: -1
174 |
175 | ###########################################################
176 | # INTERVAL SETTING #
177 | ###########################################################
178 | generator_train_start_steps: 1 # Number of steps to start to train discriminator.
179 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
180 | train_max_steps: 2500000 # Number of training steps.
181 | save_interval_steps: 20000 # Interval steps to save checkpoint.
182 | eval_interval_steps: 10000 # Interval steps to evaluate the network.
183 | log_interval_steps: 1000 # Interval steps to record the training log.
184 |
185 | ###########################################################
186 | # OTHER SETTING #
187 | ###########################################################
188 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
189 |
--------------------------------------------------------------------------------
/kantts/configs/hifigan_v1_nsf_24k.yaml:
--------------------------------------------------------------------------------
1 | model_type: hifigan
2 | Model:
3 | ###########################################################
4 | # GENERATOR NETWORK ARCHITECTURE SETTING #
5 | ###########################################################
6 | Generator:
7 | params:
8 | in_channels: 80
9 | out_channels: 1
10 | channels: 512
11 | kernel_size: 7
12 | upsample_scales: [8, 5, 3, 2]
13 | upsample_kernal_sizes: [16, 10, 6, 4]
14 | resblock_kernel_sizes: [3, 7, 11]
15 | resblock_dilations:
16 | - [1, 3, 5]
17 | - [1, 3, 5]
18 | - [1, 3, 5]
19 | bias: true
20 | causal: true
21 | nonlinear_activation: "LeakyReLU"
22 | nonlinear_activation_params:
23 | negative_slope: 0.1
24 | use_weight_norm: true
25 | nsf_params:
26 | nb_harmonics: 7
27 | sampling_rate: 24000
28 | optimizer:
29 | type: Adam
30 | params:
31 | lr: 2.0e-4
32 | betas: [0.5, 0.9]
33 | weight_decay: 0.0
34 | scheduler:
35 | type: MultiStepLR
36 | params:
37 | gamma: 0.5
38 | milestones:
39 | - 200000
40 | - 400000
41 | - 600000
42 | - 800000
43 |
44 | ###########################################################
45 | # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
46 | ###########################################################
47 | MultiScaleDiscriminator:
48 | params:
49 | scales: 3
50 | downsample_pooling: "DWT"
51 | downsample_pooling_params:
52 | kernel_size: 4
53 | stride: 2
54 | padding: 2
55 | discriminator_params:
56 | in_channels: 1
57 | out_channels: 1
58 | kernel_sizes: [15, 41, 5, 3]
59 | channels: 128
60 | max_downsample_channels: 1024
61 | max_groups: 16
62 | bias: true
63 | downsample_scales: [4, 4, 4, 4, 1]
64 | nonlinear_activation: "LeakyReLU"
65 | nonlinear_activation_params:
66 | negative_slope: 0.1
67 | follow_official_norm: true
68 | optimizer:
69 | type: Adam
70 | params:
71 | lr: 2.0e-4
72 | betas: [0.5, 0.9]
73 | weight_decay: 0.0
74 | scheduler:
75 | type: MultiStepLR
76 | params:
77 | gamma: 0.5
78 | milestones:
79 | - 200000
80 | - 400000
81 | - 600000
82 | - 800000
83 |
84 | MultiPeriodDiscriminator:
85 | params:
86 | periods: [2, 3, 5, 7, 11]
87 | discriminator_params:
88 | in_channels: 1
89 | out_channels: 1
90 | kernel_sizes: [5, 3]
91 | channels: 32
92 | downsample_scales: [3, 3, 3, 3, 1]
93 | max_downsample_channels: 1024
94 | bias: true
95 | nonlinear_activation: "LeakyReLU"
96 | nonlinear_activation_params:
97 | negative_slope: 0.1
98 | use_spectral_norm: false
99 | optimizer:
100 | type: Adam
101 | params:
102 | lr: 2.0e-4
103 | betas: [0.5, 0.9]
104 | weight_decay: 0.0
105 | scheduler:
106 | type: MultiStepLR
107 | params:
108 | gamma: 0.5
109 | milestones:
110 | - 200000
111 | - 400000
112 | - 600000
113 | - 800000
114 |
115 | ####################################################
116 | # LOSS SETTING #
117 | ####################################################
118 | Loss:
119 | generator_adv_loss:
120 | enable: True
121 | params:
122 | average_by_discriminators: False
123 | weights: 1.0
124 |
125 | discriminator_adv_loss:
126 | enable: True
127 | params:
128 | average_by_discriminators: False
129 | weights: 1.0
130 |
131 | stft_loss:
132 | enable: False # Whether to use multi-resolution STFT loss.
133 |
134 | mel_loss:
135 | enable: True
136 | params:
137 | fs: 24000
138 | fft_size: 1024
139 | hop_size: 240
140 | win_length: 1024
141 | window: "hann"
142 | num_mels: 80
143 | fmin: 0
144 | fmax: 8000
145 | log_base: null
146 | weights: 45.0
147 |
148 | subband_stft_loss:
149 | enable: False
150 | params:
151 | fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
152 | hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
153 | win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
154 | window: "hann_window" # Window function for STFT-based loss
155 |
156 | feat_match_loss:
157 | enable: True
158 | params:
159 | average_by_discriminators: false
160 | average_by_layers: false
161 | weights: 2.0
162 |
163 |
164 | ###########################################################
165 | # DATA LOADER SETTING #
166 | ###########################################################
167 | batch_size: 16
168 | batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size.
169 | pin_memory: False
170 | num_workers: 2 # FIXME: set > 0 may stuck on macos
171 | remove_short_samples: False
172 | allow_cache: True
173 |
174 | generator_grad_norm: -1
175 |
176 | discriminator_grad_norm: -1
177 |
178 | ###########################################################
179 | # INTERVAL SETTING #
180 | ###########################################################
181 | generator_train_start_steps: 1 # Number of steps to start to train discriminator.
182 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
183 | train_max_steps: 2500000 # Number of training steps.
184 | save_interval_steps: 20000 # Interval steps to save checkpoint.
185 | eval_interval_steps: 10000 # Interval steps to evaluate the network.
186 | log_interval_steps: 1000 # Interval steps to record the training log.
187 |
188 | ###########################################################
189 | # OTHER SETTING #
190 | ###########################################################
191 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
192 |
--------------------------------------------------------------------------------
/kantts/configs/sambert_16k.yaml:
--------------------------------------------------------------------------------
1 | model_type: sambert
2 | Model:
3 | #########################################################
4 | # SAMBERT NETWORK ARCHITECTURE SETTING #
5 | #########################################################
6 | KanTtsSAMBERT:
7 | params:
8 | max_len: 800
9 |
10 | embedding_dim: 512
11 | encoder_num_layers: 8
12 | encoder_num_heads: 8
13 | encoder_num_units: 128
14 | encoder_ffn_inner_dim: 1024
15 | encoder_dropout: 0.1
16 | encoder_attention_dropout: 0.1
17 | encoder_relu_dropout: 0.1
18 | encoder_projection_units: 32
19 |
20 | speaker_units: 32
21 | emotion_units: 32
22 |
23 | predictor_filter_size: 41
24 | predictor_fsmn_num_layers: 3
25 | predictor_num_memory_units: 128
26 | predictor_ffn_inner_dim: 256
27 | predictor_dropout: 0.1
28 | predictor_shift: 0
29 | predictor_lstm_units: 128
30 | dur_pred_prenet_units: [128, 128]
31 | dur_pred_lstm_units: 128
32 |
33 | decoder_prenet_units: [256, 256]
34 | decoder_num_layers: 12
35 | decoder_num_heads: 8
36 | decoder_num_units: 128
37 | decoder_ffn_inner_dim: 1024
38 | decoder_dropout: 0.1
39 | decoder_attention_dropout: 0.1
40 | decoder_relu_dropout: 0.1
41 |
42 | outputs_per_step: 3
43 | num_mels: 80
44 |
45 | postnet_filter_size: 41
46 | postnet_fsmn_num_layers: 4
47 | postnet_num_memory_units: 256
48 | postnet_ffn_inner_dim: 512
49 | postnet_dropout: 0.1
50 | postnet_shift: 17
51 | postnet_lstm_units: 128
52 | MAS: False
53 |
54 | optimizer:
55 | type: Adam
56 | params:
57 | lr: 0.001
58 | betas: [0.9, 0.98]
59 | eps: 1.0e-9
60 | weight_decay: 0.0
61 | scheduler:
62 | type: NoamLR
63 | params:
64 | warmup_steps: 4000
65 |
66 | linguistic_unit:
67 | cleaners: english_cleaners
68 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
69 | speaker_list: F7
70 | ####################################################
71 | # LOSS SETTING #
72 | ####################################################
73 | Loss:
74 | MelReconLoss:
75 | enable: True
76 | params:
77 | loss_type: mae
78 |
79 | ProsodyReconLoss:
80 | enable: True
81 | params:
82 | loss_type: mae
83 |
84 | ###########################################################
85 | # DATA LOADER SETTING #
86 | ###########################################################
87 | batch_size: 32
88 | pin_memory: False
89 | num_workers: 4 # FIXME: set > 0 may stuck on macos
90 | remove_short_samples: False
91 | allow_cache: True
92 | grad_norm: 1.0
93 |
94 | ###########################################################
95 | # INTERVAL SETTING #
96 | ###########################################################
97 | train_max_steps: 1000000 # Number of training steps.
98 | save_interval_steps: 20000 # Interval steps to save checkpoint.
99 | eval_interval_steps: 10000 # Interval steps to evaluate the network.
100 | log_interval_steps: 1000 # Interval steps to record the training log.
101 |
102 | ###########################################################
103 | # OTHER SETTING #
104 | ###########################################################
105 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
106 |
--------------------------------------------------------------------------------
/kantts/configs/sambert_16k_MAS.yaml:
--------------------------------------------------------------------------------
1 | model_type: sambert
2 | Model:
3 | #########################################################
4 | # SAMBERT NETWORK ARCHITECTURE SETTING #
5 | #########################################################
6 | KanTtsSAMBERT:
7 | params:
8 | max_len: 800
9 |
10 | embedding_dim: 512
11 | encoder_num_layers: 8
12 | encoder_num_heads: 8
13 | encoder_num_units: 128
14 | encoder_ffn_inner_dim: 1024
15 | encoder_dropout: 0.1
16 | encoder_attention_dropout: 0.1
17 | encoder_relu_dropout: 0.1
18 | encoder_projection_units: 32
19 |
20 | speaker_units: 32
21 | emotion_units: 32
22 |
23 | predictor_filter_size: 41
24 | predictor_fsmn_num_layers: 3
25 | predictor_num_memory_units: 128
26 | predictor_ffn_inner_dim: 256
27 | predictor_dropout: 0.1
28 | predictor_shift: 0
29 | predictor_lstm_units: 128
30 | dur_pred_prenet_units: [128, 128]
31 | dur_pred_lstm_units: 128
32 |
33 | decoder_prenet_units: [256, 256]
34 | decoder_num_layers: 12
35 | decoder_num_heads: 8
36 | decoder_num_units: 128
37 | decoder_ffn_inner_dim: 1024
38 | decoder_dropout: 0.1
39 | decoder_attention_dropout: 0.1
40 | decoder_relu_dropout: 0.1
41 |
42 | outputs_per_step: 3
43 | num_mels: 80
44 |
45 | postnet_filter_size: 41
46 | postnet_fsmn_num_layers: 4
47 | postnet_num_memory_units: 256
48 | postnet_ffn_inner_dim: 512
49 | postnet_dropout: 0.1
50 | postnet_shift: 17
51 | postnet_lstm_units: 128
52 |
53 | MAS: True
54 |
55 |
56 | optimizer:
57 | type: Adam
58 | params:
59 | lr: 0.001
60 | betas: [0.9, 0.98]
61 | eps: 1.0e-9
62 | weight_decay: 0.0
63 | scheduler:
64 | type: NoamLR
65 | params:
66 | warmup_steps: 4000
67 |
68 | linguistic_unit:
69 | cleaners: english_cleaners
70 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
71 | speaker_list: F7
72 | ####################################################
73 | # LOSS SETTING #
74 | ####################################################
75 | Loss:
76 | MelReconLoss:
77 | enable: True
78 | params:
79 | loss_type: mae
80 |
81 | ProsodyReconLoss:
82 | enable: True
83 | params:
84 | loss_type: mae
85 |
86 | AttentionCTCLoss:
87 | enable: True
88 |
89 | AttentionBinarizationLoss:
90 | enable: True
91 | params:
92 | start_epoch: 0
93 | warmup_epoch: 100
94 |
95 |
96 | ###########################################################
97 | # DATA LOADER SETTING #
98 | ###########################################################
99 | batch_size: 32
100 | pin_memory: False
101 | num_workers: 4 # FIXME: set > 0 may stuck on macos
102 | remove_short_samples: False
103 | allow_cache: True
104 |
105 | grad_norm: 1.0
106 |
107 | ###########################################################
108 | # INTERVAL SETTING #
109 | ###########################################################
110 | train_max_steps: 1000000 # Number of training steps.
111 | save_interval_steps: 20000 # Interval steps to save checkpoint.
112 | eval_interval_steps: 10000 # Interval steps to evaluate the network.
113 | log_interval_steps: 1000 # Interval steps to record the training log.
114 |
115 | ###########################################################
116 | # OTHER SETTING #
117 | ###########################################################
118 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
119 |
--------------------------------------------------------------------------------
/kantts/configs/sambert_16k_MAS_byte.yaml:
--------------------------------------------------------------------------------
1 | model_type: sambert
2 | Model:
3 | #########################################################
4 | # SAMBERT NETWORK ARCHITECTURE SETTING #
5 | #########################################################
6 | KanTtsSAMBERT:
7 | params:
8 | max_len: 800
9 |
10 | embedding_dim: 512
11 | encoder_num_layers: 8
12 | encoder_num_heads: 8
13 | encoder_num_units: 128
14 | encoder_ffn_inner_dim: 1024
15 | encoder_dropout: 0.1
16 | encoder_attention_dropout: 0.1
17 | encoder_relu_dropout: 0.1
18 | encoder_projection_units: 32
19 |
20 | speaker_units: 32
21 | emotion_units: 32
22 |
23 | predictor_filter_size: 41
24 | predictor_fsmn_num_layers: 3
25 | predictor_num_memory_units: 128
26 | predictor_ffn_inner_dim: 256
27 | predictor_dropout: 0.1
28 | predictor_shift: 0
29 | predictor_lstm_units: 128
30 | dur_pred_prenet_units: [128, 128]
31 | dur_pred_lstm_units: 128
32 |
33 | decoder_prenet_units: [256, 256]
34 | decoder_num_layers: 12
35 | decoder_num_heads: 8
36 | decoder_num_units: 128
37 | decoder_ffn_inner_dim: 1024
38 | decoder_dropout: 0.1
39 | decoder_attention_dropout: 0.1
40 | decoder_relu_dropout: 0.1
41 |
42 | outputs_per_step: 3
43 | num_mels: 80
44 |
45 | postnet_filter_size: 41
46 | postnet_fsmn_num_layers: 4
47 | postnet_num_memory_units: 256
48 | postnet_ffn_inner_dim: 512
49 | postnet_dropout: 0.1
50 | postnet_shift: 17
51 | postnet_lstm_units: 128
52 |
53 | MAS: True
54 | using_byte: True
55 |
56 |
57 | optimizer:
58 | type: Adam
59 | params:
60 | lr: 0.001
61 | betas: [0.9, 0.98]
62 | eps: 1.0e-9
63 | weight_decay: 0.0
64 | scheduler:
65 | type: NoamLR
66 | params:
67 | warmup_steps: 4000
68 |
69 | linguistic_unit:
70 | cleaners: english_cleaners
71 | lfeat_type_list: byte_index,emo_category,speaker_category
72 | speaker_list: F7
73 | ####################################################
74 | # LOSS SETTING #
75 | ####################################################
76 | Loss:
77 | MelReconLoss:
78 | enable: True
79 | params:
80 | loss_type: mae
81 |
82 | ProsodyReconLoss:
83 | enable: True
84 | params:
85 | loss_type: mae
86 |
87 | AttentionCTCLoss:
88 | enable: True
89 |
90 | AttentionBinarizationLoss:
91 | enable: True
92 | params:
93 | start_epoch: 0
94 | warmup_epoch: 100
95 |
96 |
97 | ###########################################################
98 | # DATA LOADER SETTING #
99 | ###########################################################
100 | batch_size: 8
101 | pin_memory: False
102 | num_workers: 4 # FIXME: set > 0 may stuck on macos
103 | remove_short_samples: False
104 | allow_cache: True
105 |
106 | grad_norm: 1.0
107 |
108 | ###########################################################
109 | # INTERVAL SETTING #
110 | ###########################################################
111 | train_max_steps: 1000000 # Number of training steps.
112 | save_interval_steps: 20000 # Interval steps to save checkpoint.
113 | eval_interval_steps: 10000 # Interval steps to evaluate the network.
114 | log_interval_steps: 1000 # Interval steps to record the training log.
115 |
116 | ###########################################################
117 | # OTHER SETTING #
118 | ###########################################################
119 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
120 |
--------------------------------------------------------------------------------
/kantts/configs/sambert_24k.yaml:
--------------------------------------------------------------------------------
1 | model_type: sambert
2 | Model:
3 | #########################################################
4 | # SAMBERT NETWORK ARCHITECTURE SETTING #
5 | #########################################################
6 | KanTtsSAMBERT:
7 | params:
8 | max_len: 800
9 |
10 | embedding_dim: 512
11 | encoder_num_layers: 8
12 | encoder_num_heads: 8
13 | encoder_num_units: 128
14 | encoder_ffn_inner_dim: 1024
15 | encoder_dropout: 0.1
16 | encoder_attention_dropout: 0.1
17 | encoder_relu_dropout: 0.1
18 | encoder_projection_units: 32
19 |
20 | speaker_units: 32
21 | emotion_units: 32
22 |
23 | predictor_filter_size: 41
24 | predictor_fsmn_num_layers: 3
25 | predictor_num_memory_units: 128
26 | predictor_ffn_inner_dim: 256
27 | predictor_dropout: 0.1
28 | predictor_shift: 0
29 | predictor_lstm_units: 128
30 | dur_pred_prenet_units: [128, 128]
31 | dur_pred_lstm_units: 128
32 |
33 | decoder_prenet_units: [256, 256]
34 | decoder_num_layers: 12
35 | decoder_num_heads: 8
36 | decoder_num_units: 128
37 | decoder_ffn_inner_dim: 1024
38 | decoder_dropout: 0.1
39 | decoder_attention_dropout: 0.1
40 | decoder_relu_dropout: 0.1
41 |
42 | outputs_per_step: 3
43 | num_mels: 80
44 |
45 | postnet_filter_size: 41
46 | postnet_fsmn_num_layers: 4
47 | postnet_num_memory_units: 256
48 | postnet_ffn_inner_dim: 512
49 | postnet_dropout: 0.1
50 | postnet_shift: 17
51 | postnet_lstm_units: 128
52 | MAS: False
53 |
54 | optimizer:
55 | type: Adam
56 | params:
57 | lr: 0.001
58 | betas: [0.9, 0.98]
59 | eps: 1.0e-9
60 | weight_decay: 0.0
61 | scheduler:
62 | type: NoamLR
63 | params:
64 | warmup_steps: 4000
65 |
66 | linguistic_unit:
67 | cleaners: english_cleaners
68 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
69 | speaker_list: F7
70 | ####################################################
71 | # LOSS SETTING #
72 | ####################################################
73 | Loss:
74 | MelReconLoss:
75 | enable: True
76 | params:
77 | loss_type: mae
78 |
79 | ProsodyReconLoss:
80 | enable: True
81 | params:
82 | loss_type: mae
83 |
84 |
85 | ###########################################################
86 | # DATA LOADER SETTING #
87 | ###########################################################
88 | batch_size: 32
89 | pin_memory: False
90 | num_workers: 4 # FIXME: set > 0 may stuck on macos
91 | remove_short_samples: False
92 | allow_cache: True
93 |
94 | grad_norm: 1.0
95 |
96 | ###########################################################
97 | # INTERVAL SETTING #
98 | ###########################################################
99 | train_max_steps: 1000000 # Number of training steps.
100 | save_interval_steps: 20000 # Interval steps to save checkpoint.
101 | eval_interval_steps: 10000 # Interval steps to evaluate the network.
102 | log_interval_steps: 1000 # Interval steps to record the training log.
103 |
104 | ###########################################################
105 | # OTHER SETTING #
106 | ###########################################################
107 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
108 |
--------------------------------------------------------------------------------
/kantts/configs/sambert_48k.yaml:
--------------------------------------------------------------------------------
1 | model_type: sambert
2 | Model:
3 | #########################################################
4 | # SAMBERT NETWORK ARCHITECTURE SETTING #
5 | #########################################################
6 | KanTtsSAMBERT:
7 | params:
8 | max_len: 900
9 |
10 | embedding_dim: 512
11 | encoder_num_layers: 8
12 | encoder_num_heads: 8
13 | encoder_num_units: 128
14 | encoder_ffn_inner_dim: 1024
15 | encoder_dropout: 0.1
16 | encoder_attention_dropout: 0.1
17 | encoder_relu_dropout: 0.1
18 | encoder_projection_units: 32
19 |
20 | speaker_units: 32
21 | emotion_units: 32
22 |
23 | predictor_filter_size: 41
24 | predictor_fsmn_num_layers: 3
25 | predictor_num_memory_units: 128
26 | predictor_ffn_inner_dim: 256
27 | predictor_dropout: 0.1
28 | predictor_shift: 0
29 | predictor_lstm_units: 128
30 | dur_pred_prenet_units: [128, 128]
31 | dur_pred_lstm_units: 128
32 |
33 | decoder_prenet_units: [256, 256]
34 | decoder_num_layers: 12
35 | decoder_num_heads: 8
36 | decoder_num_units: 128
37 | decoder_ffn_inner_dim: 1024
38 | decoder_dropout: 0.1
39 | decoder_attention_dropout: 0.1
40 | decoder_relu_dropout: 0.1
41 |
42 | outputs_per_step: 3
43 | num_mels: 128
44 |
45 | postnet_filter_size: 41
46 | postnet_fsmn_num_layers: 4
47 | postnet_num_memory_units: 256
48 | postnet_ffn_inner_dim: 512
49 | postnet_dropout: 0.1
50 | postnet_shift: 17
51 | postnet_lstm_units: 128
52 | MAS: False
53 |
54 | optimizer:
55 | type: Adam
56 | params:
57 | lr: 0.001
58 | betas: [0.9, 0.98]
59 | eps: 1.0e-9
60 | weight_decay: 0.0
61 | scheduler:
62 | type: NoamLR
63 | params:
64 | warmup_steps: 4000
65 |
66 | linguistic_unit:
67 | cleaners: english_cleaners
68 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
69 | speaker_list: F7
70 | ####################################################
71 | # LOSS SETTING #
72 | ####################################################
73 | Loss:
74 | MelReconLoss:
75 | enable: True
76 | params:
77 | loss_type: mae
78 |
79 | ProsodyReconLoss:
80 | enable: True
81 | params:
82 | loss_type: mae
83 |
84 |
85 | ###########################################################
86 | # DATA LOADER SETTING #
87 | ###########################################################
88 | batch_size: 32
89 | pin_memory: False
90 | num_workers: 4 # FIXME: set > 0 may stuck on macos
91 | remove_short_samples: False
92 | allow_cache: True
93 |
94 | grad_norm: 1.0
95 |
96 | ###########################################################
97 | # INTERVAL SETTING #
98 | ###########################################################
99 | train_max_steps: 1000000 # Number of training steps.
100 | save_interval_steps: 20000 # Interval steps to save checkpoint.
101 | eval_interval_steps: 10000 # Interval steps to evaluate the network.
102 | log_interval_steps: 1000 # Interval steps to record the training log.
103 |
104 | ###########################################################
105 | # OTHER SETTING #
106 | ###########################################################
107 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
108 |
--------------------------------------------------------------------------------
/kantts/configs/sambert_fp_8k.yaml:
--------------------------------------------------------------------------------
1 | model_type: sambert
2 | Model:
3 | #########################################################
4 | # SAMBERT NETWORK ARCHITECTURE SETTING #
5 | #########################################################
6 | KanTtsSAMBERT:
7 | params:
8 | max_len: 800
9 |
10 | embedding_dim: 512
11 | encoder_num_layers: 8
12 | encoder_num_heads: 8
13 | encoder_num_units: 128
14 | encoder_ffn_inner_dim: 1024
15 | encoder_dropout: 0.1
16 | encoder_attention_dropout: 0.1
17 | encoder_relu_dropout: 0.1
18 | encoder_projection_units: 32
19 |
20 | speaker_units: 32
21 | emotion_units: 32
22 |
23 | predictor_filter_size: 41
24 | predictor_fsmn_num_layers: 3
25 | predictor_num_memory_units: 128
26 | predictor_ffn_inner_dim: 256
27 | predictor_dropout: 0.1
28 | predictor_shift: 0
29 | predictor_lstm_units: 128
30 | dur_pred_prenet_units: [128, 128]
31 | dur_pred_lstm_units: 128
32 |
33 | decoder_prenet_units: [256, 256]
34 | decoder_num_layers: 12
35 | decoder_num_heads: 8
36 | decoder_num_units: 128
37 | decoder_ffn_inner_dim: 1024
38 | decoder_dropout: 0.1
39 | decoder_attention_dropout: 0.1
40 | decoder_relu_dropout: 0.1
41 |
42 | outputs_per_step: 3
43 | num_mels: 80
44 |
45 | postnet_filter_size: 41
46 | postnet_fsmn_num_layers: 4
47 | postnet_num_memory_units: 256
48 | postnet_ffn_inner_dim: 512
49 | postnet_dropout: 0.1
50 | postnet_shift: 17
51 | postnet_lstm_units: 128
52 | MAS: False
53 | FP: True
54 |
55 | optimizer:
56 | type: Adam
57 | params:
58 | lr: 0.001
59 | betas: [0.9, 0.98]
60 | eps: 1.0e-9
61 | weight_decay: 0.0
62 | scheduler:
63 | type: NoamLR
64 | params:
65 | warmup_steps: 4000
66 |
67 | linguistic_unit:
68 | cleaners: english_cleaners
69 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
70 | speaker_list: F7,F74,M7,FBYN,FRXL,xiaoyu
71 | ####################################################
72 | # LOSS SETTING #
73 | ####################################################
74 | Loss:
75 | MelReconLoss:
76 | enable: True
77 | params:
78 | loss_type: mae
79 |
80 | ProsodyReconLoss:
81 | enable: True
82 | params:
83 | loss_type: mae
84 |
85 | FpCELoss:
86 | enable: True
87 | params:
88 | loss_type: ce
89 | weight: [1,4,4,8]
90 |
91 | ###########################################################
92 | # DATA LOADER SETTING #
93 | ###########################################################
94 | batch_size: 16
95 | pin_memory: False
96 | num_workers: 4 # FIXME: set > 0 may stuck on macos
97 | remove_short_samples: False
98 | allow_cache: True
99 |
100 | grad_norm: 1.0
101 |
102 | ###########################################################
103 | # INTERVAL SETTING #
104 | ###########################################################
105 | train_max_steps: 1000000 # Number of training steps.
106 | save_interval_steps: 20000 # Interval steps to save checkpoint.
107 | eval_interval_steps: 10000 # Interval steps to evaluate the network.
108 | log_interval_steps: 1000 # Interval steps to record the training log.
109 |
110 | ###########################################################
111 | # OTHER SETTING #
112 | ###########################################################
113 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
114 |
--------------------------------------------------------------------------------
/kantts/configs/sambert_nsf_16k.yaml:
--------------------------------------------------------------------------------
1 | model_type: sambert
2 | Model:
3 | #########################################################
4 | # SAMBERT NETWORK ARCHITECTURE SETTING #
5 | #########################################################
6 | KanTtsSAMBERT:
7 | params:
8 | max_len: 800
9 |
10 | embedding_dim: 512
11 | encoder_num_layers: 8
12 | encoder_num_heads: 8
13 | encoder_num_units: 128
14 | encoder_ffn_inner_dim: 1024
15 | encoder_dropout: 0.1
16 | encoder_attention_dropout: 0.1
17 | encoder_relu_dropout: 0.1
18 | encoder_projection_units: 32
19 |
20 | speaker_units: 32
21 | emotion_units: 32
22 |
23 | predictor_filter_size: 41
24 | predictor_fsmn_num_layers: 3
25 | predictor_num_memory_units: 128
26 | predictor_ffn_inner_dim: 256
27 | predictor_dropout: 0.1
28 | predictor_shift: 0
29 | predictor_lstm_units: 128
30 | dur_pred_prenet_units: [128, 128]
31 | dur_pred_lstm_units: 128
32 |
33 | decoder_prenet_units: [256, 256]
34 | decoder_num_layers: 12
35 | decoder_num_heads: 8
36 | decoder_num_units: 128
37 | decoder_ffn_inner_dim: 1024
38 | decoder_dropout: 0.1
39 | decoder_attention_dropout: 0.1
40 | decoder_relu_dropout: 0.1
41 |
42 | outputs_per_step: 3
43 | num_mels: 82
44 |
45 | postnet_filter_size: 41
46 | postnet_fsmn_num_layers: 4
47 | postnet_num_memory_units: 256
48 | postnet_ffn_inner_dim: 512
49 | postnet_dropout: 0.1
50 | postnet_shift: 17
51 | postnet_lstm_units: 128
52 | MAS: False
53 | NSF: True
54 |
55 |
56 | optimizer:
57 | type: Adam
58 | params:
59 | lr: 0.001
60 | betas: [0.9, 0.98]
61 | eps: 1.0e-9
62 | weight_decay: 0.0
63 | scheduler:
64 | type: NoamLR
65 | params:
66 | warmup_steps: 4000
67 |
68 | linguistic_unit:
69 | cleaners: english_cleaners
70 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
71 | speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
72 | ####################################################
73 | # LOSS SETTING #
74 | ####################################################
75 | Loss:
76 | MelReconLoss:
77 | enable: True
78 | params:
79 | loss_type: mae
80 |
81 | ProsodyReconLoss:
82 | enable: True
83 | params:
84 | loss_type: mae
85 |
86 | ###########################################################
87 | # DATA LOADER SETTING #
88 | ###########################################################
89 | batch_size: 32
90 | pin_memory: False
91 | num_workers: 4 # FIXME: set > 0 may stuck on macos
92 | remove_short_samples: False
93 | allow_cache: True
94 | grad_norm: 1.0
95 |
96 | ###########################################################
97 | # INTERVAL SETTING #
98 | ###########################################################
99 | train_max_steps: 10000000 # Number of training steps.
100 | save_interval_steps: 20000 # Interval steps to save checkpoint.
101 | eval_interval_steps: 2300500 # Interval steps to evaluate the network.
102 | log_interval_steps: 1000 # Interval steps to record the training log.
103 |
104 | ###########################################################
105 | # OTHER SETTING #
106 | ###########################################################
107 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
108 |
--------------------------------------------------------------------------------
/kantts/configs/sambert_nsf_24k.yaml:
--------------------------------------------------------------------------------
1 | model_type: sambert
2 | Model:
3 | #########################################################
4 | # SAMBERT NETWORK ARCHITECTURE SETTING #
5 | #########################################################
6 | KanTtsSAMBERT:
7 | params:
8 | max_len: 800
9 |
10 | embedding_dim: 512
11 | encoder_num_layers: 8
12 | encoder_num_heads: 8
13 | encoder_num_units: 128
14 | encoder_ffn_inner_dim: 1024
15 | encoder_dropout: 0.1
16 | encoder_attention_dropout: 0.1
17 | encoder_relu_dropout: 0.1
18 | encoder_projection_units: 32
19 |
20 | speaker_units: 32
21 | emotion_units: 32
22 |
23 | predictor_filter_size: 41
24 | predictor_fsmn_num_layers: 3
25 | predictor_num_memory_units: 128
26 | predictor_ffn_inner_dim: 256
27 | predictor_dropout: 0.1
28 | predictor_shift: 0
29 | predictor_lstm_units: 128
30 | dur_pred_prenet_units: [128, 128]
31 | dur_pred_lstm_units: 128
32 |
33 | decoder_prenet_units: [256, 256]
34 | decoder_num_layers: 12
35 | decoder_num_heads: 8
36 | decoder_num_units: 128
37 | decoder_ffn_inner_dim: 1024
38 | decoder_dropout: 0.1
39 | decoder_attention_dropout: 0.1
40 | decoder_relu_dropout: 0.1
41 |
42 | outputs_per_step: 3
43 | num_mels: 82
44 |
45 | postnet_filter_size: 41
46 | postnet_fsmn_num_layers: 4
47 | postnet_num_memory_units: 256
48 | postnet_ffn_inner_dim: 512
49 | postnet_dropout: 0.1
50 | postnet_shift: 17
51 | postnet_lstm_units: 128
52 | MAS: False
53 | NSF: True
54 |
55 | optimizer:
56 | type: Adam
57 | params:
58 | lr: 0.001
59 | betas: [0.9, 0.98]
60 | eps: 1.0e-9
61 | weight_decay: 0.0
62 | scheduler:
63 | type: NoamLR
64 | params:
65 | warmup_steps: 4000
66 |
67 | linguistic_unit:
68 | cleaners: english_cleaners
69 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
70 | speaker_list: F7
71 | ####################################################
72 | # LOSS SETTING #
73 | ####################################################
74 | Loss:
75 | MelReconLoss:
76 | enable: True
77 | params:
78 | loss_type: mae
79 |
80 | ProsodyReconLoss:
81 | enable: True
82 | params:
83 | loss_type: mae
84 |
85 |
86 | ###########################################################
87 | # DATA LOADER SETTING #
88 | ###########################################################
89 | batch_size: 32
90 | pin_memory: False
91 | num_workers: 4 # FIXME: set > 0 may stuck on macos
92 | remove_short_samples: False
93 | allow_cache: True
94 |
95 | grad_norm: 1.0
96 |
97 | ###########################################################
98 | # INTERVAL SETTING #
99 | ###########################################################
100 | train_max_steps: 1000000 # Number of training steps.
101 | save_interval_steps: 20000 # Interval steps to save checkpoint.
102 | eval_interval_steps: 10000 # Interval steps to evaluate the network.
103 | log_interval_steps: 1000 # Interval steps to record the training log.
104 |
105 | ###########################################################
106 | # OTHER SETTING #
107 | ###########################################################
108 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
109 |
--------------------------------------------------------------------------------
/kantts/configs/sambert_se_nsf_global_16k.yaml:
--------------------------------------------------------------------------------
1 | model_type: sambert
2 | Model:
3 | #########################################################
4 | # SAMBERT NETWORK ARCHITECTURE SETTING #
5 | #########################################################
6 | KanTtsSAMBERT:
7 | params:
8 | max_len: 800
9 |
10 | embedding_dim: 512
11 | encoder_num_layers: 8
12 | encoder_num_heads: 8
13 | encoder_num_units: 128
14 | encoder_ffn_inner_dim: 1024
15 | encoder_dropout: 0.1
16 | encoder_attention_dropout: 0.1
17 | encoder_relu_dropout: 0.1
18 | encoder_projection_units: 32
19 |
20 | speaker_units: 192
21 | emotion_units: 32
22 |
23 | predictor_filter_size: 41
24 | predictor_fsmn_num_layers: 3
25 | predictor_num_memory_units: 128
26 | predictor_ffn_inner_dim: 256
27 | predictor_dropout: 0.1
28 | predictor_shift: 0
29 | predictor_lstm_units: 128
30 | dur_pred_prenet_units: [128, 128]
31 | dur_pred_lstm_units: 128
32 |
33 | decoder_prenet_units: [256, 256]
34 | decoder_num_layers: 12
35 | decoder_num_heads: 8
36 | decoder_num_units: 128
37 | decoder_ffn_inner_dim: 1024
38 | decoder_dropout: 0.1
39 | decoder_attention_dropout: 0.1
40 | decoder_relu_dropout: 0.1
41 |
42 | outputs_per_step: 3
43 | num_mels: 82
44 |
45 | postnet_filter_size: 41
46 | postnet_fsmn_num_layers: 4
47 | postnet_num_memory_units: 256
48 | postnet_ffn_inner_dim: 512
49 | postnet_dropout: 0.1
50 | postnet_shift: 17
51 | postnet_lstm_units: 128
52 | MAS: False
53 | NSF: True
54 | nsf_norm_type: global
55 | nsf_f0_global_minimum: 30.0
56 | nsf_f0_global_maximum: 730.0
57 | SE: True
58 |
59 |
60 | optimizer:
61 | type: Adam
62 | params:
63 | lr: 0.001
64 | betas: [0.9, 0.98]
65 | eps: 1.0e-9
66 | weight_decay: 0.0
67 | scheduler:
68 | type: NoamLR
69 | params:
70 | warmup_steps: 4000
71 |
72 | linguistic_unit:
73 | cleaners: english_cleaners
74 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
75 | speaker_list: F7
76 | ####################################################
77 | # LOSS SETTING #
78 | ####################################################
79 | Loss:
80 | MelReconLoss:
81 | enable: True
82 | params:
83 | loss_type: mae
84 |
85 | ProsodyReconLoss:
86 | enable: True
87 | params:
88 | loss_type: mae
89 |
90 | ###########################################################
91 | # DATA LOADER SETTING #
92 | ###########################################################
93 | batch_size: 32
94 | pin_memory: False
95 | num_workers: 4 # FIXME: set > 0 may stuck on macos
96 | remove_short_samples: False
97 | allow_cache: False
98 | grad_norm: 1.0
99 |
100 | ###########################################################
101 | # INTERVAL SETTING #
102 | ###########################################################
103 | train_max_steps: 1760101 # Number of training steps.
104 | save_interval_steps: 100 # Interval steps to save checkpoint.
105 | eval_interval_steps: 1000000000000 # Interval steps to evaluate the network.
106 | log_interval_steps: 10 # Interval steps to record the training log.
107 |
108 | ###########################################################
109 | # OTHER SETTING #
110 | ###########################################################
111 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
112 |
--------------------------------------------------------------------------------
/kantts/configs/sambert_sichuan_16k.yaml:
--------------------------------------------------------------------------------
1 | model_type: sambert
2 | Model:
3 | #########################################################
4 | # SAMBERT NETWORK ARCHITECTURE SETTING #
5 | #########################################################
6 | KanTtsSAMBERT:
7 | params:
8 | max_len: 800
9 |
10 | embedding_dim: 512
11 | encoder_num_layers: 8
12 | encoder_num_heads: 8
13 | encoder_num_units: 128
14 | encoder_ffn_inner_dim: 1024
15 | encoder_dropout: 0.1
16 | encoder_attention_dropout: 0.1
17 | encoder_relu_dropout: 0.1
18 | encoder_projection_units: 32
19 |
20 | speaker_units: 32
21 | emotion_units: 32
22 |
23 | predictor_filter_size: 41
24 | predictor_fsmn_num_layers: 3
25 | predictor_num_memory_units: 128
26 | predictor_ffn_inner_dim: 256
27 | predictor_dropout: 0.1
28 | predictor_shift: 0
29 | predictor_lstm_units: 128
30 | dur_pred_prenet_units: [128, 128]
31 | dur_pred_lstm_units: 128
32 |
33 | decoder_prenet_units: [256, 256]
34 | decoder_num_layers: 12
35 | decoder_num_heads: 8
36 | decoder_num_units: 128
37 | decoder_ffn_inner_dim: 1024
38 | decoder_dropout: 0.1
39 | decoder_attention_dropout: 0.1
40 | decoder_relu_dropout: 0.1
41 |
42 | outputs_per_step: 3
43 | num_mels: 80
44 |
45 | postnet_filter_size: 41
46 | postnet_fsmn_num_layers: 4
47 | postnet_num_memory_units: 256
48 | postnet_ffn_inner_dim: 512
49 | postnet_dropout: 0.1
50 | postnet_shift: 17
51 | postnet_lstm_units: 128
52 | MAS: False
53 |
54 | optimizer:
55 | type: Adam
56 | params:
57 | lr: 0.001
58 | betas: [0.9, 0.98]
59 | eps: 1.0e-9
60 | weight_decay: 0.0
61 | scheduler:
62 | type: NoamLR
63 | params:
64 | warmup_steps: 4000
65 |
66 | linguistic_unit:
67 | cleaners: english_cleaners
68 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
69 | speaker_list: xiaoyue
70 | language: Sichuan
71 | ####################################################
72 | # LOSS SETTING #
73 | ####################################################
74 | Loss:
75 | MelReconLoss:
76 | enable: True
77 | params:
78 | loss_type: mae
79 |
80 | ProsodyReconLoss:
81 | enable: True
82 | params:
83 | loss_type: mae
84 |
85 | ###########################################################
86 | # DATA LOADER SETTING #
87 | ###########################################################
88 | batch_size: 32
89 | pin_memory: False
90 | num_workers: 4 # FIXME: set > 0 may stuck on macos
91 | remove_short_samples: False
92 | allow_cache: True
93 | grad_norm: 1.0
94 |
95 | ###########################################################
96 | # INTERVAL SETTING #
97 | ###########################################################
98 | train_max_steps: 1000000 # Number of training steps.
99 | save_interval_steps: 20000 # Interval steps to save checkpoint.
100 | eval_interval_steps: 10000 # Interval steps to evaluate the network.
101 | log_interval_steps: 1000 # Interval steps to record the training log.
102 |
103 | ###########################################################
104 | # OTHER SETTING #
105 | ###########################################################
106 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
107 |
--------------------------------------------------------------------------------
/kantts/configs/sybert.yaml:
--------------------------------------------------------------------------------
1 | model_type: sybert
2 | Model:
3 | #########################################################
4 | # TextsyBERT NETWORK ARCHITECTURE SETTING #
5 | #########################################################
6 | KanTtsTextsyBERT:
7 | params:
8 | max_len: 800
9 |
10 | embedding_dim: 512
11 | encoder_num_layers: 8
12 | encoder_num_heads: 8
13 | encoder_num_units: 128
14 | encoder_ffn_inner_dim: 1024
15 | encoder_dropout: 0.1
16 | encoder_attention_dropout: 0.1
17 | encoder_relu_dropout: 0.1
18 | encoder_projection_units: 32
19 |
20 | mask_ratio: 0.3
21 |
22 | optimizer:
23 | type: Adam
24 | params:
25 | lr: 0.0001
26 | betas: [0.9, 0.98]
27 | eps: 1.0e-9
28 | weight_decay: 0.0
29 | scheduler:
30 | type: NoamLR
31 | params:
32 | warmup_steps: 10000
33 |
34 | linguistic_unit:
35 | cleaners: english_cleaners
36 | lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
37 | speaker_list: F7
38 | ####################################################
39 | # LOSS SETTING #
40 | ####################################################
41 | Loss:
42 | SeqCELoss:
43 | enable: True
44 | params:
45 | loss_type: ce
46 |
47 | ###########################################################
48 | # DATA LOADER SETTING #
49 | ###########################################################
50 | batch_size: 32
51 | pin_memory: False
52 | num_workers: 4 # FIXME: set > 0 may stuck on macos
53 | remove_short_samples: False
54 | allow_cache: True
55 |
56 | grad_norm: 1.0
57 |
58 | ###########################################################
59 | # INTERVAL SETTING #
60 | ###########################################################
61 | train_max_steps: 1000000 # Number of training steps.
62 | save_interval_steps: 20000 # Interval steps to save checkpoint.
63 | eval_interval_steps: 10000 # Interval steps to evaluate the network.
64 | log_interval_steps: 1000 # Interval steps to record the training log.
65 |
66 | ###########################################################
67 | # OTHER SETTING #
68 | ###########################################################
69 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
70 |
--------------------------------------------------------------------------------
/kantts/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/datasets/__init__.py
--------------------------------------------------------------------------------
/kantts/datasets/data_types.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy.io import wavfile
3 |
4 |
5 | # TODO: add your own data type here as you need.
6 | DATA_TYPE_DICT = {
7 | "txt": {
8 | "load_func": np.loadtxt,
9 | "desc": "plain txt file or readable by np.loadtxt",
10 | },
11 | "wav": {
12 | "load_func": lambda x: wavfile.read(x)[1],
13 | "desc": "wav file or readable by soundfile.read",
14 | },
15 | "npy": {
16 | "load_func": np.load,
17 | "desc": "any .npy format file",
18 | },
19 | # PCM data type can be loaded by binary format
20 | "bin_f32": {
21 | "load_func": lambda x: np.fromfile(x, dtype=np.float32),
22 | "desc": "binary file with float32 format",
23 | },
24 | "bin_f64": {
25 | "load_func": lambda x: np.fromfile(x, dtype=np.float64),
26 | "desc": "binary file with float64 format",
27 | },
28 | "bin_i32": {
29 | "load_func": lambda x: np.fromfile(x, dtype=np.int32),
30 | "desc": "binary file with int32 format",
31 | },
32 | "bin_i16": {
33 | "load_func": lambda x: np.fromfile(x, dtype=np.int16),
34 | "desc": "binary file with int16 format",
35 | },
36 | }
37 |
--------------------------------------------------------------------------------
/kantts/models/__init__.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.nn.parallel import DistributedDataParallel
3 | from kantts.models.hifigan.hifigan import ( # NOQA
4 | Generator, # NOQA
5 | MultiScaleDiscriminator, # NOQA
6 | MultiPeriodDiscriminator, # NOQA
7 | MultiSpecDiscriminator, # NOQA
8 | )
9 | import kantts
10 | import kantts.train.scheduler
11 | from kantts.models.sambert.kantts_sambert import KanTtsSAMBERT, KanTtsTextsyBERT # NOQA
12 | from kantts.utils.ling_unit.ling_unit import get_fpdict
13 | from .pqmf import PQMF
14 |
15 |
16 | def optimizer_builder(model_params, opt_name, opt_params):
17 | opt_cls = getattr(torch.optim, opt_name)
18 | optimizer = opt_cls(model_params, **opt_params)
19 | return optimizer
20 |
21 |
22 | def scheduler_builder(optimizer, sche_name, sche_params):
23 | scheduler_cls = getattr(kantts.train.scheduler, sche_name)
24 | scheduler = scheduler_cls(optimizer, **sche_params)
25 | return scheduler
26 |
27 |
28 | def hifigan_model_builder(config, device, rank, distributed):
29 | model = {}
30 | optimizer = {}
31 | scheduler = {}
32 | model["discriminator"] = {}
33 | optimizer["discriminator"] = {}
34 | scheduler["discriminator"] = {}
35 | for model_name in config["Model"].keys():
36 | if model_name == "Generator":
37 | params = config["Model"][model_name]["params"]
38 | model["generator"] = Generator(**params).to(device)
39 | optimizer["generator"] = optimizer_builder(
40 | model["generator"].parameters(),
41 | config["Model"][model_name]["optimizer"].get("type", "Adam"),
42 | config["Model"][model_name]["optimizer"].get("params", {}),
43 | )
44 | scheduler["generator"] = scheduler_builder(
45 | optimizer["generator"],
46 | config["Model"][model_name]["scheduler"].get("type", "StepLR"),
47 | config["Model"][model_name]["scheduler"].get("params", {}),
48 | )
49 | else:
50 | params = config["Model"][model_name]["params"]
51 | model["discriminator"][model_name] = globals()[model_name](**params).to(
52 | device
53 | )
54 | optimizer["discriminator"][model_name] = optimizer_builder(
55 | model["discriminator"][model_name].parameters(),
56 | config["Model"][model_name]["optimizer"].get("type", "Adam"),
57 | config["Model"][model_name]["optimizer"].get("params", {}),
58 | )
59 | scheduler["discriminator"][model_name] = scheduler_builder(
60 | optimizer["discriminator"][model_name],
61 | config["Model"][model_name]["scheduler"].get("type", "StepLR"),
62 | config["Model"][model_name]["scheduler"].get("params", {}),
63 | )
64 |
65 | out_channels = config["Model"]["Generator"]["params"]["out_channels"]
66 | if out_channels > 1:
67 | model["pqmf"] = PQMF(subbands=out_channels, **config.get("pqmf", {})).to(device)
68 |
69 | # FIXME: pywavelets buffer leads to gradient error in DDP training
70 | # Solution: https://github.com/pytorch/pytorch/issues/22095
71 | if distributed:
72 | model["generator"] = DistributedDataParallel(
73 | model["generator"],
74 | device_ids=[rank],
75 | output_device=rank,
76 | broadcast_buffers=False,
77 | )
78 | for model_name in model["discriminator"].keys():
79 | model["discriminator"][model_name] = DistributedDataParallel(
80 | model["discriminator"][model_name],
81 | device_ids=[rank],
82 | output_device=rank,
83 | broadcast_buffers=False,
84 | )
85 |
86 | return model, optimizer, scheduler
87 |
88 |
89 | # TODO: some parsing
90 | def sambert_model_builder(config, device, rank, distributed):
91 | model = {}
92 | optimizer = {}
93 | scheduler = {}
94 |
95 | model["KanTtsSAMBERT"] = KanTtsSAMBERT(
96 | config["Model"]["KanTtsSAMBERT"]["params"]
97 | ).to(device)
98 |
99 | fp_enable = config["Model"]["KanTtsSAMBERT"]["params"].get("FP", False)
100 | if fp_enable:
101 | fp_dict = {
102 | k: torch.from_numpy(v).long().unsqueeze(0).to(device)
103 | for k, v in get_fpdict(config).items()
104 | }
105 | model["KanTtsSAMBERT"].fp_dict = fp_dict
106 |
107 | optimizer["KanTtsSAMBERT"] = optimizer_builder(
108 | model["KanTtsSAMBERT"].parameters(),
109 | config["Model"]["KanTtsSAMBERT"]["optimizer"].get("type", "Adam"),
110 | config["Model"]["KanTtsSAMBERT"]["optimizer"].get("params", {}),
111 | )
112 | scheduler["KanTtsSAMBERT"] = scheduler_builder(
113 | optimizer["KanTtsSAMBERT"],
114 | config["Model"]["KanTtsSAMBERT"]["scheduler"].get("type", "StepLR"),
115 | config["Model"]["KanTtsSAMBERT"]["scheduler"].get("params", {}),
116 | )
117 |
118 | if distributed:
119 | model["KanTtsSAMBERT"] = DistributedDataParallel(
120 | model["KanTtsSAMBERT"], device_ids=[rank], output_device=rank
121 | )
122 |
123 | return model, optimizer, scheduler
124 |
125 |
126 | def sybert_model_builder(config, device, rank, distributed):
127 | model = {}
128 | optimizer = {}
129 | scheduler = {}
130 |
131 | model["KanTtsTextsyBERT"] = KanTtsTextsyBERT(
132 | config["Model"]["KanTtsTextsyBERT"]["params"]
133 | ).to(device)
134 | optimizer["KanTtsTextsyBERT"] = optimizer_builder(
135 | model["KanTtsTextsyBERT"].parameters(),
136 | config["Model"]["KanTtsTextsyBERT"]["optimizer"].get("type", "Adam"),
137 | config["Model"]["KanTtsTextsyBERT"]["optimizer"].get("params", {}),
138 | )
139 | scheduler["KanTtsTextsyBERT"] = scheduler_builder(
140 | optimizer["KanTtsTextsyBERT"],
141 | config["Model"]["KanTtsTextsyBERT"]["scheduler"].get("type", "StepLR"),
142 | config["Model"]["KanTtsTextsyBERT"]["scheduler"].get("params", {}),
143 | )
144 |
145 | if distributed:
146 | model["KanTtsTextsyBERT"] = DistributedDataParallel(
147 | model["KanTtsTextsyBERT"], device_ids=[rank], output_device=rank
148 | )
149 |
150 | return model, optimizer, scheduler
151 |
152 |
153 | # TODO: implement a builder for specific model
154 | model_dict = {
155 | "hifigan": hifigan_model_builder,
156 | "sambert": sambert_model_builder,
157 | "sybert": sybert_model_builder,
158 | }
159 |
160 |
161 | def model_builder(config, device="cpu", rank=0, distributed=False):
162 | builder_func = model_dict[config["model_type"]]
163 | model, optimizer, scheduler = builder_func(config, device, rank, distributed)
164 | return model, optimizer, scheduler
165 |
--------------------------------------------------------------------------------
/kantts/models/pqmf.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Tomoki Hayashi
2 | # MIT License (https://opensource.org/licenses/MIT)
3 |
4 | """Pseudo QMF modules."""
5 |
6 | import numpy as np
7 | import torch
8 | import torch.nn.functional as F
9 |
10 | from scipy.signal import kaiser
11 |
12 |
13 | def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
14 | """Design prototype filter for PQMF.
15 |
16 | This method is based on `A Kaiser window approach for the design of prototype
17 | filters of cosine modulated filterbanks`_.
18 |
19 | Args:
20 | taps (int): The number of filter taps.
21 | cutoff_ratio (float): Cut-off frequency ratio.
22 | beta (float): Beta coefficient for kaiser window.
23 |
24 | Returns:
25 | ndarray: Impluse response of prototype filter (taps + 1,).
26 |
27 | .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
28 | https://ieeexplore.ieee.org/abstract/document/681427
29 |
30 | """
31 | # check the arguments are valid
32 | assert taps % 2 == 0, "The number of taps mush be even number."
33 | assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0."
34 |
35 | # make initial filter
36 | omega_c = np.pi * cutoff_ratio
37 | with np.errstate(invalid="ignore"):
38 | h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) / (
39 | np.pi * (np.arange(taps + 1) - 0.5 * taps)
40 | )
41 | h_i[taps // 2] = np.cos(0) * cutoff_ratio # fix nan due to indeterminate form
42 |
43 | # apply kaiser window
44 | w = kaiser(taps + 1, beta)
45 | h = h_i * w
46 |
47 | return h
48 |
49 |
50 | class PQMF(torch.nn.Module):
51 | """PQMF module.
52 |
53 | This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
54 |
55 | .. _`Near-perfect-reconstruction pseudo-QMF banks`:
56 | https://ieeexplore.ieee.org/document/258122
57 |
58 | """
59 |
60 | def __init__(self, subbands=4, taps=62, cutoff_ratio=0.142, beta=9.0):
61 | """Initilize PQMF module.
62 |
63 | The cutoff_ratio and beta parameters are optimized for #subbands = 4.
64 | See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195.
65 |
66 | Args:
67 | subbands (int): The number of subbands.
68 | taps (int): The number of filter taps.
69 | cutoff_ratio (float): Cut-off frequency ratio.
70 | beta (float): Beta coefficient for kaiser window.
71 |
72 | """
73 | super(PQMF, self).__init__()
74 |
75 | # build analysis & synthesis filter coefficients
76 | h_proto = design_prototype_filter(taps, cutoff_ratio, beta)
77 | h_analysis = np.zeros((subbands, len(h_proto)))
78 | h_synthesis = np.zeros((subbands, len(h_proto)))
79 | for k in range(subbands):
80 | h_analysis[k] = (
81 | 2
82 | * h_proto
83 | * np.cos(
84 | (2 * k + 1)
85 | * (np.pi / (2 * subbands))
86 | * (np.arange(taps + 1) - (taps / 2))
87 | + (-1) ** k * np.pi / 4
88 | )
89 | )
90 | h_synthesis[k] = (
91 | 2
92 | * h_proto
93 | * np.cos(
94 | (2 * k + 1)
95 | * (np.pi / (2 * subbands))
96 | * (np.arange(taps + 1) - (taps / 2))
97 | - (-1) ** k * np.pi / 4
98 | )
99 | )
100 |
101 | # convert to tensor
102 | analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1)
103 | synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0)
104 |
105 | # register coefficients as beffer
106 | self.register_buffer("analysis_filter", analysis_filter)
107 | self.register_buffer("synthesis_filter", synthesis_filter)
108 |
109 | # filter for downsampling & upsampling
110 | updown_filter = torch.zeros((subbands, subbands, subbands)).float()
111 | for k in range(subbands):
112 | updown_filter[k, k, 0] = 1.0
113 | self.register_buffer("updown_filter", updown_filter)
114 | self.subbands = subbands
115 |
116 | # keep padding info
117 | self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
118 |
119 | def analysis(self, x):
120 | """Analysis with PQMF.
121 |
122 | Args:
123 | x (Tensor): Input tensor (B, 1, T).
124 |
125 | Returns:
126 | Tensor: Output tensor (B, subbands, T // subbands).
127 |
128 | """
129 | x = F.conv1d(self.pad_fn(x), self.analysis_filter)
130 | return F.conv1d(x, self.updown_filter, stride=self.subbands)
131 |
132 | def synthesis(self, x):
133 | """Synthesis with PQMF.
134 |
135 | Args:
136 | x (Tensor): Input tensor (B, subbands, T // subbands).
137 |
138 | Returns:
139 | Tensor: Output tensor (B, 1, T).
140 |
141 | """
142 | # NOTE(kan-bayashi): Power will be dreased so here multipy by # subbands.
143 | # Not sure this is the correct way, it is better to check again.
144 | # TODO(kan-bayashi): Understand the reconstruction procedure
145 | x = F.conv_transpose1d(
146 | x, self.updown_filter * self.subbands, stride=self.subbands
147 | )
148 | return F.conv1d(self.pad_fn(x), self.synthesis_filter)
149 |
--------------------------------------------------------------------------------
/kantts/models/sambert/adaptors.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from kantts.models.sambert.fsmn import FsmnEncoderV2
6 | from kantts.models.sambert import Prenet
7 |
8 |
9 | class LengthRegulator(nn.Module):
10 | def __init__(self, r=1):
11 | super(LengthRegulator, self).__init__()
12 |
13 | self.r = r
14 |
15 | def forward(self, inputs, durations, masks=None):
16 | reps = (durations + 0.5).long()
17 | output_lens = reps.sum(dim=1)
18 | max_len = output_lens.max()
19 | reps_cumsum = torch.cumsum(F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[
20 | :, None, :
21 | ]
22 | range_ = torch.arange(max_len).to(inputs.device)[None, :, None]
23 | mult = (reps_cumsum[:, :, :-1] <= range_) & (reps_cumsum[:, :, 1:] > range_)
24 | mult = mult.float()
25 | out = torch.matmul(mult, inputs)
26 |
27 | if masks is not None:
28 | out = out.masked_fill(masks.unsqueeze(-1), 0.0)
29 |
30 | seq_len = out.size(1)
31 | padding = self.r - int(seq_len) % self.r
32 | if padding < self.r:
33 | out = F.pad(out.transpose(1, 2), (0, padding, 0, 0, 0, 0), value=0.0)
34 | out = out.transpose(1, 2)
35 |
36 | return out, output_lens
37 |
38 |
39 | class VarRnnARPredictor(nn.Module):
40 | def __init__(self, cond_units, prenet_units, rnn_units):
41 | super(VarRnnARPredictor, self).__init__()
42 |
43 | self.prenet = Prenet(1, prenet_units)
44 | self.lstm = nn.LSTM(
45 | prenet_units[-1] + cond_units,
46 | rnn_units,
47 | num_layers=2,
48 | batch_first=True,
49 | bidirectional=False,
50 | )
51 | self.fc = nn.Linear(rnn_units, 1)
52 |
53 | def forward(self, inputs, cond, h=None, masks=None):
54 | x = torch.cat([self.prenet(inputs), cond], dim=-1)
55 | # The input can also be a packed variable length sequence,
56 | # here we just omit it for simplicity due to the mask and uni-directional lstm.
57 | x, h_new = self.lstm(x, h)
58 |
59 | x = self.fc(x).squeeze(-1)
60 | x = F.relu(x)
61 |
62 | if masks is not None:
63 | x = x.masked_fill(masks, 0.0)
64 |
65 | return x, h_new
66 |
67 | def infer(self, cond, masks=None):
68 | batch_size, length = cond.size(0), cond.size(1)
69 |
70 | output = []
71 | x = torch.zeros((batch_size, 1)).to(cond.device)
72 | h = None
73 |
74 | for i in range(length):
75 | x, h = self.forward(x.unsqueeze(1), cond[:, i : i + 1, :], h=h)
76 | output.append(x)
77 |
78 | output = torch.cat(output, dim=-1)
79 |
80 | if masks is not None:
81 | output = output.masked_fill(masks, 0.0)
82 |
83 | return output
84 |
85 |
86 | class VarFsmnRnnNARPredictor(nn.Module):
87 | def __init__(
88 | self,
89 | in_dim,
90 | filter_size,
91 | fsmn_num_layers,
92 | num_memory_units,
93 | ffn_inner_dim,
94 | dropout,
95 | shift,
96 | lstm_units,
97 | ):
98 | super(VarFsmnRnnNARPredictor, self).__init__()
99 |
100 | self.fsmn = FsmnEncoderV2(
101 | filter_size,
102 | fsmn_num_layers,
103 | in_dim,
104 | num_memory_units,
105 | ffn_inner_dim,
106 | dropout,
107 | shift,
108 | )
109 | self.blstm = nn.LSTM(
110 | num_memory_units,
111 | lstm_units,
112 | num_layers=1,
113 | batch_first=True,
114 | bidirectional=True,
115 | )
116 | self.fc = nn.Linear(2 * lstm_units, 1)
117 |
118 | def forward(self, inputs, masks=None):
119 | input_lengths = None
120 | if masks is not None:
121 | input_lengths = torch.sum((~masks).float(), dim=1).long()
122 |
123 | x = self.fsmn(inputs, masks)
124 |
125 | if input_lengths is not None:
126 | x = nn.utils.rnn.pack_padded_sequence(
127 | x, input_lengths.tolist(), batch_first=True, enforce_sorted=False
128 | )
129 | x, _ = self.blstm(x)
130 | x, _ = nn.utils.rnn.pad_packed_sequence(
131 | x, batch_first=True, total_length=inputs.size(1)
132 | )
133 | else:
134 | x, _ = self.blstm(x)
135 |
136 | x = self.fc(x).squeeze(-1)
137 |
138 | if masks is not None:
139 | x = x.masked_fill(masks, 0.0)
140 |
141 | return x
142 |
--------------------------------------------------------------------------------
/kantts/models/sambert/alignment.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import numba as nb
3 |
4 |
5 | @nb.jit(nopython=True)
6 | def mas(attn_map, width=1):
7 | # assumes mel x text
8 | opt = np.zeros_like(attn_map)
9 | attn_map = np.log(attn_map)
10 | attn_map[0, 1:] = -np.inf
11 | log_p = np.zeros_like(attn_map)
12 | log_p[0, :] = attn_map[0, :]
13 | prev_ind = np.zeros_like(attn_map, dtype=np.int64)
14 | for i in range(1, attn_map.shape[0]):
15 | for j in range(attn_map.shape[1]): # for each text dim
16 | prev_j = np.arange(max(0, j - width), j + 1)
17 | prev_log = np.array([log_p[i - 1, prev_idx] for prev_idx in prev_j])
18 |
19 | ind = np.argmax(prev_log)
20 | log_p[i, j] = attn_map[i, j] + prev_log[ind]
21 | prev_ind[i, j] = prev_j[ind]
22 |
23 | # now backtrack
24 | curr_text_idx = attn_map.shape[1] - 1
25 | for i in range(attn_map.shape[0] - 1, -1, -1):
26 | opt[i, curr_text_idx] = 1
27 | curr_text_idx = prev_ind[i, curr_text_idx]
28 | opt[0, curr_text_idx] = 1
29 | return opt
30 |
31 |
32 | @nb.jit(nopython=True)
33 | def mas_width1(attn_map):
34 | """mas with hardcoded width=1"""
35 | # assumes mel x text
36 | opt = np.zeros_like(attn_map)
37 | attn_map = np.log(attn_map)
38 | attn_map[0, 1:] = -np.inf
39 | log_p = np.zeros_like(attn_map)
40 | log_p[0, :] = attn_map[0, :]
41 | prev_ind = np.zeros_like(attn_map, dtype=np.int64)
42 | for i in range(1, attn_map.shape[0]):
43 | for j in range(attn_map.shape[1]): # for each text dim
44 | prev_log = log_p[i - 1, j]
45 | prev_j = j
46 |
47 | if j - 1 >= 0 and log_p[i - 1, j - 1] >= log_p[i - 1, j]:
48 | prev_log = log_p[i - 1, j - 1]
49 | prev_j = j - 1
50 |
51 | log_p[i, j] = attn_map[i, j] + prev_log
52 | prev_ind[i, j] = prev_j
53 |
54 | # now backtrack
55 | curr_text_idx = attn_map.shape[1] - 1
56 | for i in range(attn_map.shape[0] - 1, -1, -1):
57 | opt[i, curr_text_idx] = 1
58 | curr_text_idx = prev_ind[i, curr_text_idx]
59 | opt[0, curr_text_idx] = 1
60 | return opt
61 |
62 |
63 | @nb.jit(nopython=True, parallel=True)
64 | def b_mas(b_attn_map, in_lens, out_lens, width=1):
65 | assert width == 1
66 | attn_out = np.zeros_like(b_attn_map)
67 |
68 | for b in nb.prange(b_attn_map.shape[0]):
69 | out = mas_width1(b_attn_map[b, 0, : out_lens[b], : in_lens[b]])
70 | attn_out[b, 0, : out_lens[b], : in_lens[b]] = out
71 | return attn_out
72 |
--------------------------------------------------------------------------------
/kantts/models/sambert/attention.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from torch import nn
4 |
5 |
6 | class ConvNorm(torch.nn.Module):
7 | def __init__(
8 | self,
9 | in_channels,
10 | out_channels,
11 | kernel_size=1,
12 | stride=1,
13 | padding=None,
14 | dilation=1,
15 | bias=True,
16 | w_init_gain="linear",
17 | ):
18 | super(ConvNorm, self).__init__()
19 | if padding is None:
20 | assert kernel_size % 2 == 1
21 | padding = int(dilation * (kernel_size - 1) / 2)
22 |
23 | self.conv = torch.nn.Conv1d(
24 | in_channels,
25 | out_channels,
26 | kernel_size=kernel_size,
27 | stride=stride,
28 | padding=padding,
29 | dilation=dilation,
30 | bias=bias,
31 | )
32 |
33 | torch.nn.init.xavier_uniform_(
34 | self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)
35 | )
36 |
37 | def forward(self, signal):
38 | conv_signal = self.conv(signal)
39 | return conv_signal
40 |
41 |
42 | class ConvAttention(torch.nn.Module):
43 | def __init__(
44 | self,
45 | n_mel_channels=80,
46 | n_text_channels=512,
47 | n_att_channels=80,
48 | temperature=1.0,
49 | use_query_proj=True,
50 | ):
51 | super(ConvAttention, self).__init__()
52 | self.temperature = temperature
53 | self.att_scaling_factor = np.sqrt(n_att_channels)
54 | self.softmax = torch.nn.Softmax(dim=3)
55 | self.log_softmax = torch.nn.LogSoftmax(dim=3)
56 | self.attn_proj = torch.nn.Conv2d(n_att_channels, 1, kernel_size=1)
57 | self.use_query_proj = bool(use_query_proj)
58 |
59 | self.key_proj = nn.Sequential(
60 | ConvNorm(
61 | n_text_channels,
62 | n_text_channels * 2,
63 | kernel_size=3,
64 | bias=True,
65 | w_init_gain="relu",
66 | ),
67 | torch.nn.ReLU(),
68 | ConvNorm(n_text_channels * 2, n_att_channels, kernel_size=1, bias=True),
69 | )
70 |
71 | self.query_proj = nn.Sequential(
72 | ConvNorm(
73 | n_mel_channels,
74 | n_mel_channels * 2,
75 | kernel_size=3,
76 | bias=True,
77 | w_init_gain="relu",
78 | ),
79 | torch.nn.ReLU(),
80 | ConvNorm(n_mel_channels * 2, n_mel_channels, kernel_size=1, bias=True),
81 | torch.nn.ReLU(),
82 | ConvNorm(n_mel_channels, n_att_channels, kernel_size=1, bias=True),
83 | )
84 |
85 | def forward(self, queries, keys, mask=None, attn_prior=None):
86 | """Attention mechanism for flowtron parallel
87 | Unlike in Flowtron, we have no restrictions such as causality etc,
88 | since we only need this during training.
89 |
90 | Args:
91 | queries (torch.tensor): B x C x T1 tensor
92 | (probably going to be mel data)
93 | keys (torch.tensor): B x C2 x T2 tensor (text data)
94 | mask (torch.tensor): uint8 binary mask for variable length entries
95 | (should be in the T2 domain)
96 | Output:
97 | attn (torch.tensor): B x 1 x T1 x T2 attention mask.
98 | Final dim T2 should sum to 1
99 | """
100 | keys_enc = self.key_proj(keys) # B x n_attn_dims x T2
101 |
102 | # Beware can only do this since query_dim = attn_dim = n_mel_channels
103 | if self.use_query_proj:
104 | queries_enc = self.query_proj(queries)
105 | else:
106 | queries_enc = queries
107 |
108 | # different ways of computing attn,
109 | # one is isotopic gaussians (per phoneme)
110 | # Simplistic Gaussian Isotopic Attention
111 |
112 | # B x n_attn_dims x T1 x T2
113 | attn = (queries_enc[:, :, :, None] - keys_enc[:, :, None]) ** 2
114 | # compute log likelihood from a gaussian
115 | attn = -0.0005 * attn.sum(1, keepdim=True)
116 | if attn_prior is not None:
117 | attn = self.log_softmax(attn) + torch.log(attn_prior[:, None] + 1e-8)
118 |
119 | attn_logprob = attn.clone()
120 |
121 | if mask is not None:
122 | attn.data.masked_fill_(mask.unsqueeze(1).unsqueeze(1), -float("inf"))
123 |
124 | attn = self.softmax(attn) # Softmax along T2
125 | return attn, attn_logprob
126 |
--------------------------------------------------------------------------------
/kantts/models/sambert/fsmn.py:
--------------------------------------------------------------------------------
1 | """
2 | FSMN Pytorch Version
3 | """
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 |
8 | class FeedForwardNet(nn.Module):
9 | """ A two-feed-forward-layer module """
10 |
11 | def __init__(self, d_in, d_hid, d_out, kernel_size=[1, 1], dropout=0.1):
12 | super().__init__()
13 |
14 | # Use Conv1D
15 | # position-wise
16 | self.w_1 = nn.Conv1d(
17 | d_in,
18 | d_hid,
19 | kernel_size=kernel_size[0],
20 | padding=(kernel_size[0] - 1) // 2,
21 | )
22 | # position-wise
23 | self.w_2 = nn.Conv1d(
24 | d_hid,
25 | d_out,
26 | kernel_size=kernel_size[1],
27 | padding=(kernel_size[1] - 1) // 2,
28 | bias=False,
29 | )
30 |
31 | self.dropout = nn.Dropout(dropout)
32 |
33 | def forward(self, x):
34 | output = x.transpose(1, 2)
35 | output = F.relu(self.w_1(output))
36 | output = self.dropout(output)
37 | output = self.w_2(output)
38 | output = output.transpose(1, 2)
39 |
40 | return output
41 |
42 |
43 | class MemoryBlockV2(nn.Module):
44 | def __init__(self, d, filter_size, shift, dropout=0.0):
45 | super(MemoryBlockV2, self).__init__()
46 |
47 | left_padding = int(round((filter_size - 1) / 2))
48 | right_padding = int((filter_size - 1) / 2)
49 | if shift > 0:
50 | left_padding += shift
51 | right_padding -= shift
52 |
53 | self.lp, self.rp = left_padding, right_padding
54 |
55 | self.conv_dw = nn.Conv1d(d, d, filter_size, 1, 0, groups=d, bias=False)
56 | self.dropout = nn.Dropout(dropout)
57 |
58 | def forward(self, input, mask=None):
59 | if mask is not None:
60 | input = input.masked_fill(mask.unsqueeze(-1), 0)
61 |
62 | x = F.pad(input, (0, 0, self.lp, self.rp, 0, 0), mode="constant", value=0.0)
63 | output = (
64 | self.conv_dw(x.contiguous().transpose(1, 2)).contiguous().transpose(1, 2)
65 | )
66 | output += input
67 | output = self.dropout(output)
68 |
69 | if mask is not None:
70 | output = output.masked_fill(mask.unsqueeze(-1), 0)
71 |
72 | return output
73 |
74 |
75 | class FsmnEncoderV2(nn.Module):
76 | def __init__(
77 | self,
78 | filter_size,
79 | fsmn_num_layers,
80 | input_dim,
81 | num_memory_units,
82 | ffn_inner_dim,
83 | dropout=0.0,
84 | shift=0,
85 | ):
86 | super(FsmnEncoderV2, self).__init__()
87 |
88 | self.filter_size = filter_size
89 | self.fsmn_num_layers = fsmn_num_layers
90 | self.num_memory_units = num_memory_units
91 | self.ffn_inner_dim = ffn_inner_dim
92 | self.dropout = dropout
93 | self.shift = shift
94 | if not isinstance(shift, list):
95 | self.shift = [shift for _ in range(self.fsmn_num_layers)]
96 |
97 | self.ffn_lst = nn.ModuleList()
98 | self.ffn_lst.append(
99 | FeedForwardNet(input_dim, ffn_inner_dim, num_memory_units, dropout=dropout)
100 | )
101 | for i in range(1, fsmn_num_layers):
102 | self.ffn_lst.append(
103 | FeedForwardNet(
104 | num_memory_units, ffn_inner_dim, num_memory_units, dropout=dropout
105 | )
106 | )
107 |
108 | self.memory_block_lst = nn.ModuleList()
109 | for i in range(fsmn_num_layers):
110 | self.memory_block_lst.append(
111 | MemoryBlockV2(num_memory_units, filter_size, self.shift[i], dropout)
112 | )
113 |
114 | def forward(self, input, mask=None):
115 | x = F.dropout(input, self.dropout, self.training)
116 | for (ffn, memory_block) in zip(self.ffn_lst, self.memory_block_lst):
117 | context = ffn(x)
118 | memory = memory_block(context, mask)
119 | memory = F.dropout(memory, self.dropout, self.training)
120 | if memory.size(-1) == x.size(-1):
121 | memory += x
122 | x = memory
123 |
124 | return x
125 |
--------------------------------------------------------------------------------
/kantts/models/sambert/positions.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | import numpy as np
6 |
7 |
8 | class SinusoidalPositionEncoder(nn.Module):
9 | def __init__(self, max_len, depth):
10 | super(SinusoidalPositionEncoder, self).__init__()
11 |
12 | self.max_len = max_len
13 | self.depth = depth
14 | self.position_enc = nn.Parameter(
15 | self.get_sinusoid_encoding_table(max_len, depth).unsqueeze(0),
16 | requires_grad=False,
17 | )
18 |
19 | def forward(self, input):
20 | bz_in, len_in, _ = input.size()
21 | if len_in > self.max_len:
22 | self.max_len = len_in
23 | self.position_enc.data = (
24 | self.get_sinusoid_encoding_table(self.max_len, self.depth)
25 | .unsqueeze(0)
26 | .to(input.device)
27 | )
28 |
29 | output = input + self.position_enc[:, :len_in, :].expand(bz_in, -1, -1)
30 |
31 | return output
32 |
33 | @staticmethod
34 | def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
35 | """ Sinusoid position encoding table """
36 |
37 | def cal_angle(position, hid_idx):
38 | return position / np.power(10000, hid_idx / float(d_hid / 2 - 1))
39 |
40 | def get_posi_angle_vec(position):
41 | return [cal_angle(position, hid_j) for hid_j in range(d_hid // 2)]
42 |
43 | scaled_time_table = np.array(
44 | [get_posi_angle_vec(pos_i + 1) for pos_i in range(n_position)]
45 | )
46 |
47 | sinusoid_table = np.zeros((n_position, d_hid))
48 | sinusoid_table[:, : d_hid // 2] = np.sin(scaled_time_table)
49 | sinusoid_table[:, d_hid // 2 :] = np.cos(scaled_time_table)
50 |
51 | if padding_idx is not None:
52 | # zero vector for padding dimension
53 | sinusoid_table[padding_idx] = 0.0
54 |
55 | return torch.FloatTensor(sinusoid_table)
56 |
57 |
58 | class DurSinusoidalPositionEncoder(nn.Module):
59 | def __init__(self, depth, outputs_per_step):
60 | super(DurSinusoidalPositionEncoder, self).__init__()
61 |
62 | self.depth = depth
63 | self.outputs_per_step = outputs_per_step
64 |
65 | inv_timescales = [
66 | np.power(10000, 2 * (hid_idx // 2) / depth) for hid_idx in range(depth)
67 | ]
68 | self.inv_timescales = nn.Parameter(
69 | torch.FloatTensor(inv_timescales), requires_grad=False
70 | )
71 |
72 | def forward(self, durations, masks=None):
73 | reps = (durations + 0.5).long()
74 | output_lens = reps.sum(dim=1)
75 | max_len = output_lens.max()
76 | reps_cumsum = torch.cumsum(F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[
77 | :, None, :
78 | ]
79 | range_ = torch.arange(max_len).to(durations.device)[None, :, None]
80 | mult = (reps_cumsum[:, :, :-1] <= range_) & (reps_cumsum[:, :, 1:] > range_)
81 | mult = mult.float()
82 | offsets = torch.matmul(mult, reps_cumsum[:, 0, :-1].unsqueeze(-1)).squeeze(-1)
83 | dur_pos = range_[:, :, 0] - offsets + 1
84 |
85 | if masks is not None:
86 | assert masks.size(1) == dur_pos.size(1)
87 | dur_pos = dur_pos.masked_fill(masks, 0.0)
88 |
89 | seq_len = dur_pos.size(1)
90 | padding = self.outputs_per_step - int(seq_len) % self.outputs_per_step
91 | if padding < self.outputs_per_step:
92 | dur_pos = F.pad(dur_pos, (0, padding, 0, 0), value=0.0)
93 |
94 | position_embedding = dur_pos[:, :, None] / self.inv_timescales[None, None, :]
95 | position_embedding[:, :, 0::2] = torch.sin(position_embedding[:, :, 0::2])
96 | position_embedding[:, :, 1::2] = torch.cos(position_embedding[:, :, 1::2])
97 |
98 | return position_embedding
99 |
--------------------------------------------------------------------------------
/kantts/models/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from distutils.version import LooseVersion
3 |
4 | is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion("1.7")
5 |
6 |
7 | def init_weights(m, mean=0.0, std=0.01):
8 | classname = m.__class__.__name__
9 | if classname.find("Conv") != -1:
10 | m.weight.data.normal_(mean, std)
11 |
12 |
13 | def get_mask_from_lengths(lengths, max_len=None):
14 | batch_size = lengths.shape[0]
15 | if max_len is None:
16 | max_len = torch.max(lengths).item()
17 |
18 | ids = (
19 | torch.arange(0, max_len).unsqueeze(0).expand(batch_size, -1).to(lengths.device)
20 | )
21 | mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
22 |
23 | return mask
24 |
--------------------------------------------------------------------------------
/kantts/preprocess/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/preprocess/__init__.py
--------------------------------------------------------------------------------
/kantts/preprocess/audio_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/preprocess/audio_processor/__init__.py
--------------------------------------------------------------------------------
/kantts/preprocess/audio_processor/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/preprocess/audio_processor/core/__init__.py
--------------------------------------------------------------------------------
/kantts/preprocess/fp_processor.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import random
4 |
5 |
6 | def is_fp_line(line):
7 | fp_category_list = ["FP", "I", "N", "Q"]
8 | elements = line.strip().split(" ")
9 | res = True
10 | for ele in elements:
11 | if ele not in fp_category_list:
12 | res = False
13 | break
14 | return res
15 |
16 |
17 | class FpProcessor:
18 | def __init__(self):
19 | # TODO: Add more audio processing methods.
20 | self.res = []
21 |
22 | def is_fp_line(line):
23 | fp_category_list = ["FP", "I", "N", "Q"]
24 | elements = line.strip().split(" ")
25 | res = True
26 | for ele in elements:
27 | if ele not in fp_category_list:
28 | res = False
29 | break
30 | return res
31 |
32 | # TODO: adjust idx judgment rule
33 | def addfp(self, voice_output_dir, prosody, raw_metafile_lines):
34 |
35 | fp_category_list = ["FP", "I", "N"]
36 |
37 | f = open(prosody)
38 | prosody_lines = f.readlines()
39 | f.close()
40 |
41 | idx = ""
42 | fp = ""
43 | fp_label_dict = {}
44 | i = 0
45 | while i < len(prosody_lines):
46 | if len(prosody_lines[i].strip().split("\t")) == 2:
47 | idx = prosody_lines[i].strip().split("\t")[0]
48 | i += 1
49 | else:
50 | fp_enable = is_fp_line(prosody_lines[i])
51 | if fp_enable:
52 | fp = prosody_lines[i].strip().split("\t")[0].split(" ")
53 | for label in fp:
54 | if label not in fp_category_list:
55 | logging.warning("fp label not in fp_category_list")
56 | break
57 | i += 4
58 | else:
59 | fp = [
60 | "N"
61 | for _ in range(
62 | len(
63 | prosody_lines[i]
64 | .strip()
65 | .split("\t")[0]
66 | .replace("/ ", "")
67 | .replace(". ", "")
68 | .split(" ")
69 | )
70 | )
71 | ]
72 | i += 1
73 | fp_label_dict[idx] = fp
74 |
75 | fpadd_metafile = os.path.join(voice_output_dir, "fpadd_metafile.txt")
76 | f_out = open(fpadd_metafile, "w")
77 | for line in raw_metafile_lines:
78 | tokens = line.strip().split("\t")
79 | if len(tokens) == 2:
80 | uttname = tokens[0]
81 | symbol_sequences = tokens[1].split(" ")
82 |
83 | error_flag = False
84 | idx = 0
85 | out_str = uttname + "\t"
86 |
87 | for this_symbol_sequence in symbol_sequences:
88 | emotion = this_symbol_sequence.split("$")[4]
89 | this_symbol_sequence = this_symbol_sequence.replace(
90 | emotion, "emotion_neutral"
91 | )
92 |
93 | if idx < len(fp_label_dict[uttname]):
94 | if fp_label_dict[uttname][idx] == "FP":
95 | if "none" not in this_symbol_sequence:
96 | this_symbol_sequence = this_symbol_sequence.replace(
97 | "emotion_neutral", "emotion_disgust"
98 | )
99 | syllable_label = this_symbol_sequence.split("$")[2]
100 | if syllable_label == "s_both" or syllable_label == "s_end":
101 | idx += 1
102 | elif idx > len(fp_label_dict[uttname]):
103 | logging.warning(uttname + " not match")
104 | error_flag = True
105 | out_str = out_str + this_symbol_sequence + " "
106 |
107 | # if idx != len(fp_label_dict[uttname]):
108 | # logging.warning(
109 | # "{} length mismatch, length: {} ".format(
110 | # idx, len(fp_label_dict[uttname])
111 | # )
112 | # )
113 |
114 | if not error_flag:
115 | f_out.write(out_str.strip() + "\n")
116 | f_out.close()
117 | return fpadd_metafile
118 |
119 | def removefp(self, voice_output_dir, fpadd_metafile, raw_metafile_lines):
120 |
121 | f = open(fpadd_metafile)
122 | fpadd_metafile_lines = f.readlines()
123 | f.close()
124 |
125 | fprm_metafile = os.path.join(voice_output_dir, "fprm_metafile.txt")
126 | f_out = open(fprm_metafile, "w")
127 | for i in range(len(raw_metafile_lines)):
128 | tokens = raw_metafile_lines[i].strip().split("\t")
129 | symbol_sequences = tokens[1].split(" ")
130 | fpadd_tokens = fpadd_metafile_lines[i].strip().split("\t")
131 | fpadd_symbol_sequences = fpadd_tokens[1].split(" ")
132 |
133 | error_flag = False
134 | out_str = tokens[0] + "\t"
135 | idx = 0
136 | length = len(symbol_sequences)
137 | while idx < length:
138 | if "$emotion_disgust" in fpadd_symbol_sequences[idx]:
139 | if idx + 1 < length and "none" in fpadd_symbol_sequences[idx + 1]:
140 | idx = idx + 2
141 | else:
142 | idx = idx + 1
143 | continue
144 | out_str = out_str + symbol_sequences[idx] + " "
145 | idx = idx + 1
146 |
147 | if not error_flag:
148 | f_out.write(out_str.strip() + "\n")
149 | f_out.close()
150 |
151 | def process(self, voice_output_dir, prosody, raw_metafile):
152 |
153 | with open(raw_metafile, "r") as f:
154 | lines = f.readlines()
155 | random.shuffle(lines)
156 |
157 | fpadd_metafile = self.addfp(voice_output_dir, prosody, lines)
158 | self.removefp(voice_output_dir, fpadd_metafile, lines)
159 |
--------------------------------------------------------------------------------
/kantts/preprocess/languages/PinYin/En2ChPhoneMap.txt:
--------------------------------------------------------------------------------
1 | wu w
2 | yi y
3 |
--------------------------------------------------------------------------------
/kantts/preprocess/languages/PinYin/PosSet.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 1
5 | a
6 | todo
7 |
8 |
9 | 2
10 | b
11 | todo
12 |
13 |
14 | 3
15 | c
16 | todo
17 |
18 |
19 | 4
20 | d
21 | todo
22 |
23 |
24 | 5
25 | e
26 | todo
27 |
28 |
29 | 6
30 | f
31 | todo
32 |
33 |
34 | 7
35 | g
36 | todo
37 |
38 |
39 | 8
40 | gb
41 | todo
42 |
43 |
44 |
45 |
46 | 9
47 | h
48 | todo
49 |
50 |
51 | 10
52 | i
53 | todo
54 |
55 |
56 | 11
57 | j
58 | todo
59 |
60 |
61 | 12
62 | k
63 | todo
64 |
65 |
66 | 13
67 | l
68 | todo
69 |
70 |
71 | 14
72 | m
73 | todo
74 |
75 |
76 | 15
77 | n
78 | todo
79 |
80 |
81 | 16
82 | nz
83 | todo
84 |
85 |
86 |
87 |
88 | 17
89 | o
90 | todo
91 |
92 |
93 | 18
94 | p
95 | todo
96 |
97 |
98 | 19
99 | q
100 | todo
101 |
102 |
103 | 20
104 | r
105 | todo
106 |
107 |
108 | 21
109 | s
110 | todo
111 |
112 |
113 | 22
114 | t
115 | todo
116 |
117 |
118 | 23
119 | u
120 | todo
121 |
122 |
123 | 24
124 | v
125 | todo
126 |
127 |
128 | 25
129 | w
130 | todo
131 |
132 |
133 | 26
134 | x
135 | todo
136 |
137 |
138 | 27
139 | y
140 | todo
141 |
142 |
143 | 28
144 | z
145 | todo
146 |
147 |
148 |
--------------------------------------------------------------------------------
/kantts/preprocess/languages/PinYin/tonelist.txt:
--------------------------------------------------------------------------------
1 | 1
2 |
3 | 4
4 | 2
5 | 3
6 | 5
7 | 0
8 |
--------------------------------------------------------------------------------
/kantts/preprocess/languages/Sichuan/En2ChPhoneMap.txt:
--------------------------------------------------------------------------------
1 | wu w
2 | yi y
3 |
--------------------------------------------------------------------------------
/kantts/preprocess/languages/Sichuan/PosSet.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 1
5 | a
6 | todo
7 |
8 |
9 | 2
10 | b
11 | todo
12 |
13 |
14 | 3
15 | c
16 | todo
17 |
18 |
19 | 4
20 | d
21 | todo
22 |
23 |
24 | 5
25 | e
26 | todo
27 |
28 |
29 | 6
30 | f
31 | todo
32 |
33 |
34 | 7
35 | g
36 | todo
37 |
38 |
39 | 8
40 | gb
41 | todo
42 |
43 |
44 |
45 |
46 | 9
47 | h
48 | todo
49 |
50 |
51 | 10
52 | i
53 | todo
54 |
55 |
56 | 11
57 | j
58 | todo
59 |
60 |
61 | 12
62 | k
63 | todo
64 |
65 |
66 | 13
67 | l
68 | todo
69 |
70 |
71 | 14
72 | m
73 | todo
74 |
75 |
76 | 15
77 | n
78 | todo
79 |
80 |
81 | 16
82 | nz
83 | todo
84 |
85 |
86 |
87 |
88 | 17
89 | o
90 | todo
91 |
92 |
93 | 18
94 | p
95 | todo
96 |
97 |
98 | 19
99 | q
100 | todo
101 |
102 |
103 | 20
104 | r
105 | todo
106 |
107 |
108 | 21
109 | s
110 | todo
111 |
112 |
113 | 22
114 | t
115 | todo
116 |
117 |
118 | 23
119 | u
120 | todo
121 |
122 |
123 | 24
124 | v
125 | todo
126 |
127 |
128 | 25
129 | w
130 | todo
131 |
132 |
133 | 26
134 | x
135 | todo
136 |
137 |
138 | 27
139 | y
140 | todo
141 |
142 |
143 | 28
144 | z
145 | todo
146 |
147 |
148 |
--------------------------------------------------------------------------------
/kantts/preprocess/languages/Sichuan/tonelist.txt:
--------------------------------------------------------------------------------
1 | 1
2 |
3 | 4
4 | 2
5 | 3
6 | 5
7 | 0
8 |
--------------------------------------------------------------------------------
/kantts/preprocess/languages/WuuShanghai/En2ChPhoneMap.txt:
--------------------------------------------------------------------------------
1 | wu w
2 | yi y
3 |
--------------------------------------------------------------------------------
/kantts/preprocess/languages/WuuShanghai/tonelist.txt:
--------------------------------------------------------------------------------
1 | 6
2 | 0
3 | 3
4 | 4
5 | 2
6 | 5
7 | 1
8 | 7
9 | 8
10 |
11 |
--------------------------------------------------------------------------------
/kantts/preprocess/languages/ZhHK/En2ChPhoneMap.txt:
--------------------------------------------------------------------------------
1 | wu w
2 | yi y
3 |
--------------------------------------------------------------------------------
/kantts/preprocess/languages/ZhHK/tonelist.txt:
--------------------------------------------------------------------------------
1 | 6
2 | 0
3 | 3
4 | 4
5 | 2
6 | 5
7 | 7
8 | 1
9 | 8
10 | 9
11 |
12 |
--------------------------------------------------------------------------------
/kantts/preprocess/languages/__init__.py:
--------------------------------------------------------------------------------
1 | languages = {
2 | "PinYin": {
3 | "phoneset_path": "PhoneSet.xml",
4 | "posset_path": "PosSet.xml",
5 | "f2t_map_path": "En2ChPhoneMap.txt",
6 | "s2p_map_path": "py2phoneMap.txt",
7 | "tonelist_path": "tonelist.txt",
8 | },
9 | "ZhHK": {
10 | "phoneset_path": "PhoneSet.xml",
11 | "posset_path": "PosSet.xml",
12 | "f2t_map_path": "En2ChPhoneMap.txt",
13 | "s2p_map_path": "py2phoneMap.txt",
14 | "tonelist_path": "tonelist.txt",
15 | },
16 | "WuuShanghai": {
17 | "phoneset_path": "PhoneSet.xml",
18 | "posset_path": "PosSet.xml",
19 | "f2t_map_path": "En2ChPhoneMap.txt",
20 | "s2p_map_path": "py2phoneMap.txt",
21 | "tonelist_path": "tonelist.txt",
22 | },
23 | "Sichuan": {
24 | "phoneset_path": "PhoneSet.xml",
25 | "posset_path": "PosSet.xml",
26 | "f2t_map_path": "En2ChPhoneMap.txt",
27 | "s2p_map_path": "py2phoneMap.txt",
28 | "tonelist_path": "tonelist.txt",
29 | },
30 | }
31 |
--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/preprocess/script_convertor/__init__.py
--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/Phone.py:
--------------------------------------------------------------------------------
1 | from .XmlObj import XmlObj
2 | from .core_types import PhoneCVType, PhoneIFType, PhoneUVType, PhoneAPType, PhoneAMType
3 |
4 |
5 | class Phone(XmlObj):
6 | def __init__(self):
7 | self.m_id = None
8 | self.m_name = None
9 | self.m_cv_type = PhoneCVType.NULL
10 | self.m_if_type = PhoneIFType.NULL
11 | self.m_uv_type = PhoneUVType.NULL
12 | self.m_ap_type = PhoneAPType.NULL
13 | self.m_am_type = PhoneAMType.NULL
14 | self.m_bnd = False
15 |
16 | def __str__(self):
17 | return self.m_name
18 |
19 | def Save(self):
20 | pass
21 |
22 | def Load(self, phone_node):
23 | ns = "{http://schemas.alibaba-inc.com/tts}"
24 |
25 | id_node = phone_node.find(ns + "id")
26 | self.m_id = int(id_node.text)
27 |
28 | name_node = phone_node.find(ns + "name")
29 | self.m_name = name_node.text
30 |
31 | cv_node = phone_node.find(ns + "cv")
32 | self.m_cv_type = PhoneCVType.parse(cv_node.text)
33 |
34 | if_node = phone_node.find(ns + "if")
35 | self.m_if_type = PhoneIFType.parse(if_node.text)
36 |
37 | uv_node = phone_node.find(ns + "uv")
38 | self.m_uv_type = PhoneUVType.parse(uv_node.text)
39 |
40 | ap_node = phone_node.find(ns + "ap")
41 | self.m_ap_type = PhoneAPType.parse(ap_node.text)
42 |
43 | am_node = phone_node.find(ns + "am")
44 | self.m_am_type = PhoneAMType.parse(am_node.text)
45 |
--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/PhoneSet.py:
--------------------------------------------------------------------------------
1 | import xml.etree.ElementTree as ET
2 | import logging
3 |
4 | from .XmlObj import XmlObj
5 | from .Phone import Phone
6 |
7 |
8 | class PhoneSet(XmlObj):
9 | def __init__(self, phoneset_path):
10 | self.m_phone_list = []
11 | self.m_id_map = {}
12 | self.m_name_map = {}
13 | self.Load(phoneset_path)
14 |
15 | def Load(self, file_path):
16 | # alibaba tts xml namespace
17 | ns = "{http://schemas.alibaba-inc.com/tts}"
18 |
19 | phoneset_root = ET.parse(file_path).getroot()
20 | for phone_node in phoneset_root.findall(ns + "phone"):
21 | phone = Phone()
22 | phone.Load(phone_node)
23 | self.m_phone_list.append(phone)
24 | if phone.m_id in self.m_id_map:
25 | logging.error("PhoneSet.Load: duplicate id: %d", phone.m_id)
26 | self.m_id_map[phone.m_id] = phone
27 |
28 | if phone.m_name in self.m_name_map:
29 | logging.error("PhoneSet.Load duplicate name name: %s", phone.m_name)
30 | self.m_name_map[phone.m_name] = phone
31 |
32 | def Save(self):
33 | pass
34 |
35 |
36 | # if __name__ == "__main__":
37 | # import os
38 | # import sys
39 | #
40 | # phoneset = PhoneSet()
41 | # phoneset.Load(sys.argv[1])
42 | #
43 | # for phone in phoneset.m_phone_list:
44 | # print(phone)
45 | # print(phone.m_id)
46 | # print(phone.m_name)
47 | # print(phone.m_cv_type)
48 | # print(phone.m_if_type)
49 | # print(phone.m_uv_type)
50 | # print(phone.m_ap_type)
51 | # print(phone.m_am_type)
52 | # print(phone.m_bnd)
53 |
--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/Pos.py:
--------------------------------------------------------------------------------
1 | from .XmlObj import XmlObj
2 |
3 |
4 | class Pos(XmlObj):
5 | def __init__(self):
6 | self.m_id = None
7 | self.m_name = None
8 | self.m_desc = None
9 | self.m_level = 1
10 | self.m_parent = None
11 | self.m_sub_pos_list = []
12 |
13 | def __str__(self):
14 | return self.m_name
15 |
16 | def Save(self):
17 | pass
18 |
19 | def Load(self, pos_node):
20 | ns = "{http://schemas.alibaba-inc.com/tts}"
21 |
22 | id_node = pos_node.find(ns + "id")
23 | self.m_id = int(id_node.text)
24 |
25 | name_node = pos_node.find(ns + "name")
26 | self.m_name = name_node.text
27 |
28 | desc_node = pos_node.find(ns + "desc")
29 | self.m_desc = desc_node.text
30 |
31 | sub_node = pos_node.find(ns + "sub")
32 | if sub_node is not None:
33 | for sub_pos_node in sub_node.findall(ns + "pos"):
34 | sub_pos = Pos()
35 | sub_pos.Load(sub_pos_node)
36 | sub_pos.m_parent = self
37 | sub_pos.m_level = self.m_level + 1
38 | self.m_sub_pos_list.append(sub_pos)
39 |
40 | return
41 |
--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/PosSet.py:
--------------------------------------------------------------------------------
1 | import xml.etree.ElementTree as ET
2 | import logging
3 |
4 | from .XmlObj import XmlObj
5 | from .Pos import Pos
6 |
7 |
8 | class PosSet(XmlObj):
9 | def __init__(self, posset_path):
10 | self.m_pos_list = []
11 | self.m_id_map = {}
12 | self.m_name_map = {}
13 | self.Load(posset_path)
14 |
15 | def Load(self, file_path):
16 | # alibaba tts xml namespace
17 | ns = "{http://schemas.alibaba-inc.com/tts}"
18 |
19 | posset_root = ET.parse(file_path).getroot()
20 | for pos_node in posset_root.findall(ns + "pos"):
21 | pos = Pos()
22 | pos.Load(pos_node)
23 | self.m_pos_list.append(pos)
24 | if pos.m_id in self.m_id_map:
25 | logging.error("PosSet.Load: duplicate id: %d", pos.m_id)
26 | self.m_id_map[pos.m_id] = pos
27 |
28 | if pos.m_name in self.m_name_map:
29 | logging.error("PosSet.Load duplicate name name: %s", pos.m_name)
30 | self.m_name_map[pos.m_name] = pos
31 |
32 | if len(pos.m_sub_pos_list) > 0:
33 | for sub_pos in pos.m_sub_pos_list:
34 | self.m_pos_list.append(sub_pos)
35 | if sub_pos.m_id in self.m_id_map:
36 | logging.error("PosSet.Load: duplicate id: %d", sub_pos.m_id)
37 | self.m_id_map[sub_pos.m_id] = sub_pos
38 |
39 | if sub_pos.m_name in self.m_name_map:
40 | logging.error(
41 | "PosSet.Load duplicate name name: %s", sub_pos.m_name
42 | )
43 | self.m_name_map[sub_pos.m_name] = sub_pos
44 |
45 | def Save(self):
46 | pass
47 |
48 |
49 | # if __name__ == "__main__":
50 | # import os
51 | # import sys
52 | #
53 | # posset = PosSet()
54 | # posset.Load(sys.argv[1])
55 | #
56 | # for pos in posset.m_pos_list:
57 | # print(pos)
58 | # print(pos.m_id)
59 | # print(pos.m_name)
60 | # print(pos.m_desc)
61 | # print(pos.m_level)
62 | # print(pos.m_parent)
63 | # if pos.m_sub_pos_list:
64 | # print("sub pos list:")
65 | # for sub_pos in pos.m_sub_pos_list:
66 | # print(sub_pos)
67 | # print(sub_pos.m_id)
68 | # print(sub_pos.m_name)
69 | # print(sub_pos.m_desc)
70 | # print(sub_pos.m_level)
71 | # print(sub_pos.m_parent)
72 | # print("sub pos list end")
73 |
--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/Script.py:
--------------------------------------------------------------------------------
1 | from .XmlObj import XmlObj
2 |
3 | import xml.etree.ElementTree as ET
4 | from xml.dom import minidom
5 |
6 |
7 | class Script(XmlObj):
8 | def __init__(self, phoneset, posset):
9 | self.m_phoneset = phoneset
10 | self.m_posset = posset
11 | self.m_items = []
12 |
13 | def Save(self, outputXMLPath):
14 | root = ET.Element("script")
15 |
16 | root.set("uttcount", str(len(self.m_items)))
17 | root.set("xmlns", "http://schemas.alibaba-inc.com/tts")
18 | for item in self.m_items:
19 | item.Save(root)
20 |
21 | xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(
22 | indent=" ", encoding="utf-8"
23 | )
24 | with open(outputXMLPath, "wb") as f:
25 | f.write(xmlstr)
26 |
27 | def SaveMetafile(self):
28 | meta_lines = []
29 |
30 | for item in self.m_items:
31 | meta_lines.append(item.SaveMetafile())
32 |
33 | return meta_lines
34 |
--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/ScriptItem.py:
--------------------------------------------------------------------------------
1 | import xml.etree.ElementTree as ET
2 |
3 | from .XmlObj import XmlObj
4 |
5 |
6 | class ScriptItem(XmlObj):
7 | def __init__(self, phoneset, posset):
8 | if phoneset is None or posset is None:
9 | raise Exception("ScriptItem.__init__: phoneset or posset is None")
10 | self.m_phoneset = phoneset
11 | self.m_posset = posset
12 |
13 | self.m_id = None
14 | self.m_text = ""
15 | self.m_scriptSentence_list = []
16 | self.m_status = None
17 |
18 | def Load(self):
19 | pass
20 |
21 | def Save(self, parent_node):
22 | utterance_node = ET.SubElement(parent_node, "utterance")
23 | utterance_node.set("id", self.m_id)
24 |
25 | text_node = ET.SubElement(utterance_node, "text")
26 | text_node.text = self.m_text
27 |
28 | for sentence in self.m_scriptSentence_list:
29 | sentence.Save(utterance_node)
30 |
31 | def SaveMetafile(self):
32 | meta_line = self.m_id + "\t"
33 |
34 | for sentence in self.m_scriptSentence_list:
35 | meta_line += sentence.SaveMetafile()
36 |
37 | return meta_line
38 |
--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/ScriptSentence.py:
--------------------------------------------------------------------------------
1 | from .XmlObj import XmlObj
2 |
3 | import xml.etree.ElementTree as ET
4 |
5 |
6 | # TODO(jin): Not referenced, temporarily commented
7 | class WrittenSentence(XmlObj):
8 | def __init__(self, posset):
9 | self.m_written_word_list = []
10 | self.m_written_mark_list = []
11 | self.m_posset = posset
12 | self.m_align_list = []
13 | self.m_alignCursor = 0
14 | self.m_accompanyIndex = 0
15 | self.m_sequence = ""
16 | self.m_text = ""
17 |
18 | def AddHost(self, writtenWord):
19 | self.m_written_word_list.append(writtenWord)
20 | self.m_align_list.append(self.m_alignCursor)
21 |
22 | def LoadHost(self):
23 | pass
24 |
25 | def SaveHost(self):
26 | pass
27 |
28 | def AddAccompany(self, writtenMark):
29 | self.m_written_mark_list.append(writtenMark)
30 | self.m_alignCursor += 1
31 | self.m_accompanyIndex += 1
32 |
33 | def SaveAccompany(self):
34 | pass
35 |
36 | def LoadAccompany(self):
37 | pass
38 |
39 | # Get the mark span corresponding to specific spoken word
40 | def GetAccompanySpan(self, host_index):
41 | if host_index == -1:
42 | return (0, self.m_align_list[0])
43 |
44 | accompany_begin = self.m_align_list[host_index]
45 | accompany_end = (
46 | self.m_align_list[host_index + 1]
47 | if host_index + 1 < len(self.m_written_word_list)
48 | else len(self.m_written_mark_list)
49 | )
50 |
51 | return (accompany_begin, accompany_end)
52 |
53 | # TODO: iterable
54 | def GetElements(self):
55 | accompany_begin, accompany_end = self.GetAccompanySpan(-1)
56 | res_lst = [
57 | self.m_written_mark_list[i] for i in range(accompany_begin, accompany_end)
58 | ]
59 |
60 | for j in range(len(self.m_written_word_list)):
61 | accompany_begin, accompany_end = self.GetAccompanySpan(j)
62 | res_lst.extend([self.m_written_word_list[j]])
63 | res_lst.extend(
64 | [
65 | self.m_written_mark_list[i]
66 | for i in range(accompany_begin, accompany_end)
67 | ]
68 | )
69 |
70 | return res_lst
71 |
72 | def BuildSequence(self):
73 | self.m_sequence = " ".join([str(ele) for ele in self.GetElements()])
74 |
75 | def BuildText(self):
76 | self.m_text = "".join([str(ele) for ele in self.GetElements()])
77 |
78 |
79 | class SpokenSentence(XmlObj):
80 | def __init__(self, phoneset):
81 | self.m_spoken_word_list = []
82 | self.m_spoken_mark_list = []
83 | self.m_phoneset = phoneset
84 | self.m_align_list = []
85 | self.m_alignCursor = 0
86 | self.m_accompanyIndex = 0
87 | self.m_sequence = ""
88 | self.m_text = ""
89 |
90 | def __len__(self):
91 | return len(self.m_spoken_word_list)
92 |
93 | def AddHost(self, spokenWord):
94 | self.m_spoken_word_list.append(spokenWord)
95 | self.m_align_list.append(self.m_alignCursor)
96 |
97 | def SaveHost(self):
98 | pass
99 |
100 | def LoadHost(self):
101 | pass
102 |
103 | def AddAccompany(self, spokenMark):
104 | self.m_spoken_mark_list.append(spokenMark)
105 | self.m_alignCursor += 1
106 | self.m_accompanyIndex += 1
107 |
108 | def SaveAccompany(self):
109 | pass
110 |
111 | # Get the mark span corresponding to specific spoken word
112 | def GetAccompanySpan(self, host_index):
113 | if host_index == -1:
114 | return (0, self.m_align_list[0])
115 |
116 | accompany_begin = self.m_align_list[host_index]
117 | accompany_end = (
118 | self.m_align_list[host_index + 1]
119 | if host_index + 1 < len(self.m_spoken_word_list)
120 | else len(self.m_spoken_mark_list)
121 | )
122 |
123 | return (accompany_begin, accompany_end)
124 |
125 | # TODO: iterable
126 | def GetElements(self):
127 | accompany_begin, accompany_end = self.GetAccompanySpan(-1)
128 | res_lst = [
129 | self.m_spoken_mark_list[i] for i in range(accompany_begin, accompany_end)
130 | ]
131 |
132 | for j in range(len(self.m_spoken_word_list)):
133 | accompany_begin, accompany_end = self.GetAccompanySpan(j)
134 | res_lst.extend([self.m_spoken_word_list[j]])
135 | res_lst.extend(
136 | [
137 | self.m_spoken_mark_list[i]
138 | for i in range(accompany_begin, accompany_end)
139 | ]
140 | )
141 |
142 | return res_lst
143 |
144 | def LoadAccompany(self):
145 | pass
146 |
147 | def BuildSequence(self):
148 | self.m_sequence = " ".join([str(ele) for ele in self.GetElements()])
149 |
150 | def BuildText(self):
151 | self.m_text = "".join([str(ele) for ele in self.GetElements()])
152 |
153 | def Save(self, parent_node):
154 | spoken_node = ET.SubElement(parent_node, "spoken")
155 | spoken_node.set("wordcount", str(len(self.m_spoken_word_list)))
156 |
157 | text_node = ET.SubElement(spoken_node, "text")
158 | text_node.text = self.m_sequence
159 |
160 | # TODO: spoken mark might be used
161 | for word in self.m_spoken_word_list:
162 | word.Save(spoken_node)
163 |
164 | def SaveMetafile(self):
165 | meta_line_list = [word.SaveMetafile() for word in self.m_spoken_word_list]
166 |
167 | return " ".join(meta_line_list)
168 |
169 |
170 | class ScriptSentence(XmlObj):
171 | def __init__(self, phoneset, posset):
172 | self.m_phoneset = phoneset
173 | self.m_posset = posset
174 | self.m_writtenSentence = WrittenSentence(posset)
175 | self.m_spokenSentence = SpokenSentence(phoneset)
176 | self.m_text = ""
177 |
178 | def Save(self, parent_node):
179 | if len(self.m_spokenSentence) > 0:
180 | self.m_spokenSentence.Save(parent_node)
181 |
182 | def SaveMetafile(self):
183 | if len(self.m_spokenSentence) > 0:
184 | return self.m_spokenSentence.SaveMetafile()
185 | else:
186 | return ""
187 |
--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/ScriptWord.py:
--------------------------------------------------------------------------------
1 | import xml.etree.ElementTree as ET
2 |
3 | from .XmlObj import XmlObj
4 | from .core_types import Language
5 | from .Syllable import SyllableList
6 |
7 |
8 | # TODO(Jin): Not referenced, temporarily commented
9 | class WrittenWord(XmlObj):
10 | def __init__(self):
11 | self.m_name = None
12 | self.m_POS = None
13 |
14 | def __str__(self):
15 | return self.m_name
16 |
17 | def Load(self):
18 | pass
19 |
20 | def Save(self):
21 | pass
22 |
23 |
24 | class WrittenMark(XmlObj):
25 | def __init__(self):
26 | self.m_punctuation = None
27 |
28 | def __str__(self):
29 | return self.m_punctuation
30 |
31 | def Load(self):
32 | pass
33 |
34 | def Save(self):
35 | pass
36 |
37 |
38 | class SpokenWord(XmlObj):
39 | def __init__(self):
40 | self.m_name = None
41 | self.m_language = None
42 | self.m_syllable_list = []
43 | self.m_breakText = "1"
44 | self.m_POS = "0"
45 |
46 | def __str__(self):
47 | return self.m_name
48 |
49 | def Load(self):
50 | pass
51 |
52 | def Save(self, parent_node):
53 |
54 | word_node = ET.SubElement(parent_node, "word")
55 |
56 | name_node = ET.SubElement(word_node, "name")
57 | name_node.text = self.m_name
58 |
59 | if (
60 | len(self.m_syllable_list) > 0
61 | and self.m_syllable_list[0].m_language != Language.Neutral
62 | ):
63 | language_node = ET.SubElement(word_node, "lang")
64 | language_node.text = self.m_syllable_list[0].m_language.name
65 |
66 | SyllableList(self.m_syllable_list).Save(word_node)
67 |
68 | break_node = ET.SubElement(word_node, "break")
69 | break_node.text = self.m_breakText
70 |
71 | POS_node = ET.SubElement(word_node, "POS")
72 | POS_node.text = self.m_POS
73 |
74 | return
75 |
76 | def SaveMetafile(self):
77 | word_phone_cnt = sum(
78 | [syllable.PhoneCount() for syllable in self.m_syllable_list]
79 | )
80 | word_syllable_cnt = len(self.m_syllable_list)
81 | single_syllable_word = word_syllable_cnt == 1
82 | meta_line_list = []
83 |
84 | for idx, syll in enumerate(self.m_syllable_list):
85 | if word_phone_cnt == 1:
86 | word_pos = "word_both"
87 | elif idx == 0:
88 | word_pos = "word_begin"
89 | elif idx == len(self.m_syllable_list) - 1:
90 | word_pos = "word_end"
91 | else:
92 | word_pos = "word_middle"
93 | meta_line_list.append(
94 | syll.SaveMetafile(word_pos, single_syllable_word=single_syllable_word)
95 | )
96 |
97 | if self.m_breakText != "0" and self.m_breakText is not None:
98 | meta_line_list.append(
99 | "{{#{}$tone_none$s_none$word_none}}".format(self.m_breakText)
100 | )
101 |
102 | return " ".join(meta_line_list)
103 |
104 |
105 | class SpokenMark(XmlObj):
106 | def __init__(self):
107 | self.m_breakLevel = None
108 |
109 | def BreakLevel2Text(self):
110 | return "#" + str(self.m_breakLevel.value)
111 |
112 | def __str__(self):
113 | return self.BreakLevel2Text()
114 |
115 | def Load(self):
116 | pass
117 |
118 | def Save(self):
119 | pass
120 |
--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/Syllable.py:
--------------------------------------------------------------------------------
1 | import xml.etree.ElementTree as ET
2 |
3 | from .XmlObj import XmlObj
4 |
5 |
6 | class Syllable(XmlObj):
7 | def __init__(self):
8 | self.m_phone_list = []
9 | self.m_tone = None
10 | self.m_language = None
11 | self.m_breaklevel = None
12 |
13 | def PronunciationText(self):
14 | return " ".join([str(phone) for phone in self.m_phone_list])
15 |
16 | def PhoneCount(self):
17 | return len(self.m_phone_list)
18 |
19 | def ToneText(self):
20 | return str(self.m_tone.value)
21 |
22 | def Save(self):
23 | pass
24 |
25 | def Load(self):
26 | pass
27 |
28 | def GetPhoneMeta(
29 | self, phone_name, word_pos, syll_pos, tone_text, single_syllable_word=False
30 | ):
31 | # Special case: word with single syllable, the last phone's word_pos should be "word_end"
32 | if word_pos == "word_begin" and syll_pos == "s_end" and single_syllable_word:
33 | word_pos = "word_end"
34 | elif word_pos == "word_begin" and syll_pos not in [
35 | "s_begin",
36 | "s_both",
37 | ]: # FIXME: keep accord with Engine logic
38 | word_pos = "word_middle"
39 | elif word_pos == "word_end" and syll_pos not in ["s_end", "s_both"]:
40 | word_pos = "word_middle"
41 | else:
42 | pass
43 |
44 | return "{{{}$tone{}${}${}}}".format(phone_name, tone_text, syll_pos, word_pos)
45 |
46 | def SaveMetafile(self, word_pos, single_syllable_word=False):
47 | syllable_phone_cnt = len(self.m_phone_list)
48 |
49 | meta_line_list = []
50 |
51 | for idx, phone in enumerate(self.m_phone_list):
52 | if syllable_phone_cnt == 1:
53 | syll_pos = "s_both"
54 | elif idx == 0:
55 | syll_pos = "s_begin"
56 | elif idx == len(self.m_phone_list) - 1:
57 | syll_pos = "s_end"
58 | else:
59 | syll_pos = "s_middle"
60 | meta_line_list.append(
61 | self.GetPhoneMeta(
62 | phone,
63 | word_pos,
64 | syll_pos,
65 | self.ToneText(),
66 | single_syllable_word=single_syllable_word,
67 | )
68 | )
69 |
70 | return " ".join(meta_line_list)
71 |
72 |
73 | class SyllableList(XmlObj):
74 | def __init__(self, syllables):
75 | self.m_syllable_list = syllables
76 |
77 | def __len__(self):
78 | return len(self.m_syllable_list)
79 |
80 | def __index__(self, index):
81 | return self.m_syllable_list[index]
82 |
83 | def PronunciationText(self):
84 | return " - ".join(
85 | [syllable.PronunciationText() for syllable in self.m_syllable_list]
86 | )
87 |
88 | def ToneText(self):
89 | return "".join([syllable.ToneText() for syllable in self.m_syllable_list])
90 |
91 | def Save(self, parent_node):
92 | syllable_node = ET.SubElement(parent_node, "syllable")
93 | syllable_node.set("syllcount", str(len(self.m_syllable_list)))
94 |
95 | phone_node = ET.SubElement(syllable_node, "phone")
96 | phone_node.text = self.PronunciationText()
97 |
98 | tone_node = ET.SubElement(syllable_node, "tone")
99 | tone_node.text = self.ToneText()
100 |
101 | return
102 |
103 | def Load(self):
104 | pass
105 |
--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/XmlObj.py:
--------------------------------------------------------------------------------
1 | class XmlObj:
2 | def __init__(self):
3 | pass
4 |
5 | def Load(self):
6 | pass
7 |
8 | def Save(self):
9 | pass
10 |
11 | def LoadData(self):
12 | pass
13 |
14 | def SaveData(self):
15 | pass
16 |
--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/preprocess/script_convertor/core/__init__.py
--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/utils.py:
--------------------------------------------------------------------------------
1 | import re
2 | import unicodedata
3 | import codecs
4 |
5 | WordPattern = r"((?P\w+)(\(\w+\))?)"
6 | BreakPattern = r"(?P(\*?#(?P[0-4])))"
7 | MarkPattern = r"(?P[、,。!?:“”《》·])"
8 | POSPattern = r"(?P(\*?\|(?P[1-9])))"
9 | PhraseTonePattern = r"(?P(\*?%([L|H])))"
10 |
11 | NgBreakPattern = r"^ng(?P\d)"
12 |
13 |
14 | RegexWord = re.compile(WordPattern + r"\s*")
15 | RegexBreak = re.compile(BreakPattern + r"\s*")
16 | RegexID = re.compile(r"^(?P.*?)\s")
17 | RegexSentence = re.compile(
18 | r"({}|{}|{}|{}|{})\s*".format(
19 | WordPattern, BreakPattern, MarkPattern, POSPattern, PhraseTonePattern
20 | )
21 | )
22 | RegexForeignLang = re.compile(r"[A-Z@]")
23 | RegexSpace = re.compile(r"^\s*")
24 | RegexNeutralTone = re.compile(r"[1-5]5")
25 |
26 |
27 | def do_character_normalization(line):
28 | return unicodedata.normalize("NFKC", line)
29 |
30 |
31 | def do_prosody_text_normalization(line):
32 | tokens = line.split("\t")
33 | text = tokens[1]
34 | # Remove punctuations
35 | text = text.replace(u"。", " ")
36 | text = text.replace(u"、", " ")
37 | text = text.replace(u"“", " ")
38 | text = text.replace(u"”", " ")
39 | text = text.replace(u"‘", " ")
40 | text = text.replace(u"’", " ")
41 | text = text.replace(u"|", " ")
42 | text = text.replace(u"《", " ")
43 | text = text.replace(u"》", " ")
44 | text = text.replace(u"【", " ")
45 | text = text.replace(u"】", " ")
46 | text = text.replace(u"—", " ")
47 | text = text.replace(u"―", " ")
48 | text = text.replace(".", " ")
49 | text = text.replace("!", " ")
50 | text = text.replace("?", " ")
51 | text = text.replace("(", " ")
52 | text = text.replace(")", " ")
53 | text = text.replace("[", " ")
54 | text = text.replace("]", " ")
55 | text = text.replace("{", " ")
56 | text = text.replace("}", " ")
57 | text = text.replace("~", " ")
58 | text = text.replace(":", " ")
59 | text = text.replace(";", " ")
60 | text = text.replace("+", " ")
61 | text = text.replace(",", " ")
62 | # text = text.replace('·', ' ')
63 | text = text.replace('"', " ")
64 | text = text.replace(
65 | "-", ""
66 | ) # don't replace by space because compond word like two-year-old
67 | text = text.replace(
68 | "'", ""
69 | ) # don't replace by space because English word like that's
70 |
71 | # Replace break
72 | text = text.replace("/", "#2")
73 | text = text.replace("%", "#3")
74 | # Remove useless spaces surround #2 #3 #4
75 | text = re.sub(r"(#\d)[ ]+", r"\1", text)
76 | text = re.sub(r"[ ]+(#\d)", r"\1", text)
77 | # Replace space by #1
78 | text = re.sub("[ ]+", "#1", text)
79 |
80 | # Remove break at the end of the text
81 | text = re.sub(r"#\d$", "", text)
82 |
83 | # Add #1 between target language and foreign language
84 | text = re.sub(r"([a-zA-Z])([^a-zA-Z\d\#\s\'\%\/\-])", r"\1#1\2", text)
85 | text = re.sub(r"([^a-zA-Z\d\#\s\'\%\/\-])([a-zA-Z])", r"\1#1\2", text)
86 |
87 | return tokens[0] + "\t" + text
88 |
89 |
90 | def is_fp_line(line):
91 | fp_category_list = ["FP", "I", "N", "Q"]
92 | elements = line.strip().split(" ")
93 | res = True
94 | for ele in elements:
95 | if ele not in fp_category_list:
96 | res = False
97 | break
98 | return res
99 |
100 |
101 | def format_prosody(src_prosody):
102 | formatted_lines = []
103 | with codecs.open(src_prosody, "r", "utf-8") as f:
104 | lines = f.readlines()
105 |
106 | idx = 0
107 | while idx < len(lines):
108 | line = do_character_normalization(lines[idx])
109 |
110 | if len(line.strip().split("\t")) == 2:
111 | line = do_prosody_text_normalization(line)
112 | else:
113 | fp_enable = is_fp_line(line)
114 | if fp_enable:
115 | idx += 3
116 | continue
117 | formatted_lines.append(line)
118 | idx += 1
119 | # with codecs.open(tgt_prosody, 'w', 'utf-8') as f:
120 | # f.writelines(formatted_lines)
121 | return formatted_lines
122 |
--------------------------------------------------------------------------------
/kantts/preprocess/se_processor/D_TDNN.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 |
3 | import torch
4 | from torch import nn
5 | import torch.nn.functional as F
6 |
7 | from .layers import (DenseLayer, DenseTDNNBlock, StatsPool, TDNNLayer, SEDenseTDNNBlock,
8 | TransitLayer)
9 |
10 | class BasicBlock(nn.Module):
11 | expansion = 1
12 |
13 | def __init__(self, in_planes, planes, stride=1):
14 | super(BasicBlock, self).__init__()
15 | self.conv1 = nn.Conv2d(in_planes,
16 | planes,
17 | kernel_size=3,
18 | stride=(stride, 1),
19 | padding=1,
20 | bias=False)
21 | self.bn1 = nn.BatchNorm2d(planes)
22 | self.conv2 = nn.Conv2d(planes,
23 | planes,
24 | kernel_size=3,
25 | stride=1,
26 | padding=1,
27 | bias=False)
28 | self.bn2 = nn.BatchNorm2d(planes)
29 |
30 | self.shortcut = nn.Sequential()
31 | if stride != 1 or in_planes != self.expansion * planes:
32 | self.shortcut = nn.Sequential(
33 | nn.Conv2d(in_planes,
34 | self.expansion * planes,
35 | kernel_size=1,
36 | stride=(stride, 1),
37 | bias=False),
38 | nn.BatchNorm2d(self.expansion * planes))
39 |
40 | def forward(self, x):
41 | out = F.relu(self.bn1(self.conv1(x)))
42 | out = self.bn2(self.conv2(out))
43 | out += self.shortcut(x)
44 | out = F.relu(out)
45 | return out
46 |
47 | class CNN_Head(nn.Module):
48 | def __init__(self,
49 | block=BasicBlock,
50 | num_blocks=[2, 2],
51 | m_channels=32,
52 | feat_dim=80):
53 | super(CNN_Head, self).__init__()
54 | self.in_planes = m_channels
55 | self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
56 | self.bn1 = nn.BatchNorm2d(m_channels)
57 |
58 | self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=2)
59 | self.layer2 = self._make_layer(block, m_channels, num_blocks[0], stride=2)
60 |
61 | self.conv2 = nn.Conv2d(m_channels, m_channels, kernel_size=3, stride=(2, 1), padding=1, bias=False)
62 | self.bn2 = nn.BatchNorm2d(m_channels)
63 | self.out_channels = m_channels * (feat_dim // 8)
64 |
65 | def _make_layer(self, block, planes, num_blocks, stride):
66 | strides = [stride] + [1] * (num_blocks - 1)
67 | layers = []
68 | for stride in strides:
69 | layers.append(block(self.in_planes, planes, stride))
70 | self.in_planes = planes * block.expansion
71 | return nn.Sequential(*layers)
72 |
73 | def forward(self, x):
74 | x = x.unsqueeze_(1)
75 | out = F.relu(self.bn1(self.conv1(x)))
76 | out = self.layer1(out)
77 | out = self.layer2(out)
78 | out = F.relu(self.bn2(self.conv2(out)))
79 |
80 | out = out.reshape(out.shape[0], out.shape[1]*out.shape[2], out.shape[3])
81 | return out
82 |
83 | class DTDNN(nn.Module):
84 | def __init__(self,
85 | feat_dim=80,
86 | embedding_size=192,
87 | growth_rate=32,
88 | bn_size=4,
89 | init_channels=128,
90 | config_str='batchnorm-relu',
91 | memory_efficient=True):
92 | super(DTDNN, self).__init__()
93 |
94 | self.head = CNN_Head()
95 | feat_dim = self.head.out_channels
96 |
97 | self.xvector = nn.Sequential(
98 | OrderedDict([
99 | ('tdnn',
100 | TDNNLayer(feat_dim,
101 | init_channels,
102 | 5,
103 | stride=2,
104 | dilation=1,
105 | padding=-1,
106 | config_str=config_str)),
107 | ]))
108 | channels = init_channels
109 | for i, (num_layers, kernel_size,
110 | dilation) in enumerate(zip((12, 24, 16), (3, 3, 3), (1, 2, 3))):
111 | block = SEDenseTDNNBlock(num_layers=num_layers,
112 | in_channels=channels,
113 | out_channels=growth_rate,
114 | bn_channels=bn_size * growth_rate,
115 | kernel_size=kernel_size,
116 | dilation=dilation,
117 | config_str=config_str,
118 | memory_efficient=memory_efficient)
119 | self.xvector.add_module('block%d' % (i + 1), block)
120 | channels = channels + num_layers * growth_rate
121 | self.xvector.add_module(
122 | 'transit%d' % (i + 1),
123 | TransitLayer(channels,
124 | channels // 2,
125 | bias=False,
126 | config_str=config_str))
127 | channels //= 2
128 |
129 | self.bn = nn.BatchNorm1d(channels)
130 | self.relu = nn.ReLU(inplace=True)
131 |
132 | self.xvector.add_module('stats', StatsPool())
133 | self.xvector.add_module(
134 | 'dense',
135 | DenseLayer(channels * 2, embedding_size, config_str='batchnorm_'))
136 |
137 | for m in self.modules():
138 | if isinstance(m, (nn.Conv1d, nn.Linear)):
139 | nn.init.kaiming_normal_(m.weight.data)
140 | if m.bias is not None:
141 | nn.init.zeros_(m.bias)
142 |
143 | def forward(self, x):
144 | x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
145 | x = self.head(x)
146 | x = self.xvector.tdnn(x)
147 |
148 | x = self.xvector.block1(x)
149 | x = self.xvector.transit1(x)
150 |
151 | x = self.xvector.block2(x)
152 | x = self.xvector.transit2(x)
153 |
154 | x = self.xvector.block3(x)
155 | x = self.xvector.transit3(x)
156 | x = self.relu(self.bn(x))
157 |
158 | x = self.xvector.stats(x)
159 | x = self.xvector.dense(x)
160 | return x
161 |
162 |
--------------------------------------------------------------------------------
/kantts/preprocess/se_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/preprocess/se_processor/__init__.py
--------------------------------------------------------------------------------
/kantts/preprocess/se_processor/se_processor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torchaudio
3 | import numpy as np
4 | import os
5 | import torchaudio.compliance.kaldi as Kaldi
6 | from .D_TDNN import DTDNN
7 | import logging
8 | import argparse
9 | from glob import glob
10 |
11 |
12 | logging.basicConfig(
13 | format="%(asctime)s %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s",
14 | datefmt="%Y-%m-%d:%H:%M:%S",
15 | level=logging.DEBUG,
16 | )
17 |
18 | class SpeakerEmbeddingProcessor:
19 | def __init__(self, sample_rate=16000):
20 | self.sample_rate = sample_rate
21 | self.min_wav_length = self.sample_rate * 30 * 10 / 1000
22 |
23 | self.pcm_dict = {}
24 | self.mfcc_dict = {}
25 | self.se_list = []
26 |
27 | def process(self, src_voice_dir, se_model):
28 | logging.info("[SpeakerEmbeddingProcessor] Speaker embedding extractor started")
29 |
30 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
31 | model = DTDNN()
32 | try:
33 | if os.path.basename(se_model) == "se.model":
34 | model.load_state_dict(torch.load(se_model, map_location=device))
35 | else:
36 | raise Exception("[SpeakerEmbeddingProcessor] se model loading error!!!")
37 | except Exception as e:
38 | logging.info(e)
39 | if os.path.basename(se_model) == 'se.onnx':
40 | logging.info("[SpeakerEmbeddingProcessor] please update your se model to ensure that the version is greater than or equal to 1.0.5")
41 | sys.exit()
42 | model.eval()
43 | model.to(device)
44 |
45 | wav_dir = os.path.join(src_voice_dir, "wav")
46 | se_dir = os.path.join(src_voice_dir, "se")
47 | se_average_file = os.path.join(se_dir, "se.npy")
48 |
49 | os.makedirs(se_dir, exist_ok=True)
50 |
51 | wav_files = glob(os.path.join(wav_dir, '*.wav'))
52 |
53 |
54 | for wav_file in wav_files:
55 | basename = os.path.splitext(os.path.basename(wav_file))[0]
56 | se_file = os.path.join(se_dir, basename + '.npy')
57 |
58 | wav, fs = torchaudio.load(wav_file)
59 | assert wav.shape[0] == 1
60 | assert fs == 16000
61 |
62 | if wav.shape[1] < self.min_wav_length:
63 | continue
64 |
65 | fbank_feat = Kaldi.fbank(wav, num_mel_bins=80)
66 |
67 | feat = fbank_feat - fbank_feat.mean(dim=0, keepdim=True)
68 | feat = feat.unsqueeze(0).to(device)
69 |
70 | speaker_embedding = model(feat)
71 | speaker_embedding = speaker_embedding.squeeze().cpu().detach().numpy()
72 | speaker_embedding = np.expand_dims(speaker_embedding, axis=0)
73 |
74 |
75 | np.save(se_file, speaker_embedding)
76 | self.se_list.append(speaker_embedding)
77 | self.se_average = np.expand_dims(
78 | np.mean(
79 | np.concatenate(self.se_list, axis=0),
80 | axis=0
81 | ),
82 | axis=0
83 | )
84 | np.save(se_average_file, self.se_average)
85 |
86 | logging.info("[SpeakerEmbeddingProcessor] Speaker embedding extracted successfully!")
87 |
88 |
89 | if __name__ == '__main__':
90 | parser = argparse.ArgumentParser(description="Speaker Embedding Processor")
91 | parser.add_argument("--src_voice_dir", type=str, required=True)
92 | parser.add_argument('--se_model', required=True)
93 | args = parser.parse_args()
94 |
95 | sep = SpeakerEmbeddingProcessor()
96 | sep.process(args.src_voice_dir, args.se_onnx)
--------------------------------------------------------------------------------
/kantts/preprocess/text_process.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import sys
4 | import argparse
5 | import yaml
6 | import time
7 | import zipfile
8 |
9 | ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # NOQA: E402
10 | sys.path.insert(0, os.path.dirname(ROOT_PATH)) # NOQA: E402
11 |
12 | try:
13 | from kantts.datasets.dataset import BERT_Text_Dataset
14 | from kantts.utils.log import logging_to_file, get_git_revision_hash
15 | from kantts.utils.ling_unit import text_to_mit_symbols as text_to_symbols
16 | except ImportError:
17 | raise ImportError("Please install kantts.")
18 |
19 | logging.basicConfig(
20 | format="%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
21 | datefmt="%Y-%m-%d:%H:%M:%S",
22 | level=logging.INFO,
23 | )
24 |
25 |
26 | def gen_metafile(
27 | output_dir,
28 | split_ratio=0.98,
29 | ):
30 | raw_metafile = os.path.join(output_dir, "raw_metafile.txt")
31 | bert_train_meta = os.path.join(output_dir, "bert_train.lst")
32 | bert_valid_meta = os.path.join(output_dir, "bert_valid.lst")
33 | if not os.path.exists(
34 | bert_train_meta) or not os.path.exists(bert_valid_meta):
35 | BERT_Text_Dataset.gen_metafile(raw_metafile, output_dir, split_ratio)
36 | logging.info("BERT Text metafile generated.")
37 |
38 | # TODO: Zh-CN as default
39 | def process_mit_style_data(
40 | text_file,
41 | resources_zip_file,
42 | output_dir,
43 | ):
44 | os.makedirs(output_dir, exist_ok=True)
45 | logging_to_file(os.path.join(output_dir, "data_process_stdout.log"))
46 |
47 | resource_root_dir = os.path.dirname(resources_zip_file)
48 | resource_dir = os.path.join(resource_root_dir, "resource")
49 |
50 | if not os.path.exists(resource_dir):
51 | logging.info("Extracting resources...")
52 | with zipfile.ZipFile(resources_zip_file, "r") as zip_ref:
53 | zip_ref.extractall(resource_root_dir)
54 |
55 | with open(text_file, "r") as text_data:
56 | texts = text_data.readlines()
57 |
58 | logging.info("Converting text to symbols...")
59 | symbols_lst = text_to_symbols(texts, resource_dir, "F7")
60 | symbols_file = os.path.join(output_dir, "raw_metafile.txt")
61 | with open(symbols_file, "w") as symbol_data:
62 | for symbol in symbols_lst:
63 | symbol_data.write(symbol)
64 |
65 | logging.info("Processing done.")
66 |
67 | # Generate BERT Text metafile
68 | # TODO: train/valid ratio setting
69 | gen_metafile(output_dir)
70 |
71 |
72 | if __name__ == "__main__":
73 | parser = argparse.ArgumentParser(description="Dataset preprocessor")
74 | parser.add_argument("--text_file", type=str, required=True)
75 | parser.add_argument("--resources_zip_file", type=str, required=True)
76 | parser.add_argument("--output_dir", type=str, required=True)
77 |
78 | args = parser.parse_args()
79 |
80 | process_mit_style_data(
81 | args.text_file,
82 | args.resources_zip_file,
83 | args.output_dir,
84 | )
85 |
86 |
--------------------------------------------------------------------------------
/kantts/train/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/train/__init__.py
--------------------------------------------------------------------------------
/kantts/train/scheduler.py:
--------------------------------------------------------------------------------
1 | from torch.optim.lr_scheduler import * # NOQA
2 | from torch.optim.lr_scheduler import _LRScheduler # NOQA
3 |
4 | """Noam Scheduler."""
5 |
6 |
7 | class FindLR(_LRScheduler):
8 | """
9 | inspired by fast.ai @https://sgugger.github.io/how-do-you-find-a-good-learning-rate.html
10 | """
11 |
12 | def __init__(self, optimizer, max_steps, max_lr=10):
13 | self.max_steps = max_steps
14 | self.max_lr = max_lr
15 | super().__init__(optimizer)
16 |
17 | def get_lr(self):
18 | return [
19 | base_lr
20 | * ((self.max_lr / base_lr) ** (self.last_epoch / (self.max_steps - 1)))
21 | for base_lr in self.base_lrs
22 | ]
23 |
24 |
25 | class NoamLR(_LRScheduler):
26 | """
27 | Implements the Noam Learning rate schedule. This corresponds to increasing the learning rate
28 | linearly for the first ``warmup_steps`` training steps, and decreasing it thereafter proportionally
29 | to the inverse square root of the step number, scaled by the inverse square root of the
30 | dimensionality of the model. Time will tell if this is just madness or it's actually important.
31 | Parameters
32 | ----------
33 | warmup_steps: ``int``, required.
34 | The number of steps to linearly increase the learning rate.
35 | """
36 |
37 | def __init__(self, optimizer, warmup_steps):
38 | self.warmup_steps = warmup_steps
39 | super().__init__(optimizer)
40 |
41 | def get_lr(self):
42 | last_epoch = max(1, self.last_epoch)
43 | scale = self.warmup_steps ** 0.5 * min(
44 | last_epoch ** (-0.5), last_epoch * self.warmup_steps ** (-1.5)
45 | )
46 | return [base_lr * scale for base_lr in self.base_lrs]
47 |
--------------------------------------------------------------------------------
/kantts/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/utils/__init__.py
--------------------------------------------------------------------------------
/kantts/utils/audio_torch.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import librosa
3 | from distutils.version import LooseVersion
4 |
5 | is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion("1.7")
6 |
7 |
8 | def stft(x, fft_size, hop_size, win_length, window):
9 | """Perform STFT and convert to magnitude spectrogram.
10 |
11 | Args:
12 | x (Tensor): Input signal tensor (B, T).
13 | fft_size (int): FFT size.
14 | hop_size (int): Hop size.
15 | win_length (int): Window length.
16 | window (str): Window function type.
17 |
18 | Returns:
19 | Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
20 |
21 | """
22 | if is_pytorch_17plus:
23 | x_stft = torch.stft(
24 | x, fft_size, hop_size, win_length, window, return_complex=False
25 | )
26 | else:
27 | x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
28 | real = x_stft[..., 0]
29 | imag = x_stft[..., 1]
30 |
31 | return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1)
32 |
33 |
34 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
35 | return 20 * torch.log10(torch.clamp(x, min=clip_val) * C)
36 |
37 |
38 | def dynamic_range_decompression_torch(x, C=1):
39 | return torch.pow(10.0, x * 0.05) / C
40 |
41 |
42 | def spectral_normalize_torch(
43 | magnitudes,
44 | min_level_db=-100.0,
45 | ref_level_db=20.0,
46 | norm_abs_value=4.0,
47 | symmetric=True,
48 | ):
49 | output = dynamic_range_compression_torch(magnitudes) - ref_level_db
50 |
51 | if symmetric:
52 | return torch.clamp(
53 | 2 * norm_abs_value * ((output - min_level_db) / (-min_level_db))
54 | - norm_abs_value,
55 | min=-norm_abs_value,
56 | max=norm_abs_value,
57 | )
58 | else:
59 | return torch.clamp(
60 | norm_abs_value * ((output - min_level_db) / (-min_level_db)),
61 | min=0.0,
62 | max=norm_abs_value,
63 | )
64 |
65 |
66 | def spectral_de_normalize_torch(
67 | magnitudes,
68 | min_level_db=-100.0,
69 | ref_level_db=20.0,
70 | norm_abs_value=4.0,
71 | symmetric=True,
72 | ):
73 | if symmetric:
74 | magnitudes = torch.clamp(magnitudes, min=-norm_abs_value, max=norm_abs_value)
75 | magnitudes = (magnitudes + norm_abs_value) * (-min_level_db) / (
76 | 2 * norm_abs_value
77 | ) + min_level_db
78 | else:
79 | magnitudes = torch.clamp(magnitudes, min=0.0, max=norm_abs_value)
80 | magnitudes = (magnitudes) * (-min_level_db) / (norm_abs_value) + min_level_db
81 |
82 | output = dynamic_range_decompression_torch(magnitudes + ref_level_db)
83 | return output
84 |
85 |
86 | class MelSpectrogram(torch.nn.Module):
87 | """Calculate Mel-spectrogram."""
88 |
89 | def __init__(
90 | self,
91 | fs=22050,
92 | fft_size=1024,
93 | hop_size=256,
94 | win_length=None,
95 | window="hann",
96 | num_mels=80,
97 | fmin=80,
98 | fmax=7600,
99 | center=True,
100 | normalized=False,
101 | onesided=True,
102 | eps=1e-10,
103 | log_base=10.0,
104 | pad_mode="constant",
105 | ):
106 | """Initialize MelSpectrogram module."""
107 | super().__init__()
108 | self.fft_size = fft_size
109 | if win_length is None:
110 | self.win_length = fft_size
111 | else:
112 | self.win_length = win_length
113 | self.hop_size = hop_size
114 | self.center = center
115 | self.normalized = normalized
116 | self.onesided = onesided
117 | if window is not None and not hasattr(torch, f"{window}_window"):
118 | raise ValueError(f"{window} window is not implemented")
119 | self.window = window
120 | self.eps = eps
121 | self.pad_mode = pad_mode
122 |
123 | fmin = 0 if fmin is None else fmin
124 | fmax = fs / 2 if fmax is None else fmax
125 | melmat = librosa.filters.mel(
126 | sr=fs,
127 | n_fft=fft_size,
128 | n_mels=num_mels,
129 | fmin=fmin,
130 | fmax=fmax,
131 | )
132 | self.register_buffer("melmat", torch.from_numpy(melmat.T).float())
133 | self.stft_params = {
134 | "n_fft": self.fft_size,
135 | "win_length": self.win_length,
136 | "hop_length": self.hop_size,
137 | "center": self.center,
138 | "normalized": self.normalized,
139 | "onesided": self.onesided,
140 | "pad_mode": self.pad_mode,
141 | }
142 | if is_pytorch_17plus:
143 | self.stft_params["return_complex"] = False
144 |
145 | self.log_base = log_base
146 | if self.log_base is None:
147 | self.log = torch.log
148 | elif self.log_base == 2.0:
149 | self.log = torch.log2
150 | elif self.log_base == 10.0:
151 | self.log = torch.log10
152 | else:
153 | raise ValueError(f"log_base: {log_base} is not supported.")
154 |
155 | def forward(self, x):
156 | """Calculate Mel-spectrogram.
157 |
158 | Args:
159 | x (Tensor): Input waveform tensor (B, T) or (B, 1, T).
160 |
161 | Returns:
162 | Tensor: Mel-spectrogram (B, #mels, #frames).
163 |
164 | """
165 | if x.dim() == 3:
166 | # (B, C, T) -> (B*C, T)
167 | x = x.reshape(-1, x.size(2))
168 |
169 | if self.window is not None:
170 | window_func = getattr(torch, f"{self.window}_window")
171 | window = window_func(self.win_length, dtype=x.dtype, device=x.device)
172 | else:
173 | window = None
174 |
175 | x_stft = torch.stft(x, window=window, **self.stft_params)
176 | # (B, #freqs, #frames, 2) -> (B, $frames, #freqs, 2)
177 | x_stft = x_stft.transpose(1, 2)
178 | x_power = x_stft[..., 0] ** 2 + x_stft[..., 1] ** 2
179 | x_amp = torch.sqrt(torch.clamp(x_power, min=self.eps))
180 |
181 | x_mel = torch.matmul(x_amp, self.melmat)
182 | x_mel = torch.clamp(x_mel, min=self.eps)
183 | x_mel = spectral_normalize_torch(x_mel)
184 |
185 | # return self.log(x_mel).transpose(1, 2)
186 | return x_mel.transpose(1, 2)
187 |
--------------------------------------------------------------------------------
/kantts/utils/ling_unit/__init__.py:
--------------------------------------------------------------------------------
1 | import ttsfrd
2 |
3 | ENG_LANG_MAPPING = {
4 | "PinYin": "zh-cn",
5 | "English": "en-us",
6 | "British": "en-gb",
7 | "ZhHK": "hk_cantonese",
8 | "Sichuan": "sichuan",
9 | "Japanese": "japanese",
10 | "WuuShangHai": "shanghai",
11 | "Indonesian": "indonesian",
12 | "Malay": "malay",
13 | "Filipino": "filipino",
14 | "Vietnamese": "vietnamese",
15 | "Korean": "korean",
16 | "Russian": "russian",
17 | }
18 |
19 |
20 | def text_to_mit_symbols(texts, resources_dir, speaker, lang="PinYin"):
21 | fe = ttsfrd.TtsFrontendEngine()
22 | fe.initialize(resources_dir)
23 | fe.set_lang_type(ENG_LANG_MAPPING[lang])
24 |
25 | symbols_lst = []
26 | for idx, text in enumerate(texts):
27 | text = text.strip()
28 | res = fe.gen_tacotron_symbols(text)
29 | res = res.replace("F7", speaker)
30 | sentences = res.split("\n")
31 | for sentence in sentences:
32 | arr = sentence.split("\t")
33 | # skip the empty line
34 | if len(arr) != 2:
35 | continue
36 | sub_index, symbols = sentence.split("\t")
37 | symbol_str = "{}_{}\t{}\n".format(idx, sub_index, symbols)
38 | symbols_lst.append(symbol_str)
39 |
40 | return symbols_lst
41 |
--------------------------------------------------------------------------------
/kantts/utils/ling_unit/cleaners.py:
--------------------------------------------------------------------------------
1 | """
2 | Cleaners are transformations that run over the input text at both training and eval time.
3 |
4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
6 | 1. "english_cleaners" for English text
7 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
8 | the Unidecode library (https://pypi.python.org/pypi/Unidecode)
9 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
10 | the symbols in symbols.py to match your data).
11 | """
12 |
13 | import re
14 | from unidecode import unidecode
15 | from .numbers import normalize_numbers
16 |
17 |
18 | # Regular expression matching whitespace:
19 | _whitespace_re = re.compile(r"\s+")
20 |
21 | # List of (regular expression, replacement) pairs for abbreviations:
22 | _abbreviations = [
23 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
24 | for x in [
25 | ("mrs", "misess"),
26 | ("mr", "mister"),
27 | ("dr", "doctor"),
28 | ("st", "saint"),
29 | ("co", "company"),
30 | ("jr", "junior"),
31 | ("maj", "major"),
32 | ("gen", "general"),
33 | ("drs", "doctors"),
34 | ("rev", "reverend"),
35 | ("lt", "lieutenant"),
36 | ("hon", "honorable"),
37 | ("sgt", "sergeant"),
38 | ("capt", "captain"),
39 | ("esq", "esquire"),
40 | ("ltd", "limited"),
41 | ("col", "colonel"),
42 | ("ft", "fort"),
43 | ]
44 | ]
45 |
46 |
47 | def expand_abbreviations(text):
48 | for regex, replacement in _abbreviations:
49 | text = re.sub(regex, replacement, text)
50 | return text
51 |
52 |
53 | def expand_numbers(text):
54 | return normalize_numbers(text)
55 |
56 |
57 | def lowercase(text):
58 | return text.lower()
59 |
60 |
61 | def collapse_whitespace(text):
62 | return re.sub(_whitespace_re, " ", text)
63 |
64 |
65 | def convert_to_ascii(text):
66 | return unidecode(text)
67 |
68 |
69 | def basic_cleaners(text):
70 | """Basic pipeline that lowercases and collapses whitespace without transliteration."""
71 | text = lowercase(text)
72 | text = collapse_whitespace(text)
73 | return text
74 |
75 |
76 | def transliteration_cleaners(text):
77 | """Pipeline for non-English text that transliterates to ASCII."""
78 | text = convert_to_ascii(text)
79 | text = lowercase(text)
80 | text = collapse_whitespace(text)
81 | return text
82 |
83 |
84 | def english_cleaners(text):
85 | """Pipeline for English text, including number and abbreviation expansion."""
86 | text = convert_to_ascii(text)
87 | text = lowercase(text)
88 | text = expand_numbers(text)
89 | text = expand_abbreviations(text)
90 | text = collapse_whitespace(text)
91 | return text
92 |
--------------------------------------------------------------------------------
/kantts/utils/ling_unit/emotion_types.py:
--------------------------------------------------------------------------------
1 | emotion_types = [
2 | "emotion_none",
3 | "emotion_neutral",
4 | "emotion_angry",
5 | "emotion_disgust",
6 | "emotion_fear",
7 | "emotion_happy",
8 | "emotion_sad",
9 | "emotion_surprise",
10 | "emotion_calm",
11 | "emotion_gentle",
12 | "emotion_relax",
13 | "emotion_lyrical",
14 | "emotion_serious",
15 | "emotion_disgruntled",
16 | "emotion_satisfied",
17 | "emotion_disappointed",
18 | "emotion_excited",
19 | "emotion_anxiety",
20 | "emotion_jealousy",
21 | "emotion_hate",
22 | "emotion_pity",
23 | "emotion_pleasure",
24 | "emotion_arousal",
25 | "emotion_dominance",
26 | "emotion_placeholder1",
27 | "emotion_placeholder2",
28 | "emotion_placeholder3",
29 | "emotion_placeholder4",
30 | "emotion_placeholder5",
31 | "emotion_placeholder6",
32 | "emotion_placeholder7",
33 | "emotion_placeholder8",
34 | "emotion_placeholder9",
35 | ]
36 |
--------------------------------------------------------------------------------
/kantts/utils/ling_unit/lang_symbols.py:
--------------------------------------------------------------------------------
1 | import xml.etree.ElementTree as ET
2 | from kantts.preprocess.languages import languages
3 | import logging
4 | import os
5 |
6 | syllable_flags = [
7 | "s_begin",
8 | "s_end",
9 | "s_none",
10 | "s_both",
11 | "s_middle",
12 | ]
13 |
14 | word_segments = [
15 | "word_begin",
16 | "word_end",
17 | "word_middle",
18 | "word_both",
19 | "word_none",
20 | ]
21 |
22 | LANGUAGES_DIR = os.path.join(
23 | os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
24 | "preprocess",
25 | "languages",
26 | )
27 |
28 |
29 | def parse_phoneset(phoneset_file):
30 | """Parse a phoneset file and return a list of symbols.
31 | Args:
32 | phoneset_file (str): Path to the phoneset file.
33 |
34 | Returns:
35 | list: A list of phones.
36 | """
37 | ns = "{http://schemas.alibaba-inc.com/tts}"
38 |
39 | phone_lst = []
40 | phoneset_root = ET.parse(phoneset_file).getroot()
41 | for phone_node in phoneset_root.findall(ns + "phone"):
42 | phone_lst.append(phone_node.find(ns + "name").text)
43 |
44 | for i in range(1, 5):
45 | phone_lst.append("#{}".format(i))
46 |
47 | return phone_lst
48 |
49 |
50 | def parse_tonelist(tonelist_file):
51 | """Parse a tonelist file and return a list of tones.
52 | Args:
53 | tonelist_file (str): Path to the tonelist file.
54 |
55 | Returns:
56 | dict: A dictionary of tones.
57 | """
58 | tone_lst = []
59 | with open(tonelist_file, "r") as f:
60 | lines = f.readlines()
61 | for line in lines:
62 | tone = line.strip()
63 | if tone != "":
64 | tone_lst.append("tone{}".format(tone))
65 | else:
66 | tone_lst.append("tone_none")
67 |
68 | return tone_lst
69 |
70 |
71 | def get_language_symbols(language):
72 | """Get symbols of a language.
73 | Args:
74 | language (str): Language name.
75 | """
76 | language_dict = languages.get(language, None)
77 | if language_dict is None:
78 | logging.error("Language %s not supported. Using PinYin as default", language)
79 | language_dict = languages["PinYin"]
80 | language = "PinYin"
81 |
82 | language_dir = os.path.join(LANGUAGES_DIR, language)
83 | phoneset_file = os.path.join(language_dir, language_dict["phoneset_path"])
84 | tonelist_file = os.path.join(language_dir, language_dict["tonelist_path"])
85 | phones = parse_phoneset(phoneset_file)
86 | tones = parse_tonelist(tonelist_file)
87 |
88 | return phones, tones, syllable_flags, word_segments
89 |
--------------------------------------------------------------------------------
/kantts/utils/ling_unit/numbers.py:
--------------------------------------------------------------------------------
1 | import inflect
2 | import re
3 |
4 |
5 | _inflect = inflect.engine()
6 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
7 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
8 | _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
9 | _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
10 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
11 | _number_re = re.compile(r"[0-9]+")
12 |
13 |
14 | def _remove_commas(m):
15 | return m.group(1).replace(",", "")
16 |
17 |
18 | def _expand_decimal_point(m):
19 | return m.group(1).replace(".", " point ")
20 |
21 |
22 | def _expand_dollars(m):
23 | match = m.group(1)
24 | parts = match.split(".")
25 | if len(parts) > 2:
26 | return match + " dollars" # Unexpected format
27 | dollars = int(parts[0]) if parts[0] else 0
28 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
29 | if dollars and cents:
30 | dollar_unit = "dollar" if dollars == 1 else "dollars"
31 | cent_unit = "cent" if cents == 1 else "cents"
32 | return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
33 | elif dollars:
34 | dollar_unit = "dollar" if dollars == 1 else "dollars"
35 | return "%s %s" % (dollars, dollar_unit)
36 | elif cents:
37 | cent_unit = "cent" if cents == 1 else "cents"
38 | return "%s %s" % (cents, cent_unit)
39 | else:
40 | return "zero dollars"
41 |
42 |
43 | def _expand_ordinal(m):
44 | return _inflect.number_to_words(m.group(0))
45 |
46 |
47 | def _expand_number(m):
48 | num = int(m.group(0))
49 | if num > 1000 and num < 3000:
50 | if num == 2000:
51 | return "two thousand"
52 | elif num > 2000 and num < 2010:
53 | return "two thousand " + _inflect.number_to_words(num % 100)
54 | elif num % 100 == 0:
55 | return _inflect.number_to_words(num // 100) + " hundred"
56 | else:
57 | return _inflect.number_to_words(
58 | num, andword="", zero="oh", group=2
59 | ).replace(", ", " ")
60 | else:
61 | return _inflect.number_to_words(num, andword="")
62 |
63 |
64 | def normalize_numbers(text):
65 | text = re.sub(_comma_number_re, _remove_commas, text)
66 | text = re.sub(_pounds_re, r"\1 pounds", text)
67 | text = re.sub(_dollars_re, _expand_dollars, text)
68 | text = re.sub(_decimal_number_re, _expand_decimal_point, text)
69 | text = re.sub(_ordinal_re, _expand_ordinal, text)
70 | text = re.sub(_number_re, _expand_number, text)
71 | return text
72 |
--------------------------------------------------------------------------------
/kantts/utils/log.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import subprocess
3 |
4 |
5 | def logging_to_file(log_file):
6 | logger = logging.getLogger()
7 | handler = logging.FileHandler(log_file)
8 | formatter = logging.Formatter(
9 | "%(asctime)s %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s",
10 | datefmt="%Y-%m-%d:%H:%M:%S",
11 | )
12 | handler.setFormatter(formatter)
13 | logger.addHandler(handler)
14 | logger.setLevel(logging.INFO)
15 |
16 |
17 | def get_git_revision_short_hash():
18 | return (
19 | subprocess.check_output(["git", "rev-parse", "--short", "HEAD"])
20 | .decode("ascii")
21 | .strip()
22 | )
23 |
24 |
25 | def get_git_revision_hash():
26 | return subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("ascii").strip()
27 |
--------------------------------------------------------------------------------
/kantts/utils/plot.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 |
3 | matplotlib.use("Agg") # NOQA: E402
4 | try:
5 | import matplotlib.pyplot as plt
6 | except ImportError:
7 | raise ImportError("Please install matplotlib.")
8 |
9 |
10 | def plot_spectrogram(spectrogram):
11 | fig, ax = plt.subplots(figsize=(12, 8))
12 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
13 | plt.colorbar(im, ax=ax)
14 |
15 | fig.canvas.draw()
16 | plt.close()
17 |
18 | return fig
19 |
20 |
21 | def plot_alignment(alignment, info=None):
22 | fig, ax = plt.subplots()
23 | im = ax.imshow(alignment, aspect="auto", origin="lower", interpolation="none")
24 | fig.colorbar(im, ax=ax)
25 | xlabel = "Input timestep"
26 | if info is not None:
27 | xlabel += "\t" + info
28 | plt.xlabel(xlabel)
29 | plt.ylabel("Output timestep")
30 | fig.canvas.draw()
31 | plt.close()
32 |
33 | return fig
34 |
--------------------------------------------------------------------------------
/notebooks/README.md:
--------------------------------------------------------------------------------
1 | # How to run notebook examples?
2 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pre-commit
2 | autopep8
3 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 | version = "0.0.1"
4 |
5 | with open("README.md", "r", encoding="utf-8") as readme_file:
6 | README = readme_file.read()
7 |
8 | setup(
9 | name="kantts",
10 | version=version,
11 | url="https://github.com/AlibabaResearch/KAN-TTS",
12 | author="Jin",
13 | description="Alibaba DAMO Speech-Lab Text to Speech deeplearning toolchain",
14 | long_description=README,
15 | long_description_content_type="text/markdown",
16 | license="MIT",
17 | # cython
18 | # include_dirs=numpy.get_include(),
19 | # ext_modules=find_cython_extensions(),
20 | # package
21 | include_package_data=True,
22 | packages=find_packages(include=["kantts*"]),
23 | project_urls={
24 | "Documentation": "https://github.com/AlibabaResearch/KAN-TTS/wiki",
25 | "Tracker": "",
26 | "Repository": "https://github.com/AlibabaResearch/KAN-TTS",
27 | "Discussions": "",
28 | },
29 | python_requires=">=3.7.0, <3.9",
30 | classifiers=[
31 | "Programming Language :: Python",
32 | "Programming Language :: Python :: 3",
33 | "Programming Language :: Python :: 3.7",
34 | "Programming Language :: Python :: 3.8",
35 | "Development Status :: 3 - Alpha",
36 | "Intended Audience :: Science/Research",
37 | "Intended Audience :: Developers",
38 | "Operating System :: POSIX :: Linux",
39 | "License :: OSI Approved :: MIT License",
40 | "Topic :: Software Development",
41 | "Topic :: Software Development :: Libraries :: Python Modules",
42 | "Topic :: Multimedia :: Sound/Audio :: Speech",
43 | "Topic :: Multimedia :: Sound/Audio",
44 | "Topic :: Multimedia",
45 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
46 | ],
47 | zip_safe=False,
48 | )
49 |
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/test/__init__.py
--------------------------------------------------------------------------------