├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── environment.yaml
├── kantts
    ├── __init__.py
    ├── bin
    │   ├── __init__.py
    │   ├── infer_hifigan.py
    │   ├── infer_sambert.py
    │   ├── text_to_wav.py
    │   ├── train_hifigan.py
    │   ├── train_sambert.py
    │   └── train_sybert.py
    ├── configs
    │   ├── audio_config_16k.yaml
    │   ├── audio_config_24k.yaml
    │   ├── audio_config_48k.yaml
    │   ├── audio_config_8k.yaml
    │   ├── audio_config_se_16k.yaml
    │   ├── hifigan_noncausal_nsf_global_v1_16k.yaml
    │   ├── hifigan_noncausal_nsf_v1_16k.yaml
    │   ├── hifigan_noncausal_v1_16k.yaml
    │   ├── hifigan_v1_16k.yaml
    │   ├── hifigan_v1_24k.yaml
    │   ├── hifigan_v1_48k.yaml
    │   ├── hifigan_v1_8k.yaml
    │   ├── hifigan_v1_nsf_24k.yaml
    │   ├── sambert_16k.yaml
    │   ├── sambert_16k_MAS.yaml
    │   ├── sambert_16k_MAS_byte.yaml
    │   ├── sambert_24k.yaml
    │   ├── sambert_48k.yaml
    │   ├── sambert_fp_8k.yaml
    │   ├── sambert_nsf_16k.yaml
    │   ├── sambert_nsf_24k.yaml
    │   ├── sambert_se_nsf_global_16k.yaml
    │   ├── sambert_sichuan_16k.yaml
    │   └── sybert.yaml
    ├── datasets
    │   ├── __init__.py
    │   ├── data_types.py
    │   └── dataset.py
    ├── models
    │   ├── __init__.py
    │   ├── hifigan
    │   │   ├── hifigan.py
    │   │   └── layers.py
    │   ├── pqmf.py
    │   ├── sambert
    │   │   ├── __init__.py
    │   │   ├── adaptors.py
    │   │   ├── alignment.py
    │   │   ├── attention.py
    │   │   ├── fsmn.py
    │   │   ├── kantts_sambert.py
    │   │   └── positions.py
    │   └── utils.py
    ├── preprocess
    │   ├── __init__.py
    │   ├── audio_processor
    │   │   ├── __init__.py
    │   │   ├── audio_processor.py
    │   │   └── core
    │   │   │   ├── __init__.py
    │   │   │   ├── dsp.py
    │   │   │   └── utils.py
    │   ├── data_process.py
    │   ├── fp_processor.py
    │   ├── languages
    │   │   ├── PinYin
    │   │   │   ├── En2ChPhoneMap.txt
    │   │   │   ├── PhoneSet.xml
    │   │   │   ├── PosSet.xml
    │   │   │   ├── py2phoneMap.txt
    │   │   │   └── tonelist.txt
    │   │   ├── Sichuan
    │   │   │   ├── En2ChPhoneMap.txt
    │   │   │   ├── PhoneSet.xml
    │   │   │   ├── PosSet.xml
    │   │   │   ├── py2phoneMap.txt
    │   │   │   └── tonelist.txt
    │   │   ├── WuuShanghai
    │   │   │   ├── En2ChPhoneMap.txt
    │   │   │   ├── PhoneSet.xml
    │   │   │   ├── PosSet.xml
    │   │   │   ├── py2phoneMap.txt
    │   │   │   └── tonelist.txt
    │   │   ├── ZhHK
    │   │   │   ├── En2ChPhoneMap.txt
    │   │   │   ├── PhoneSet.xml
    │   │   │   ├── PosSet.xml
    │   │   │   ├── py2phoneMap.txt
    │   │   │   └── tonelist.txt
    │   │   └── __init__.py
    │   ├── script_convertor
    │   │   ├── TextScriptConvertor.py
    │   │   ├── __init__.py
    │   │   └── core
    │   │   │   ├── Phone.py
    │   │   │   ├── PhoneSet.py
    │   │   │   ├── Pos.py
    │   │   │   ├── PosSet.py
    │   │   │   ├── Script.py
    │   │   │   ├── ScriptItem.py
    │   │   │   ├── ScriptSentence.py
    │   │   │   ├── ScriptWord.py
    │   │   │   ├── Syllable.py
    │   │   │   ├── SyllableFormatter.py
    │   │   │   ├── XmlObj.py
    │   │   │   ├── __init__.py
    │   │   │   ├── core_types.py
    │   │   │   └── utils.py
    │   ├── se_processor
    │   │   ├── D_TDNN.py
    │   │   ├── __init__.py
    │   │   ├── layers.py
    │   │   └── se_processor.py
    │   └── text_process.py
    ├── train
    │   ├── __init__.py
    │   ├── loss.py
    │   ├── scheduler.py
    │   └── trainer.py
    └── utils
    │   ├── __init__.py
    │   ├── audio_torch.py
    │   ├── ling_unit
    │       ├── __init__.py
    │       ├── cleaners.py
    │       ├── emotion_types.py
    │       ├── lang_symbols.py
    │       ├── ling_unit.py
    │       └── numbers.py
    │   ├── log.py
    │   └── plot.py
├── notebooks
    └── README.md
├── requirements.txt
├── setup.py
└── test
    └── __init__.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | #
162 | # MISC
163 | .DS_Store
164 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/ambv/black
 3 |     rev: 20.8b1
 4 |     hooks:
 5 |     - id: black
 6 |       additional_dependencies: ['click==8.0.4']
 7 | -   repo: https://gitlab.com/pycqa/flake8
 8 |     rev: 3.8.4
 9 |     hooks:
10 |     - id: flake8
11 |       args: ['--max-line-length=120', '--extend-ignore=E203']
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Alibaba Research
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # KAN-TTS
 2 | 
 3 | With KAN-TTS you can train your own TTS model from zero to hero :).
 4 | 
 5 | ## Models 
 6 | Temporarily we support sam-bert and hifi-GAN, other models are coming soon.
 7 | 
 8 | ## Support Languages
 9 | | Language     | Model Links                                                                                  |
10 | | :---:        | :---:                                                                                        |
11 | | Mandarin     | https://modelscope.cn/models?name=zhcn&page=1&tasks=text-to-speech&type=audio                |
12 | | English      | https://modelscope.cn/models?name=enus&page=1&tasks=text-to-speech&type=audio                |
13 | | British      | https://modelscope.cn/models?name=engb&page=1&tasks=text-to-speech&type=audio                |
14 | | Shanghainese | https://modelscope.cn/models?name=WuuShanghai&page=1&tasks=text-to-speech&type=audio         |
15 | | Sichuanese   | https://modelscope.cn/models?name=Sichuan&page=1&tasks=text-to-speech&type=audio             |
16 | | Cantonese    | https://modelscope.cn/models?name=Cantonese&page=1&tasks=text-to-speech&type=audio           |
17 | | Italian      | https://modelscope.cn/models?name=itit&page=1&tasks=text-to-speech&type=audio                |
18 | | Spanish      | https://modelscope.cn/models?name=eses&page=1&tasks=text-to-speech&type=audio                |
19 | | Russian      | https://modelscope.cn/models?name=ruru&page=1&tasks=text-to-speech&type=audio                |
20 | | Korean       | https://modelscope.cn/models?name=kokr&page=1&tasks=text-to-speech&type=audio                |
21 | More languages are coming soon.
22 | 
23 | ## Training Tutorial
24 | You can find the training tutorial in our wiki page [KAN-TTS Wiki](https://github.com/AlibabaResearch/KAN-TTS/wiki).
25 | 
26 | ## ModelScope Demo
27 | Try our demo on ModelScope [KAN-TTS Demo](https://modelscope.cn/models?page=1&tasks=text-to-speech).
28 | 
29 | ## Contribute to this repo
30 | 
31 | ```shell
32 | pip install -r requirements.txt
33 | pre-commit install
34 | ```
35 | 
36 | ## Contact us
37 | If you have any questions, please feel free to contact us.
38 | 
39 | Scan the QR code to join our DingTalk group.
40 | 
41 | <img src="https://raw.githubusercontent.com/wiki/alibaba-damo-academy/KAN-TTS/resources/images/kantts_dinggroup.png" width="200" height="200" />
42 | 


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: maas
 2 | channels:
 3 |   - pytorch
 4 |   - conda-forge
 5 |   - defaults
 6 | dependencies:
 7 |   - audioread
 8 |   - cudatoolkit=10.1
 9 |   - ffmpeg
10 |   - lame
11 |   - librosa=0.9.2
12 |   - libsndfile
13 |   - matplotlib=3.5.1
14 |   - matplotlib-base=3.5.1
15 |   - numba
16 |   - numpy
17 |   - unidecode
18 |   - inflect
19 |   - numpy-base
20 |   - pip
21 |   - protobuf=3.20.1
22 |   - pysocks=1.7.1
23 |   - pysoundfile
24 |   - python=3.7.13
25 |   - python-dateutil=2.8.2
26 |   - python_abi=3.7
27 |   - pytorch=1.7.0
28 |   - pywavelets=1.3.0
29 |   - pyyaml=6.0
30 |   - readline
31 |   - scikit-learn=1.0.2
32 |   - scipy=1.7.3
33 |   - setuptools=61.2.0
34 |   - six=1.16.0
35 |   - sqlite=3.38.5
36 |   - tensorboardx=2.2
37 |   - threadpoolctl=3.1.0
38 |   - tk=8.6.12
39 |   - torchaudio=0.7.0
40 |   - torchvision=0.8.0
41 |   - tqdm
42 |   - urllib3
43 |   - wheel
44 |   - yaml=0.2.5
45 |   - pip:
46 |     - appnope==0.1.3
47 |     - backcall==0.2.0
48 |     - cython==0.29.30
49 |     - dataclasses==0.6
50 |     - future==0.18.2
51 |     - greenlet==1.1.2
52 |     - ipdb
53 |     - ipython
54 |     - jedi==0.18.1
55 |     - matplotlib-inline==0.1.3
56 |     - msgpack==1.0.4
57 |     - parso==0.8.3
58 |     - pexpect==4.8.0
59 |     - pickleshare==0.7.5
60 |     - prompt-toolkit==3.0.30
61 |     - ptyprocess==0.7.0
62 |     - pygments==2.12.0
63 |     - pysptk
64 |     - git+https://github.com/fbcotter/pytorch_wavelets.git
65 |     - sox
66 |     - toml==0.10.2
67 |     - traitlets==5.3.0
68 |     - wcwidth==0.2.5
69 |     - bitstring==3.1.6
70 |     - --find-links https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
71 |     - ttsfrd
72 | 


--------------------------------------------------------------------------------
/kantts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/__init__.py


--------------------------------------------------------------------------------
/kantts/bin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/bin/__init__.py


--------------------------------------------------------------------------------
/kantts/bin/infer_hifigan.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import argparse
  4 | import torch
  5 | import soundfile as sf
  6 | import yaml
  7 | import logging
  8 | import numpy as np
  9 | import time
 10 | import glob
 11 | 
 12 | ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))  # NOQA: E402
 13 | sys.path.insert(0, os.path.dirname(ROOT_PATH))  # NOQA: E402
 14 | 
 15 | try:
 16 |     from kantts.utils.log import logging_to_file
 17 | except ImportError:
 18 |     raise ImportError("Please install kantts.")
 19 | 
 20 | logging.basicConfig(
 21 |     #  filename=os.path.join(stage_dir, 'stdout.log'),
 22 |     format="%(asctime)s, %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s",
 23 |     datefmt="%Y-%m-%d:%H:%M:%S",
 24 |     level=logging.INFO,
 25 | )
 26 | 
 27 | 
 28 | def count_parameters(model):
 29 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
 30 | 
 31 | 
 32 | def load_model(ckpt, config=None):
 33 |     # load config if not provided
 34 |     if config is None:
 35 |         dirname = os.path.dirname(os.path.dirname(ckpt))
 36 |         config = os.path.join(dirname, "config.yaml")
 37 |         with open(config) as f:
 38 |             config = yaml.load(f, Loader=yaml.Loader)
 39 | 
 40 |     # lazy load for circular error
 41 |     from kantts.models.hifigan.hifigan import Generator
 42 | 
 43 |     model = Generator(**config["Model"]["Generator"]["params"])
 44 |     states = torch.load(ckpt, map_location="cpu")
 45 |     model.load_state_dict(states["model"]["generator"])
 46 | 
 47 |     # add pqmf if needed
 48 |     if config["Model"]["Generator"]["params"]["out_channels"] > 1:
 49 |         # lazy load for circular error
 50 |         from kantts.models.pqmf import PQMF
 51 | 
 52 |         model.pqmf = PQMF()
 53 | 
 54 |     return model
 55 | 
 56 | 
 57 | def binarize(mel, threshold=0.6):
 58 |     # vuv binarize
 59 |     res_mel = mel.copy()
 60 |     index = np.where(mel[:, -1] < threshold)[0]
 61 |     res_mel[:, -1] = 1.0
 62 |     res_mel[:, -1][index] = 0.0
 63 |     return res_mel
 64 | 
 65 | 
 66 | def hifigan_infer(input_mel, ckpt_path, output_dir, config=None):
 67 |     if not torch.cuda.is_available():
 68 |         device = torch.device("cpu")
 69 |     else:
 70 |         torch.backends.cudnn.benchmark = True
 71 |         device = torch.device("cuda", 0)
 72 | 
 73 |     if config is not None:
 74 |         with open(config, "r") as f:
 75 |             config = yaml.load(f, Loader=yaml.Loader)
 76 |     else:
 77 |         config_path = os.path.join(
 78 |             os.path.dirname(os.path.dirname(ckpt_path)), "config.yaml"
 79 |         )
 80 |         if not os.path.exists(config_path):
 81 |             raise ValueError("config file not found: {}".format(config_path))
 82 |         with open(config_path, "r") as f:
 83 |             config = yaml.load(f, Loader=yaml.Loader)
 84 | 
 85 |     for key, value in config.items():
 86 |         logging.info(f"{key} = {value}")
 87 | 
 88 |     # check directory existence
 89 |     if not os.path.exists(output_dir):
 90 |         os.makedirs(output_dir)
 91 | 
 92 |     logging_to_file(os.path.join(output_dir, "stdout.log"))
 93 | 
 94 |     if os.path.isfile(input_mel):
 95 |         mel_lst = [input_mel]
 96 |     elif os.path.isdir(input_mel):
 97 |         mel_lst = glob.glob(os.path.join(input_mel, "*.npy"))
 98 |     else:
 99 |         raise ValueError("input_mel should be a file or a directory")
100 | 
101 |     model = load_model(ckpt_path, config)
102 | 
103 |     logging.info(f"Loaded model parameters from {ckpt_path}.")
104 |     model.remove_weight_norm()
105 |     model = model.eval().to(device)
106 | 
107 |     with torch.no_grad():
108 |         start = time.time()
109 |         pcm_len = 0
110 |         for mel in mel_lst:
111 |             utt_id = os.path.splitext(os.path.basename(mel))[0]
112 |             mel_data = np.load(mel)
113 |             if model.nsf_enable:
114 |                 mel_data = binarize(mel_data)
115 |             # generate
116 |             mel_data = torch.tensor(mel_data, dtype=torch.float).to(device)
117 |             # (T, C) -> (B, C, T)
118 |             mel_data = mel_data.transpose(1, 0).unsqueeze(0)
119 |             y = model(mel_data)
120 |             if hasattr(model, "pqmf"):
121 |                 y = model.pqmf.synthesis(y)
122 |             y = y.view(-1).cpu().numpy()
123 |             pcm_len += len(y)
124 | 
125 |             # save as PCM 16 bit wav file
126 |             sf.write(
127 |                 os.path.join(output_dir, f"{utt_id}_gen.wav"),
128 |                 y,
129 |                 config["audio_config"]["sampling_rate"],
130 |                 "PCM_16",
131 |             )
132 |         rtf = (time.time() - start) / (
133 |             pcm_len / config["audio_config"]["sampling_rate"]
134 |         )
135 | 
136 |     # report average RTF
137 |     logging.info(
138 |         f"Finished generation of {len(mel_lst)} utterances (RTF = {rtf:.03f})."
139 |     )
140 | 
141 | 
142 | if __name__ == "__main__":
143 |     parser = argparse.ArgumentParser(description="Infer hifigan model")
144 |     parser.add_argument(
145 |         "--ckpt", type=str, required=True, help="Path to model checkpoint"
146 |     )
147 |     parser.add_argument(
148 |         "--input_mel",
149 |         type=str,
150 |         required=True,
151 |         help="Path to input mel file or directory containing mel files",
152 |     )
153 |     parser.add_argument(
154 |         "--output_dir", type=str, required=True, help="Path to output directory"
155 |     )
156 |     parser.add_argument("--config", type=str, default=None, help="Path to config file")
157 |     args = parser.parse_args()
158 |     hifigan_infer(
159 |         args.input_mel,
160 |         args.ckpt,
161 |         args.output_dir,
162 |         args.config,
163 |     )
164 | 


--------------------------------------------------------------------------------
/kantts/bin/text_to_wav.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import argparse
  4 | import yaml
  5 | import logging
  6 | import zipfile
  7 | from glob import glob
  8 | import soundfile as sf
  9 | import numpy as np
 10 | 
 11 | 
 12 | ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))  # NOQA: E402
 13 | sys.path.insert(0, os.path.dirname(ROOT_PATH))  # NOQA: E402
 14 | 
 15 | try:
 16 |     from kantts.bin.infer_sambert import am_infer
 17 |     from kantts.bin.infer_hifigan import hifigan_infer
 18 |     from kantts.utils.ling_unit import text_to_mit_symbols as text_to_symbols
 19 | except ImportError:
 20 |     raise ImportError("Please install kantts.")
 21 | 
 22 | logging.basicConfig(
 23 |     #  filename=os.path.join(stage_dir, 'stdout.log'),
 24 |     format="%(asctime)s, %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s",
 25 |     datefmt="%Y-%m-%d:%H:%M:%S",
 26 |     level=logging.INFO,
 27 | )
 28 | 
 29 | 
 30 | def concat_process(chunked_dir, output_dir):
 31 |     wav_files = sorted(glob(os.path.join(chunked_dir, "*.wav")))
 32 |     print(wav_files)
 33 |     sentence_sil = 0.28  # seconds
 34 |     end_sil = 0.05  # seconds
 35 | 
 36 |     cnt = 0
 37 |     wav_concat = None
 38 |     main_id, sub_id = 0, 0
 39 | 
 40 |     while cnt < len(wav_files):
 41 |         wav_file = os.path.join(
 42 |             chunked_dir, "{}_{}_mel_gen.wav".format(main_id, sub_id)
 43 |         )
 44 |         if os.path.exists(wav_file):
 45 |             wav, sr = sf.read(wav_file)
 46 |             sentence_sil_samples = int(sentence_sil * sr)
 47 |             end_sil_samples = int(end_sil * sr)
 48 |             if sub_id == 0:
 49 |                 wav_concat = wav
 50 |             else:
 51 |                 wav_concat = np.concatenate(
 52 |                     (wav_concat, np.zeros(sentence_sil_samples), wav), axis=0
 53 |                 )
 54 | 
 55 |             sub_id += 1
 56 |             cnt += 1
 57 |         else:
 58 |             if wav_concat is not None:
 59 |                 wav_concat = np.concatenate(
 60 |                     (wav_concat, np.zeros(end_sil_samples)), axis=0
 61 |                 )
 62 |                 sf.write(os.path.join(output_dir, f"{main_id}.wav"), wav_concat, sr)
 63 | 
 64 |             main_id += 1
 65 |             sub_id = 0
 66 |             wav_concat = None
 67 | 
 68 |         if cnt == len(wav_files):
 69 |             wav_concat = np.concatenate((wav_concat, np.zeros(end_sil_samples)), axis=0)
 70 |             sf.write(os.path.join(output_dir, f"{main_id}.wav"), wav_concat, sr)
 71 | 
 72 | 
 73 | def text_to_wav(
 74 |     text_file,
 75 |     output_dir,
 76 |     resources_zip_file,
 77 |     am_ckpt,
 78 |     voc_ckpt,
 79 |     speaker=None,
 80 |     se_file=None,
 81 |     lang="PinYin",
 82 | ):
 83 |     os.makedirs(output_dir, exist_ok=True)
 84 |     os.makedirs(os.path.join(output_dir, "res_wavs"), exist_ok=True)
 85 | 
 86 |     resource_root_dir = os.path.dirname(resources_zip_file)
 87 |     resource_dir = os.path.join(resource_root_dir, "resource")
 88 | 
 89 |     if not os.path.exists(resource_dir):
 90 |         logging.info("Extracting resources...")
 91 |         with zipfile.ZipFile(resources_zip_file, "r") as zip_ref:
 92 |             zip_ref.extractall(resource_root_dir)
 93 | 
 94 |     with open(text_file, "r") as text_data:
 95 |         texts = text_data.readlines()
 96 | 
 97 |     logging.info("Converting text to symbols...")
 98 |     am_config = os.path.join(os.path.dirname(os.path.dirname(am_ckpt)), "config.yaml")
 99 |     with open(am_config, "r") as f:
100 |         config = yaml.load(f, Loader=yaml.Loader)
101 |     if speaker is None:
102 |         speaker = config["linguistic_unit"]["speaker_list"].split(",")[0]
103 |     symbols_lst = text_to_symbols(texts, resource_dir, speaker, lang)
104 |     symbols_file = os.path.join(output_dir, "symbols.lst")
105 |     with open(symbols_file, "w") as symbol_data:
106 |         for symbol in symbols_lst:
107 |             symbol_data.write(symbol)
108 | 
109 |     logging.info("AM is infering...")
110 |     am_infer(symbols_file, am_ckpt, output_dir, se_file)
111 | 
112 |     logging.info("Vocoder is infering...")
113 |     hifigan_infer(os.path.join(output_dir, "feat"), voc_ckpt, output_dir)
114 | 
115 |     concat_process(output_dir, os.path.join(output_dir, "res_wavs"))
116 | 
117 |     logging.info("Text to wav finished!")
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     parser = argparse.ArgumentParser(description="Text to wav")
122 |     parser.add_argument("--txt", type=str, required=True, help="Path to text file")
123 |     parser.add_argument(
124 |         "--output_dir", type=str, required=True, help="Path to output directory"
125 |     )
126 |     parser.add_argument(
127 |         "--res_zip", type=str, required=True, help="Path to resource zip file"
128 |     )
129 |     parser.add_argument(
130 |         "--am_ckpt", type=str, required=True, help="Path to am ckpt file"
131 |     )
132 |     parser.add_argument(
133 |         "--voc_ckpt", type=str, required=True, help="Path to voc ckpt file"
134 |     )
135 |     parser.add_argument(
136 |         "--speaker",
137 |         type=str,
138 |         required=False,
139 |         default=None,
140 |         help="The speaker name, default is the first speaker",
141 |     )
142 |     parser.add_argument(
143 |         "--se_file",
144 |         type=str,
145 |         required=False,
146 |         default=None,
147 |         help="The speaker embedding file , default is None",
148 |     )
149 |     parser.add_argument(
150 |         "--lang",
151 |         type=str,
152 |         default="PinYin",
153 |         help="""The language of the text, default is PinYin, other options are:
154 |         English,
155 |         British,
156 |         ZhHK,
157 |         WuuShanghai,
158 |         Sichuan,
159 |         Indonesian,
160 |         Malay,
161 |         Filipino,
162 |         Vietnamese,
163 |         Korean,
164 |         Russian
165 |         """,
166 |     )
167 |     args = parser.parse_args()
168 |     text_to_wav(
169 |         args.txt,
170 |         args.output_dir,
171 |         args.res_zip,
172 |         args.am_ckpt,
173 |         args.voc_ckpt,
174 |         args.speaker,
175 |         args.se_file,
176 |         args.lang,
177 |     )
178 | 


--------------------------------------------------------------------------------
/kantts/configs/audio_config_16k.yaml:
--------------------------------------------------------------------------------
 1 | # Audio processing configs
 2 | 
 3 | audio_config:
 4 |   # Preprocess
 5 |   wav_normalize: True
 6 |   trim_silence: True
 7 |   trim_silence_threshold_db: 60
 8 |   preemphasize: False
 9 | 
10 |   # Feature extraction
11 |   sampling_rate: 16000
12 |   hop_length: 200
13 |   win_length: 1000
14 |   n_fft: 2048
15 |   n_mels: 80
16 |   fmin: 0.0
17 |   fmax: 8000.0
18 |   phone_level_feature: True
19 | 
20 |   # Normalization
21 |   norm_type: "mean_std"  # "mean_std" or "global"
22 |   max_norm: 1.0
23 |   symmetric: False
24 |   min_level_db: -100.0
25 |   ref_level_db: 20
26 |   
27 |   num_workers: 16
28 | 


--------------------------------------------------------------------------------
/kantts/configs/audio_config_24k.yaml:
--------------------------------------------------------------------------------
 1 | # Audio processing configs
 2 | 
 3 | audio_config:
 4 |   # Preprocess
 5 |   wav_normalize: True
 6 |   trim_silence: True
 7 |   trim_silence_threshold_db: 60
 8 |   preemphasize: False
 9 | 
10 |   # Feature extraction
11 |   sampling_rate: 24000
12 |   hop_length: 240
13 |   win_length: 1024
14 |   n_fft: 1024
15 |   n_mels: 80
16 |   fmin: 50.0
17 |   fmax: 8000.0
18 |   phone_level_feature: True
19 | 
20 |   # Normalization
21 |   norm_type: "mean_std"  # "mean_std" or "global"
22 |   max_norm: 1.0
23 |   symmetric: False
24 |   min_level_db: -100.0
25 |   ref_level_db: 20
26 |   
27 |   num_workers: 16
28 | 


--------------------------------------------------------------------------------
/kantts/configs/audio_config_48k.yaml:
--------------------------------------------------------------------------------
 1 | # Audio processing configs
 2 | 
 3 | audio_config:
 4 |   # Preprocess
 5 |   wav_normalize: True
 6 |   trim_silence: True
 7 |   trim_silence_threshold_db: 60
 8 |   preemphasize: False
 9 | 
10 |   # Feature extraction
11 |   sampling_rate: 48000
12 |   hop_length: 600
13 |   win_length: 2400
14 |   n_fft: 4096
15 |   n_mels: 128
16 |   fmin: 0.0
17 |   fmax: 12000.0
18 |   phone_level_feature: True
19 | 
20 |   # Normalization
21 |   norm_type: "mean_std"  # "mean_std" or "global"
22 |   max_norm: 1.0
23 |   symmetric: False
24 |   min_level_db: -100.0
25 |   ref_level_db: 20
26 |   
27 |   num_workers: 16
28 | 


--------------------------------------------------------------------------------
/kantts/configs/audio_config_8k.yaml:
--------------------------------------------------------------------------------
 1 | # Audio processing configs
 2 | 
 3 | audio_config:
 4 |   # Preprocess
 5 |   wav_normalize: True
 6 |   trim_silence: True
 7 |   trim_silence_threshold_db: 60
 8 |   preemphasize: False
 9 | 
10 |   # Feature extraction
11 |   sampling_rate: 8000
12 |   hop_length: 100
13 |   win_length: 600
14 |   n_fft: 2048
15 |   n_mels: 80
16 |   fmin: 0.0
17 |   fmax: 4000.0
18 |   phone_level_feature: True
19 | 
20 |   # Normalization
21 |   norm_type: "mean_std"  # "mean_std" or "global"
22 |   max_norm: 1.0
23 |   symmetric: False
24 |   min_level_db: -100.0
25 |   ref_level_db: 20
26 |   
27 |   num_workers: 16
28 | 
29 | 


--------------------------------------------------------------------------------
/kantts/configs/audio_config_se_16k.yaml:
--------------------------------------------------------------------------------
 1 | # Audio processing configs
 2 | 
 3 | audio_config:
 4 |   # Preprocess
 5 |   wav_normalize: True
 6 |   trim_silence: True
 7 |   trim_silence_threshold_db: 60
 8 |   preemphasize: False
 9 | 
10 |   # Feature extraction
11 |   sampling_rate: 16000
12 |   hop_length: 200
13 |   win_length: 1000
14 |   n_fft: 2048
15 |   n_mels: 80
16 |   fmin: 0.0
17 |   fmax: 8000.0
18 |   phone_level_feature: True
19 |   se_feature: True
20 | 
21 |   # Normalization
22 |   norm_type: "mean_std"  # "mean_std" or "global"
23 |   max_norm: 1.0
24 |   symmetric: False
25 |   min_level_db: -100.0
26 |   ref_level_db: 20
27 |   
28 |   num_workers: 16
29 | 


--------------------------------------------------------------------------------
/kantts/configs/hifigan_noncausal_v1_16k.yaml:
--------------------------------------------------------------------------------
  1 | model_type: hifigan
  2 | Model:
  3 | ###########################################################
  4 | #         GENERATOR NETWORK ARCHITECTURE SETTING          #
  5 | ###########################################################
  6 |   Generator:
  7 |     params:
  8 |       in_channels: 80                       
  9 |       out_channels: 1                      
 10 |       channels: 256                       
 11 |       kernel_size: 7                     
 12 |       upsample_scales: [10, 5, 2, 2]        
 13 |       upsample_kernal_sizes: [20, 11, 4, 4] 
 14 |       resblock_kernel_sizes: [3, 7, 11]     
 15 |       resblock_dilations:                  
 16 |             - [1, 3, 5, 7]
 17 |             - [1, 3, 5, 7]
 18 |             - [1, 3, 5, 7]
 19 |       bias: true                           
 20 |       causal: false
 21 |       nonlinear_activation: "LeakyReLU"    
 22 |       nonlinear_activation_params:         
 23 |         negative_slope: 0.1
 24 |       use_weight_norm: true               
 25 |     optimizer:
 26 |       type: Adam
 27 |       params:
 28 |         lr: 2.0e-4
 29 |         betas: [0.5, 0.9]
 30 |         weight_decay: 0.0
 31 |     scheduler:
 32 |       type: MultiStepLR
 33 |       params:
 34 |         gamma: 0.5
 35 |         milestones:
 36 |             - 200000
 37 |             - 400000
 38 |             - 600000
 39 |             - 800000
 40 | 
 41 | ###########################################################
 42 | #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
 43 | ###########################################################
 44 |   MultiScaleDiscriminator:
 45 |     params:
 46 |       scales: 3                              
 47 |       downsample_pooling: "DWT"  
 48 |       downsample_pooling_params:
 49 |           kernel_size: 4                    
 50 |           stride: 2                         
 51 |           padding: 2                        
 52 |       discriminator_params:
 53 |           in_channels: 1                     
 54 |           out_channels: 1                    
 55 |           kernel_sizes: [15, 41, 5, 3]       
 56 |           channels: 128                      
 57 |           max_downsample_channels: 1024     
 58 |           max_groups: 16                   
 59 |           bias: true
 60 |           downsample_scales: [4, 4, 4, 4, 1]
 61 |           nonlinear_activation: "LeakyReLU"  
 62 |           nonlinear_activation_params:
 63 |             negative_slope: 0.1
 64 |       follow_official_norm: true    
 65 |     optimizer:
 66 |       type: Adam
 67 |       params:
 68 |         lr: 2.0e-4
 69 |         betas: [0.5, 0.9]
 70 |         weight_decay: 0.0
 71 |     scheduler:
 72 |       type: MultiStepLR
 73 |       params:
 74 |         gamma: 0.5
 75 |         milestones:
 76 |             - 200000
 77 |             - 400000
 78 |             - 600000
 79 |             - 800000
 80 | 
 81 |   MultiPeriodDiscriminator:
 82 |     params:
 83 |       periods: [2, 3, 5, 7, 11]      
 84 |       discriminator_params:
 85 |         in_channels: 1                  
 86 |         out_channels: 1                  
 87 |         kernel_sizes: [5, 3]              
 88 |         channels: 32                       
 89 |         downsample_scales: [3, 3, 3, 3, 1] 
 90 |         max_downsample_channels: 1024      
 91 |         bias: true                       
 92 |         nonlinear_activation: "LeakyReLU"  
 93 |         nonlinear_activation_params:       
 94 |           negative_slope: 0.1
 95 |         use_spectral_norm: false           
 96 |     optimizer:
 97 |       type: Adam
 98 |       params:
 99 |         lr: 2.0e-4
100 |         betas: [0.5, 0.9]
101 |         weight_decay: 0.0
102 |     scheduler:
103 |       type: MultiStepLR
104 |       params:
105 |         gamma: 0.5
106 |         milestones:
107 |             - 200000
108 |             - 400000
109 |             - 600000
110 |             - 800000
111 | 
112 | ####################################################
113 | #                   LOSS SETTING                   #
114 | ####################################################
115 | Loss:
116 |   generator_adv_loss:
117 |     enable: True
118 |     params:
119 |       average_by_discriminators: False
120 |     weights: 1.0
121 | 
122 |   discriminator_adv_loss:
123 |     enable: True
124 |     params:
125 |       average_by_discriminators: False
126 |     weights: 1.0
127 | 
128 |   stft_loss:
129 |     enable: False             # Whether to use multi-resolution STFT loss.
130 | 
131 |   mel_loss:
132 |     enable: True
133 |     params:
134 |       fs: 16000
135 |       fft_size: 2048
136 |       hop_size: 200
137 |       win_length: 1000
138 |       window: "hann"
139 |       num_mels: 80
140 |       fmin: 0
141 |       fmax: 8000
142 |       log_base: null
143 |     weights: 45.0
144 | 
145 |   subband_stft_loss:
146 |     enable: False
147 |     params:
148 |       fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
149 |       hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
150 |       win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
151 |       window: "hann_window"       # Window function for STFT-based loss
152 | 
153 |   feat_match_loss:
154 |     enable: True
155 |     params:
156 |       average_by_discriminators: false 
157 |       average_by_layers: false         
158 |     weights: 2.0
159 | 
160 | 
161 | ###########################################################
162 | #                  DATA LOADER SETTING                    #
163 | ###########################################################
164 | batch_size: 16              
165 | batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
166 | pin_memory: True            
167 | num_workers: 2 # FIXME: set > 0 may stuck on macos              
168 | remove_short_samples: False 
169 | allow_cache: True           
170 | 
171 | generator_grad_norm: -1
172 | 
173 | discriminator_grad_norm: -1
174 | 
175 | ###########################################################
176 | #                    INTERVAL SETTING                     #
177 | ###########################################################
178 | generator_train_start_steps: 1     # Number of steps to start to train discriminator.
179 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
180 | train_max_steps: 2500000           # Number of training steps.
181 | save_interval_steps: 20000         # Interval steps to save checkpoint.
182 | eval_interval_steps: 10000          # Interval steps to evaluate the network.
183 | log_interval_steps: 1000            # Interval steps to record the training log.
184 | 
185 | ###########################################################
186 | #                     OTHER SETTING                       #
187 | ###########################################################
188 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
189 | 


--------------------------------------------------------------------------------
/kantts/configs/hifigan_v1_16k.yaml:
--------------------------------------------------------------------------------
  1 | model_type: hifigan
  2 | Model:
  3 | ###########################################################
  4 | #         GENERATOR NETWORK ARCHITECTURE SETTING          #
  5 | ###########################################################
  6 |   Generator:
  7 |     params:
  8 |       in_channels: 80                       
  9 |       out_channels: 1                      
 10 |       channels: 256                       
 11 |       kernel_size: 7                     
 12 |       upsample_scales: [10, 5, 2, 2]        
 13 |       upsample_kernal_sizes: [20, 10, 4, 4] 
 14 |       resblock_kernel_sizes: [3, 7, 11]     
 15 |       resblock_dilations:                  
 16 |             - [1, 3, 5, 7]
 17 |             - [1, 3, 5, 7]
 18 |             - [1, 3, 5, 7]
 19 |       bias: true                           
 20 |       causal: true                           
 21 |       nonlinear_activation: "LeakyReLU"    
 22 |       nonlinear_activation_params:         
 23 |         negative_slope: 0.1
 24 |       use_weight_norm: true               
 25 |     optimizer:
 26 |       type: Adam
 27 |       params:
 28 |         lr: 2.0e-4
 29 |         betas: [0.5, 0.9]
 30 |         weight_decay: 0.0
 31 |     scheduler:
 32 |       type: MultiStepLR
 33 |       params:
 34 |         gamma: 0.5
 35 |         milestones:
 36 |             - 200000
 37 |             - 400000
 38 |             - 600000
 39 |             - 800000
 40 | 
 41 | ###########################################################
 42 | #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
 43 | ###########################################################
 44 |   MultiScaleDiscriminator:
 45 |     params:
 46 |       scales: 3                              
 47 |       downsample_pooling: "DWT"  
 48 |       downsample_pooling_params:
 49 |           kernel_size: 4                    
 50 |           stride: 2                         
 51 |           padding: 2                        
 52 |       discriminator_params:
 53 |           in_channels: 1                     
 54 |           out_channels: 1                    
 55 |           kernel_sizes: [15, 41, 5, 3]       
 56 |           channels: 128                      
 57 |           max_downsample_channels: 1024     
 58 |           max_groups: 16                   
 59 |           bias: true
 60 |           downsample_scales: [4, 4, 4, 4, 1]
 61 |           nonlinear_activation: "LeakyReLU"  
 62 |           nonlinear_activation_params:
 63 |             negative_slope: 0.1
 64 |       follow_official_norm: true    
 65 |     optimizer:
 66 |       type: Adam
 67 |       params:
 68 |         lr: 2.0e-4
 69 |         betas: [0.5, 0.9]
 70 |         weight_decay: 0.0
 71 |     scheduler:
 72 |       type: MultiStepLR
 73 |       params:
 74 |         gamma: 0.5
 75 |         milestones:
 76 |             - 200000
 77 |             - 400000
 78 |             - 600000
 79 |             - 800000
 80 | 
 81 |   MultiPeriodDiscriminator:
 82 |     params:
 83 |       periods: [2, 3, 5, 7, 11]      
 84 |       discriminator_params:
 85 |         in_channels: 1                  
 86 |         out_channels: 1                  
 87 |         kernel_sizes: [5, 3]              
 88 |         channels: 32                       
 89 |         downsample_scales: [3, 3, 3, 3, 1] 
 90 |         max_downsample_channels: 1024      
 91 |         bias: true                       
 92 |         nonlinear_activation: "LeakyReLU"  
 93 |         nonlinear_activation_params:       
 94 |           negative_slope: 0.1
 95 |         use_spectral_norm: false           
 96 |     optimizer:
 97 |       type: Adam
 98 |       params:
 99 |         lr: 2.0e-4
100 |         betas: [0.5, 0.9]
101 |         weight_decay: 0.0
102 |     scheduler:
103 |       type: MultiStepLR
104 |       params:
105 |         gamma: 0.5
106 |         milestones:
107 |             - 200000
108 |             - 400000
109 |             - 600000
110 |             - 800000
111 | 
112 | ####################################################
113 | #                   LOSS SETTING                   #
114 | ####################################################
115 | Loss:
116 |   generator_adv_loss:
117 |     enable: True
118 |     params:
119 |       average_by_discriminators: False
120 |     weights: 1.0
121 | 
122 |   discriminator_adv_loss:
123 |     enable: True
124 |     params:
125 |       average_by_discriminators: False
126 |     weights: 1.0
127 | 
128 |   stft_loss:
129 |     enable: False             # Whether to use multi-resolution STFT loss.
130 | 
131 |   mel_loss:
132 |     enable: True
133 |     params:
134 |       fs: 16000
135 |       fft_size: 2048
136 |       hop_size: 200
137 |       win_length: 1000
138 |       window: "hann"
139 |       num_mels: 80
140 |       fmin: 0
141 |       fmax: 8000
142 |       log_base: null
143 |     weights: 45.0
144 | 
145 |   subband_stft_loss:
146 |     enable: False
147 |     params:
148 |       fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
149 |       hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
150 |       win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
151 |       window: "hann_window"       # Window function for STFT-based loss
152 | 
153 |   feat_match_loss:
154 |     enable: True
155 |     params:
156 |       average_by_discriminators: false 
157 |       average_by_layers: false         
158 |     weights: 2.0
159 | 
160 | 
161 | ###########################################################
162 | #                  DATA LOADER SETTING                    #
163 | ###########################################################
164 | batch_size: 16              
165 | batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
166 | pin_memory: True            
167 | num_workers: 2 # FIXME: set > 0 may stuck on macos              
168 | remove_short_samples: False 
169 | allow_cache: True           
170 | 
171 | generator_grad_norm: -1
172 | 
173 | discriminator_grad_norm: -1
174 | 
175 | ###########################################################
176 | #                    INTERVAL SETTING                     #
177 | ###########################################################
178 | generator_train_start_steps: 1     # Number of steps to start to train discriminator.
179 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
180 | train_max_steps: 2500000           # Number of training steps.
181 | save_interval_steps: 20000         # Interval steps to save checkpoint.
182 | eval_interval_steps: 10000          # Interval steps to evaluate the network.
183 | log_interval_steps: 1000            # Interval steps to record the training log.
184 | 
185 | ###########################################################
186 | #                     OTHER SETTING                       #
187 | ###########################################################
188 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
189 | 


--------------------------------------------------------------------------------
/kantts/configs/hifigan_v1_24k.yaml:
--------------------------------------------------------------------------------
  1 | model_type: hifigan
  2 | Model:
  3 | ###########################################################
  4 | #         GENERATOR NETWORK ARCHITECTURE SETTING          #
  5 | ###########################################################
  6 |   Generator:
  7 |     params:
  8 |       in_channels: 80                       
  9 |       out_channels: 1                      
 10 |       channels: 512                       
 11 |       kernel_size: 7                     
 12 |       upsample_scales: [8, 5, 3, 2]        
 13 |       upsample_kernal_sizes: [16, 10, 6, 4] 
 14 |       resblock_kernel_sizes: [3, 7, 11]     
 15 |       resblock_dilations:                  
 16 |             - [1, 3, 5]
 17 |             - [1, 3, 5]
 18 |             - [1, 3, 5]
 19 |       bias: true                           
 20 |       causal: true                             
 21 |       nonlinear_activation: "LeakyReLU"    
 22 |       nonlinear_activation_params:         
 23 |         negative_slope: 0.1
 24 |       use_weight_norm: true               
 25 |     optimizer:
 26 |       type: Adam
 27 |       params:
 28 |         lr: 2.0e-4
 29 |         betas: [0.5, 0.9]
 30 |         weight_decay: 0.0
 31 |     scheduler:
 32 |       type: MultiStepLR
 33 |       params:
 34 |         gamma: 0.5
 35 |         milestones:
 36 |             - 200000
 37 |             - 400000
 38 |             - 600000
 39 |             - 800000
 40 | 
 41 | ###########################################################
 42 | #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
 43 | ###########################################################
 44 |   MultiScaleDiscriminator:
 45 |     params:
 46 |       scales: 3                              
 47 |       downsample_pooling: "DWT"  
 48 |       downsample_pooling_params:
 49 |           kernel_size: 4                    
 50 |           stride: 2                         
 51 |           padding: 2                        
 52 |       discriminator_params:
 53 |           in_channels: 1                     
 54 |           out_channels: 1                    
 55 |           kernel_sizes: [15, 41, 5, 3]       
 56 |           channels: 128                      
 57 |           max_downsample_channels: 1024     
 58 |           max_groups: 16                   
 59 |           bias: true
 60 |           downsample_scales: [4, 4, 4, 4, 1]
 61 |           nonlinear_activation: "LeakyReLU"  
 62 |           nonlinear_activation_params:
 63 |             negative_slope: 0.1
 64 |       follow_official_norm: true    
 65 |     optimizer:
 66 |       type: Adam
 67 |       params:
 68 |         lr: 2.0e-4
 69 |         betas: [0.5, 0.9]
 70 |         weight_decay: 0.0
 71 |     scheduler:
 72 |       type: MultiStepLR
 73 |       params:
 74 |         gamma: 0.5
 75 |         milestones:
 76 |             - 200000
 77 |             - 400000
 78 |             - 600000
 79 |             - 800000
 80 | 
 81 |   MultiPeriodDiscriminator:
 82 |     params:
 83 |       periods: [2, 3, 5, 7, 11]      
 84 |       discriminator_params:
 85 |         in_channels: 1                  
 86 |         out_channels: 1                  
 87 |         kernel_sizes: [5, 3]              
 88 |         channels: 32                       
 89 |         downsample_scales: [3, 3, 3, 3, 1] 
 90 |         max_downsample_channels: 1024      
 91 |         bias: true                       
 92 |         nonlinear_activation: "LeakyReLU"  
 93 |         nonlinear_activation_params:       
 94 |           negative_slope: 0.1
 95 |         use_spectral_norm: false           
 96 |     optimizer:
 97 |       type: Adam
 98 |       params:
 99 |         lr: 2.0e-4
100 |         betas: [0.5, 0.9]
101 |         weight_decay: 0.0
102 |     scheduler:
103 |       type: MultiStepLR
104 |       params:
105 |         gamma: 0.5
106 |         milestones:
107 |             - 200000
108 |             - 400000
109 |             - 600000
110 |             - 800000
111 | 
112 | ####################################################
113 | #                   LOSS SETTING                   #
114 | ####################################################
115 | Loss:
116 |   generator_adv_loss:
117 |     enable: True
118 |     params:
119 |       average_by_discriminators: False
120 |     weights: 1.0
121 | 
122 |   discriminator_adv_loss:
123 |     enable: True
124 |     params:
125 |       average_by_discriminators: False
126 |     weights: 1.0
127 | 
128 |   stft_loss:
129 |     enable: False             # Whether to use multi-resolution STFT loss.
130 | 
131 |   mel_loss:
132 |     enable: True
133 |     params:
134 |       fs: 24000
135 |       fft_size: 1024
136 |       hop_size: 240
137 |       win_length: 1024
138 |       window: "hann"
139 |       num_mels: 80
140 |       fmin: 0
141 |       fmax: 8000
142 |       log_base: null
143 |     weights: 45.0
144 | 
145 |   subband_stft_loss:
146 |     enable: False
147 |     params:
148 |       fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
149 |       hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
150 |       win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
151 |       window: "hann_window"       # Window function for STFT-based loss
152 | 
153 |   feat_match_loss:
154 |     enable: True
155 |     params:
156 |       average_by_discriminators: false 
157 |       average_by_layers: false         
158 |     weights: 2.0
159 | 
160 | 
161 | ###########################################################
162 | #                  DATA LOADER SETTING                    #
163 | ###########################################################
164 | batch_size: 16              
165 | batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
166 | pin_memory: True            
167 | num_workers: 2 # FIXME: set > 0 may stuck on macos              
168 | remove_short_samples: False 
169 | allow_cache: True           
170 | 
171 | generator_grad_norm: -1
172 | 
173 | discriminator_grad_norm: -1
174 | 
175 | ###########################################################
176 | #                    INTERVAL SETTING                     #
177 | ###########################################################
178 | generator_train_start_steps: 1     # Number of steps to start to train discriminator.
179 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
180 | train_max_steps: 2500000           # Number of training steps.
181 | save_interval_steps: 20000         # Interval steps to save checkpoint.
182 | eval_interval_steps: 10000          # Interval steps to evaluate the network.
183 | log_interval_steps: 1000            # Interval steps to record the training log.
184 | 
185 | ###########################################################
186 | #                     OTHER SETTING                       #
187 | ###########################################################
188 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
189 | 


--------------------------------------------------------------------------------
/kantts/configs/hifigan_v1_48k.yaml:
--------------------------------------------------------------------------------
  1 | model_type: hifigan
  2 | Model:
  3 | ###########################################################
  4 | #         GENERATOR NETWORK ARCHITECTURE SETTING          #
  5 | ###########################################################
  6 |   Generator:
  7 |     params:
  8 |       in_channels: 128                       
  9 |       out_channels: 1                      
 10 |       channels: 512                       
 11 |       kernel_size: 7                     
 12 |       upsample_scales: [10, 5, 3, 2, 2]        
 13 |       upsample_kernal_sizes: [20, 10, 6, 4, 4] 
 14 |       resblock_kernel_sizes: [3, 7, 11]     
 15 |       resblock_dilations:                  
 16 |             - [1, 3, 5, 7]
 17 |             - [1, 3, 5, 7]
 18 |             - [1, 3, 5, 7]
 19 |       bias: true                           
 20 |       causal: true                             
 21 |       nonlinear_activation: "LeakyReLU"    
 22 |       nonlinear_activation_params:         
 23 |         negative_slope: 0.1
 24 |       use_weight_norm: true               
 25 |     optimizer:
 26 |       type: Adam
 27 |       params:
 28 |         lr: 2.0e-4
 29 |         betas: [0.5, 0.9]
 30 |         weight_decay: 0.0
 31 |     scheduler:
 32 |       type: MultiStepLR
 33 |       params:
 34 |         gamma: 0.5
 35 |         milestones:
 36 |             - 200000
 37 |             - 400000
 38 |             - 600000
 39 |             - 800000
 40 | 
 41 | ###########################################################
 42 | #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
 43 | ###########################################################
 44 |   MultiScaleDiscriminator:
 45 |     params:
 46 |       scales: 3                              
 47 |       downsample_pooling: "DWT"  
 48 |       downsample_pooling_params:
 49 |           kernel_size: 4                    
 50 |           stride: 2                         
 51 |           padding: 2                        
 52 |       discriminator_params:
 53 |           in_channels: 1                     
 54 |           out_channels: 1                    
 55 |           kernel_sizes: [15, 41, 5, 3]       
 56 |           channels: 128                      
 57 |           max_downsample_channels: 1024     
 58 |           max_groups: 16                   
 59 |           bias: true
 60 |           downsample_scales: [4, 4, 4, 4, 1]
 61 |           nonlinear_activation: "LeakyReLU"  
 62 |           nonlinear_activation_params:
 63 |             negative_slope: 0.1
 64 |       follow_official_norm: true    
 65 |     optimizer:
 66 |       type: Adam
 67 |       params:
 68 |         lr: 2.0e-4
 69 |         betas: [0.5, 0.9]
 70 |         weight_decay: 0.0
 71 |     scheduler:
 72 |       type: MultiStepLR
 73 |       params:
 74 |         gamma: 0.5
 75 |         milestones:
 76 |             - 200000
 77 |             - 400000
 78 |             - 600000
 79 |             - 800000
 80 | 
 81 |   MultiPeriodDiscriminator:
 82 |     params:
 83 |       periods: [2, 3, 5, 7, 11]      
 84 |       discriminator_params:
 85 |         in_channels: 1                  
 86 |         out_channels: 1                  
 87 |         kernel_sizes: [5, 3]              
 88 |         channels: 32                       
 89 |         downsample_scales: [3, 3, 3, 3, 1] 
 90 |         max_downsample_channels: 1024      
 91 |         bias: true                       
 92 |         nonlinear_activation: "LeakyReLU"  
 93 |         nonlinear_activation_params:       
 94 |           negative_slope: 0.1
 95 |         use_spectral_norm: false           
 96 |     optimizer:
 97 |       type: Adam
 98 |       params:
 99 |         lr: 2.0e-4
100 |         betas: [0.5, 0.9]
101 |         weight_decay: 0.0
102 |     scheduler:
103 |       type: MultiStepLR
104 |       params:
105 |         gamma: 0.5
106 |         milestones:
107 |             - 200000
108 |             - 400000
109 |             - 600000
110 |             - 800000
111 | 
112 | ####################################################
113 | #                   LOSS SETTING                   #
114 | ####################################################
115 | Loss:
116 |   generator_adv_loss:
117 |     enable: True
118 |     params:
119 |       average_by_discriminators: False
120 |     weights: 1.0
121 | 
122 |   discriminator_adv_loss:
123 |     enable: True
124 |     params:
125 |       average_by_discriminators: False
126 |     weights: 1.0
127 | 
128 |   stft_loss:
129 |     enable: False             # Whether to use multi-resolution STFT loss.
130 | 
131 |   mel_loss:
132 |     enable: True
133 |     params:
134 |       fs: 48000
135 |       fft_size: 4096
136 |       hop_size: 600
137 |       win_length: 2400
138 |       window: "hann"
139 |       num_mels: 128
140 |       fmin: 0
141 |       fmax: 12000
142 |       log_base: null
143 |     weights: 45.0
144 | 
145 |   subband_stft_loss:
146 |     enable: False
147 |     params:
148 |       fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
149 |       hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
150 |       win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
151 |       window: "hann_window"       # Window function for STFT-based loss
152 | 
153 |   feat_match_loss:
154 |     enable: True
155 |     params:
156 |       average_by_discriminators: false 
157 |       average_by_layers: false         
158 |     weights: 2.0
159 | 
160 | 
161 | ###########################################################
162 | #                  DATA LOADER SETTING                    #
163 | ###########################################################
164 | batch_size: 16              
165 | batch_max_steps: 19200       # Length of each audio in batch. Make sure dividable by hop_size.
166 | pin_memory: True            
167 | num_workers: 2 # FIXME: set > 0 may stuck on macos              
168 | remove_short_samples: False 
169 | allow_cache: True           
170 | 
171 | generator_grad_norm: -1
172 | 
173 | discriminator_grad_norm: -1
174 | 
175 | ###########################################################
176 | #                    INTERVAL SETTING                     #
177 | ###########################################################
178 | generator_train_start_steps: 1     # Number of steps to start to train discriminator.
179 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
180 | train_max_steps: 2500000           # Number of training steps.
181 | save_interval_steps: 20000         # Interval steps to save checkpoint.
182 | eval_interval_steps: 10000          # Interval steps to evaluate the network.
183 | log_interval_steps: 1000            # Interval steps to record the training log.
184 | 
185 | ###########################################################
186 | #                     OTHER SETTING                       #
187 | ###########################################################
188 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
189 | 


--------------------------------------------------------------------------------
/kantts/configs/hifigan_v1_8k.yaml:
--------------------------------------------------------------------------------
  1 | model_type: hifigan
  2 | Model:
  3 | ###########################################################
  4 | #         GENERATOR NETWORK ARCHITECTURE SETTING          #
  5 | ###########################################################
  6 |   Generator:
  7 |     params:
  8 |       in_channels: 80                       
  9 |       out_channels: 1                      
 10 |       channels: 256                       
 11 |       kernel_size: 7                     
 12 |       upsample_scales: [5, 5, 2, 2]        
 13 |       upsample_kernal_sizes: [10, 10, 4, 4] 
 14 |       resblock_kernel_sizes: [3, 7, 11]     
 15 |       resblock_dilations:                  
 16 |             - [1, 3, 5, 7]
 17 |             - [1, 3, 5, 7]
 18 |             - [1, 3, 5, 7]
 19 |       bias: true                           
 20 |       causal: true                           
 21 |       nonlinear_activation: "LeakyReLU"    
 22 |       nonlinear_activation_params:         
 23 |         negative_slope: 0.1
 24 |       use_weight_norm: true               
 25 |     optimizer:
 26 |       type: Adam
 27 |       params:
 28 |         lr: 2.0e-4
 29 |         betas: [0.5, 0.9]
 30 |         weight_decay: 0.0
 31 |     scheduler:
 32 |       type: MultiStepLR
 33 |       params:
 34 |         gamma: 0.5
 35 |         milestones:
 36 |             - 200000
 37 |             - 400000
 38 |             - 600000
 39 |             - 800000
 40 | 
 41 | ###########################################################
 42 | #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
 43 | ###########################################################
 44 |   MultiScaleDiscriminator:
 45 |     params:
 46 |       scales: 3                              
 47 |       downsample_pooling: "DWT"  
 48 |       downsample_pooling_params:
 49 |           kernel_size: 4                    
 50 |           stride: 2                         
 51 |           padding: 2                        
 52 |       discriminator_params:
 53 |           in_channels: 1                     
 54 |           out_channels: 1                    
 55 |           kernel_sizes: [15, 41, 5, 3]       
 56 |           channels: 128                      
 57 |           max_downsample_channels: 1024     
 58 |           max_groups: 16                   
 59 |           bias: true
 60 |           downsample_scales: [4, 4, 4, 4, 1]
 61 |           nonlinear_activation: "LeakyReLU"  
 62 |           nonlinear_activation_params:
 63 |             negative_slope: 0.1
 64 |       follow_official_norm: true    
 65 |     optimizer:
 66 |       type: Adam
 67 |       params:
 68 |         lr: 2.0e-4
 69 |         betas: [0.5, 0.9]
 70 |         weight_decay: 0.0
 71 |     scheduler:
 72 |       type: MultiStepLR
 73 |       params:
 74 |         gamma: 0.5
 75 |         milestones:
 76 |             - 200000
 77 |             - 400000
 78 |             - 600000
 79 |             - 800000
 80 | 
 81 |   MultiPeriodDiscriminator:
 82 |     params:
 83 |       periods: [2, 3, 5, 7, 11]      
 84 |       discriminator_params:
 85 |         in_channels: 1                  
 86 |         out_channels: 1                  
 87 |         kernel_sizes: [5, 3]              
 88 |         channels: 32                       
 89 |         downsample_scales: [3, 3, 3, 3, 1] 
 90 |         max_downsample_channels: 1024      
 91 |         bias: true                       
 92 |         nonlinear_activation: "LeakyReLU"  
 93 |         nonlinear_activation_params:       
 94 |           negative_slope: 0.1
 95 |         use_spectral_norm: false           
 96 |     optimizer:
 97 |       type: Adam
 98 |       params:
 99 |         lr: 2.0e-4
100 |         betas: [0.5, 0.9]
101 |         weight_decay: 0.0
102 |     scheduler:
103 |       type: MultiStepLR
104 |       params:
105 |         gamma: 0.5
106 |         milestones:
107 |             - 200000
108 |             - 400000
109 |             - 600000
110 |             - 800000
111 | 
112 | ####################################################
113 | #                   LOSS SETTING                   #
114 | ####################################################
115 | Loss:
116 |   generator_adv_loss:
117 |     enable: True
118 |     params:
119 |       average_by_discriminators: False
120 |     weights: 1.0
121 | 
122 |   discriminator_adv_loss:
123 |     enable: True
124 |     params:
125 |       average_by_discriminators: False
126 |     weights: 1.0
127 | 
128 |   stft_loss:
129 |     enable: False             # Whether to use multi-resolution STFT loss.
130 | 
131 |   mel_loss:
132 |     enable: True
133 |     params:
134 |       fs: 8000
135 |       fft_size: 2048
136 |       hop_size: 100
137 |       win_length: 600
138 |       window: "hann"
139 |       num_mels: 80
140 |       fmin: 0
141 |       fmax: 8000
142 |       log_base: null
143 |     weights: 45.0
144 | 
145 |   subband_stft_loss:
146 |     enable: False
147 |     params:
148 |       fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
149 |       hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
150 |       win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
151 |       window: "hann_window"       # Window function for STFT-based loss
152 | 
153 |   feat_match_loss:
154 |     enable: True
155 |     params:
156 |       average_by_discriminators: false 
157 |       average_by_layers: false         
158 |     weights: 2.0
159 | 
160 | 
161 | ###########################################################
162 | #                  DATA LOADER SETTING                    #
163 | ###########################################################
164 | batch_size: 16              
165 | batch_max_steps: 6000       # Length of each audio in batch. Make sure dividable by hop_size.
166 | pin_memory: True            
167 | num_workers: 2 # FIXME: set > 0 may stuck on macos              
168 | remove_short_samples: False 
169 | allow_cache: True           
170 | 
171 | generator_grad_norm: -1
172 | 
173 | discriminator_grad_norm: -1
174 | 
175 | ###########################################################
176 | #                    INTERVAL SETTING                     #
177 | ###########################################################
178 | generator_train_start_steps: 1     # Number of steps to start to train discriminator.
179 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
180 | train_max_steps: 2500000           # Number of training steps.
181 | save_interval_steps: 20000         # Interval steps to save checkpoint.
182 | eval_interval_steps: 10000          # Interval steps to evaluate the network.
183 | log_interval_steps: 1000            # Interval steps to record the training log.
184 | 
185 | ###########################################################
186 | #                     OTHER SETTING                       #
187 | ###########################################################
188 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
189 | 


--------------------------------------------------------------------------------
/kantts/configs/hifigan_v1_nsf_24k.yaml:
--------------------------------------------------------------------------------
  1 | model_type: hifigan
  2 | Model:
  3 | ###########################################################
  4 | #         GENERATOR NETWORK ARCHITECTURE SETTING          #
  5 | ###########################################################
  6 |   Generator:
  7 |     params:
  8 |       in_channels: 80                       
  9 |       out_channels: 1                      
 10 |       channels: 512                       
 11 |       kernel_size: 7                     
 12 |       upsample_scales: [8, 5, 3, 2]        
 13 |       upsample_kernal_sizes: [16, 10, 6, 4] 
 14 |       resblock_kernel_sizes: [3, 7, 11]     
 15 |       resblock_dilations:                  
 16 |             - [1, 3, 5]
 17 |             - [1, 3, 5]
 18 |             - [1, 3, 5]
 19 |       bias: true                           
 20 |       causal: true                             
 21 |       nonlinear_activation: "LeakyReLU"    
 22 |       nonlinear_activation_params:         
 23 |         negative_slope: 0.1
 24 |       use_weight_norm: true               
 25 |       nsf_params:
 26 |         nb_harmonics: 7
 27 |         sampling_rate: 24000
 28 |     optimizer:
 29 |       type: Adam
 30 |       params:
 31 |         lr: 2.0e-4
 32 |         betas: [0.5, 0.9]
 33 |         weight_decay: 0.0
 34 |     scheduler:
 35 |       type: MultiStepLR
 36 |       params:
 37 |         gamma: 0.5
 38 |         milestones:
 39 |             - 200000
 40 |             - 400000
 41 |             - 600000
 42 |             - 800000
 43 | 
 44 | ###########################################################
 45 | #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
 46 | ###########################################################
 47 |   MultiScaleDiscriminator:
 48 |     params:
 49 |       scales: 3                              
 50 |       downsample_pooling: "DWT"  
 51 |       downsample_pooling_params:
 52 |           kernel_size: 4                    
 53 |           stride: 2                         
 54 |           padding: 2                        
 55 |       discriminator_params:
 56 |           in_channels: 1                     
 57 |           out_channels: 1                    
 58 |           kernel_sizes: [15, 41, 5, 3]       
 59 |           channels: 128                      
 60 |           max_downsample_channels: 1024     
 61 |           max_groups: 16                   
 62 |           bias: true
 63 |           downsample_scales: [4, 4, 4, 4, 1]
 64 |           nonlinear_activation: "LeakyReLU"  
 65 |           nonlinear_activation_params:
 66 |             negative_slope: 0.1
 67 |       follow_official_norm: true    
 68 |     optimizer:
 69 |       type: Adam
 70 |       params:
 71 |         lr: 2.0e-4
 72 |         betas: [0.5, 0.9]
 73 |         weight_decay: 0.0
 74 |     scheduler:
 75 |       type: MultiStepLR
 76 |       params:
 77 |         gamma: 0.5
 78 |         milestones:
 79 |             - 200000
 80 |             - 400000
 81 |             - 600000
 82 |             - 800000
 83 | 
 84 |   MultiPeriodDiscriminator:
 85 |     params:
 86 |       periods: [2, 3, 5, 7, 11]      
 87 |       discriminator_params:
 88 |         in_channels: 1                  
 89 |         out_channels: 1                  
 90 |         kernel_sizes: [5, 3]              
 91 |         channels: 32                       
 92 |         downsample_scales: [3, 3, 3, 3, 1] 
 93 |         max_downsample_channels: 1024      
 94 |         bias: true                       
 95 |         nonlinear_activation: "LeakyReLU"  
 96 |         nonlinear_activation_params:       
 97 |           negative_slope: 0.1
 98 |         use_spectral_norm: false           
 99 |     optimizer:
100 |       type: Adam
101 |       params:
102 |         lr: 2.0e-4
103 |         betas: [0.5, 0.9]
104 |         weight_decay: 0.0
105 |     scheduler:
106 |       type: MultiStepLR
107 |       params:
108 |         gamma: 0.5
109 |         milestones:
110 |             - 200000
111 |             - 400000
112 |             - 600000
113 |             - 800000
114 | 
115 | ####################################################
116 | #                   LOSS SETTING                   #
117 | ####################################################
118 | Loss:
119 |   generator_adv_loss:
120 |     enable: True
121 |     params:
122 |       average_by_discriminators: False
123 |     weights: 1.0
124 | 
125 |   discriminator_adv_loss:
126 |     enable: True
127 |     params:
128 |       average_by_discriminators: False
129 |     weights: 1.0
130 | 
131 |   stft_loss:
132 |     enable: False             # Whether to use multi-resolution STFT loss.
133 | 
134 |   mel_loss:
135 |     enable: True
136 |     params:
137 |       fs: 24000
138 |       fft_size: 1024
139 |       hop_size: 240
140 |       win_length: 1024
141 |       window: "hann"
142 |       num_mels: 80
143 |       fmin: 0
144 |       fmax: 8000
145 |       log_base: null
146 |     weights: 45.0
147 | 
148 |   subband_stft_loss:
149 |     enable: False
150 |     params:
151 |       fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
152 |       hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
153 |       win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
154 |       window: "hann_window"       # Window function for STFT-based loss
155 | 
156 |   feat_match_loss:
157 |     enable: True
158 |     params:
159 |       average_by_discriminators: false 
160 |       average_by_layers: false         
161 |     weights: 2.0
162 | 
163 | 
164 | ###########################################################
165 | #                  DATA LOADER SETTING                    #
166 | ###########################################################
167 | batch_size: 16              
168 | batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
169 | pin_memory: False            
170 | num_workers: 2 # FIXME: set > 0 may stuck on macos              
171 | remove_short_samples: False 
172 | allow_cache: True           
173 | 
174 | generator_grad_norm: -1
175 | 
176 | discriminator_grad_norm: -1
177 | 
178 | ###########################################################
179 | #                    INTERVAL SETTING                     #
180 | ###########################################################
181 | generator_train_start_steps: 1     # Number of steps to start to train discriminator.
182 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
183 | train_max_steps: 2500000           # Number of training steps.
184 | save_interval_steps: 20000         # Interval steps to save checkpoint.
185 | eval_interval_steps: 10000          # Interval steps to evaluate the network.
186 | log_interval_steps: 1000            # Interval steps to record the training log.
187 | 
188 | ###########################################################
189 | #                     OTHER SETTING                       #
190 | ###########################################################
191 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
192 | 


--------------------------------------------------------------------------------
/kantts/configs/sambert_16k.yaml:
--------------------------------------------------------------------------------
  1 | model_type: sambert
  2 | Model:
  3 | #########################################################
  4 | #         SAMBERT NETWORK ARCHITECTURE SETTING          #
  5 | #########################################################
  6 |   KanTtsSAMBERT:
  7 |     params:
  8 |         max_len: 800
  9 | 
 10 |         embedding_dim: 512 
 11 |         encoder_num_layers: 8
 12 |         encoder_num_heads: 8
 13 |         encoder_num_units: 128
 14 |         encoder_ffn_inner_dim: 1024
 15 |         encoder_dropout: 0.1
 16 |         encoder_attention_dropout: 0.1
 17 |         encoder_relu_dropout: 0.1
 18 |         encoder_projection_units: 32
 19 | 
 20 |         speaker_units: 32
 21 |         emotion_units: 32
 22 | 
 23 |         predictor_filter_size: 41
 24 |         predictor_fsmn_num_layers: 3
 25 |         predictor_num_memory_units: 128
 26 |         predictor_ffn_inner_dim: 256
 27 |         predictor_dropout: 0.1
 28 |         predictor_shift: 0
 29 |         predictor_lstm_units: 128
 30 |         dur_pred_prenet_units: [128, 128]
 31 |         dur_pred_lstm_units: 128
 32 | 
 33 |         decoder_prenet_units: [256, 256]
 34 |         decoder_num_layers: 12
 35 |         decoder_num_heads: 8
 36 |         decoder_num_units: 128
 37 |         decoder_ffn_inner_dim: 1024
 38 |         decoder_dropout: 0.1
 39 |         decoder_attention_dropout: 0.1
 40 |         decoder_relu_dropout: 0.1
 41 | 
 42 |         outputs_per_step: 3
 43 |         num_mels: 80
 44 | 
 45 |         postnet_filter_size: 41
 46 |         postnet_fsmn_num_layers: 4
 47 |         postnet_num_memory_units: 256
 48 |         postnet_ffn_inner_dim: 512
 49 |         postnet_dropout: 0.1
 50 |         postnet_shift: 17
 51 |         postnet_lstm_units: 128
 52 |         MAS: False
 53 | 
 54 |     optimizer:
 55 |       type: Adam
 56 |       params:
 57 |         lr: 0.001
 58 |         betas: [0.9, 0.98]
 59 |         eps: 1.0e-9
 60 |         weight_decay: 0.0
 61 |     scheduler:
 62 |       type: NoamLR
 63 |       params:
 64 |         warmup_steps: 4000
 65 | 
 66 | linguistic_unit: 
 67 |   cleaners: english_cleaners
 68 |   lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
 69 |   speaker_list: F7
 70 | ####################################################
 71 | #                   LOSS SETTING                   #
 72 | ####################################################
 73 | Loss:
 74 |   MelReconLoss:
 75 |     enable: True
 76 |     params:
 77 |       loss_type: mae
 78 | 
 79 |   ProsodyReconLoss:
 80 |     enable: True
 81 |     params:
 82 |       loss_type: mae
 83 | 
 84 | ###########################################################
 85 | #                  DATA LOADER SETTING                    #
 86 | ###########################################################
 87 | batch_size: 32              
 88 | pin_memory: False            
 89 | num_workers: 4 # FIXME: set > 0 may stuck on macos              
 90 | remove_short_samples: False 
 91 | allow_cache: True           
 92 | grad_norm: 1.0
 93 | 
 94 | ###########################################################
 95 | #                    INTERVAL SETTING                     #
 96 | ###########################################################
 97 | train_max_steps: 1000000           # Number of training steps.
 98 | save_interval_steps: 20000         # Interval steps to save checkpoint.
 99 | eval_interval_steps: 10000          # Interval steps to evaluate the network.
100 | log_interval_steps: 1000            # Interval steps to record the training log.
101 | 
102 | ###########################################################
103 | #                     OTHER SETTING                       #
104 | ###########################################################
105 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
106 | 


--------------------------------------------------------------------------------
/kantts/configs/sambert_16k_MAS.yaml:
--------------------------------------------------------------------------------
  1 | model_type: sambert
  2 | Model:
  3 | #########################################################
  4 | #         SAMBERT NETWORK ARCHITECTURE SETTING          #
  5 | #########################################################
  6 |   KanTtsSAMBERT:
  7 |     params:
  8 |         max_len: 800
  9 | 
 10 |         embedding_dim: 512 
 11 |         encoder_num_layers: 8
 12 |         encoder_num_heads: 8
 13 |         encoder_num_units: 128
 14 |         encoder_ffn_inner_dim: 1024
 15 |         encoder_dropout: 0.1
 16 |         encoder_attention_dropout: 0.1
 17 |         encoder_relu_dropout: 0.1
 18 |         encoder_projection_units: 32
 19 | 
 20 |         speaker_units: 32
 21 |         emotion_units: 32
 22 | 
 23 |         predictor_filter_size: 41
 24 |         predictor_fsmn_num_layers: 3
 25 |         predictor_num_memory_units: 128
 26 |         predictor_ffn_inner_dim: 256
 27 |         predictor_dropout: 0.1
 28 |         predictor_shift: 0
 29 |         predictor_lstm_units: 128
 30 |         dur_pred_prenet_units: [128, 128]
 31 |         dur_pred_lstm_units: 128
 32 | 
 33 |         decoder_prenet_units: [256, 256]
 34 |         decoder_num_layers: 12
 35 |         decoder_num_heads: 8
 36 |         decoder_num_units: 128
 37 |         decoder_ffn_inner_dim: 1024
 38 |         decoder_dropout: 0.1
 39 |         decoder_attention_dropout: 0.1
 40 |         decoder_relu_dropout: 0.1
 41 | 
 42 |         outputs_per_step: 3
 43 |         num_mels: 80
 44 | 
 45 |         postnet_filter_size: 41
 46 |         postnet_fsmn_num_layers: 4
 47 |         postnet_num_memory_units: 256
 48 |         postnet_ffn_inner_dim: 512
 49 |         postnet_dropout: 0.1
 50 |         postnet_shift: 17
 51 |         postnet_lstm_units: 128
 52 | 
 53 |         MAS: True
 54 |           
 55 | 
 56 |     optimizer:
 57 |       type: Adam
 58 |       params:
 59 |         lr: 0.001
 60 |         betas: [0.9, 0.98]
 61 |         eps: 1.0e-9
 62 |         weight_decay: 0.0
 63 |     scheduler:
 64 |       type: NoamLR
 65 |       params:
 66 |         warmup_steps: 4000
 67 | 
 68 | linguistic_unit: 
 69 |   cleaners: english_cleaners
 70 |   lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
 71 |   speaker_list: F7
 72 | ####################################################
 73 | #                   LOSS SETTING                   #
 74 | ####################################################
 75 | Loss:
 76 |   MelReconLoss:
 77 |     enable: True
 78 |     params:
 79 |       loss_type: mae
 80 | 
 81 |   ProsodyReconLoss:
 82 |     enable: True
 83 |     params:
 84 |       loss_type: mae
 85 | 
 86 |   AttentionCTCLoss:
 87 |     enable: True
 88 | 
 89 |   AttentionBinarizationLoss:
 90 |     enable: True
 91 |     params:
 92 |       start_epoch: 0
 93 |       warmup_epoch: 100
 94 | 
 95 | 
 96 | ###########################################################
 97 | #                  DATA LOADER SETTING                    #
 98 | ###########################################################
 99 | batch_size: 32              
100 | pin_memory: False            
101 | num_workers: 4 # FIXME: set > 0 may stuck on macos              
102 | remove_short_samples: False 
103 | allow_cache: True           
104 | 
105 | grad_norm: 1.0
106 | 
107 | ###########################################################
108 | #                    INTERVAL SETTING                     #
109 | ###########################################################
110 | train_max_steps: 1000000           # Number of training steps.
111 | save_interval_steps: 20000         # Interval steps to save checkpoint.
112 | eval_interval_steps: 10000          # Interval steps to evaluate the network.
113 | log_interval_steps: 1000            # Interval steps to record the training log.
114 | 
115 | ###########################################################
116 | #                     OTHER SETTING                       #
117 | ###########################################################
118 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
119 | 


--------------------------------------------------------------------------------
/kantts/configs/sambert_16k_MAS_byte.yaml:
--------------------------------------------------------------------------------
  1 | model_type: sambert
  2 | Model:
  3 | #########################################################
  4 | #         SAMBERT NETWORK ARCHITECTURE SETTING          #
  5 | #########################################################
  6 |   KanTtsSAMBERT:
  7 |     params:
  8 |         max_len: 800
  9 | 
 10 |         embedding_dim: 512 
 11 |         encoder_num_layers: 8
 12 |         encoder_num_heads: 8
 13 |         encoder_num_units: 128
 14 |         encoder_ffn_inner_dim: 1024
 15 |         encoder_dropout: 0.1
 16 |         encoder_attention_dropout: 0.1
 17 |         encoder_relu_dropout: 0.1
 18 |         encoder_projection_units: 32
 19 | 
 20 |         speaker_units: 32
 21 |         emotion_units: 32
 22 | 
 23 |         predictor_filter_size: 41
 24 |         predictor_fsmn_num_layers: 3
 25 |         predictor_num_memory_units: 128
 26 |         predictor_ffn_inner_dim: 256
 27 |         predictor_dropout: 0.1
 28 |         predictor_shift: 0
 29 |         predictor_lstm_units: 128
 30 |         dur_pred_prenet_units: [128, 128]
 31 |         dur_pred_lstm_units: 128
 32 | 
 33 |         decoder_prenet_units: [256, 256]
 34 |         decoder_num_layers: 12
 35 |         decoder_num_heads: 8
 36 |         decoder_num_units: 128
 37 |         decoder_ffn_inner_dim: 1024
 38 |         decoder_dropout: 0.1
 39 |         decoder_attention_dropout: 0.1
 40 |         decoder_relu_dropout: 0.1
 41 | 
 42 |         outputs_per_step: 3
 43 |         num_mels: 80
 44 | 
 45 |         postnet_filter_size: 41
 46 |         postnet_fsmn_num_layers: 4
 47 |         postnet_num_memory_units: 256
 48 |         postnet_ffn_inner_dim: 512
 49 |         postnet_dropout: 0.1
 50 |         postnet_shift: 17
 51 |         postnet_lstm_units: 128
 52 | 
 53 |         MAS: True
 54 |         using_byte: True
 55 |           
 56 | 
 57 |     optimizer:
 58 |       type: Adam
 59 |       params:
 60 |         lr: 0.001
 61 |         betas: [0.9, 0.98]
 62 |         eps: 1.0e-9
 63 |         weight_decay: 0.0
 64 |     scheduler:
 65 |       type: NoamLR
 66 |       params:
 67 |         warmup_steps: 4000
 68 | 
 69 | linguistic_unit: 
 70 |   cleaners: english_cleaners
 71 |   lfeat_type_list: byte_index,emo_category,speaker_category
 72 |   speaker_list: F7
 73 | ####################################################
 74 | #                   LOSS SETTING                   #
 75 | ####################################################
 76 | Loss:
 77 |   MelReconLoss:
 78 |     enable: True
 79 |     params:
 80 |       loss_type: mae
 81 | 
 82 |   ProsodyReconLoss:
 83 |     enable: True
 84 |     params:
 85 |       loss_type: mae
 86 | 
 87 |   AttentionCTCLoss:
 88 |     enable: True
 89 | 
 90 |   AttentionBinarizationLoss:
 91 |     enable: True
 92 |     params:
 93 |       start_epoch: 0
 94 |       warmup_epoch: 100
 95 | 
 96 | 
 97 | ###########################################################
 98 | #                  DATA LOADER SETTING                    #
 99 | ###########################################################
100 | batch_size: 8
101 | pin_memory: False            
102 | num_workers: 4 # FIXME: set > 0 may stuck on macos              
103 | remove_short_samples: False 
104 | allow_cache: True           
105 | 
106 | grad_norm: 1.0
107 | 
108 | ###########################################################
109 | #                    INTERVAL SETTING                     #
110 | ###########################################################
111 | train_max_steps: 1000000           # Number of training steps.
112 | save_interval_steps: 20000         # Interval steps to save checkpoint.
113 | eval_interval_steps: 10000          # Interval steps to evaluate the network.
114 | log_interval_steps: 1000            # Interval steps to record the training log.
115 | 
116 | ###########################################################
117 | #                     OTHER SETTING                       #
118 | ###########################################################
119 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
120 | 


--------------------------------------------------------------------------------
/kantts/configs/sambert_24k.yaml:
--------------------------------------------------------------------------------
  1 | model_type: sambert
  2 | Model:
  3 | #########################################################
  4 | #         SAMBERT NETWORK ARCHITECTURE SETTING          #
  5 | #########################################################
  6 |   KanTtsSAMBERT:
  7 |     params:
  8 |         max_len: 800
  9 | 
 10 |         embedding_dim: 512 
 11 |         encoder_num_layers: 8
 12 |         encoder_num_heads: 8
 13 |         encoder_num_units: 128
 14 |         encoder_ffn_inner_dim: 1024
 15 |         encoder_dropout: 0.1
 16 |         encoder_attention_dropout: 0.1
 17 |         encoder_relu_dropout: 0.1
 18 |         encoder_projection_units: 32
 19 | 
 20 |         speaker_units: 32
 21 |         emotion_units: 32
 22 | 
 23 |         predictor_filter_size: 41
 24 |         predictor_fsmn_num_layers: 3
 25 |         predictor_num_memory_units: 128
 26 |         predictor_ffn_inner_dim: 256
 27 |         predictor_dropout: 0.1
 28 |         predictor_shift: 0
 29 |         predictor_lstm_units: 128
 30 |         dur_pred_prenet_units: [128, 128]
 31 |         dur_pred_lstm_units: 128
 32 | 
 33 |         decoder_prenet_units: [256, 256]
 34 |         decoder_num_layers: 12
 35 |         decoder_num_heads: 8
 36 |         decoder_num_units: 128
 37 |         decoder_ffn_inner_dim: 1024
 38 |         decoder_dropout: 0.1
 39 |         decoder_attention_dropout: 0.1
 40 |         decoder_relu_dropout: 0.1
 41 | 
 42 |         outputs_per_step: 3
 43 |         num_mels: 80
 44 | 
 45 |         postnet_filter_size: 41
 46 |         postnet_fsmn_num_layers: 4
 47 |         postnet_num_memory_units: 256
 48 |         postnet_ffn_inner_dim: 512
 49 |         postnet_dropout: 0.1
 50 |         postnet_shift: 17
 51 |         postnet_lstm_units: 128
 52 |         MAS: False
 53 | 
 54 |     optimizer:
 55 |       type: Adam
 56 |       params:
 57 |         lr: 0.001
 58 |         betas: [0.9, 0.98]
 59 |         eps: 1.0e-9
 60 |         weight_decay: 0.0
 61 |     scheduler:
 62 |       type: NoamLR
 63 |       params:
 64 |         warmup_steps: 4000
 65 | 
 66 | linguistic_unit: 
 67 |   cleaners: english_cleaners
 68 |   lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
 69 |   speaker_list: F7
 70 | ####################################################
 71 | #                   LOSS SETTING                   #
 72 | ####################################################
 73 | Loss:
 74 |   MelReconLoss:
 75 |     enable: True
 76 |     params:
 77 |       loss_type: mae
 78 | 
 79 |   ProsodyReconLoss:
 80 |     enable: True
 81 |     params:
 82 |       loss_type: mae
 83 | 
 84 | 
 85 | ###########################################################
 86 | #                  DATA LOADER SETTING                    #
 87 | ###########################################################
 88 | batch_size: 32              
 89 | pin_memory: False            
 90 | num_workers: 4 # FIXME: set > 0 may stuck on macos              
 91 | remove_short_samples: False 
 92 | allow_cache: True           
 93 | 
 94 | grad_norm: 1.0
 95 | 
 96 | ###########################################################
 97 | #                    INTERVAL SETTING                     #
 98 | ###########################################################
 99 | train_max_steps: 1000000           # Number of training steps.
100 | save_interval_steps: 20000         # Interval steps to save checkpoint.
101 | eval_interval_steps: 10000          # Interval steps to evaluate the network.
102 | log_interval_steps: 1000            # Interval steps to record the training log.
103 | 
104 | ###########################################################
105 | #                     OTHER SETTING                       #
106 | ###########################################################
107 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
108 | 


--------------------------------------------------------------------------------
/kantts/configs/sambert_48k.yaml:
--------------------------------------------------------------------------------
  1 | model_type: sambert
  2 | Model:
  3 | #########################################################
  4 | #         SAMBERT NETWORK ARCHITECTURE SETTING          #
  5 | #########################################################
  6 |   KanTtsSAMBERT:
  7 |     params:
  8 |         max_len: 900
  9 | 
 10 |         embedding_dim: 512 
 11 |         encoder_num_layers: 8
 12 |         encoder_num_heads: 8
 13 |         encoder_num_units: 128
 14 |         encoder_ffn_inner_dim: 1024
 15 |         encoder_dropout: 0.1
 16 |         encoder_attention_dropout: 0.1
 17 |         encoder_relu_dropout: 0.1
 18 |         encoder_projection_units: 32
 19 | 
 20 |         speaker_units: 32
 21 |         emotion_units: 32
 22 | 
 23 |         predictor_filter_size: 41
 24 |         predictor_fsmn_num_layers: 3
 25 |         predictor_num_memory_units: 128
 26 |         predictor_ffn_inner_dim: 256
 27 |         predictor_dropout: 0.1
 28 |         predictor_shift: 0
 29 |         predictor_lstm_units: 128
 30 |         dur_pred_prenet_units: [128, 128]
 31 |         dur_pred_lstm_units: 128
 32 | 
 33 |         decoder_prenet_units: [256, 256]
 34 |         decoder_num_layers: 12
 35 |         decoder_num_heads: 8
 36 |         decoder_num_units: 128
 37 |         decoder_ffn_inner_dim: 1024
 38 |         decoder_dropout: 0.1
 39 |         decoder_attention_dropout: 0.1
 40 |         decoder_relu_dropout: 0.1
 41 | 
 42 |         outputs_per_step: 3
 43 |         num_mels: 128
 44 | 
 45 |         postnet_filter_size: 41
 46 |         postnet_fsmn_num_layers: 4
 47 |         postnet_num_memory_units: 256
 48 |         postnet_ffn_inner_dim: 512
 49 |         postnet_dropout: 0.1
 50 |         postnet_shift: 17
 51 |         postnet_lstm_units: 128
 52 |         MAS: False
 53 | 
 54 |     optimizer:
 55 |       type: Adam
 56 |       params:
 57 |         lr: 0.001
 58 |         betas: [0.9, 0.98]
 59 |         eps: 1.0e-9
 60 |         weight_decay: 0.0
 61 |     scheduler:
 62 |       type: NoamLR
 63 |       params:
 64 |         warmup_steps: 4000
 65 | 
 66 | linguistic_unit: 
 67 |   cleaners: english_cleaners
 68 |   lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
 69 |   speaker_list: F7
 70 | ####################################################
 71 | #                   LOSS SETTING                   #
 72 | ####################################################
 73 | Loss:
 74 |   MelReconLoss:
 75 |     enable: True
 76 |     params:
 77 |       loss_type: mae
 78 | 
 79 |   ProsodyReconLoss:
 80 |     enable: True
 81 |     params:
 82 |       loss_type: mae
 83 | 
 84 | 
 85 | ###########################################################
 86 | #                  DATA LOADER SETTING                    #
 87 | ###########################################################
 88 | batch_size: 32              
 89 | pin_memory: False            
 90 | num_workers: 4 # FIXME: set > 0 may stuck on macos              
 91 | remove_short_samples: False 
 92 | allow_cache: True           
 93 | 
 94 | grad_norm: 1.0
 95 | 
 96 | ###########################################################
 97 | #                    INTERVAL SETTING                     #
 98 | ###########################################################
 99 | train_max_steps: 1000000           # Number of training steps.
100 | save_interval_steps: 20000         # Interval steps to save checkpoint.
101 | eval_interval_steps: 10000          # Interval steps to evaluate the network.
102 | log_interval_steps: 1000            # Interval steps to record the training log.
103 | 
104 | ###########################################################
105 | #                     OTHER SETTING                       #
106 | ###########################################################
107 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
108 | 


--------------------------------------------------------------------------------
/kantts/configs/sambert_fp_8k.yaml:
--------------------------------------------------------------------------------
  1 | model_type: sambert
  2 | Model:
  3 | #########################################################
  4 | #         SAMBERT NETWORK ARCHITECTURE SETTING          #
  5 | #########################################################
  6 |   KanTtsSAMBERT:
  7 |     params:
  8 |         max_len: 800
  9 | 
 10 |         embedding_dim: 512 
 11 |         encoder_num_layers: 8
 12 |         encoder_num_heads: 8
 13 |         encoder_num_units: 128
 14 |         encoder_ffn_inner_dim: 1024
 15 |         encoder_dropout: 0.1
 16 |         encoder_attention_dropout: 0.1
 17 |         encoder_relu_dropout: 0.1
 18 |         encoder_projection_units: 32
 19 | 
 20 |         speaker_units: 32
 21 |         emotion_units: 32
 22 | 
 23 |         predictor_filter_size: 41
 24 |         predictor_fsmn_num_layers: 3
 25 |         predictor_num_memory_units: 128
 26 |         predictor_ffn_inner_dim: 256
 27 |         predictor_dropout: 0.1
 28 |         predictor_shift: 0
 29 |         predictor_lstm_units: 128
 30 |         dur_pred_prenet_units: [128, 128]
 31 |         dur_pred_lstm_units: 128
 32 | 
 33 |         decoder_prenet_units: [256, 256]
 34 |         decoder_num_layers: 12
 35 |         decoder_num_heads: 8
 36 |         decoder_num_units: 128
 37 |         decoder_ffn_inner_dim: 1024
 38 |         decoder_dropout: 0.1
 39 |         decoder_attention_dropout: 0.1
 40 |         decoder_relu_dropout: 0.1
 41 | 
 42 |         outputs_per_step: 3
 43 |         num_mels: 80
 44 | 
 45 |         postnet_filter_size: 41
 46 |         postnet_fsmn_num_layers: 4
 47 |         postnet_num_memory_units: 256
 48 |         postnet_ffn_inner_dim: 512
 49 |         postnet_dropout: 0.1
 50 |         postnet_shift: 17
 51 |         postnet_lstm_units: 128
 52 |         MAS: False
 53 |         FP: True
 54 | 
 55 |     optimizer:
 56 |       type: Adam
 57 |       params:
 58 |         lr: 0.001
 59 |         betas: [0.9, 0.98]
 60 |         eps: 1.0e-9
 61 |         weight_decay: 0.0
 62 |     scheduler:
 63 |       type: NoamLR
 64 |       params:
 65 |         warmup_steps: 4000
 66 | 
 67 | linguistic_unit: 
 68 |   cleaners: english_cleaners
 69 |   lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
 70 |   speaker_list: F7,F74,M7,FBYN,FRXL,xiaoyu
 71 | ####################################################
 72 | #                   LOSS SETTING                   #
 73 | ####################################################
 74 | Loss:
 75 |   MelReconLoss:
 76 |     enable: True
 77 |     params:
 78 |       loss_type: mae
 79 | 
 80 |   ProsodyReconLoss:
 81 |     enable: True
 82 |     params:
 83 |       loss_type: mae
 84 | 
 85 |   FpCELoss:
 86 |     enable: True
 87 |     params:
 88 |       loss_type: ce
 89 |       weight: [1,4,4,8]
 90 | 
 91 | ###########################################################
 92 | #                  DATA LOADER SETTING                    #
 93 | ###########################################################
 94 | batch_size: 16              
 95 | pin_memory: False            
 96 | num_workers: 4 # FIXME: set > 0 may stuck on macos              
 97 | remove_short_samples: False 
 98 | allow_cache: True           
 99 | 
100 | grad_norm: 1.0
101 |  
102 | ###########################################################
103 | #                    INTERVAL SETTING                     #
104 | ###########################################################
105 | train_max_steps: 1000000           # Number of training steps.
106 | save_interval_steps: 20000         # Interval steps to save checkpoint.
107 | eval_interval_steps: 10000          # Interval steps to evaluate the network.
108 | log_interval_steps: 1000            # Interval steps to record the training log.
109 | 
110 | ###########################################################
111 | #                     OTHER SETTING                       #
112 | ###########################################################
113 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
114 | 


--------------------------------------------------------------------------------
/kantts/configs/sambert_nsf_16k.yaml:
--------------------------------------------------------------------------------
  1 | model_type: sambert
  2 | Model:
  3 | #########################################################
  4 | #         SAMBERT NETWORK ARCHITECTURE SETTING          #
  5 | #########################################################
  6 |   KanTtsSAMBERT:
  7 |     params:
  8 |         max_len: 800
  9 | 
 10 |         embedding_dim: 512 
 11 |         encoder_num_layers: 8
 12 |         encoder_num_heads: 8
 13 |         encoder_num_units: 128
 14 |         encoder_ffn_inner_dim: 1024
 15 |         encoder_dropout: 0.1
 16 |         encoder_attention_dropout: 0.1
 17 |         encoder_relu_dropout: 0.1
 18 |         encoder_projection_units: 32
 19 | 
 20 |         speaker_units: 32
 21 |         emotion_units: 32
 22 | 
 23 |         predictor_filter_size: 41
 24 |         predictor_fsmn_num_layers: 3
 25 |         predictor_num_memory_units: 128
 26 |         predictor_ffn_inner_dim: 256
 27 |         predictor_dropout: 0.1
 28 |         predictor_shift: 0
 29 |         predictor_lstm_units: 128
 30 |         dur_pred_prenet_units: [128, 128]
 31 |         dur_pred_lstm_units: 128
 32 | 
 33 |         decoder_prenet_units: [256, 256]
 34 |         decoder_num_layers: 12
 35 |         decoder_num_heads: 8
 36 |         decoder_num_units: 128
 37 |         decoder_ffn_inner_dim: 1024
 38 |         decoder_dropout: 0.1
 39 |         decoder_attention_dropout: 0.1
 40 |         decoder_relu_dropout: 0.1
 41 | 
 42 |         outputs_per_step: 3
 43 |         num_mels: 82
 44 | 
 45 |         postnet_filter_size: 41
 46 |         postnet_fsmn_num_layers: 4
 47 |         postnet_num_memory_units: 256
 48 |         postnet_ffn_inner_dim: 512
 49 |         postnet_dropout: 0.1
 50 |         postnet_shift: 17
 51 |         postnet_lstm_units: 128
 52 |         MAS: False
 53 |         NSF: True
 54 |         
 55 |         
 56 |     optimizer:
 57 |       type: Adam
 58 |       params:
 59 |         lr: 0.001
 60 |         betas: [0.9, 0.98]
 61 |         eps: 1.0e-9
 62 |         weight_decay: 0.0
 63 |     scheduler:
 64 |       type: NoamLR
 65 |       params:
 66 |         warmup_steps: 4000
 67 | 
 68 | linguistic_unit: 
 69 |   cleaners: english_cleaners
 70 |   lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
 71 |   speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
 72 | ####################################################
 73 | #                   LOSS SETTING                   #
 74 | ####################################################
 75 | Loss:
 76 |   MelReconLoss:
 77 |     enable: True
 78 |     params:
 79 |       loss_type: mae
 80 | 
 81 |   ProsodyReconLoss:
 82 |     enable: True
 83 |     params:
 84 |       loss_type: mae
 85 | 
 86 | ###########################################################
 87 | #                  DATA LOADER SETTING                    #
 88 | ###########################################################
 89 | batch_size: 32              
 90 | pin_memory: False            
 91 | num_workers: 4 # FIXME: set > 0 may stuck on macos              
 92 | remove_short_samples: False 
 93 | allow_cache: True           
 94 | grad_norm: 1.0
 95 | 
 96 | ###########################################################
 97 | #                    INTERVAL SETTING                     #
 98 | ###########################################################
 99 | train_max_steps: 10000000           # Number of training steps.
100 | save_interval_steps: 20000         # Interval steps to save checkpoint.
101 | eval_interval_steps: 2300500          # Interval steps to evaluate the network.
102 | log_interval_steps: 1000            # Interval steps to record the training log.
103 | 
104 | ###########################################################
105 | #                     OTHER SETTING                       #
106 | ###########################################################
107 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
108 | 


--------------------------------------------------------------------------------
/kantts/configs/sambert_nsf_24k.yaml:
--------------------------------------------------------------------------------
  1 | model_type: sambert
  2 | Model:
  3 | #########################################################
  4 | #         SAMBERT NETWORK ARCHITECTURE SETTING          #
  5 | #########################################################
  6 |   KanTtsSAMBERT:
  7 |     params:
  8 |         max_len: 800
  9 | 
 10 |         embedding_dim: 512 
 11 |         encoder_num_layers: 8
 12 |         encoder_num_heads: 8
 13 |         encoder_num_units: 128
 14 |         encoder_ffn_inner_dim: 1024
 15 |         encoder_dropout: 0.1
 16 |         encoder_attention_dropout: 0.1
 17 |         encoder_relu_dropout: 0.1
 18 |         encoder_projection_units: 32
 19 | 
 20 |         speaker_units: 32
 21 |         emotion_units: 32
 22 | 
 23 |         predictor_filter_size: 41
 24 |         predictor_fsmn_num_layers: 3
 25 |         predictor_num_memory_units: 128
 26 |         predictor_ffn_inner_dim: 256
 27 |         predictor_dropout: 0.1
 28 |         predictor_shift: 0
 29 |         predictor_lstm_units: 128
 30 |         dur_pred_prenet_units: [128, 128]
 31 |         dur_pred_lstm_units: 128
 32 | 
 33 |         decoder_prenet_units: [256, 256]
 34 |         decoder_num_layers: 12
 35 |         decoder_num_heads: 8
 36 |         decoder_num_units: 128
 37 |         decoder_ffn_inner_dim: 1024
 38 |         decoder_dropout: 0.1
 39 |         decoder_attention_dropout: 0.1
 40 |         decoder_relu_dropout: 0.1
 41 | 
 42 |         outputs_per_step: 3
 43 |         num_mels: 82
 44 | 
 45 |         postnet_filter_size: 41
 46 |         postnet_fsmn_num_layers: 4
 47 |         postnet_num_memory_units: 256
 48 |         postnet_ffn_inner_dim: 512
 49 |         postnet_dropout: 0.1
 50 |         postnet_shift: 17
 51 |         postnet_lstm_units: 128
 52 |         MAS: False
 53 |         NSF: True
 54 | 
 55 |     optimizer:
 56 |       type: Adam
 57 |       params:
 58 |         lr: 0.001
 59 |         betas: [0.9, 0.98]
 60 |         eps: 1.0e-9
 61 |         weight_decay: 0.0
 62 |     scheduler:
 63 |       type: NoamLR
 64 |       params:
 65 |         warmup_steps: 4000
 66 | 
 67 | linguistic_unit: 
 68 |   cleaners: english_cleaners
 69 |   lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
 70 |   speaker_list: F7
 71 | ####################################################
 72 | #                   LOSS SETTING                   #
 73 | ####################################################
 74 | Loss:
 75 |   MelReconLoss:
 76 |     enable: True
 77 |     params:
 78 |       loss_type: mae
 79 | 
 80 |   ProsodyReconLoss:
 81 |     enable: True
 82 |     params:
 83 |       loss_type: mae
 84 | 
 85 | 
 86 | ###########################################################
 87 | #                  DATA LOADER SETTING                    #
 88 | ###########################################################
 89 | batch_size: 32              
 90 | pin_memory: False            
 91 | num_workers: 4 # FIXME: set > 0 may stuck on macos              
 92 | remove_short_samples: False 
 93 | allow_cache: True           
 94 | 
 95 | grad_norm: 1.0
 96 | 
 97 | ###########################################################
 98 | #                    INTERVAL SETTING                     #
 99 | ###########################################################
100 | train_max_steps: 1000000           # Number of training steps.
101 | save_interval_steps: 20000         # Interval steps to save checkpoint.
102 | eval_interval_steps: 10000          # Interval steps to evaluate the network.
103 | log_interval_steps: 1000            # Interval steps to record the training log.
104 | 
105 | ###########################################################
106 | #                     OTHER SETTING                       #
107 | ###########################################################
108 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
109 | 


--------------------------------------------------------------------------------
/kantts/configs/sambert_se_nsf_global_16k.yaml:
--------------------------------------------------------------------------------
  1 | model_type: sambert
  2 | Model:
  3 | #########################################################
  4 | #         SAMBERT NETWORK ARCHITECTURE SETTING          #
  5 | #########################################################
  6 |   KanTtsSAMBERT:
  7 |     params:
  8 |         max_len: 800
  9 | 
 10 |         embedding_dim: 512 
 11 |         encoder_num_layers: 8
 12 |         encoder_num_heads: 8
 13 |         encoder_num_units: 128
 14 |         encoder_ffn_inner_dim: 1024
 15 |         encoder_dropout: 0.1
 16 |         encoder_attention_dropout: 0.1
 17 |         encoder_relu_dropout: 0.1
 18 |         encoder_projection_units: 32
 19 | 
 20 |         speaker_units: 192
 21 |         emotion_units: 32
 22 | 
 23 |         predictor_filter_size: 41
 24 |         predictor_fsmn_num_layers: 3
 25 |         predictor_num_memory_units: 128
 26 |         predictor_ffn_inner_dim: 256
 27 |         predictor_dropout: 0.1
 28 |         predictor_shift: 0
 29 |         predictor_lstm_units: 128
 30 |         dur_pred_prenet_units: [128, 128]
 31 |         dur_pred_lstm_units: 128
 32 | 
 33 |         decoder_prenet_units: [256, 256]
 34 |         decoder_num_layers: 12
 35 |         decoder_num_heads: 8
 36 |         decoder_num_units: 128
 37 |         decoder_ffn_inner_dim: 1024
 38 |         decoder_dropout: 0.1
 39 |         decoder_attention_dropout: 0.1
 40 |         decoder_relu_dropout: 0.1
 41 | 
 42 |         outputs_per_step: 3
 43 |         num_mels: 82
 44 | 
 45 |         postnet_filter_size: 41
 46 |         postnet_fsmn_num_layers: 4
 47 |         postnet_num_memory_units: 256
 48 |         postnet_ffn_inner_dim: 512
 49 |         postnet_dropout: 0.1
 50 |         postnet_shift: 17
 51 |         postnet_lstm_units: 128
 52 |         MAS: False
 53 |         NSF: True
 54 |         nsf_norm_type: global 
 55 |         nsf_f0_global_minimum: 30.0
 56 |         nsf_f0_global_maximum: 730.0            
 57 |         SE: True
 58 | 
 59 | 
 60 |     optimizer:
 61 |       type: Adam
 62 |       params:
 63 |         lr: 0.001
 64 |         betas: [0.9, 0.98]
 65 |         eps: 1.0e-9
 66 |         weight_decay: 0.0
 67 |     scheduler:
 68 |       type: NoamLR
 69 |       params:
 70 |         warmup_steps: 4000
 71 | 
 72 | linguistic_unit: 
 73 |   cleaners: english_cleaners
 74 |   lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
 75 |   speaker_list: F7
 76 | ####################################################
 77 | #                   LOSS SETTING                   #
 78 | ####################################################
 79 | Loss:
 80 |   MelReconLoss:
 81 |     enable: True
 82 |     params:
 83 |       loss_type: mae
 84 | 
 85 |   ProsodyReconLoss:
 86 |     enable: True
 87 |     params:
 88 |       loss_type: mae
 89 | 
 90 | ###########################################################
 91 | #                  DATA LOADER SETTING                    #
 92 | ###########################################################
 93 | batch_size: 32              
 94 | pin_memory: False            
 95 | num_workers: 4 # FIXME: set > 0 may stuck on macos              
 96 | remove_short_samples: False 
 97 | allow_cache: False           
 98 | grad_norm: 1.0
 99 | 
100 | ###########################################################
101 | #                    INTERVAL SETTING                     #
102 | ###########################################################
103 | train_max_steps: 1760101           # Number of training steps.
104 | save_interval_steps: 100        # Interval steps to save checkpoint.
105 | eval_interval_steps: 1000000000000          # Interval steps to evaluate the network. 
106 | log_interval_steps: 10           # Interval steps to record the training log.
107 | 
108 | ###########################################################
109 | #                     OTHER SETTING                       #
110 | ###########################################################
111 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
112 | 


--------------------------------------------------------------------------------
/kantts/configs/sambert_sichuan_16k.yaml:
--------------------------------------------------------------------------------
  1 | model_type: sambert
  2 | Model:
  3 | #########################################################
  4 | #         SAMBERT NETWORK ARCHITECTURE SETTING          #
  5 | #########################################################
  6 |   KanTtsSAMBERT:
  7 |     params:
  8 |         max_len: 800
  9 | 
 10 |         embedding_dim: 512 
 11 |         encoder_num_layers: 8
 12 |         encoder_num_heads: 8
 13 |         encoder_num_units: 128
 14 |         encoder_ffn_inner_dim: 1024
 15 |         encoder_dropout: 0.1
 16 |         encoder_attention_dropout: 0.1
 17 |         encoder_relu_dropout: 0.1
 18 |         encoder_projection_units: 32
 19 | 
 20 |         speaker_units: 32
 21 |         emotion_units: 32
 22 | 
 23 |         predictor_filter_size: 41
 24 |         predictor_fsmn_num_layers: 3
 25 |         predictor_num_memory_units: 128
 26 |         predictor_ffn_inner_dim: 256
 27 |         predictor_dropout: 0.1
 28 |         predictor_shift: 0
 29 |         predictor_lstm_units: 128
 30 |         dur_pred_prenet_units: [128, 128]
 31 |         dur_pred_lstm_units: 128
 32 | 
 33 |         decoder_prenet_units: [256, 256]
 34 |         decoder_num_layers: 12
 35 |         decoder_num_heads: 8
 36 |         decoder_num_units: 128
 37 |         decoder_ffn_inner_dim: 1024
 38 |         decoder_dropout: 0.1
 39 |         decoder_attention_dropout: 0.1
 40 |         decoder_relu_dropout: 0.1
 41 | 
 42 |         outputs_per_step: 3
 43 |         num_mels: 80
 44 | 
 45 |         postnet_filter_size: 41
 46 |         postnet_fsmn_num_layers: 4
 47 |         postnet_num_memory_units: 256
 48 |         postnet_ffn_inner_dim: 512
 49 |         postnet_dropout: 0.1
 50 |         postnet_shift: 17
 51 |         postnet_lstm_units: 128
 52 |         MAS: False
 53 | 
 54 |     optimizer:
 55 |       type: Adam
 56 |       params:
 57 |         lr: 0.001
 58 |         betas: [0.9, 0.98]
 59 |         eps: 1.0e-9
 60 |         weight_decay: 0.0
 61 |     scheduler:
 62 |       type: NoamLR
 63 |       params:
 64 |         warmup_steps: 4000
 65 | 
 66 | linguistic_unit: 
 67 |   cleaners: english_cleaners
 68 |   lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
 69 |   speaker_list: xiaoyue
 70 |   language: Sichuan
 71 | ####################################################
 72 | #                   LOSS SETTING                   #
 73 | ####################################################
 74 | Loss:
 75 |   MelReconLoss:
 76 |     enable: True
 77 |     params:
 78 |       loss_type: mae
 79 | 
 80 |   ProsodyReconLoss:
 81 |     enable: True
 82 |     params:
 83 |       loss_type: mae
 84 | 
 85 | ###########################################################
 86 | #                  DATA LOADER SETTING                    #
 87 | ###########################################################
 88 | batch_size: 32              
 89 | pin_memory: False            
 90 | num_workers: 4 # FIXME: set > 0 may stuck on macos              
 91 | remove_short_samples: False 
 92 | allow_cache: True           
 93 | grad_norm: 1.0
 94 | 
 95 | ###########################################################
 96 | #                    INTERVAL SETTING                     #
 97 | ###########################################################
 98 | train_max_steps: 1000000           # Number of training steps.
 99 | save_interval_steps: 20000         # Interval steps to save checkpoint.
100 | eval_interval_steps: 10000          # Interval steps to evaluate the network.
101 | log_interval_steps: 1000            # Interval steps to record the training log.
102 | 
103 | ###########################################################
104 | #                     OTHER SETTING                       #
105 | ###########################################################
106 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
107 | 


--------------------------------------------------------------------------------
/kantts/configs/sybert.yaml:
--------------------------------------------------------------------------------
 1 | model_type: sybert
 2 | Model:
 3 | #########################################################
 4 | #         TextsyBERT NETWORK ARCHITECTURE SETTING          #
 5 | #########################################################
 6 |   KanTtsTextsyBERT:
 7 |     params:
 8 |         max_len: 800
 9 | 
10 |         embedding_dim: 512
11 |         encoder_num_layers: 8
12 |         encoder_num_heads: 8
13 |         encoder_num_units: 128
14 |         encoder_ffn_inner_dim: 1024
15 |         encoder_dropout: 0.1
16 |         encoder_attention_dropout: 0.1
17 |         encoder_relu_dropout: 0.1
18 |         encoder_projection_units: 32
19 |         
20 |         mask_ratio: 0.3
21 | 
22 |     optimizer:
23 |       type: Adam
24 |       params:
25 |         lr: 0.0001
26 |         betas: [0.9, 0.98]
27 |         eps: 1.0e-9
28 |         weight_decay: 0.0
29 |     scheduler:
30 |       type: NoamLR
31 |       params:
32 |         warmup_steps: 10000
33 | 
34 | linguistic_unit:
35 |   cleaners: english_cleaners
36 |   lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
37 |   speaker_list: F7
38 | ####################################################
39 | #                   LOSS SETTING                   #
40 | ####################################################
41 | Loss:
42 |   SeqCELoss:
43 |     enable: True
44 |     params:
45 |       loss_type: ce
46 | 
47 | ###########################################################
48 | #                  DATA LOADER SETTING                    #
49 | ###########################################################
50 | batch_size: 32
51 | pin_memory: False
52 | num_workers: 4 # FIXME: set > 0 may stuck on macos
53 | remove_short_samples: False
54 | allow_cache: True
55 | 
56 | grad_norm: 1.0
57 | 
58 | ###########################################################
59 | #                    INTERVAL SETTING                     #
60 | ###########################################################
61 | train_max_steps: 1000000           # Number of training steps.
62 | save_interval_steps: 20000         # Interval steps to save checkpoint.
63 | eval_interval_steps: 10000          # Interval steps to evaluate the network.
64 | log_interval_steps: 1000            # Interval steps to record the training log.
65 | 
66 | ###########################################################
67 | #                     OTHER SETTING                       #
68 | ###########################################################
69 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
70 | 


--------------------------------------------------------------------------------
/kantts/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/datasets/__init__.py


--------------------------------------------------------------------------------
/kantts/datasets/data_types.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.io import wavfile
 3 | 
 4 | 
 5 | #  TODO: add your own data type here as you need.
 6 | DATA_TYPE_DICT = {
 7 |     "txt": {
 8 |         "load_func": np.loadtxt,
 9 |         "desc": "plain txt file or readable by np.loadtxt",
10 |     },
11 |     "wav": {
12 |         "load_func": lambda x: wavfile.read(x)[1],
13 |         "desc": "wav file or readable by soundfile.read",
14 |     },
15 |     "npy": {
16 |         "load_func": np.load,
17 |         "desc": "any .npy format file",
18 |     },
19 |     # PCM data type can be loaded by binary format
20 |     "bin_f32": {
21 |         "load_func": lambda x: np.fromfile(x, dtype=np.float32),
22 |         "desc": "binary file with float32 format",
23 |     },
24 |     "bin_f64": {
25 |         "load_func": lambda x: np.fromfile(x, dtype=np.float64),
26 |         "desc": "binary file with float64 format",
27 |     },
28 |     "bin_i32": {
29 |         "load_func": lambda x: np.fromfile(x, dtype=np.int32),
30 |         "desc": "binary file with int32 format",
31 |     },
32 |     "bin_i16": {
33 |         "load_func": lambda x: np.fromfile(x, dtype=np.int16),
34 |         "desc": "binary file with int16 format",
35 |     },
36 | }
37 | 


--------------------------------------------------------------------------------
/kantts/models/__init__.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn.parallel import DistributedDataParallel
  3 | from kantts.models.hifigan.hifigan import (  # NOQA
  4 |     Generator,  # NOQA
  5 |     MultiScaleDiscriminator,  # NOQA
  6 |     MultiPeriodDiscriminator,  # NOQA
  7 |     MultiSpecDiscriminator,  # NOQA
  8 | )
  9 | import kantts
 10 | import kantts.train.scheduler
 11 | from kantts.models.sambert.kantts_sambert import KanTtsSAMBERT, KanTtsTextsyBERT  # NOQA
 12 | from kantts.utils.ling_unit.ling_unit import get_fpdict
 13 | from .pqmf import PQMF
 14 | 
 15 | 
 16 | def optimizer_builder(model_params, opt_name, opt_params):
 17 |     opt_cls = getattr(torch.optim, opt_name)
 18 |     optimizer = opt_cls(model_params, **opt_params)
 19 |     return optimizer
 20 | 
 21 | 
 22 | def scheduler_builder(optimizer, sche_name, sche_params):
 23 |     scheduler_cls = getattr(kantts.train.scheduler, sche_name)
 24 |     scheduler = scheduler_cls(optimizer, **sche_params)
 25 |     return scheduler
 26 | 
 27 | 
 28 | def hifigan_model_builder(config, device, rank, distributed):
 29 |     model = {}
 30 |     optimizer = {}
 31 |     scheduler = {}
 32 |     model["discriminator"] = {}
 33 |     optimizer["discriminator"] = {}
 34 |     scheduler["discriminator"] = {}
 35 |     for model_name in config["Model"].keys():
 36 |         if model_name == "Generator":
 37 |             params = config["Model"][model_name]["params"]
 38 |             model["generator"] = Generator(**params).to(device)
 39 |             optimizer["generator"] = optimizer_builder(
 40 |                 model["generator"].parameters(),
 41 |                 config["Model"][model_name]["optimizer"].get("type", "Adam"),
 42 |                 config["Model"][model_name]["optimizer"].get("params", {}),
 43 |             )
 44 |             scheduler["generator"] = scheduler_builder(
 45 |                 optimizer["generator"],
 46 |                 config["Model"][model_name]["scheduler"].get("type", "StepLR"),
 47 |                 config["Model"][model_name]["scheduler"].get("params", {}),
 48 |             )
 49 |         else:
 50 |             params = config["Model"][model_name]["params"]
 51 |             model["discriminator"][model_name] = globals()[model_name](**params).to(
 52 |                 device
 53 |             )
 54 |             optimizer["discriminator"][model_name] = optimizer_builder(
 55 |                 model["discriminator"][model_name].parameters(),
 56 |                 config["Model"][model_name]["optimizer"].get("type", "Adam"),
 57 |                 config["Model"][model_name]["optimizer"].get("params", {}),
 58 |             )
 59 |             scheduler["discriminator"][model_name] = scheduler_builder(
 60 |                 optimizer["discriminator"][model_name],
 61 |                 config["Model"][model_name]["scheduler"].get("type", "StepLR"),
 62 |                 config["Model"][model_name]["scheduler"].get("params", {}),
 63 |             )
 64 | 
 65 |     out_channels = config["Model"]["Generator"]["params"]["out_channels"]
 66 |     if out_channels > 1:
 67 |         model["pqmf"] = PQMF(subbands=out_channels, **config.get("pqmf", {})).to(device)
 68 | 
 69 |     # FIXME: pywavelets buffer leads to gradient error in DDP training
 70 |     # Solution: https://github.com/pytorch/pytorch/issues/22095
 71 |     if distributed:
 72 |         model["generator"] = DistributedDataParallel(
 73 |             model["generator"],
 74 |             device_ids=[rank],
 75 |             output_device=rank,
 76 |             broadcast_buffers=False,
 77 |         )
 78 |         for model_name in model["discriminator"].keys():
 79 |             model["discriminator"][model_name] = DistributedDataParallel(
 80 |                 model["discriminator"][model_name],
 81 |                 device_ids=[rank],
 82 |                 output_device=rank,
 83 |                 broadcast_buffers=False,
 84 |             )
 85 | 
 86 |     return model, optimizer, scheduler
 87 | 
 88 | 
 89 | #  TODO: some parsing
 90 | def sambert_model_builder(config, device, rank, distributed):
 91 |     model = {}
 92 |     optimizer = {}
 93 |     scheduler = {}
 94 | 
 95 |     model["KanTtsSAMBERT"] = KanTtsSAMBERT(
 96 |         config["Model"]["KanTtsSAMBERT"]["params"]
 97 |     ).to(device)
 98 | 
 99 |     fp_enable = config["Model"]["KanTtsSAMBERT"]["params"].get("FP", False)
100 |     if fp_enable:
101 |         fp_dict = {
102 |             k: torch.from_numpy(v).long().unsqueeze(0).to(device)
103 |             for k, v in get_fpdict(config).items()
104 |         }
105 |         model["KanTtsSAMBERT"].fp_dict = fp_dict
106 | 
107 |     optimizer["KanTtsSAMBERT"] = optimizer_builder(
108 |         model["KanTtsSAMBERT"].parameters(),
109 |         config["Model"]["KanTtsSAMBERT"]["optimizer"].get("type", "Adam"),
110 |         config["Model"]["KanTtsSAMBERT"]["optimizer"].get("params", {}),
111 |     )
112 |     scheduler["KanTtsSAMBERT"] = scheduler_builder(
113 |         optimizer["KanTtsSAMBERT"],
114 |         config["Model"]["KanTtsSAMBERT"]["scheduler"].get("type", "StepLR"),
115 |         config["Model"]["KanTtsSAMBERT"]["scheduler"].get("params", {}),
116 |     )
117 | 
118 |     if distributed:
119 |         model["KanTtsSAMBERT"] = DistributedDataParallel(
120 |             model["KanTtsSAMBERT"], device_ids=[rank], output_device=rank
121 |         )
122 | 
123 |     return model, optimizer, scheduler
124 | 
125 | 
126 | def sybert_model_builder(config, device, rank, distributed):
127 |     model = {}
128 |     optimizer = {}
129 |     scheduler = {}
130 | 
131 |     model["KanTtsTextsyBERT"] = KanTtsTextsyBERT(
132 |         config["Model"]["KanTtsTextsyBERT"]["params"]
133 |     ).to(device)
134 |     optimizer["KanTtsTextsyBERT"] = optimizer_builder(
135 |         model["KanTtsTextsyBERT"].parameters(),
136 |         config["Model"]["KanTtsTextsyBERT"]["optimizer"].get("type", "Adam"),
137 |         config["Model"]["KanTtsTextsyBERT"]["optimizer"].get("params", {}),
138 |     )
139 |     scheduler["KanTtsTextsyBERT"] = scheduler_builder(
140 |         optimizer["KanTtsTextsyBERT"],
141 |         config["Model"]["KanTtsTextsyBERT"]["scheduler"].get("type", "StepLR"),
142 |         config["Model"]["KanTtsTextsyBERT"]["scheduler"].get("params", {}),
143 |     )
144 | 
145 |     if distributed:
146 |         model["KanTtsTextsyBERT"] = DistributedDataParallel(
147 |             model["KanTtsTextsyBERT"], device_ids=[rank], output_device=rank
148 |         )
149 | 
150 |     return model, optimizer, scheduler
151 | 
152 | 
153 | #  TODO: implement a builder for specific model
154 | model_dict = {
155 |     "hifigan": hifigan_model_builder,
156 |     "sambert": sambert_model_builder,
157 |     "sybert": sybert_model_builder,
158 | }
159 | 
160 | 
161 | def model_builder(config, device="cpu", rank=0, distributed=False):
162 |     builder_func = model_dict[config["model_type"]]
163 |     model, optimizer, scheduler = builder_func(config, device, rank, distributed)
164 |     return model, optimizer, scheduler
165 | 


--------------------------------------------------------------------------------
/kantts/models/pqmf.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Tomoki Hayashi
  2 | #  MIT License (https://opensource.org/licenses/MIT)
  3 | 
  4 | """Pseudo QMF modules."""
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn.functional as F
  9 | 
 10 | from scipy.signal import kaiser
 11 | 
 12 | 
 13 | def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
 14 |     """Design prototype filter for PQMF.
 15 | 
 16 |     This method is based on `A Kaiser window approach for the design of prototype
 17 |     filters of cosine modulated filterbanks`_.
 18 | 
 19 |     Args:
 20 |         taps (int): The number of filter taps.
 21 |         cutoff_ratio (float): Cut-off frequency ratio.
 22 |         beta (float): Beta coefficient for kaiser window.
 23 | 
 24 |     Returns:
 25 |         ndarray: Impluse response of prototype filter (taps + 1,).
 26 | 
 27 |     .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
 28 |         https://ieeexplore.ieee.org/abstract/document/681427
 29 | 
 30 |     """
 31 |     # check the arguments are valid
 32 |     assert taps % 2 == 0, "The number of taps mush be even number."
 33 |     assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0."
 34 | 
 35 |     # make initial filter
 36 |     omega_c = np.pi * cutoff_ratio
 37 |     with np.errstate(invalid="ignore"):
 38 |         h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) / (
 39 |             np.pi * (np.arange(taps + 1) - 0.5 * taps)
 40 |         )
 41 |     h_i[taps // 2] = np.cos(0) * cutoff_ratio  # fix nan due to indeterminate form
 42 | 
 43 |     # apply kaiser window
 44 |     w = kaiser(taps + 1, beta)
 45 |     h = h_i * w
 46 | 
 47 |     return h
 48 | 
 49 | 
 50 | class PQMF(torch.nn.Module):
 51 |     """PQMF module.
 52 | 
 53 |     This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
 54 | 
 55 |     .. _`Near-perfect-reconstruction pseudo-QMF banks`:
 56 |         https://ieeexplore.ieee.org/document/258122
 57 | 
 58 |     """
 59 | 
 60 |     def __init__(self, subbands=4, taps=62, cutoff_ratio=0.142, beta=9.0):
 61 |         """Initilize PQMF module.
 62 | 
 63 |         The cutoff_ratio and beta parameters are optimized for #subbands = 4.
 64 |         See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195.
 65 | 
 66 |         Args:
 67 |             subbands (int): The number of subbands.
 68 |             taps (int): The number of filter taps.
 69 |             cutoff_ratio (float): Cut-off frequency ratio.
 70 |             beta (float): Beta coefficient for kaiser window.
 71 | 
 72 |         """
 73 |         super(PQMF, self).__init__()
 74 | 
 75 |         # build analysis & synthesis filter coefficients
 76 |         h_proto = design_prototype_filter(taps, cutoff_ratio, beta)
 77 |         h_analysis = np.zeros((subbands, len(h_proto)))
 78 |         h_synthesis = np.zeros((subbands, len(h_proto)))
 79 |         for k in range(subbands):
 80 |             h_analysis[k] = (
 81 |                 2
 82 |                 * h_proto
 83 |                 * np.cos(
 84 |                     (2 * k + 1)
 85 |                     * (np.pi / (2 * subbands))
 86 |                     * (np.arange(taps + 1) - (taps / 2))
 87 |                     + (-1) ** k * np.pi / 4
 88 |                 )
 89 |             )
 90 |             h_synthesis[k] = (
 91 |                 2
 92 |                 * h_proto
 93 |                 * np.cos(
 94 |                     (2 * k + 1)
 95 |                     * (np.pi / (2 * subbands))
 96 |                     * (np.arange(taps + 1) - (taps / 2))
 97 |                     - (-1) ** k * np.pi / 4
 98 |                 )
 99 |             )
100 | 
101 |         # convert to tensor
102 |         analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1)
103 |         synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0)
104 | 
105 |         # register coefficients as beffer
106 |         self.register_buffer("analysis_filter", analysis_filter)
107 |         self.register_buffer("synthesis_filter", synthesis_filter)
108 | 
109 |         # filter for downsampling & upsampling
110 |         updown_filter = torch.zeros((subbands, subbands, subbands)).float()
111 |         for k in range(subbands):
112 |             updown_filter[k, k, 0] = 1.0
113 |         self.register_buffer("updown_filter", updown_filter)
114 |         self.subbands = subbands
115 | 
116 |         # keep padding info
117 |         self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
118 | 
119 |     def analysis(self, x):
120 |         """Analysis with PQMF.
121 | 
122 |         Args:
123 |             x (Tensor): Input tensor (B, 1, T).
124 | 
125 |         Returns:
126 |             Tensor: Output tensor (B, subbands, T // subbands).
127 | 
128 |         """
129 |         x = F.conv1d(self.pad_fn(x), self.analysis_filter)
130 |         return F.conv1d(x, self.updown_filter, stride=self.subbands)
131 | 
132 |     def synthesis(self, x):
133 |         """Synthesis with PQMF.
134 | 
135 |         Args:
136 |             x (Tensor): Input tensor (B, subbands, T // subbands).
137 | 
138 |         Returns:
139 |             Tensor: Output tensor (B, 1, T).
140 | 
141 |         """
142 |         # NOTE(kan-bayashi): Power will be dreased so here multipy by # subbands.
143 |         #   Not sure this is the correct way, it is better to check again.
144 |         # TODO(kan-bayashi): Understand the reconstruction procedure
145 |         x = F.conv_transpose1d(
146 |             x, self.updown_filter * self.subbands, stride=self.subbands
147 |         )
148 |         return F.conv1d(self.pad_fn(x), self.synthesis_filter)
149 | 


--------------------------------------------------------------------------------
/kantts/models/sambert/adaptors.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from kantts.models.sambert.fsmn import FsmnEncoderV2
  6 | from kantts.models.sambert import Prenet
  7 | 
  8 | 
  9 | class LengthRegulator(nn.Module):
 10 |     def __init__(self, r=1):
 11 |         super(LengthRegulator, self).__init__()
 12 | 
 13 |         self.r = r
 14 | 
 15 |     def forward(self, inputs, durations, masks=None):
 16 |         reps = (durations + 0.5).long()
 17 |         output_lens = reps.sum(dim=1)
 18 |         max_len = output_lens.max()
 19 |         reps_cumsum = torch.cumsum(F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[
 20 |             :, None, :
 21 |         ]
 22 |         range_ = torch.arange(max_len).to(inputs.device)[None, :, None]
 23 |         mult = (reps_cumsum[:, :, :-1] <= range_) & (reps_cumsum[:, :, 1:] > range_)
 24 |         mult = mult.float()
 25 |         out = torch.matmul(mult, inputs)
 26 | 
 27 |         if masks is not None:
 28 |             out = out.masked_fill(masks.unsqueeze(-1), 0.0)
 29 | 
 30 |         seq_len = out.size(1)
 31 |         padding = self.r - int(seq_len) % self.r
 32 |         if padding < self.r:
 33 |             out = F.pad(out.transpose(1, 2), (0, padding, 0, 0, 0, 0), value=0.0)
 34 |             out = out.transpose(1, 2)
 35 | 
 36 |         return out, output_lens
 37 | 
 38 | 
 39 | class VarRnnARPredictor(nn.Module):
 40 |     def __init__(self, cond_units, prenet_units, rnn_units):
 41 |         super(VarRnnARPredictor, self).__init__()
 42 | 
 43 |         self.prenet = Prenet(1, prenet_units)
 44 |         self.lstm = nn.LSTM(
 45 |             prenet_units[-1] + cond_units,
 46 |             rnn_units,
 47 |             num_layers=2,
 48 |             batch_first=True,
 49 |             bidirectional=False,
 50 |         )
 51 |         self.fc = nn.Linear(rnn_units, 1)
 52 | 
 53 |     def forward(self, inputs, cond, h=None, masks=None):
 54 |         x = torch.cat([self.prenet(inputs), cond], dim=-1)
 55 |         # The input can also be a packed variable length sequence,
 56 |         # here we just omit it for simplicity due to the mask and uni-directional lstm.
 57 |         x, h_new = self.lstm(x, h)
 58 | 
 59 |         x = self.fc(x).squeeze(-1)
 60 |         x = F.relu(x)
 61 | 
 62 |         if masks is not None:
 63 |             x = x.masked_fill(masks, 0.0)
 64 | 
 65 |         return x, h_new
 66 | 
 67 |     def infer(self, cond, masks=None):
 68 |         batch_size, length = cond.size(0), cond.size(1)
 69 | 
 70 |         output = []
 71 |         x = torch.zeros((batch_size, 1)).to(cond.device)
 72 |         h = None
 73 | 
 74 |         for i in range(length):
 75 |             x, h = self.forward(x.unsqueeze(1), cond[:, i : i + 1, :], h=h)
 76 |             output.append(x)
 77 | 
 78 |         output = torch.cat(output, dim=-1)
 79 | 
 80 |         if masks is not None:
 81 |             output = output.masked_fill(masks, 0.0)
 82 | 
 83 |         return output
 84 | 
 85 | 
 86 | class VarFsmnRnnNARPredictor(nn.Module):
 87 |     def __init__(
 88 |         self,
 89 |         in_dim,
 90 |         filter_size,
 91 |         fsmn_num_layers,
 92 |         num_memory_units,
 93 |         ffn_inner_dim,
 94 |         dropout,
 95 |         shift,
 96 |         lstm_units,
 97 |     ):
 98 |         super(VarFsmnRnnNARPredictor, self).__init__()
 99 | 
100 |         self.fsmn = FsmnEncoderV2(
101 |             filter_size,
102 |             fsmn_num_layers,
103 |             in_dim,
104 |             num_memory_units,
105 |             ffn_inner_dim,
106 |             dropout,
107 |             shift,
108 |         )
109 |         self.blstm = nn.LSTM(
110 |             num_memory_units,
111 |             lstm_units,
112 |             num_layers=1,
113 |             batch_first=True,
114 |             bidirectional=True,
115 |         )
116 |         self.fc = nn.Linear(2 * lstm_units, 1)
117 | 
118 |     def forward(self, inputs, masks=None):
119 |         input_lengths = None
120 |         if masks is not None:
121 |             input_lengths = torch.sum((~masks).float(), dim=1).long()
122 | 
123 |         x = self.fsmn(inputs, masks)
124 | 
125 |         if input_lengths is not None:
126 |             x = nn.utils.rnn.pack_padded_sequence(
127 |                 x, input_lengths.tolist(), batch_first=True, enforce_sorted=False
128 |             )
129 |             x, _ = self.blstm(x)
130 |             x, _ = nn.utils.rnn.pad_packed_sequence(
131 |                 x, batch_first=True, total_length=inputs.size(1)
132 |             )
133 |         else:
134 |             x, _ = self.blstm(x)
135 | 
136 |         x = self.fc(x).squeeze(-1)
137 | 
138 |         if masks is not None:
139 |             x = x.masked_fill(masks, 0.0)
140 | 
141 |         return x
142 | 


--------------------------------------------------------------------------------
/kantts/models/sambert/alignment.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import numba as nb
 3 | 
 4 | 
 5 | @nb.jit(nopython=True)
 6 | def mas(attn_map, width=1):
 7 |     # assumes mel x text
 8 |     opt = np.zeros_like(attn_map)
 9 |     attn_map = np.log(attn_map)
10 |     attn_map[0, 1:] = -np.inf
11 |     log_p = np.zeros_like(attn_map)
12 |     log_p[0, :] = attn_map[0, :]
13 |     prev_ind = np.zeros_like(attn_map, dtype=np.int64)
14 |     for i in range(1, attn_map.shape[0]):
15 |         for j in range(attn_map.shape[1]):  # for each text dim
16 |             prev_j = np.arange(max(0, j - width), j + 1)
17 |             prev_log = np.array([log_p[i - 1, prev_idx] for prev_idx in prev_j])
18 | 
19 |             ind = np.argmax(prev_log)
20 |             log_p[i, j] = attn_map[i, j] + prev_log[ind]
21 |             prev_ind[i, j] = prev_j[ind]
22 | 
23 |     # now backtrack
24 |     curr_text_idx = attn_map.shape[1] - 1
25 |     for i in range(attn_map.shape[0] - 1, -1, -1):
26 |         opt[i, curr_text_idx] = 1
27 |         curr_text_idx = prev_ind[i, curr_text_idx]
28 |     opt[0, curr_text_idx] = 1
29 |     return opt
30 | 
31 | 
32 | @nb.jit(nopython=True)
33 | def mas_width1(attn_map):
34 |     """mas with hardcoded width=1"""
35 |     # assumes mel x text
36 |     opt = np.zeros_like(attn_map)
37 |     attn_map = np.log(attn_map)
38 |     attn_map[0, 1:] = -np.inf
39 |     log_p = np.zeros_like(attn_map)
40 |     log_p[0, :] = attn_map[0, :]
41 |     prev_ind = np.zeros_like(attn_map, dtype=np.int64)
42 |     for i in range(1, attn_map.shape[0]):
43 |         for j in range(attn_map.shape[1]):  # for each text dim
44 |             prev_log = log_p[i - 1, j]
45 |             prev_j = j
46 | 
47 |             if j - 1 >= 0 and log_p[i - 1, j - 1] >= log_p[i - 1, j]:
48 |                 prev_log = log_p[i - 1, j - 1]
49 |                 prev_j = j - 1
50 | 
51 |             log_p[i, j] = attn_map[i, j] + prev_log
52 |             prev_ind[i, j] = prev_j
53 | 
54 |     # now backtrack
55 |     curr_text_idx = attn_map.shape[1] - 1
56 |     for i in range(attn_map.shape[0] - 1, -1, -1):
57 |         opt[i, curr_text_idx] = 1
58 |         curr_text_idx = prev_ind[i, curr_text_idx]
59 |     opt[0, curr_text_idx] = 1
60 |     return opt
61 | 
62 | 
63 | @nb.jit(nopython=True, parallel=True)
64 | def b_mas(b_attn_map, in_lens, out_lens, width=1):
65 |     assert width == 1
66 |     attn_out = np.zeros_like(b_attn_map)
67 | 
68 |     for b in nb.prange(b_attn_map.shape[0]):
69 |         out = mas_width1(b_attn_map[b, 0, : out_lens[b], : in_lens[b]])
70 |         attn_out[b, 0, : out_lens[b], : in_lens[b]] = out
71 |     return attn_out
72 | 


--------------------------------------------------------------------------------
/kantts/models/sambert/attention.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from torch import nn
  4 | 
  5 | 
  6 | class ConvNorm(torch.nn.Module):
  7 |     def __init__(
  8 |         self,
  9 |         in_channels,
 10 |         out_channels,
 11 |         kernel_size=1,
 12 |         stride=1,
 13 |         padding=None,
 14 |         dilation=1,
 15 |         bias=True,
 16 |         w_init_gain="linear",
 17 |     ):
 18 |         super(ConvNorm, self).__init__()
 19 |         if padding is None:
 20 |             assert kernel_size % 2 == 1
 21 |             padding = int(dilation * (kernel_size - 1) / 2)
 22 | 
 23 |         self.conv = torch.nn.Conv1d(
 24 |             in_channels,
 25 |             out_channels,
 26 |             kernel_size=kernel_size,
 27 |             stride=stride,
 28 |             padding=padding,
 29 |             dilation=dilation,
 30 |             bias=bias,
 31 |         )
 32 | 
 33 |         torch.nn.init.xavier_uniform_(
 34 |             self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)
 35 |         )
 36 | 
 37 |     def forward(self, signal):
 38 |         conv_signal = self.conv(signal)
 39 |         return conv_signal
 40 | 
 41 | 
 42 | class ConvAttention(torch.nn.Module):
 43 |     def __init__(
 44 |         self,
 45 |         n_mel_channels=80,
 46 |         n_text_channels=512,
 47 |         n_att_channels=80,
 48 |         temperature=1.0,
 49 |         use_query_proj=True,
 50 |     ):
 51 |         super(ConvAttention, self).__init__()
 52 |         self.temperature = temperature
 53 |         self.att_scaling_factor = np.sqrt(n_att_channels)
 54 |         self.softmax = torch.nn.Softmax(dim=3)
 55 |         self.log_softmax = torch.nn.LogSoftmax(dim=3)
 56 |         self.attn_proj = torch.nn.Conv2d(n_att_channels, 1, kernel_size=1)
 57 |         self.use_query_proj = bool(use_query_proj)
 58 | 
 59 |         self.key_proj = nn.Sequential(
 60 |             ConvNorm(
 61 |                 n_text_channels,
 62 |                 n_text_channels * 2,
 63 |                 kernel_size=3,
 64 |                 bias=True,
 65 |                 w_init_gain="relu",
 66 |             ),
 67 |             torch.nn.ReLU(),
 68 |             ConvNorm(n_text_channels * 2, n_att_channels, kernel_size=1, bias=True),
 69 |         )
 70 | 
 71 |         self.query_proj = nn.Sequential(
 72 |             ConvNorm(
 73 |                 n_mel_channels,
 74 |                 n_mel_channels * 2,
 75 |                 kernel_size=3,
 76 |                 bias=True,
 77 |                 w_init_gain="relu",
 78 |             ),
 79 |             torch.nn.ReLU(),
 80 |             ConvNorm(n_mel_channels * 2, n_mel_channels, kernel_size=1, bias=True),
 81 |             torch.nn.ReLU(),
 82 |             ConvNorm(n_mel_channels, n_att_channels, kernel_size=1, bias=True),
 83 |         )
 84 | 
 85 |     def forward(self, queries, keys, mask=None, attn_prior=None):
 86 |         """Attention mechanism for flowtron parallel
 87 |         Unlike in Flowtron, we have no restrictions such as causality etc,
 88 |         since we only need this during training.
 89 | 
 90 |         Args:
 91 |             queries (torch.tensor): B x C x T1 tensor
 92 |                 (probably going to be mel data)
 93 |             keys (torch.tensor): B x C2 x T2 tensor (text data)
 94 |             mask (torch.tensor): uint8 binary mask for variable length entries
 95 |                 (should be in the T2 domain)
 96 |         Output:
 97 |             attn (torch.tensor): B x 1 x T1 x T2 attention mask.
 98 |                 Final dim T2 should sum to 1
 99 |         """
100 |         keys_enc = self.key_proj(keys)  # B x n_attn_dims x T2
101 | 
102 |         # Beware can only do this since query_dim = attn_dim = n_mel_channels
103 |         if self.use_query_proj:
104 |             queries_enc = self.query_proj(queries)
105 |         else:
106 |             queries_enc = queries
107 | 
108 |         # different ways of computing attn,
109 |         # one is isotopic gaussians (per phoneme)
110 |         # Simplistic Gaussian Isotopic Attention
111 | 
112 |         # B x n_attn_dims x T1 x T2
113 |         attn = (queries_enc[:, :, :, None] - keys_enc[:, :, None]) ** 2
114 |         # compute log likelihood from a gaussian
115 |         attn = -0.0005 * attn.sum(1, keepdim=True)
116 |         if attn_prior is not None:
117 |             attn = self.log_softmax(attn) + torch.log(attn_prior[:, None] + 1e-8)
118 | 
119 |         attn_logprob = attn.clone()
120 | 
121 |         if mask is not None:
122 |             attn.data.masked_fill_(mask.unsqueeze(1).unsqueeze(1), -float("inf"))
123 | 
124 |         attn = self.softmax(attn)  # Softmax along T2
125 |         return attn, attn_logprob
126 | 


--------------------------------------------------------------------------------
/kantts/models/sambert/fsmn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | FSMN Pytorch Version
  3 | """
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | 
  8 | class FeedForwardNet(nn.Module):
  9 |     """ A two-feed-forward-layer module """
 10 | 
 11 |     def __init__(self, d_in, d_hid, d_out, kernel_size=[1, 1], dropout=0.1):
 12 |         super().__init__()
 13 | 
 14 |         # Use Conv1D
 15 |         # position-wise
 16 |         self.w_1 = nn.Conv1d(
 17 |             d_in,
 18 |             d_hid,
 19 |             kernel_size=kernel_size[0],
 20 |             padding=(kernel_size[0] - 1) // 2,
 21 |         )
 22 |         # position-wise
 23 |         self.w_2 = nn.Conv1d(
 24 |             d_hid,
 25 |             d_out,
 26 |             kernel_size=kernel_size[1],
 27 |             padding=(kernel_size[1] - 1) // 2,
 28 |             bias=False,
 29 |         )
 30 | 
 31 |         self.dropout = nn.Dropout(dropout)
 32 | 
 33 |     def forward(self, x):
 34 |         output = x.transpose(1, 2)
 35 |         output = F.relu(self.w_1(output))
 36 |         output = self.dropout(output)
 37 |         output = self.w_2(output)
 38 |         output = output.transpose(1, 2)
 39 | 
 40 |         return output
 41 | 
 42 | 
 43 | class MemoryBlockV2(nn.Module):
 44 |     def __init__(self, d, filter_size, shift, dropout=0.0):
 45 |         super(MemoryBlockV2, self).__init__()
 46 | 
 47 |         left_padding = int(round((filter_size - 1) / 2))
 48 |         right_padding = int((filter_size - 1) / 2)
 49 |         if shift > 0:
 50 |             left_padding += shift
 51 |             right_padding -= shift
 52 | 
 53 |         self.lp, self.rp = left_padding, right_padding
 54 | 
 55 |         self.conv_dw = nn.Conv1d(d, d, filter_size, 1, 0, groups=d, bias=False)
 56 |         self.dropout = nn.Dropout(dropout)
 57 | 
 58 |     def forward(self, input, mask=None):
 59 |         if mask is not None:
 60 |             input = input.masked_fill(mask.unsqueeze(-1), 0)
 61 | 
 62 |         x = F.pad(input, (0, 0, self.lp, self.rp, 0, 0), mode="constant", value=0.0)
 63 |         output = (
 64 |             self.conv_dw(x.contiguous().transpose(1, 2)).contiguous().transpose(1, 2)
 65 |         )
 66 |         output += input
 67 |         output = self.dropout(output)
 68 | 
 69 |         if mask is not None:
 70 |             output = output.masked_fill(mask.unsqueeze(-1), 0)
 71 | 
 72 |         return output
 73 | 
 74 | 
 75 | class FsmnEncoderV2(nn.Module):
 76 |     def __init__(
 77 |         self,
 78 |         filter_size,
 79 |         fsmn_num_layers,
 80 |         input_dim,
 81 |         num_memory_units,
 82 |         ffn_inner_dim,
 83 |         dropout=0.0,
 84 |         shift=0,
 85 |     ):
 86 |         super(FsmnEncoderV2, self).__init__()
 87 | 
 88 |         self.filter_size = filter_size
 89 |         self.fsmn_num_layers = fsmn_num_layers
 90 |         self.num_memory_units = num_memory_units
 91 |         self.ffn_inner_dim = ffn_inner_dim
 92 |         self.dropout = dropout
 93 |         self.shift = shift
 94 |         if not isinstance(shift, list):
 95 |             self.shift = [shift for _ in range(self.fsmn_num_layers)]
 96 | 
 97 |         self.ffn_lst = nn.ModuleList()
 98 |         self.ffn_lst.append(
 99 |             FeedForwardNet(input_dim, ffn_inner_dim, num_memory_units, dropout=dropout)
100 |         )
101 |         for i in range(1, fsmn_num_layers):
102 |             self.ffn_lst.append(
103 |                 FeedForwardNet(
104 |                     num_memory_units, ffn_inner_dim, num_memory_units, dropout=dropout
105 |                 )
106 |             )
107 | 
108 |         self.memory_block_lst = nn.ModuleList()
109 |         for i in range(fsmn_num_layers):
110 |             self.memory_block_lst.append(
111 |                 MemoryBlockV2(num_memory_units, filter_size, self.shift[i], dropout)
112 |             )
113 | 
114 |     def forward(self, input, mask=None):
115 |         x = F.dropout(input, self.dropout, self.training)
116 |         for (ffn, memory_block) in zip(self.ffn_lst, self.memory_block_lst):
117 |             context = ffn(x)
118 |             memory = memory_block(context, mask)
119 |             memory = F.dropout(memory, self.dropout, self.training)
120 |             if memory.size(-1) == x.size(-1):
121 |                 memory += x
122 |             x = memory
123 | 
124 |         return x
125 | 


--------------------------------------------------------------------------------
/kantts/models/sambert/positions.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | import numpy as np
 6 | 
 7 | 
 8 | class SinusoidalPositionEncoder(nn.Module):
 9 |     def __init__(self, max_len, depth):
10 |         super(SinusoidalPositionEncoder, self).__init__()
11 | 
12 |         self.max_len = max_len
13 |         self.depth = depth
14 |         self.position_enc = nn.Parameter(
15 |             self.get_sinusoid_encoding_table(max_len, depth).unsqueeze(0),
16 |             requires_grad=False,
17 |         )
18 | 
19 |     def forward(self, input):
20 |         bz_in, len_in, _ = input.size()
21 |         if len_in > self.max_len:
22 |             self.max_len = len_in
23 |             self.position_enc.data = (
24 |                 self.get_sinusoid_encoding_table(self.max_len, self.depth)
25 |                 .unsqueeze(0)
26 |                 .to(input.device)
27 |             )
28 | 
29 |         output = input + self.position_enc[:, :len_in, :].expand(bz_in, -1, -1)
30 | 
31 |         return output
32 | 
33 |     @staticmethod
34 |     def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
35 |         """ Sinusoid position encoding table """
36 | 
37 |         def cal_angle(position, hid_idx):
38 |             return position / np.power(10000, hid_idx / float(d_hid / 2 - 1))
39 | 
40 |         def get_posi_angle_vec(position):
41 |             return [cal_angle(position, hid_j) for hid_j in range(d_hid // 2)]
42 | 
43 |         scaled_time_table = np.array(
44 |             [get_posi_angle_vec(pos_i + 1) for pos_i in range(n_position)]
45 |         )
46 | 
47 |         sinusoid_table = np.zeros((n_position, d_hid))
48 |         sinusoid_table[:, : d_hid // 2] = np.sin(scaled_time_table)
49 |         sinusoid_table[:, d_hid // 2 :] = np.cos(scaled_time_table)
50 | 
51 |         if padding_idx is not None:
52 |             # zero vector for padding dimension
53 |             sinusoid_table[padding_idx] = 0.0
54 | 
55 |         return torch.FloatTensor(sinusoid_table)
56 | 
57 | 
58 | class DurSinusoidalPositionEncoder(nn.Module):
59 |     def __init__(self, depth, outputs_per_step):
60 |         super(DurSinusoidalPositionEncoder, self).__init__()
61 | 
62 |         self.depth = depth
63 |         self.outputs_per_step = outputs_per_step
64 | 
65 |         inv_timescales = [
66 |             np.power(10000, 2 * (hid_idx // 2) / depth) for hid_idx in range(depth)
67 |         ]
68 |         self.inv_timescales = nn.Parameter(
69 |             torch.FloatTensor(inv_timescales), requires_grad=False
70 |         )
71 | 
72 |     def forward(self, durations, masks=None):
73 |         reps = (durations + 0.5).long()
74 |         output_lens = reps.sum(dim=1)
75 |         max_len = output_lens.max()
76 |         reps_cumsum = torch.cumsum(F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[
77 |             :, None, :
78 |         ]
79 |         range_ = torch.arange(max_len).to(durations.device)[None, :, None]
80 |         mult = (reps_cumsum[:, :, :-1] <= range_) & (reps_cumsum[:, :, 1:] > range_)
81 |         mult = mult.float()
82 |         offsets = torch.matmul(mult, reps_cumsum[:, 0, :-1].unsqueeze(-1)).squeeze(-1)
83 |         dur_pos = range_[:, :, 0] - offsets + 1
84 | 
85 |         if masks is not None:
86 |             assert masks.size(1) == dur_pos.size(1)
87 |             dur_pos = dur_pos.masked_fill(masks, 0.0)
88 | 
89 |         seq_len = dur_pos.size(1)
90 |         padding = self.outputs_per_step - int(seq_len) % self.outputs_per_step
91 |         if padding < self.outputs_per_step:
92 |             dur_pos = F.pad(dur_pos, (0, padding, 0, 0), value=0.0)
93 | 
94 |         position_embedding = dur_pos[:, :, None] / self.inv_timescales[None, None, :]
95 |         position_embedding[:, :, 0::2] = torch.sin(position_embedding[:, :, 0::2])
96 |         position_embedding[:, :, 1::2] = torch.cos(position_embedding[:, :, 1::2])
97 | 
98 |         return position_embedding
99 | 


--------------------------------------------------------------------------------
/kantts/models/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from distutils.version import LooseVersion
 3 | 
 4 | is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion("1.7")
 5 | 
 6 | 
 7 | def init_weights(m, mean=0.0, std=0.01):
 8 |     classname = m.__class__.__name__
 9 |     if classname.find("Conv") != -1:
10 |         m.weight.data.normal_(mean, std)
11 | 
12 | 
13 | def get_mask_from_lengths(lengths, max_len=None):
14 |     batch_size = lengths.shape[0]
15 |     if max_len is None:
16 |         max_len = torch.max(lengths).item()
17 | 
18 |     ids = (
19 |         torch.arange(0, max_len).unsqueeze(0).expand(batch_size, -1).to(lengths.device)
20 |     )
21 |     mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
22 | 
23 |     return mask
24 | 


--------------------------------------------------------------------------------
/kantts/preprocess/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/preprocess/__init__.py


--------------------------------------------------------------------------------
/kantts/preprocess/audio_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/preprocess/audio_processor/__init__.py


--------------------------------------------------------------------------------
/kantts/preprocess/audio_processor/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/preprocess/audio_processor/core/__init__.py


--------------------------------------------------------------------------------
/kantts/preprocess/fp_processor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import random
  4 | 
  5 | 
  6 | def is_fp_line(line):
  7 |     fp_category_list = ["FP", "I", "N", "Q"]
  8 |     elements = line.strip().split(" ")
  9 |     res = True
 10 |     for ele in elements:
 11 |         if ele not in fp_category_list:
 12 |             res = False
 13 |             break
 14 |     return res
 15 | 
 16 | 
 17 | class FpProcessor:
 18 |     def __init__(self):
 19 |         #  TODO: Add more audio processing methods.
 20 |         self.res = []
 21 | 
 22 |     def is_fp_line(line):
 23 |         fp_category_list = ["FP", "I", "N", "Q"]
 24 |         elements = line.strip().split(" ")
 25 |         res = True
 26 |         for ele in elements:
 27 |             if ele not in fp_category_list:
 28 |                 res = False
 29 |                 break
 30 |         return res
 31 | 
 32 |     # TODO: adjust idx judgment rule
 33 |     def addfp(self, voice_output_dir, prosody, raw_metafile_lines):
 34 | 
 35 |         fp_category_list = ["FP", "I", "N"]
 36 | 
 37 |         f = open(prosody)
 38 |         prosody_lines = f.readlines()
 39 |         f.close()
 40 | 
 41 |         idx = ""
 42 |         fp = ""
 43 |         fp_label_dict = {}
 44 |         i = 0
 45 |         while i < len(prosody_lines):
 46 |             if len(prosody_lines[i].strip().split("\t")) == 2:
 47 |                 idx = prosody_lines[i].strip().split("\t")[0]
 48 |                 i += 1
 49 |             else:
 50 |                 fp_enable = is_fp_line(prosody_lines[i])
 51 |                 if fp_enable:
 52 |                     fp = prosody_lines[i].strip().split("\t")[0].split(" ")
 53 |                     for label in fp:
 54 |                         if label not in fp_category_list:
 55 |                             logging.warning("fp label not in fp_category_list")
 56 |                             break
 57 |                     i += 4
 58 |                 else:
 59 |                     fp = [
 60 |                         "N"
 61 |                         for _ in range(
 62 |                             len(
 63 |                                 prosody_lines[i]
 64 |                                 .strip()
 65 |                                 .split("\t")[0]
 66 |                                 .replace("/ ", "")
 67 |                                 .replace(". ", "")
 68 |                                 .split(" ")
 69 |                             )
 70 |                         )
 71 |                     ]
 72 |                     i += 1
 73 |                 fp_label_dict[idx] = fp
 74 | 
 75 |         fpadd_metafile = os.path.join(voice_output_dir, "fpadd_metafile.txt")
 76 |         f_out = open(fpadd_metafile, "w")
 77 |         for line in raw_metafile_lines:
 78 |             tokens = line.strip().split("\t")
 79 |             if len(tokens) == 2:
 80 |                 uttname = tokens[0]
 81 |                 symbol_sequences = tokens[1].split(" ")
 82 | 
 83 |                 error_flag = False
 84 |                 idx = 0
 85 |                 out_str = uttname + "\t"
 86 | 
 87 |                 for this_symbol_sequence in symbol_sequences:
 88 |                     emotion = this_symbol_sequence.split("$")[4]
 89 |                     this_symbol_sequence = this_symbol_sequence.replace(
 90 |                         emotion, "emotion_neutral"
 91 |                     )
 92 | 
 93 |                     if idx < len(fp_label_dict[uttname]):
 94 |                         if fp_label_dict[uttname][idx] == "FP":
 95 |                             if "none" not in this_symbol_sequence:
 96 |                                 this_symbol_sequence = this_symbol_sequence.replace(
 97 |                                     "emotion_neutral", "emotion_disgust"
 98 |                                 )
 99 |                         syllable_label = this_symbol_sequence.split("$")[2]
100 |                         if syllable_label == "s_both" or syllable_label == "s_end":
101 |                             idx += 1
102 |                     elif idx > len(fp_label_dict[uttname]):
103 |                         logging.warning(uttname + " not match")
104 |                         error_flag = True
105 |                     out_str = out_str + this_symbol_sequence + " "
106 | 
107 |                 # if idx != len(fp_label_dict[uttname]):
108 |                 #     logging.warning(
109 |                 #         "{} length mismatch, length: {} ".format(
110 |                 #             idx, len(fp_label_dict[uttname])
111 |                 #         )
112 |                 #     )
113 | 
114 |                 if not error_flag:
115 |                     f_out.write(out_str.strip() + "\n")
116 |         f_out.close()
117 |         return fpadd_metafile
118 | 
119 |     def removefp(self, voice_output_dir, fpadd_metafile, raw_metafile_lines):
120 | 
121 |         f = open(fpadd_metafile)
122 |         fpadd_metafile_lines = f.readlines()
123 |         f.close()
124 | 
125 |         fprm_metafile = os.path.join(voice_output_dir, "fprm_metafile.txt")
126 |         f_out = open(fprm_metafile, "w")
127 |         for i in range(len(raw_metafile_lines)):
128 |             tokens = raw_metafile_lines[i].strip().split("\t")
129 |             symbol_sequences = tokens[1].split(" ")
130 |             fpadd_tokens = fpadd_metafile_lines[i].strip().split("\t")
131 |             fpadd_symbol_sequences = fpadd_tokens[1].split(" ")
132 | 
133 |             error_flag = False
134 |             out_str = tokens[0] + "\t"
135 |             idx = 0
136 |             length = len(symbol_sequences)
137 |             while idx < length:
138 |                 if "$emotion_disgust" in fpadd_symbol_sequences[idx]:
139 |                     if idx + 1 < length and "none" in fpadd_symbol_sequences[idx + 1]:
140 |                         idx = idx + 2
141 |                     else:
142 |                         idx = idx + 1
143 |                     continue
144 |                 out_str = out_str + symbol_sequences[idx] + " "
145 |                 idx = idx + 1
146 | 
147 |             if not error_flag:
148 |                 f_out.write(out_str.strip() + "\n")
149 |         f_out.close()
150 | 
151 |     def process(self, voice_output_dir, prosody, raw_metafile):
152 | 
153 |         with open(raw_metafile, "r") as f:
154 |             lines = f.readlines()
155 |         random.shuffle(lines)
156 | 
157 |         fpadd_metafile = self.addfp(voice_output_dir, prosody, lines)
158 |         self.removefp(voice_output_dir, fpadd_metafile, lines)
159 | 


--------------------------------------------------------------------------------
/kantts/preprocess/languages/PinYin/En2ChPhoneMap.txt:
--------------------------------------------------------------------------------
1 | wu	w
2 | yi	y
3 | 


--------------------------------------------------------------------------------
/kantts/preprocess/languages/PinYin/PosSet.xml:
--------------------------------------------------------------------------------
  1 | ﻿<?xml version="1.0" encoding="utf-8"?>
  2 | <posSet xmlns="http://schemas.alibaba-inc.com/tts">
  3 |   <pos>
  4 |     <id>1</id>
  5 |     <name>a</name>
  6 |     <desc>todo</desc>
  7 |   </pos>
  8 |   <pos>
  9 |     <id>2</id>
 10 |     <name>b</name>
 11 |     <desc>todo</desc>
 12 |   </pos>
 13 |   <pos>
 14 |     <id>3</id>
 15 |     <name>c</name>
 16 |     <desc>todo</desc>
 17 |   </pos>
 18 |   <pos>
 19 |     <id>4</id>
 20 |     <name>d</name>
 21 |     <desc>todo</desc>
 22 |   </pos>
 23 |   <pos>
 24 |     <id>5</id>
 25 |     <name>e</name>
 26 |     <desc>todo</desc>
 27 |   </pos>
 28 |   <pos>
 29 |     <id>6</id>
 30 |     <name>f</name>
 31 |     <desc>todo</desc>
 32 |   </pos>
 33 |   <pos>
 34 |     <id>7</id>
 35 |     <name>g</name>
 36 |     <desc>todo</desc>
 37 |     <sub>
 38 |       <pos>
 39 |         <id>8</id>
 40 |         <name>gb</name>
 41 |         <desc>todo</desc>
 42 |       </pos>
 43 |     </sub>
 44 |   </pos>
 45 |   <pos>
 46 |     <id>9</id>
 47 |     <name>h</name>
 48 |     <desc>todo</desc>
 49 |   </pos>
 50 |   <pos>
 51 |     <id>10</id>
 52 |     <name>i</name>
 53 |     <desc>todo</desc>
 54 |   </pos>
 55 |   <pos>
 56 |     <id>11</id>
 57 |     <name>j</name>
 58 |     <desc>todo</desc>
 59 |   </pos>
 60 |   <pos>
 61 |     <id>12</id>
 62 |     <name>k</name>
 63 |     <desc>todo</desc>
 64 |   </pos>
 65 |   <pos>
 66 |     <id>13</id>
 67 |     <name>l</name>
 68 |     <desc>todo</desc>
 69 |   </pos>
 70 |   <pos>
 71 |     <id>14</id>
 72 |     <name>m</name>
 73 |     <desc>todo</desc>
 74 |   </pos>
 75 |   <pos>
 76 |     <id>15</id>
 77 |     <name>n</name>
 78 |     <desc>todo</desc>
 79 |     <sub>
 80 |       <pos>
 81 |         <id>16</id>
 82 |         <name>nz</name>
 83 |         <desc>todo</desc>
 84 |       </pos>
 85 |     </sub>
 86 |   </pos>
 87 |   <pos>
 88 |     <id>17</id>
 89 |     <name>o</name>
 90 |     <desc>todo</desc>
 91 |   </pos>
 92 |   <pos>
 93 |     <id>18</id>
 94 |     <name>p</name>
 95 |     <desc>todo</desc>
 96 |   </pos>
 97 |   <pos>
 98 |     <id>19</id>
 99 |     <name>q</name>
100 |     <desc>todo</desc>
101 |   </pos>
102 |   <pos>
103 |    <id>20</id>
104 |     <name>r</name>
105 |     <desc>todo</desc>
106 |   </pos>
107 |   <pos>
108 |    <id>21</id>
109 |     <name>s</name>
110 |     <desc>todo</desc>
111 |   </pos>
112 |   <pos>
113 |     <id>22</id>
114 |     <name>t</name>
115 |     <desc>todo</desc>
116 |   </pos>
117 |   <pos>
118 |     <id>23</id>
119 |     <name>u</name>
120 |     <desc>todo</desc>
121 |   </pos>
122 |   <pos>
123 |     <id>24</id>
124 |     <name>v</name>
125 |     <desc>todo</desc>
126 |   </pos>
127 |   <pos>
128 |     <id>25</id>
129 |     <name>w</name>
130 |     <desc>todo</desc>
131 |   </pos>
132 |   <pos>
133 |     <id>26</id>
134 |     <name>x</name>
135 |     <desc>todo</desc>
136 |   </pos>
137 |   <pos>
138 |     <id>27</id>
139 |     <name>y</name>
140 |     <desc>todo</desc>
141 |   </pos>
142 |   <pos>
143 |     <id>28</id>
144 |     <name>z</name>
145 |     <desc>todo</desc>
146 |   </pos>
147 | </posSet>
148 | 


--------------------------------------------------------------------------------
/kantts/preprocess/languages/PinYin/tonelist.txt:
--------------------------------------------------------------------------------
1 | 1
2 | 
3 | 4
4 | 2
5 | 3
6 | 5
7 | 0
8 | 


--------------------------------------------------------------------------------
/kantts/preprocess/languages/Sichuan/En2ChPhoneMap.txt:
--------------------------------------------------------------------------------
1 | wu	w
2 | yi	y
3 | 


--------------------------------------------------------------------------------
/kantts/preprocess/languages/Sichuan/PosSet.xml:
--------------------------------------------------------------------------------
  1 | ﻿<?xml version="1.0" encoding="utf-8"?>
  2 | <posSet xmlns="http://schemas.alibaba-inc.com/tts">
  3 |   <pos>
  4 |     <id>1</id>
  5 |     <name>a</name>
  6 |     <desc>todo</desc>
  7 |   </pos>
  8 |   <pos>
  9 |     <id>2</id>
 10 |     <name>b</name>
 11 |     <desc>todo</desc>
 12 |   </pos>
 13 |   <pos>
 14 |     <id>3</id>
 15 |     <name>c</name>
 16 |     <desc>todo</desc>
 17 |   </pos>
 18 |   <pos>
 19 |     <id>4</id>
 20 |     <name>d</name>
 21 |     <desc>todo</desc>
 22 |   </pos>
 23 |   <pos>
 24 |     <id>5</id>
 25 |     <name>e</name>
 26 |     <desc>todo</desc>
 27 |   </pos>
 28 |   <pos>
 29 |     <id>6</id>
 30 |     <name>f</name>
 31 |     <desc>todo</desc>
 32 |   </pos>
 33 |   <pos>
 34 |     <id>7</id>
 35 |     <name>g</name>
 36 |     <desc>todo</desc>
 37 |     <sub>
 38 |       <pos>
 39 |         <id>8</id>
 40 |         <name>gb</name>
 41 |         <desc>todo</desc>
 42 |       </pos>
 43 |     </sub>
 44 |   </pos>
 45 |   <pos>
 46 |     <id>9</id>
 47 |     <name>h</name>
 48 |     <desc>todo</desc>
 49 |   </pos>
 50 |   <pos>
 51 |     <id>10</id>
 52 |     <name>i</name>
 53 |     <desc>todo</desc>
 54 |   </pos>
 55 |   <pos>
 56 |     <id>11</id>
 57 |     <name>j</name>
 58 |     <desc>todo</desc>
 59 |   </pos>
 60 |   <pos>
 61 |     <id>12</id>
 62 |     <name>k</name>
 63 |     <desc>todo</desc>
 64 |   </pos>
 65 |   <pos>
 66 |     <id>13</id>
 67 |     <name>l</name>
 68 |     <desc>todo</desc>
 69 |   </pos>
 70 |   <pos>
 71 |     <id>14</id>
 72 |     <name>m</name>
 73 |     <desc>todo</desc>
 74 |   </pos>
 75 |   <pos>
 76 |     <id>15</id>
 77 |     <name>n</name>
 78 |     <desc>todo</desc>
 79 |     <sub>
 80 |       <pos>
 81 |         <id>16</id>
 82 |         <name>nz</name>
 83 |         <desc>todo</desc>
 84 |       </pos>
 85 |     </sub>
 86 |   </pos>
 87 |   <pos>
 88 |     <id>17</id>
 89 |     <name>o</name>
 90 |     <desc>todo</desc>
 91 |   </pos>
 92 |   <pos>
 93 |     <id>18</id>
 94 |     <name>p</name>
 95 |     <desc>todo</desc>
 96 |   </pos>
 97 |   <pos>
 98 |     <id>19</id>
 99 |     <name>q</name>
100 |     <desc>todo</desc>
101 |   </pos>
102 |   <pos>
103 |    <id>20</id>
104 |     <name>r</name>
105 |     <desc>todo</desc>
106 |   </pos>
107 |   <pos>
108 |    <id>21</id>
109 |     <name>s</name>
110 |     <desc>todo</desc>
111 |   </pos>
112 |   <pos>
113 |     <id>22</id>
114 |     <name>t</name>
115 |     <desc>todo</desc>
116 |   </pos>
117 |   <pos>
118 |     <id>23</id>
119 |     <name>u</name>
120 |     <desc>todo</desc>
121 |   </pos>
122 |   <pos>
123 |     <id>24</id>
124 |     <name>v</name>
125 |     <desc>todo</desc>
126 |   </pos>
127 |   <pos>
128 |     <id>25</id>
129 |     <name>w</name>
130 |     <desc>todo</desc>
131 |   </pos>
132 |   <pos>
133 |     <id>26</id>
134 |     <name>x</name>
135 |     <desc>todo</desc>
136 |   </pos>
137 |   <pos>
138 |     <id>27</id>
139 |     <name>y</name>
140 |     <desc>todo</desc>
141 |   </pos>
142 |   <pos>
143 |     <id>28</id>
144 |     <name>z</name>
145 |     <desc>todo</desc>
146 |   </pos>
147 | </posSet>
148 | 


--------------------------------------------------------------------------------
/kantts/preprocess/languages/Sichuan/tonelist.txt:
--------------------------------------------------------------------------------
1 | 1
2 | 
3 | 4
4 | 2
5 | 3
6 | 5
7 | 0
8 | 


--------------------------------------------------------------------------------
/kantts/preprocess/languages/WuuShanghai/En2ChPhoneMap.txt:
--------------------------------------------------------------------------------
1 | wu	w
2 | yi	y
3 | 


--------------------------------------------------------------------------------
/kantts/preprocess/languages/WuuShanghai/tonelist.txt:
--------------------------------------------------------------------------------
 1 | 6
 2 | 0
 3 | 3
 4 | 4
 5 | 2
 6 | 5
 7 | 1
 8 | 7
 9 | 8
10 | 
11 | 


--------------------------------------------------------------------------------
/kantts/preprocess/languages/ZhHK/En2ChPhoneMap.txt:
--------------------------------------------------------------------------------
1 | wu	w
2 | yi	y
3 | 


--------------------------------------------------------------------------------
/kantts/preprocess/languages/ZhHK/tonelist.txt:
--------------------------------------------------------------------------------
 1 | 6
 2 | 0
 3 | 3
 4 | 4
 5 | 2
 6 | 5
 7 | 7
 8 | 1
 9 | 8
10 | 9
11 | 
12 | 


--------------------------------------------------------------------------------
/kantts/preprocess/languages/__init__.py:
--------------------------------------------------------------------------------
 1 | languages = {
 2 |     "PinYin": {
 3 |         "phoneset_path": "PhoneSet.xml",
 4 |         "posset_path": "PosSet.xml",
 5 |         "f2t_map_path": "En2ChPhoneMap.txt",
 6 |         "s2p_map_path": "py2phoneMap.txt",
 7 |         "tonelist_path": "tonelist.txt",
 8 |     },
 9 |     "ZhHK": {
10 |         "phoneset_path": "PhoneSet.xml",
11 |         "posset_path": "PosSet.xml",
12 |         "f2t_map_path": "En2ChPhoneMap.txt",
13 |         "s2p_map_path": "py2phoneMap.txt",
14 |         "tonelist_path": "tonelist.txt",
15 |     },
16 |     "WuuShanghai": {
17 |         "phoneset_path": "PhoneSet.xml",
18 |         "posset_path": "PosSet.xml",
19 |         "f2t_map_path": "En2ChPhoneMap.txt",
20 |         "s2p_map_path": "py2phoneMap.txt",
21 |         "tonelist_path": "tonelist.txt",
22 |     },
23 |     "Sichuan": {
24 |         "phoneset_path": "PhoneSet.xml",
25 |         "posset_path": "PosSet.xml",
26 |         "f2t_map_path": "En2ChPhoneMap.txt",
27 |         "s2p_map_path": "py2phoneMap.txt",
28 |         "tonelist_path": "tonelist.txt",
29 |     },
30 | }
31 | 


--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/preprocess/script_convertor/__init__.py


--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/Phone.py:
--------------------------------------------------------------------------------
 1 | from .XmlObj import XmlObj
 2 | from .core_types import PhoneCVType, PhoneIFType, PhoneUVType, PhoneAPType, PhoneAMType
 3 | 
 4 | 
 5 | class Phone(XmlObj):
 6 |     def __init__(self):
 7 |         self.m_id = None
 8 |         self.m_name = None
 9 |         self.m_cv_type = PhoneCVType.NULL
10 |         self.m_if_type = PhoneIFType.NULL
11 |         self.m_uv_type = PhoneUVType.NULL
12 |         self.m_ap_type = PhoneAPType.NULL
13 |         self.m_am_type = PhoneAMType.NULL
14 |         self.m_bnd = False
15 | 
16 |     def __str__(self):
17 |         return self.m_name
18 | 
19 |     def Save(self):
20 |         pass
21 | 
22 |     def Load(self, phone_node):
23 |         ns = "{http://schemas.alibaba-inc.com/tts}"
24 | 
25 |         id_node = phone_node.find(ns + "id")
26 |         self.m_id = int(id_node.text)
27 | 
28 |         name_node = phone_node.find(ns + "name")
29 |         self.m_name = name_node.text
30 | 
31 |         cv_node = phone_node.find(ns + "cv")
32 |         self.m_cv_type = PhoneCVType.parse(cv_node.text)
33 | 
34 |         if_node = phone_node.find(ns + "if")
35 |         self.m_if_type = PhoneIFType.parse(if_node.text)
36 | 
37 |         uv_node = phone_node.find(ns + "uv")
38 |         self.m_uv_type = PhoneUVType.parse(uv_node.text)
39 | 
40 |         ap_node = phone_node.find(ns + "ap")
41 |         self.m_ap_type = PhoneAPType.parse(ap_node.text)
42 | 
43 |         am_node = phone_node.find(ns + "am")
44 |         self.m_am_type = PhoneAMType.parse(am_node.text)
45 | 


--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/PhoneSet.py:
--------------------------------------------------------------------------------
 1 | import xml.etree.ElementTree as ET
 2 | import logging
 3 | 
 4 | from .XmlObj import XmlObj
 5 | from .Phone import Phone
 6 | 
 7 | 
 8 | class PhoneSet(XmlObj):
 9 |     def __init__(self, phoneset_path):
10 |         self.m_phone_list = []
11 |         self.m_id_map = {}
12 |         self.m_name_map = {}
13 |         self.Load(phoneset_path)
14 | 
15 |     def Load(self, file_path):
16 |         #  alibaba tts xml namespace
17 |         ns = "{http://schemas.alibaba-inc.com/tts}"
18 | 
19 |         phoneset_root = ET.parse(file_path).getroot()
20 |         for phone_node in phoneset_root.findall(ns + "phone"):
21 |             phone = Phone()
22 |             phone.Load(phone_node)
23 |             self.m_phone_list.append(phone)
24 |             if phone.m_id in self.m_id_map:
25 |                 logging.error("PhoneSet.Load: duplicate id: %d", phone.m_id)
26 |             self.m_id_map[phone.m_id] = phone
27 | 
28 |             if phone.m_name in self.m_name_map:
29 |                 logging.error("PhoneSet.Load duplicate name name: %s", phone.m_name)
30 |             self.m_name_map[phone.m_name] = phone
31 | 
32 |     def Save(self):
33 |         pass
34 | 
35 | 
36 | #  if __name__ == "__main__":
37 | #      import os
38 | #      import sys
39 | #
40 | #      phoneset = PhoneSet()
41 | #      phoneset.Load(sys.argv[1])
42 | #
43 | #      for phone in phoneset.m_phone_list:
44 | #          print(phone)
45 | #          print(phone.m_id)
46 | #          print(phone.m_name)
47 | #          print(phone.m_cv_type)
48 | #          print(phone.m_if_type)
49 | #          print(phone.m_uv_type)
50 | #          print(phone.m_ap_type)
51 | #          print(phone.m_am_type)
52 | #          print(phone.m_bnd)
53 | 


--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/Pos.py:
--------------------------------------------------------------------------------
 1 | from .XmlObj import XmlObj
 2 | 
 3 | 
 4 | class Pos(XmlObj):
 5 |     def __init__(self):
 6 |         self.m_id = None
 7 |         self.m_name = None
 8 |         self.m_desc = None
 9 |         self.m_level = 1
10 |         self.m_parent = None
11 |         self.m_sub_pos_list = []
12 | 
13 |     def __str__(self):
14 |         return self.m_name
15 | 
16 |     def Save(self):
17 |         pass
18 | 
19 |     def Load(self, pos_node):
20 |         ns = "{http://schemas.alibaba-inc.com/tts}"
21 | 
22 |         id_node = pos_node.find(ns + "id")
23 |         self.m_id = int(id_node.text)
24 | 
25 |         name_node = pos_node.find(ns + "name")
26 |         self.m_name = name_node.text
27 | 
28 |         desc_node = pos_node.find(ns + "desc")
29 |         self.m_desc = desc_node.text
30 | 
31 |         sub_node = pos_node.find(ns + "sub")
32 |         if sub_node is not None:
33 |             for sub_pos_node in sub_node.findall(ns + "pos"):
34 |                 sub_pos = Pos()
35 |                 sub_pos.Load(sub_pos_node)
36 |                 sub_pos.m_parent = self
37 |                 sub_pos.m_level = self.m_level + 1
38 |                 self.m_sub_pos_list.append(sub_pos)
39 | 
40 |         return
41 | 


--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/PosSet.py:
--------------------------------------------------------------------------------
 1 | import xml.etree.ElementTree as ET
 2 | import logging
 3 | 
 4 | from .XmlObj import XmlObj
 5 | from .Pos import Pos
 6 | 
 7 | 
 8 | class PosSet(XmlObj):
 9 |     def __init__(self, posset_path):
10 |         self.m_pos_list = []
11 |         self.m_id_map = {}
12 |         self.m_name_map = {}
13 |         self.Load(posset_path)
14 | 
15 |     def Load(self, file_path):
16 |         #  alibaba tts xml namespace
17 |         ns = "{http://schemas.alibaba-inc.com/tts}"
18 | 
19 |         posset_root = ET.parse(file_path).getroot()
20 |         for pos_node in posset_root.findall(ns + "pos"):
21 |             pos = Pos()
22 |             pos.Load(pos_node)
23 |             self.m_pos_list.append(pos)
24 |             if pos.m_id in self.m_id_map:
25 |                 logging.error("PosSet.Load: duplicate id: %d", pos.m_id)
26 |             self.m_id_map[pos.m_id] = pos
27 | 
28 |             if pos.m_name in self.m_name_map:
29 |                 logging.error("PosSet.Load duplicate name name: %s", pos.m_name)
30 |             self.m_name_map[pos.m_name] = pos
31 | 
32 |             if len(pos.m_sub_pos_list) > 0:
33 |                 for sub_pos in pos.m_sub_pos_list:
34 |                     self.m_pos_list.append(sub_pos)
35 |                     if sub_pos.m_id in self.m_id_map:
36 |                         logging.error("PosSet.Load: duplicate id: %d", sub_pos.m_id)
37 |                     self.m_id_map[sub_pos.m_id] = sub_pos
38 | 
39 |                     if sub_pos.m_name in self.m_name_map:
40 |                         logging.error(
41 |                             "PosSet.Load duplicate name name: %s", sub_pos.m_name
42 |                         )
43 |                     self.m_name_map[sub_pos.m_name] = sub_pos
44 | 
45 |     def Save(self):
46 |         pass
47 | 
48 | 
49 | #  if __name__ == "__main__":
50 | #      import os
51 | #      import sys
52 | #
53 | #      posset = PosSet()
54 | #      posset.Load(sys.argv[1])
55 | #
56 | #      for pos in posset.m_pos_list:
57 | #          print(pos)
58 | #          print(pos.m_id)
59 | #          print(pos.m_name)
60 | #          print(pos.m_desc)
61 | #          print(pos.m_level)
62 | #          print(pos.m_parent)
63 | #          if pos.m_sub_pos_list:
64 | #              print("sub pos list:")
65 | #              for sub_pos in pos.m_sub_pos_list:
66 | #                  print(sub_pos)
67 | #                  print(sub_pos.m_id)
68 | #                  print(sub_pos.m_name)
69 | #                  print(sub_pos.m_desc)
70 | #                  print(sub_pos.m_level)
71 | #                  print(sub_pos.m_parent)
72 | #              print("sub pos list end")
73 | 


--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/Script.py:
--------------------------------------------------------------------------------
 1 | from .XmlObj import XmlObj
 2 | 
 3 | import xml.etree.ElementTree as ET
 4 | from xml.dom import minidom
 5 | 
 6 | 
 7 | class Script(XmlObj):
 8 |     def __init__(self, phoneset, posset):
 9 |         self.m_phoneset = phoneset
10 |         self.m_posset = posset
11 |         self.m_items = []
12 | 
13 |     def Save(self, outputXMLPath):
14 |         root = ET.Element("script")
15 | 
16 |         root.set("uttcount", str(len(self.m_items)))
17 |         root.set("xmlns", "http://schemas.alibaba-inc.com/tts")
18 |         for item in self.m_items:
19 |             item.Save(root)
20 | 
21 |         xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(
22 |             indent="  ", encoding="utf-8"
23 |         )
24 |         with open(outputXMLPath, "wb") as f:
25 |             f.write(xmlstr)
26 | 
27 |     def SaveMetafile(self):
28 |         meta_lines = []
29 | 
30 |         for item in self.m_items:
31 |             meta_lines.append(item.SaveMetafile())
32 | 
33 |         return meta_lines
34 | 


--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/ScriptItem.py:
--------------------------------------------------------------------------------
 1 | import xml.etree.ElementTree as ET
 2 | 
 3 | from .XmlObj import XmlObj
 4 | 
 5 | 
 6 | class ScriptItem(XmlObj):
 7 |     def __init__(self, phoneset, posset):
 8 |         if phoneset is None or posset is None:
 9 |             raise Exception("ScriptItem.__init__: phoneset or posset is None")
10 |         self.m_phoneset = phoneset
11 |         self.m_posset = posset
12 | 
13 |         self.m_id = None
14 |         self.m_text = ""
15 |         self.m_scriptSentence_list = []
16 |         self.m_status = None
17 | 
18 |     def Load(self):
19 |         pass
20 | 
21 |     def Save(self, parent_node):
22 |         utterance_node = ET.SubElement(parent_node, "utterance")
23 |         utterance_node.set("id", self.m_id)
24 | 
25 |         text_node = ET.SubElement(utterance_node, "text")
26 |         text_node.text = self.m_text
27 | 
28 |         for sentence in self.m_scriptSentence_list:
29 |             sentence.Save(utterance_node)
30 | 
31 |     def SaveMetafile(self):
32 |         meta_line = self.m_id + "\t"
33 | 
34 |         for sentence in self.m_scriptSentence_list:
35 |             meta_line += sentence.SaveMetafile()
36 | 
37 |         return meta_line
38 | 


--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/ScriptSentence.py:
--------------------------------------------------------------------------------
  1 | from .XmlObj import XmlObj
  2 | 
  3 | import xml.etree.ElementTree as ET
  4 | 
  5 | 
  6 | #  TODO(jin): Not referenced, temporarily commented
  7 | class WrittenSentence(XmlObj):
  8 |     def __init__(self, posset):
  9 |         self.m_written_word_list = []
 10 |         self.m_written_mark_list = []
 11 |         self.m_posset = posset
 12 |         self.m_align_list = []
 13 |         self.m_alignCursor = 0
 14 |         self.m_accompanyIndex = 0
 15 |         self.m_sequence = ""
 16 |         self.m_text = ""
 17 | 
 18 |     def AddHost(self, writtenWord):
 19 |         self.m_written_word_list.append(writtenWord)
 20 |         self.m_align_list.append(self.m_alignCursor)
 21 | 
 22 |     def LoadHost(self):
 23 |         pass
 24 | 
 25 |     def SaveHost(self):
 26 |         pass
 27 | 
 28 |     def AddAccompany(self, writtenMark):
 29 |         self.m_written_mark_list.append(writtenMark)
 30 |         self.m_alignCursor += 1
 31 |         self.m_accompanyIndex += 1
 32 | 
 33 |     def SaveAccompany(self):
 34 |         pass
 35 | 
 36 |     def LoadAccompany(self):
 37 |         pass
 38 | 
 39 |     #  Get the mark span corresponding to specific spoken word
 40 |     def GetAccompanySpan(self, host_index):
 41 |         if host_index == -1:
 42 |             return (0, self.m_align_list[0])
 43 | 
 44 |         accompany_begin = self.m_align_list[host_index]
 45 |         accompany_end = (
 46 |             self.m_align_list[host_index + 1]
 47 |             if host_index + 1 < len(self.m_written_word_list)
 48 |             else len(self.m_written_mark_list)
 49 |         )
 50 | 
 51 |         return (accompany_begin, accompany_end)
 52 | 
 53 |     # TODO: iterable
 54 |     def GetElements(self):
 55 |         accompany_begin, accompany_end = self.GetAccompanySpan(-1)
 56 |         res_lst = [
 57 |             self.m_written_mark_list[i] for i in range(accompany_begin, accompany_end)
 58 |         ]
 59 | 
 60 |         for j in range(len(self.m_written_word_list)):
 61 |             accompany_begin, accompany_end = self.GetAccompanySpan(j)
 62 |             res_lst.extend([self.m_written_word_list[j]])
 63 |             res_lst.extend(
 64 |                 [
 65 |                     self.m_written_mark_list[i]
 66 |                     for i in range(accompany_begin, accompany_end)
 67 |                 ]
 68 |             )
 69 | 
 70 |         return res_lst
 71 | 
 72 |     def BuildSequence(self):
 73 |         self.m_sequence = " ".join([str(ele) for ele in self.GetElements()])
 74 | 
 75 |     def BuildText(self):
 76 |         self.m_text = "".join([str(ele) for ele in self.GetElements()])
 77 | 
 78 | 
 79 | class SpokenSentence(XmlObj):
 80 |     def __init__(self, phoneset):
 81 |         self.m_spoken_word_list = []
 82 |         self.m_spoken_mark_list = []
 83 |         self.m_phoneset = phoneset
 84 |         self.m_align_list = []
 85 |         self.m_alignCursor = 0
 86 |         self.m_accompanyIndex = 0
 87 |         self.m_sequence = ""
 88 |         self.m_text = ""
 89 | 
 90 |     def __len__(self):
 91 |         return len(self.m_spoken_word_list)
 92 | 
 93 |     def AddHost(self, spokenWord):
 94 |         self.m_spoken_word_list.append(spokenWord)
 95 |         self.m_align_list.append(self.m_alignCursor)
 96 | 
 97 |     def SaveHost(self):
 98 |         pass
 99 | 
100 |     def LoadHost(self):
101 |         pass
102 | 
103 |     def AddAccompany(self, spokenMark):
104 |         self.m_spoken_mark_list.append(spokenMark)
105 |         self.m_alignCursor += 1
106 |         self.m_accompanyIndex += 1
107 | 
108 |     def SaveAccompany(self):
109 |         pass
110 | 
111 |     #  Get the mark span corresponding to specific spoken word
112 |     def GetAccompanySpan(self, host_index):
113 |         if host_index == -1:
114 |             return (0, self.m_align_list[0])
115 | 
116 |         accompany_begin = self.m_align_list[host_index]
117 |         accompany_end = (
118 |             self.m_align_list[host_index + 1]
119 |             if host_index + 1 < len(self.m_spoken_word_list)
120 |             else len(self.m_spoken_mark_list)
121 |         )
122 | 
123 |         return (accompany_begin, accompany_end)
124 | 
125 |     # TODO: iterable
126 |     def GetElements(self):
127 |         accompany_begin, accompany_end = self.GetAccompanySpan(-1)
128 |         res_lst = [
129 |             self.m_spoken_mark_list[i] for i in range(accompany_begin, accompany_end)
130 |         ]
131 | 
132 |         for j in range(len(self.m_spoken_word_list)):
133 |             accompany_begin, accompany_end = self.GetAccompanySpan(j)
134 |             res_lst.extend([self.m_spoken_word_list[j]])
135 |             res_lst.extend(
136 |                 [
137 |                     self.m_spoken_mark_list[i]
138 |                     for i in range(accompany_begin, accompany_end)
139 |                 ]
140 |             )
141 | 
142 |         return res_lst
143 | 
144 |     def LoadAccompany(self):
145 |         pass
146 | 
147 |     def BuildSequence(self):
148 |         self.m_sequence = " ".join([str(ele) for ele in self.GetElements()])
149 | 
150 |     def BuildText(self):
151 |         self.m_text = "".join([str(ele) for ele in self.GetElements()])
152 | 
153 |     def Save(self, parent_node):
154 |         spoken_node = ET.SubElement(parent_node, "spoken")
155 |         spoken_node.set("wordcount", str(len(self.m_spoken_word_list)))
156 | 
157 |         text_node = ET.SubElement(spoken_node, "text")
158 |         text_node.text = self.m_sequence
159 | 
160 |         #  TODO: spoken mark might be used
161 |         for word in self.m_spoken_word_list:
162 |             word.Save(spoken_node)
163 | 
164 |     def SaveMetafile(self):
165 |         meta_line_list = [word.SaveMetafile() for word in self.m_spoken_word_list]
166 | 
167 |         return " ".join(meta_line_list)
168 | 
169 | 
170 | class ScriptSentence(XmlObj):
171 |     def __init__(self, phoneset, posset):
172 |         self.m_phoneset = phoneset
173 |         self.m_posset = posset
174 |         self.m_writtenSentence = WrittenSentence(posset)
175 |         self.m_spokenSentence = SpokenSentence(phoneset)
176 |         self.m_text = ""
177 | 
178 |     def Save(self, parent_node):
179 |         if len(self.m_spokenSentence) > 0:
180 |             self.m_spokenSentence.Save(parent_node)
181 | 
182 |     def SaveMetafile(self):
183 |         if len(self.m_spokenSentence) > 0:
184 |             return self.m_spokenSentence.SaveMetafile()
185 |         else:
186 |             return ""
187 | 


--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/ScriptWord.py:
--------------------------------------------------------------------------------
  1 | import xml.etree.ElementTree as ET
  2 | 
  3 | from .XmlObj import XmlObj
  4 | from .core_types import Language
  5 | from .Syllable import SyllableList
  6 | 
  7 | 
  8 | #  TODO(Jin): Not referenced, temporarily commented
  9 | class WrittenWord(XmlObj):
 10 |     def __init__(self):
 11 |         self.m_name = None
 12 |         self.m_POS = None
 13 | 
 14 |     def __str__(self):
 15 |         return self.m_name
 16 | 
 17 |     def Load(self):
 18 |         pass
 19 | 
 20 |     def Save(self):
 21 |         pass
 22 | 
 23 | 
 24 | class WrittenMark(XmlObj):
 25 |     def __init__(self):
 26 |         self.m_punctuation = None
 27 | 
 28 |     def __str__(self):
 29 |         return self.m_punctuation
 30 | 
 31 |     def Load(self):
 32 |         pass
 33 | 
 34 |     def Save(self):
 35 |         pass
 36 | 
 37 | 
 38 | class SpokenWord(XmlObj):
 39 |     def __init__(self):
 40 |         self.m_name = None
 41 |         self.m_language = None
 42 |         self.m_syllable_list = []
 43 |         self.m_breakText = "1"
 44 |         self.m_POS = "0"
 45 | 
 46 |     def __str__(self):
 47 |         return self.m_name
 48 | 
 49 |     def Load(self):
 50 |         pass
 51 | 
 52 |     def Save(self, parent_node):
 53 | 
 54 |         word_node = ET.SubElement(parent_node, "word")
 55 | 
 56 |         name_node = ET.SubElement(word_node, "name")
 57 |         name_node.text = self.m_name
 58 | 
 59 |         if (
 60 |             len(self.m_syllable_list) > 0
 61 |             and self.m_syllable_list[0].m_language != Language.Neutral
 62 |         ):
 63 |             language_node = ET.SubElement(word_node, "lang")
 64 |             language_node.text = self.m_syllable_list[0].m_language.name
 65 | 
 66 |         SyllableList(self.m_syllable_list).Save(word_node)
 67 | 
 68 |         break_node = ET.SubElement(word_node, "break")
 69 |         break_node.text = self.m_breakText
 70 | 
 71 |         POS_node = ET.SubElement(word_node, "POS")
 72 |         POS_node.text = self.m_POS
 73 | 
 74 |         return
 75 | 
 76 |     def SaveMetafile(self):
 77 |         word_phone_cnt = sum(
 78 |             [syllable.PhoneCount() for syllable in self.m_syllable_list]
 79 |         )
 80 |         word_syllable_cnt = len(self.m_syllable_list)
 81 |         single_syllable_word = word_syllable_cnt == 1
 82 |         meta_line_list = []
 83 | 
 84 |         for idx, syll in enumerate(self.m_syllable_list):
 85 |             if word_phone_cnt == 1:
 86 |                 word_pos = "word_both"
 87 |             elif idx == 0:
 88 |                 word_pos = "word_begin"
 89 |             elif idx == len(self.m_syllable_list) - 1:
 90 |                 word_pos = "word_end"
 91 |             else:
 92 |                 word_pos = "word_middle"
 93 |             meta_line_list.append(
 94 |                 syll.SaveMetafile(word_pos, single_syllable_word=single_syllable_word)
 95 |             )
 96 | 
 97 |         if self.m_breakText != "0" and self.m_breakText is not None:
 98 |             meta_line_list.append(
 99 |                 "{{#{}$tone_none$s_none$word_none}}".format(self.m_breakText)
100 |             )
101 | 
102 |         return " ".join(meta_line_list)
103 | 
104 | 
105 | class SpokenMark(XmlObj):
106 |     def __init__(self):
107 |         self.m_breakLevel = None
108 | 
109 |     def BreakLevel2Text(self):
110 |         return "#" + str(self.m_breakLevel.value)
111 | 
112 |     def __str__(self):
113 |         return self.BreakLevel2Text()
114 | 
115 |     def Load(self):
116 |         pass
117 | 
118 |     def Save(self):
119 |         pass
120 | 


--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/Syllable.py:
--------------------------------------------------------------------------------
  1 | import xml.etree.ElementTree as ET
  2 | 
  3 | from .XmlObj import XmlObj
  4 | 
  5 | 
  6 | class Syllable(XmlObj):
  7 |     def __init__(self):
  8 |         self.m_phone_list = []
  9 |         self.m_tone = None
 10 |         self.m_language = None
 11 |         self.m_breaklevel = None
 12 | 
 13 |     def PronunciationText(self):
 14 |         return " ".join([str(phone) for phone in self.m_phone_list])
 15 | 
 16 |     def PhoneCount(self):
 17 |         return len(self.m_phone_list)
 18 | 
 19 |     def ToneText(self):
 20 |         return str(self.m_tone.value)
 21 | 
 22 |     def Save(self):
 23 |         pass
 24 | 
 25 |     def Load(self):
 26 |         pass
 27 | 
 28 |     def GetPhoneMeta(
 29 |         self, phone_name, word_pos, syll_pos, tone_text, single_syllable_word=False
 30 |     ):
 31 |         #  Special case: word with single syllable, the last phone's word_pos should be "word_end"
 32 |         if word_pos == "word_begin" and syll_pos == "s_end" and single_syllable_word:
 33 |             word_pos = "word_end"
 34 |         elif word_pos == "word_begin" and syll_pos not in [
 35 |             "s_begin",
 36 |             "s_both",
 37 |         ]:  # FIXME: keep accord with Engine logic
 38 |             word_pos = "word_middle"
 39 |         elif word_pos == "word_end" and syll_pos not in ["s_end", "s_both"]:
 40 |             word_pos = "word_middle"
 41 |         else:
 42 |             pass
 43 | 
 44 |         return "{{{}$tone{}${}${}}}".format(phone_name, tone_text, syll_pos, word_pos)
 45 | 
 46 |     def SaveMetafile(self, word_pos, single_syllable_word=False):
 47 |         syllable_phone_cnt = len(self.m_phone_list)
 48 | 
 49 |         meta_line_list = []
 50 | 
 51 |         for idx, phone in enumerate(self.m_phone_list):
 52 |             if syllable_phone_cnt == 1:
 53 |                 syll_pos = "s_both"
 54 |             elif idx == 0:
 55 |                 syll_pos = "s_begin"
 56 |             elif idx == len(self.m_phone_list) - 1:
 57 |                 syll_pos = "s_end"
 58 |             else:
 59 |                 syll_pos = "s_middle"
 60 |             meta_line_list.append(
 61 |                 self.GetPhoneMeta(
 62 |                     phone,
 63 |                     word_pos,
 64 |                     syll_pos,
 65 |                     self.ToneText(),
 66 |                     single_syllable_word=single_syllable_word,
 67 |                 )
 68 |             )
 69 | 
 70 |         return " ".join(meta_line_list)
 71 | 
 72 | 
 73 | class SyllableList(XmlObj):
 74 |     def __init__(self, syllables):
 75 |         self.m_syllable_list = syllables
 76 | 
 77 |     def __len__(self):
 78 |         return len(self.m_syllable_list)
 79 | 
 80 |     def __index__(self, index):
 81 |         return self.m_syllable_list[index]
 82 | 
 83 |     def PronunciationText(self):
 84 |         return " - ".join(
 85 |             [syllable.PronunciationText() for syllable in self.m_syllable_list]
 86 |         )
 87 | 
 88 |     def ToneText(self):
 89 |         return "".join([syllable.ToneText() for syllable in self.m_syllable_list])
 90 | 
 91 |     def Save(self, parent_node):
 92 |         syllable_node = ET.SubElement(parent_node, "syllable")
 93 |         syllable_node.set("syllcount", str(len(self.m_syllable_list)))
 94 | 
 95 |         phone_node = ET.SubElement(syllable_node, "phone")
 96 |         phone_node.text = self.PronunciationText()
 97 | 
 98 |         tone_node = ET.SubElement(syllable_node, "tone")
 99 |         tone_node.text = self.ToneText()
100 | 
101 |         return
102 | 
103 |     def Load(self):
104 |         pass
105 | 


--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/XmlObj.py:
--------------------------------------------------------------------------------
 1 | class XmlObj:
 2 |     def __init__(self):
 3 |         pass
 4 | 
 5 |     def Load(self):
 6 |         pass
 7 | 
 8 |     def Save(self):
 9 |         pass
10 | 
11 |     def LoadData(self):
12 |         pass
13 | 
14 |     def SaveData(self):
15 |         pass
16 | 


--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/preprocess/script_convertor/core/__init__.py


--------------------------------------------------------------------------------
/kantts/preprocess/script_convertor/core/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import unicodedata
  3 | import codecs
  4 | 
  5 | WordPattern = r"((?P<Word>\w+)(\(\w+\))?)"
  6 | BreakPattern = r"(?P<Break>(\*?#(?P<BreakLevel>[0-4])))"
  7 | MarkPattern = r"(?P<Mark>[、，。！？：“”《》·])"
  8 | POSPattern = r"(?P<POS>(\*?\|(?P<POSClass>[1-9])))"
  9 | PhraseTonePattern = r"(?P<PhraseTone>(\*?%([L|H])))"
 10 | 
 11 | NgBreakPattern = r"^ng(?P<break>\d)"
 12 | 
 13 | 
 14 | RegexWord = re.compile(WordPattern + r"\s*")
 15 | RegexBreak = re.compile(BreakPattern + r"\s*")
 16 | RegexID = re.compile(r"^(?P<ID>.*?)\s")
 17 | RegexSentence = re.compile(
 18 |     r"({}|{}|{}|{}|{})\s*".format(
 19 |         WordPattern, BreakPattern, MarkPattern, POSPattern, PhraseTonePattern
 20 |     )
 21 | )
 22 | RegexForeignLang = re.compile(r"[A-Z@]")
 23 | RegexSpace = re.compile(r"^\s*")
 24 | RegexNeutralTone = re.compile(r"[1-5]5")
 25 | 
 26 | 
 27 | def do_character_normalization(line):
 28 |     return unicodedata.normalize("NFKC", line)
 29 | 
 30 | 
 31 | def do_prosody_text_normalization(line):
 32 |     tokens = line.split("\t")
 33 |     text = tokens[1]
 34 |     # Remove punctuations
 35 |     text = text.replace(u"。", " ")
 36 |     text = text.replace(u"、", " ")
 37 |     text = text.replace(u"“", " ")
 38 |     text = text.replace(u"”", " ")
 39 |     text = text.replace(u"‘", " ")
 40 |     text = text.replace(u"’", " ")
 41 |     text = text.replace(u"|", " ")
 42 |     text = text.replace(u"《", " ")
 43 |     text = text.replace(u"》", " ")
 44 |     text = text.replace(u"【", " ")
 45 |     text = text.replace(u"】", " ")
 46 |     text = text.replace(u"—", " ")
 47 |     text = text.replace(u"―", " ")
 48 |     text = text.replace(".", " ")
 49 |     text = text.replace("!", " ")
 50 |     text = text.replace("?", " ")
 51 |     text = text.replace("(", " ")
 52 |     text = text.replace(")", " ")
 53 |     text = text.replace("[", " ")
 54 |     text = text.replace("]", " ")
 55 |     text = text.replace("{", " ")
 56 |     text = text.replace("}", " ")
 57 |     text = text.replace("~", " ")
 58 |     text = text.replace(":", " ")
 59 |     text = text.replace(";", " ")
 60 |     text = text.replace("+", " ")
 61 |     text = text.replace(",", " ")
 62 |     #    text = text.replace('·', ' ')
 63 |     text = text.replace('"', " ")
 64 |     text = text.replace(
 65 |         "-", ""
 66 |     )  # don't replace by space because compond word like two-year-old
 67 |     text = text.replace(
 68 |         "'", ""
 69 |     )  # don't replace by space because English word like that's
 70 | 
 71 |     # Replace break
 72 |     text = text.replace("/", "#2")
 73 |     text = text.replace("%", "#3")
 74 |     # Remove useless spaces surround #2 #3 #4
 75 |     text = re.sub(r"(#\d)[ ]+", r"\1", text)
 76 |     text = re.sub(r"[ ]+(#\d)", r"\1", text)
 77 |     # Replace space by #1
 78 |     text = re.sub("[ ]+", "#1", text)
 79 | 
 80 |     # Remove break at the end of the text
 81 |     text = re.sub(r"#\d$", "", text)
 82 | 
 83 |     # Add #1 between target language and foreign language
 84 |     text = re.sub(r"([a-zA-Z])([^a-zA-Z\d\#\s\'\%\/\-])", r"\1#1\2", text)
 85 |     text = re.sub(r"([^a-zA-Z\d\#\s\'\%\/\-])([a-zA-Z])", r"\1#1\2", text)
 86 | 
 87 |     return tokens[0] + "\t" + text
 88 | 
 89 | 
 90 | def is_fp_line(line):
 91 |     fp_category_list = ["FP", "I", "N", "Q"]
 92 |     elements = line.strip().split(" ")
 93 |     res = True
 94 |     for ele in elements:
 95 |         if ele not in fp_category_list:
 96 |             res = False
 97 |             break
 98 |     return res
 99 | 
100 | 
101 | def format_prosody(src_prosody):
102 |     formatted_lines = []
103 |     with codecs.open(src_prosody, "r", "utf-8") as f:
104 |         lines = f.readlines()
105 | 
106 |         idx = 0
107 |         while idx < len(lines):
108 |             line = do_character_normalization(lines[idx])
109 | 
110 |             if len(line.strip().split("\t")) == 2:
111 |                 line = do_prosody_text_normalization(line)
112 |             else:
113 |                 fp_enable = is_fp_line(line)
114 |                 if fp_enable:
115 |                     idx += 3
116 |                     continue
117 |             formatted_lines.append(line)
118 |             idx += 1
119 |     #  with codecs.open(tgt_prosody, 'w', 'utf-8') as f:
120 |     #      f.writelines(formatted_lines)
121 |     return formatted_lines
122 | 


--------------------------------------------------------------------------------
/kantts/preprocess/se_processor/D_TDNN.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | 
  3 | import torch
  4 | from torch import nn
  5 | import torch.nn.functional as F
  6 | 
  7 | from .layers import (DenseLayer, DenseTDNNBlock, StatsPool, TDNNLayer, SEDenseTDNNBlock,
  8 |                      TransitLayer)
  9 | 
 10 | class BasicBlock(nn.Module):
 11 |     expansion = 1
 12 | 
 13 |     def __init__(self, in_planes, planes, stride=1):
 14 |         super(BasicBlock, self).__init__()
 15 |         self.conv1 = nn.Conv2d(in_planes,
 16 |                                planes,
 17 |                                kernel_size=3,
 18 |                                stride=(stride, 1),
 19 |                                padding=1,
 20 |                                bias=False)
 21 |         self.bn1 = nn.BatchNorm2d(planes)
 22 |         self.conv2 = nn.Conv2d(planes,
 23 |                                planes,
 24 |                                kernel_size=3,
 25 |                                stride=1,
 26 |                                padding=1,
 27 |                                bias=False)
 28 |         self.bn2 = nn.BatchNorm2d(planes)
 29 | 
 30 |         self.shortcut = nn.Sequential()
 31 |         if stride != 1 or in_planes != self.expansion * planes:
 32 |             self.shortcut = nn.Sequential(
 33 |                 nn.Conv2d(in_planes,
 34 |                           self.expansion * planes,
 35 |                           kernel_size=1,
 36 |                           stride=(stride, 1),
 37 |                           bias=False),
 38 |                 nn.BatchNorm2d(self.expansion * planes))
 39 | 
 40 |     def forward(self, x):
 41 |         out = F.relu(self.bn1(self.conv1(x)))
 42 |         out = self.bn2(self.conv2(out))
 43 |         out += self.shortcut(x)
 44 |         out = F.relu(out)
 45 |         return out
 46 |     
 47 | class CNN_Head(nn.Module):
 48 |     def __init__(self,
 49 |                 block=BasicBlock,
 50 |                 num_blocks=[2, 2],
 51 |                 m_channels=32,
 52 |                 feat_dim=80):
 53 |         super(CNN_Head, self).__init__()
 54 |         self.in_planes = m_channels
 55 |         self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
 56 |         self.bn1 = nn.BatchNorm2d(m_channels)
 57 |         
 58 |         self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=2)
 59 |         self.layer2 = self._make_layer(block, m_channels, num_blocks[0], stride=2)
 60 | 
 61 |         self.conv2 = nn.Conv2d(m_channels, m_channels, kernel_size=3, stride=(2, 1), padding=1, bias=False)
 62 |         self.bn2 = nn.BatchNorm2d(m_channels)
 63 |         self.out_channels =  m_channels * (feat_dim // 8)
 64 |     
 65 |     def _make_layer(self, block, planes, num_blocks, stride):
 66 |         strides = [stride] + [1] * (num_blocks - 1)
 67 |         layers = []
 68 |         for stride in strides:
 69 |             layers.append(block(self.in_planes, planes, stride))
 70 |             self.in_planes = planes * block.expansion
 71 |         return nn.Sequential(*layers)
 72 | 
 73 |     def forward(self, x):
 74 |         x = x.unsqueeze_(1)
 75 |         out = F.relu(self.bn1(self.conv1(x)))
 76 |         out = self.layer1(out)
 77 |         out = self.layer2(out)
 78 |         out = F.relu(self.bn2(self.conv2(out)))
 79 |         
 80 |         out = out.reshape(out.shape[0], out.shape[1]*out.shape[2], out.shape[3])
 81 |         return out
 82 | 
 83 | class DTDNN(nn.Module):
 84 |     def __init__(self,
 85 |                  feat_dim=80,
 86 |                  embedding_size=192,
 87 |                  growth_rate=32,
 88 |                  bn_size=4,
 89 |                  init_channels=128,
 90 |                  config_str='batchnorm-relu',
 91 |                  memory_efficient=True):
 92 |         super(DTDNN, self).__init__()
 93 | 
 94 |         self.head = CNN_Head()
 95 |         feat_dim = self.head.out_channels
 96 | 
 97 |         self.xvector = nn.Sequential(
 98 |             OrderedDict([
 99 |                 ('tdnn',
100 |                  TDNNLayer(feat_dim,
101 |                            init_channels,
102 |                            5,
103 |                            stride=2,
104 |                            dilation=1,
105 |                            padding=-1,
106 |                            config_str=config_str)),
107 |             ]))
108 |         channels = init_channels
109 |         for i, (num_layers, kernel_size,
110 |                 dilation) in enumerate(zip((12, 24, 16), (3, 3, 3), (1, 2, 3))):
111 |             block = SEDenseTDNNBlock(num_layers=num_layers,
112 |                                    in_channels=channels,
113 |                                    out_channels=growth_rate,
114 |                                    bn_channels=bn_size * growth_rate,
115 |                                    kernel_size=kernel_size,
116 |                                    dilation=dilation,
117 |                                    config_str=config_str,
118 |                                    memory_efficient=memory_efficient)
119 |             self.xvector.add_module('block%d' % (i + 1), block)
120 |             channels = channels + num_layers * growth_rate
121 |             self.xvector.add_module(
122 |                 'transit%d' % (i + 1),
123 |                 TransitLayer(channels,
124 |                              channels // 2,
125 |                              bias=False,
126 |                              config_str=config_str))
127 |             channels //= 2
128 | 
129 |         self.bn = nn.BatchNorm1d(channels)
130 |         self.relu = nn.ReLU(inplace=True)
131 | 
132 |         self.xvector.add_module('stats', StatsPool())
133 |         self.xvector.add_module(
134 |             'dense',
135 |             DenseLayer(channels * 2, embedding_size, config_str='batchnorm_'))
136 | 
137 |         for m in self.modules():
138 |             if isinstance(m, (nn.Conv1d, nn.Linear)):
139 |                 nn.init.kaiming_normal_(m.weight.data)
140 |                 if m.bias is not None:
141 |                     nn.init.zeros_(m.bias)
142 | 
143 |     def forward(self, x):
144 |         x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
145 |         x = self.head(x)
146 |         x = self.xvector.tdnn(x)
147 |         
148 |         x = self.xvector.block1(x)
149 |         x = self.xvector.transit1(x)
150 | 
151 |         x = self.xvector.block2(x)
152 |         x = self.xvector.transit2(x)
153 | 
154 |         x = self.xvector.block3(x)
155 |         x = self.xvector.transit3(x)
156 |         x = self.relu(self.bn(x))
157 | 
158 |         x = self.xvector.stats(x)
159 |         x = self.xvector.dense(x)
160 |         return x
161 | 
162 | 


--------------------------------------------------------------------------------
/kantts/preprocess/se_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/preprocess/se_processor/__init__.py


--------------------------------------------------------------------------------
/kantts/preprocess/se_processor/se_processor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchaudio
 3 | import numpy as np
 4 | import os
 5 | import torchaudio.compliance.kaldi as Kaldi
 6 | from .D_TDNN import DTDNN
 7 | import logging
 8 | import argparse
 9 | from glob import glob
10 | 
11 | 
12 | logging.basicConfig(
13 |     format="%(asctime)s %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s",
14 |     datefmt="%Y-%m-%d:%H:%M:%S",
15 |     level=logging.DEBUG,
16 | )
17 | 
18 | class SpeakerEmbeddingProcessor:
19 |     def __init__(self, sample_rate=16000):
20 |         self.sample_rate = sample_rate
21 |         self.min_wav_length = self.sample_rate * 30 * 10 / 1000
22 | 
23 |         self.pcm_dict = {}
24 |         self.mfcc_dict = {}
25 |         self.se_list = []
26 | 
27 |     def process(self, src_voice_dir, se_model):
28 |         logging.info("[SpeakerEmbeddingProcessor] Speaker embedding extractor started")
29 | 
30 |         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
31 |         model = DTDNN()
32 |         try:
33 |             if os.path.basename(se_model) == "se.model":
34 |                 model.load_state_dict(torch.load(se_model, map_location=device))
35 |             else:
36 |                 raise Exception("[SpeakerEmbeddingProcessor] se model loading error!!!")
37 |         except Exception as e:
38 |             logging.info(e)
39 |             if os.path.basename(se_model) == 'se.onnx':
40 |                 logging.info("[SpeakerEmbeddingProcessor] please update your se model to ensure that the version is greater than or equal to 1.0.5")
41 |             sys.exit()
42 |         model.eval()
43 |         model.to(device)
44 | 
45 |         wav_dir = os.path.join(src_voice_dir, "wav")
46 |         se_dir = os.path.join(src_voice_dir, "se")
47 |         se_average_file = os.path.join(se_dir, "se.npy")
48 | 
49 |         os.makedirs(se_dir, exist_ok=True)
50 | 
51 |         wav_files = glob(os.path.join(wav_dir, '*.wav'))
52 | 
53 | 
54 |         for wav_file in wav_files:
55 |             basename = os.path.splitext(os.path.basename(wav_file))[0]
56 |             se_file = os.path.join(se_dir, basename + '.npy')
57 |              
58 |             wav, fs = torchaudio.load(wav_file)
59 |             assert wav.shape[0] == 1
60 |             assert fs == 16000
61 | 
62 |             if wav.shape[1] < self.min_wav_length:
63 |                 continue
64 | 
65 |             fbank_feat = Kaldi.fbank(wav, num_mel_bins=80)
66 |             
67 |             feat = fbank_feat - fbank_feat.mean(dim=0, keepdim=True)
68 |             feat = feat.unsqueeze(0).to(device)
69 |             
70 |             speaker_embedding = model(feat)
71 |             speaker_embedding = speaker_embedding.squeeze().cpu().detach().numpy()
72 |             speaker_embedding = np.expand_dims(speaker_embedding,  axis=0)
73 |             
74 |             
75 |             np.save(se_file, speaker_embedding)
76 |             self.se_list.append(speaker_embedding)
77 |         self.se_average = np.expand_dims(
78 |             np.mean(
79 |                 np.concatenate(self.se_list, axis=0), 
80 |                 axis=0
81 |             ), 
82 |             axis=0
83 |         )
84 |         np.save(se_average_file, self.se_average)
85 | 
86 |         logging.info("[SpeakerEmbeddingProcessor] Speaker embedding extracted successfully!")
87 |         
88 | 
89 | if __name__ == '__main__':
90 |     parser = argparse.ArgumentParser(description="Speaker Embedding Processor")
91 |     parser.add_argument("--src_voice_dir", type=str, required=True)    
92 |     parser.add_argument('--se_model', required=True)
93 |     args = parser.parse_args()
94 | 
95 |     sep = SpeakerEmbeddingProcessor()
96 |     sep.process(args.src_voice_dir, args.se_onnx)


--------------------------------------------------------------------------------
/kantts/preprocess/text_process.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import sys
 4 | import argparse
 5 | import yaml
 6 | import time
 7 | import zipfile
 8 | 
 9 | ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))  # NOQA: E402
10 | sys.path.insert(0, os.path.dirname(ROOT_PATH))  # NOQA: E402
11 | 
12 | try:
13 |     from kantts.datasets.dataset import BERT_Text_Dataset
14 |     from kantts.utils.log import logging_to_file, get_git_revision_hash
15 |     from kantts.utils.ling_unit import text_to_mit_symbols as text_to_symbols
16 | except ImportError:
17 |     raise ImportError("Please install kantts.")
18 | 
19 | logging.basicConfig(
20 |     format="%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
21 |     datefmt="%Y-%m-%d:%H:%M:%S",
22 |     level=logging.INFO,
23 | )
24 | 
25 | 
26 | def gen_metafile(
27 |     output_dir,
28 |     split_ratio=0.98,
29 | ):
30 |     raw_metafile = os.path.join(output_dir, "raw_metafile.txt")
31 |     bert_train_meta = os.path.join(output_dir, "bert_train.lst")
32 |     bert_valid_meta = os.path.join(output_dir, "bert_valid.lst")
33 |     if not os.path.exists(
34 |             bert_train_meta) or not os.path.exists(bert_valid_meta):
35 |         BERT_Text_Dataset.gen_metafile(raw_metafile, output_dir, split_ratio)
36 |         logging.info("BERT Text metafile generated.")
37 | 
38 | #  TODO: Zh-CN as default
39 | def process_mit_style_data(
40 |     text_file,
41 |     resources_zip_file,
42 |     output_dir,
43 | ):
44 |     os.makedirs(output_dir, exist_ok=True)
45 |     logging_to_file(os.path.join(output_dir, "data_process_stdout.log"))
46 | 
47 |     resource_root_dir = os.path.dirname(resources_zip_file)
48 |     resource_dir = os.path.join(resource_root_dir, "resource")
49 | 
50 |     if not os.path.exists(resource_dir):
51 |         logging.info("Extracting resources...")
52 |         with zipfile.ZipFile(resources_zip_file, "r") as zip_ref:
53 |             zip_ref.extractall(resource_root_dir)
54 | 
55 |     with open(text_file, "r") as text_data:
56 |         texts = text_data.readlines()
57 | 
58 |     logging.info("Converting text to symbols...")
59 |     symbols_lst = text_to_symbols(texts, resource_dir, "F7")
60 |     symbols_file = os.path.join(output_dir, "raw_metafile.txt")
61 |     with open(symbols_file, "w") as symbol_data:
62 |         for symbol in symbols_lst:
63 |             symbol_data.write(symbol)
64 | 
65 |     logging.info("Processing done.")
66 | 
67 |     # Generate BERT Text metafile
68 |     # TODO: train/valid ratio setting
69 |     gen_metafile(output_dir)
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     parser = argparse.ArgumentParser(description="Dataset preprocessor")
74 |     parser.add_argument("--text_file", type=str, required=True)
75 |     parser.add_argument("--resources_zip_file", type=str, required=True)
76 |     parser.add_argument("--output_dir", type=str, required=True)
77 | 
78 |     args = parser.parse_args()
79 | 
80 |     process_mit_style_data(
81 |         args.text_file,
82 |         args.resources_zip_file,
83 |         args.output_dir,
84 |     )
85 | 
86 | 


--------------------------------------------------------------------------------
/kantts/train/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/train/__init__.py


--------------------------------------------------------------------------------
/kantts/train/scheduler.py:
--------------------------------------------------------------------------------
 1 | from torch.optim.lr_scheduler import *  # NOQA
 2 | from torch.optim.lr_scheduler import _LRScheduler  # NOQA
 3 | 
 4 | """Noam Scheduler."""
 5 | 
 6 | 
 7 | class FindLR(_LRScheduler):
 8 |     """
 9 |     inspired by fast.ai @https://sgugger.github.io/how-do-you-find-a-good-learning-rate.html
10 |     """
11 | 
12 |     def __init__(self, optimizer, max_steps, max_lr=10):
13 |         self.max_steps = max_steps
14 |         self.max_lr = max_lr
15 |         super().__init__(optimizer)
16 | 
17 |     def get_lr(self):
18 |         return [
19 |             base_lr
20 |             * ((self.max_lr / base_lr) ** (self.last_epoch / (self.max_steps - 1)))
21 |             for base_lr in self.base_lrs
22 |         ]
23 | 
24 | 
25 | class NoamLR(_LRScheduler):
26 |     """
27 |     Implements the Noam Learning rate schedule. This corresponds to increasing the learning rate
28 |     linearly for the first ``warmup_steps`` training steps, and decreasing it thereafter proportionally
29 |     to the inverse square root of the step number, scaled by the inverse square root of the
30 |     dimensionality of the model. Time will tell if this is just madness or it's actually important.
31 |     Parameters
32 |     ----------
33 |     warmup_steps: ``int``, required.
34 |         The number of steps to linearly increase the learning rate.
35 |     """
36 | 
37 |     def __init__(self, optimizer, warmup_steps):
38 |         self.warmup_steps = warmup_steps
39 |         super().__init__(optimizer)
40 | 
41 |     def get_lr(self):
42 |         last_epoch = max(1, self.last_epoch)
43 |         scale = self.warmup_steps ** 0.5 * min(
44 |             last_epoch ** (-0.5), last_epoch * self.warmup_steps ** (-1.5)
45 |         )
46 |         return [base_lr * scale for base_lr in self.base_lrs]
47 | 


--------------------------------------------------------------------------------
/kantts/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/kantts/utils/__init__.py


--------------------------------------------------------------------------------
/kantts/utils/audio_torch.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import librosa
  3 | from distutils.version import LooseVersion
  4 | 
  5 | is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion("1.7")
  6 | 
  7 | 
  8 | def stft(x, fft_size, hop_size, win_length, window):
  9 |     """Perform STFT and convert to magnitude spectrogram.
 10 | 
 11 |     Args:
 12 |         x (Tensor): Input signal tensor (B, T).
 13 |         fft_size (int): FFT size.
 14 |         hop_size (int): Hop size.
 15 |         win_length (int): Window length.
 16 |         window (str): Window function type.
 17 | 
 18 |     Returns:
 19 |         Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
 20 | 
 21 |     """
 22 |     if is_pytorch_17plus:
 23 |         x_stft = torch.stft(
 24 |             x, fft_size, hop_size, win_length, window, return_complex=False
 25 |         )
 26 |     else:
 27 |         x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
 28 |     real = x_stft[..., 0]
 29 |     imag = x_stft[..., 1]
 30 | 
 31 |     return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1)
 32 | 
 33 | 
 34 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 35 |     return 20 * torch.log10(torch.clamp(x, min=clip_val) * C)
 36 | 
 37 | 
 38 | def dynamic_range_decompression_torch(x, C=1):
 39 |     return torch.pow(10.0, x * 0.05) / C
 40 | 
 41 | 
 42 | def spectral_normalize_torch(
 43 |     magnitudes,
 44 |     min_level_db=-100.0,
 45 |     ref_level_db=20.0,
 46 |     norm_abs_value=4.0,
 47 |     symmetric=True,
 48 | ):
 49 |     output = dynamic_range_compression_torch(magnitudes) - ref_level_db
 50 | 
 51 |     if symmetric:
 52 |         return torch.clamp(
 53 |             2 * norm_abs_value * ((output - min_level_db) / (-min_level_db))
 54 |             - norm_abs_value,
 55 |             min=-norm_abs_value,
 56 |             max=norm_abs_value,
 57 |         )
 58 |     else:
 59 |         return torch.clamp(
 60 |             norm_abs_value * ((output - min_level_db) / (-min_level_db)),
 61 |             min=0.0,
 62 |             max=norm_abs_value,
 63 |         )
 64 | 
 65 | 
 66 | def spectral_de_normalize_torch(
 67 |     magnitudes,
 68 |     min_level_db=-100.0,
 69 |     ref_level_db=20.0,
 70 |     norm_abs_value=4.0,
 71 |     symmetric=True,
 72 | ):
 73 |     if symmetric:
 74 |         magnitudes = torch.clamp(magnitudes, min=-norm_abs_value, max=norm_abs_value)
 75 |         magnitudes = (magnitudes + norm_abs_value) * (-min_level_db) / (
 76 |             2 * norm_abs_value
 77 |         ) + min_level_db
 78 |     else:
 79 |         magnitudes = torch.clamp(magnitudes, min=0.0, max=norm_abs_value)
 80 |         magnitudes = (magnitudes) * (-min_level_db) / (norm_abs_value) + min_level_db
 81 | 
 82 |     output = dynamic_range_decompression_torch(magnitudes + ref_level_db)
 83 |     return output
 84 | 
 85 | 
 86 | class MelSpectrogram(torch.nn.Module):
 87 |     """Calculate Mel-spectrogram."""
 88 | 
 89 |     def __init__(
 90 |         self,
 91 |         fs=22050,
 92 |         fft_size=1024,
 93 |         hop_size=256,
 94 |         win_length=None,
 95 |         window="hann",
 96 |         num_mels=80,
 97 |         fmin=80,
 98 |         fmax=7600,
 99 |         center=True,
100 |         normalized=False,
101 |         onesided=True,
102 |         eps=1e-10,
103 |         log_base=10.0,
104 |         pad_mode="constant",
105 |     ):
106 |         """Initialize MelSpectrogram module."""
107 |         super().__init__()
108 |         self.fft_size = fft_size
109 |         if win_length is None:
110 |             self.win_length = fft_size
111 |         else:
112 |             self.win_length = win_length
113 |         self.hop_size = hop_size
114 |         self.center = center
115 |         self.normalized = normalized
116 |         self.onesided = onesided
117 |         if window is not None and not hasattr(torch, f"{window}_window"):
118 |             raise ValueError(f"{window} window is not implemented")
119 |         self.window = window
120 |         self.eps = eps
121 |         self.pad_mode = pad_mode
122 | 
123 |         fmin = 0 if fmin is None else fmin
124 |         fmax = fs / 2 if fmax is None else fmax
125 |         melmat = librosa.filters.mel(
126 |             sr=fs,
127 |             n_fft=fft_size,
128 |             n_mels=num_mels,
129 |             fmin=fmin,
130 |             fmax=fmax,
131 |         )
132 |         self.register_buffer("melmat", torch.from_numpy(melmat.T).float())
133 |         self.stft_params = {
134 |             "n_fft": self.fft_size,
135 |             "win_length": self.win_length,
136 |             "hop_length": self.hop_size,
137 |             "center": self.center,
138 |             "normalized": self.normalized,
139 |             "onesided": self.onesided,
140 |             "pad_mode": self.pad_mode,
141 |         }
142 |         if is_pytorch_17plus:
143 |             self.stft_params["return_complex"] = False
144 | 
145 |         self.log_base = log_base
146 |         if self.log_base is None:
147 |             self.log = torch.log
148 |         elif self.log_base == 2.0:
149 |             self.log = torch.log2
150 |         elif self.log_base == 10.0:
151 |             self.log = torch.log10
152 |         else:
153 |             raise ValueError(f"log_base: {log_base} is not supported.")
154 | 
155 |     def forward(self, x):
156 |         """Calculate Mel-spectrogram.
157 | 
158 |         Args:
159 |             x (Tensor): Input waveform tensor (B, T) or (B, 1, T).
160 | 
161 |         Returns:
162 |             Tensor: Mel-spectrogram (B, #mels, #frames).
163 | 
164 |         """
165 |         if x.dim() == 3:
166 |             # (B, C, T) -> (B*C, T)
167 |             x = x.reshape(-1, x.size(2))
168 | 
169 |         if self.window is not None:
170 |             window_func = getattr(torch, f"{self.window}_window")
171 |             window = window_func(self.win_length, dtype=x.dtype, device=x.device)
172 |         else:
173 |             window = None
174 | 
175 |         x_stft = torch.stft(x, window=window, **self.stft_params)
176 |         # (B, #freqs, #frames, 2) -> (B, $frames, #freqs, 2)
177 |         x_stft = x_stft.transpose(1, 2)
178 |         x_power = x_stft[..., 0] ** 2 + x_stft[..., 1] ** 2
179 |         x_amp = torch.sqrt(torch.clamp(x_power, min=self.eps))
180 | 
181 |         x_mel = torch.matmul(x_amp, self.melmat)
182 |         x_mel = torch.clamp(x_mel, min=self.eps)
183 |         x_mel = spectral_normalize_torch(x_mel)
184 | 
185 |         #  return self.log(x_mel).transpose(1, 2)
186 |         return x_mel.transpose(1, 2)
187 | 


--------------------------------------------------------------------------------
/kantts/utils/ling_unit/__init__.py:
--------------------------------------------------------------------------------
 1 | import ttsfrd
 2 | 
 3 | ENG_LANG_MAPPING = {
 4 |     "PinYin": "zh-cn",
 5 |     "English": "en-us",
 6 |     "British": "en-gb",
 7 |     "ZhHK": "hk_cantonese",
 8 |     "Sichuan": "sichuan",
 9 |     "Japanese": "japanese",
10 |     "WuuShangHai": "shanghai",
11 |     "Indonesian": "indonesian",
12 |     "Malay": "malay",
13 |     "Filipino": "filipino",
14 |     "Vietnamese": "vietnamese",
15 |     "Korean": "korean",
16 |     "Russian": "russian",
17 | }
18 | 
19 | 
20 | def text_to_mit_symbols(texts, resources_dir, speaker, lang="PinYin"):
21 |     fe = ttsfrd.TtsFrontendEngine()
22 |     fe.initialize(resources_dir)
23 |     fe.set_lang_type(ENG_LANG_MAPPING[lang])
24 | 
25 |     symbols_lst = []
26 |     for idx, text in enumerate(texts):
27 |         text = text.strip()
28 |         res = fe.gen_tacotron_symbols(text)
29 |         res = res.replace("F7", speaker)
30 |         sentences = res.split("\n")
31 |         for sentence in sentences:
32 |             arr = sentence.split("\t")
33 |             # skip the empty line
34 |             if len(arr) != 2:
35 |                 continue
36 |             sub_index, symbols = sentence.split("\t")
37 |             symbol_str = "{}_{}\t{}\n".format(idx, sub_index, symbols)
38 |             symbols_lst.append(symbol_str)
39 | 
40 |     return symbols_lst
41 | 


--------------------------------------------------------------------------------
/kantts/utils/ling_unit/cleaners.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Cleaners are transformations that run over the input text at both training and eval time.
 3 | 
 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
 6 |   1. "english_cleaners" for English text
 7 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 8 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 9 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
10 |      the symbols in symbols.py to match your data).
11 | """
12 | 
13 | import re
14 | from unidecode import unidecode
15 | from .numbers import normalize_numbers
16 | 
17 | 
18 | # Regular expression matching whitespace:
19 | _whitespace_re = re.compile(r"\s+")
20 | 
21 | # List of (regular expression, replacement) pairs for abbreviations:
22 | _abbreviations = [
23 |     (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
24 |     for x in [
25 |         ("mrs", "misess"),
26 |         ("mr", "mister"),
27 |         ("dr", "doctor"),
28 |         ("st", "saint"),
29 |         ("co", "company"),
30 |         ("jr", "junior"),
31 |         ("maj", "major"),
32 |         ("gen", "general"),
33 |         ("drs", "doctors"),
34 |         ("rev", "reverend"),
35 |         ("lt", "lieutenant"),
36 |         ("hon", "honorable"),
37 |         ("sgt", "sergeant"),
38 |         ("capt", "captain"),
39 |         ("esq", "esquire"),
40 |         ("ltd", "limited"),
41 |         ("col", "colonel"),
42 |         ("ft", "fort"),
43 |     ]
44 | ]
45 | 
46 | 
47 | def expand_abbreviations(text):
48 |     for regex, replacement in _abbreviations:
49 |         text = re.sub(regex, replacement, text)
50 |     return text
51 | 
52 | 
53 | def expand_numbers(text):
54 |     return normalize_numbers(text)
55 | 
56 | 
57 | def lowercase(text):
58 |     return text.lower()
59 | 
60 | 
61 | def collapse_whitespace(text):
62 |     return re.sub(_whitespace_re, " ", text)
63 | 
64 | 
65 | def convert_to_ascii(text):
66 |     return unidecode(text)
67 | 
68 | 
69 | def basic_cleaners(text):
70 |     """Basic pipeline that lowercases and collapses whitespace without transliteration."""
71 |     text = lowercase(text)
72 |     text = collapse_whitespace(text)
73 |     return text
74 | 
75 | 
76 | def transliteration_cleaners(text):
77 |     """Pipeline for non-English text that transliterates to ASCII."""
78 |     text = convert_to_ascii(text)
79 |     text = lowercase(text)
80 |     text = collapse_whitespace(text)
81 |     return text
82 | 
83 | 
84 | def english_cleaners(text):
85 |     """Pipeline for English text, including number and abbreviation expansion."""
86 |     text = convert_to_ascii(text)
87 |     text = lowercase(text)
88 |     text = expand_numbers(text)
89 |     text = expand_abbreviations(text)
90 |     text = collapse_whitespace(text)
91 |     return text
92 | 


--------------------------------------------------------------------------------
/kantts/utils/ling_unit/emotion_types.py:
--------------------------------------------------------------------------------
 1 | emotion_types = [
 2 |     "emotion_none",
 3 |     "emotion_neutral",
 4 |     "emotion_angry",
 5 |     "emotion_disgust",
 6 |     "emotion_fear",
 7 |     "emotion_happy",
 8 |     "emotion_sad",
 9 |     "emotion_surprise",
10 |     "emotion_calm",
11 |     "emotion_gentle",
12 |     "emotion_relax",
13 |     "emotion_lyrical",
14 |     "emotion_serious",
15 |     "emotion_disgruntled",
16 |     "emotion_satisfied",
17 |     "emotion_disappointed",
18 |     "emotion_excited",
19 |     "emotion_anxiety",
20 |     "emotion_jealousy",
21 |     "emotion_hate",
22 |     "emotion_pity",
23 |     "emotion_pleasure",
24 |     "emotion_arousal",
25 |     "emotion_dominance",
26 |     "emotion_placeholder1",
27 |     "emotion_placeholder2",
28 |     "emotion_placeholder3",
29 |     "emotion_placeholder4",
30 |     "emotion_placeholder5",
31 |     "emotion_placeholder6",
32 |     "emotion_placeholder7",
33 |     "emotion_placeholder8",
34 |     "emotion_placeholder9",
35 | ]
36 | 


--------------------------------------------------------------------------------
/kantts/utils/ling_unit/lang_symbols.py:
--------------------------------------------------------------------------------
 1 | import xml.etree.ElementTree as ET
 2 | from kantts.preprocess.languages import languages
 3 | import logging
 4 | import os
 5 | 
 6 | syllable_flags = [
 7 |     "s_begin",
 8 |     "s_end",
 9 |     "s_none",
10 |     "s_both",
11 |     "s_middle",
12 | ]
13 | 
14 | word_segments = [
15 |     "word_begin",
16 |     "word_end",
17 |     "word_middle",
18 |     "word_both",
19 |     "word_none",
20 | ]
21 | 
22 | LANGUAGES_DIR = os.path.join(
23 |     os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
24 |     "preprocess",
25 |     "languages",
26 | )
27 | 
28 | 
29 | def parse_phoneset(phoneset_file):
30 |     """Parse a phoneset file and return a list of symbols.
31 |     Args:
32 |         phoneset_file (str): Path to the phoneset file.
33 | 
34 |     Returns:
35 |         list: A list of phones.
36 |     """
37 |     ns = "{http://schemas.alibaba-inc.com/tts}"
38 | 
39 |     phone_lst = []
40 |     phoneset_root = ET.parse(phoneset_file).getroot()
41 |     for phone_node in phoneset_root.findall(ns + "phone"):
42 |         phone_lst.append(phone_node.find(ns + "name").text)
43 | 
44 |     for i in range(1, 5):
45 |         phone_lst.append("#{}".format(i))
46 | 
47 |     return phone_lst
48 | 
49 | 
50 | def parse_tonelist(tonelist_file):
51 |     """Parse a tonelist file and return a list of tones.
52 |     Args:
53 |         tonelist_file (str): Path to the tonelist file.
54 | 
55 |     Returns:
56 |         dict: A dictionary of tones.
57 |     """
58 |     tone_lst = []
59 |     with open(tonelist_file, "r") as f:
60 |         lines = f.readlines()
61 |     for line in lines:
62 |         tone = line.strip()
63 |         if tone != "":
64 |             tone_lst.append("tone{}".format(tone))
65 |         else:
66 |             tone_lst.append("tone_none")
67 | 
68 |     return tone_lst
69 | 
70 | 
71 | def get_language_symbols(language):
72 |     """Get symbols of a language.
73 |     Args:
74 |         language (str): Language name.
75 |     """
76 |     language_dict = languages.get(language, None)
77 |     if language_dict is None:
78 |         logging.error("Language %s not supported. Using PinYin as default", language)
79 |         language_dict = languages["PinYin"]
80 |         language = "PinYin"
81 | 
82 |     language_dir = os.path.join(LANGUAGES_DIR, language)
83 |     phoneset_file = os.path.join(language_dir, language_dict["phoneset_path"])
84 |     tonelist_file = os.path.join(language_dir, language_dict["tonelist_path"])
85 |     phones = parse_phoneset(phoneset_file)
86 |     tones = parse_tonelist(tonelist_file)
87 | 
88 |     return phones, tones, syllable_flags, word_segments
89 | 


--------------------------------------------------------------------------------
/kantts/utils/ling_unit/numbers.py:
--------------------------------------------------------------------------------
 1 | import inflect
 2 | import re
 3 | 
 4 | 
 5 | _inflect = inflect.engine()
 6 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
 7 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
 8 | _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
 9 | _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
10 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
11 | _number_re = re.compile(r"[0-9]+")
12 | 
13 | 
14 | def _remove_commas(m):
15 |     return m.group(1).replace(",", "")
16 | 
17 | 
18 | def _expand_decimal_point(m):
19 |     return m.group(1).replace(".", " point ")
20 | 
21 | 
22 | def _expand_dollars(m):
23 |     match = m.group(1)
24 |     parts = match.split(".")
25 |     if len(parts) > 2:
26 |         return match + " dollars"  # Unexpected format
27 |     dollars = int(parts[0]) if parts[0] else 0
28 |     cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
29 |     if dollars and cents:
30 |         dollar_unit = "dollar" if dollars == 1 else "dollars"
31 |         cent_unit = "cent" if cents == 1 else "cents"
32 |         return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
33 |     elif dollars:
34 |         dollar_unit = "dollar" if dollars == 1 else "dollars"
35 |         return "%s %s" % (dollars, dollar_unit)
36 |     elif cents:
37 |         cent_unit = "cent" if cents == 1 else "cents"
38 |         return "%s %s" % (cents, cent_unit)
39 |     else:
40 |         return "zero dollars"
41 | 
42 | 
43 | def _expand_ordinal(m):
44 |     return _inflect.number_to_words(m.group(0))
45 | 
46 | 
47 | def _expand_number(m):
48 |     num = int(m.group(0))
49 |     if num > 1000 and num < 3000:
50 |         if num == 2000:
51 |             return "two thousand"
52 |         elif num > 2000 and num < 2010:
53 |             return "two thousand " + _inflect.number_to_words(num % 100)
54 |         elif num % 100 == 0:
55 |             return _inflect.number_to_words(num // 100) + " hundred"
56 |         else:
57 |             return _inflect.number_to_words(
58 |                 num, andword="", zero="oh", group=2
59 |             ).replace(", ", " ")
60 |     else:
61 |         return _inflect.number_to_words(num, andword="")
62 | 
63 | 
64 | def normalize_numbers(text):
65 |     text = re.sub(_comma_number_re, _remove_commas, text)
66 |     text = re.sub(_pounds_re, r"\1 pounds", text)
67 |     text = re.sub(_dollars_re, _expand_dollars, text)
68 |     text = re.sub(_decimal_number_re, _expand_decimal_point, text)
69 |     text = re.sub(_ordinal_re, _expand_ordinal, text)
70 |     text = re.sub(_number_re, _expand_number, text)
71 |     return text
72 | 


--------------------------------------------------------------------------------
/kantts/utils/log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import subprocess
 3 | 
 4 | 
 5 | def logging_to_file(log_file):
 6 |     logger = logging.getLogger()
 7 |     handler = logging.FileHandler(log_file)
 8 |     formatter = logging.Formatter(
 9 |         "%(asctime)s %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s",
10 |         datefmt="%Y-%m-%d:%H:%M:%S",
11 |     )
12 |     handler.setFormatter(formatter)
13 |     logger.addHandler(handler)
14 |     logger.setLevel(logging.INFO)
15 | 
16 | 
17 | def get_git_revision_short_hash():
18 |     return (
19 |         subprocess.check_output(["git", "rev-parse", "--short", "HEAD"])
20 |         .decode("ascii")
21 |         .strip()
22 |     )
23 | 
24 | 
25 | def get_git_revision_hash():
26 |     return subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("ascii").strip()
27 | 


--------------------------------------------------------------------------------
/kantts/utils/plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | 
 3 | matplotlib.use("Agg")  # NOQA: E402
 4 | try:
 5 |     import matplotlib.pyplot as plt
 6 | except ImportError:
 7 |     raise ImportError("Please install matplotlib.")
 8 | 
 9 | 
10 | def plot_spectrogram(spectrogram):
11 |     fig, ax = plt.subplots(figsize=(12, 8))
12 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
13 |     plt.colorbar(im, ax=ax)
14 | 
15 |     fig.canvas.draw()
16 |     plt.close()
17 | 
18 |     return fig
19 | 
20 | 
21 | def plot_alignment(alignment, info=None):
22 |     fig, ax = plt.subplots()
23 |     im = ax.imshow(alignment, aspect="auto", origin="lower", interpolation="none")
24 |     fig.colorbar(im, ax=ax)
25 |     xlabel = "Input timestep"
26 |     if info is not None:
27 |         xlabel += "\t" + info
28 |     plt.xlabel(xlabel)
29 |     plt.ylabel("Output timestep")
30 |     fig.canvas.draw()
31 |     plt.close()
32 | 
33 |     return fig
34 | 


--------------------------------------------------------------------------------
/notebooks/README.md:
--------------------------------------------------------------------------------
1 | # How to run notebook examples?
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pre-commit
2 | autopep8
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | version = "0.0.1"
 4 | 
 5 | with open("README.md", "r", encoding="utf-8") as readme_file:
 6 |     README = readme_file.read()
 7 | 
 8 | setup(
 9 |     name="kantts",
10 |     version=version,
11 |     url="https://github.com/AlibabaResearch/KAN-TTS",
12 |     author="Jin",
13 |     description="Alibaba DAMO Speech-Lab Text to Speech deeplearning toolchain",
14 |     long_description=README,
15 |     long_description_content_type="text/markdown",
16 |     license="MIT",
17 |     # cython
18 |     #  include_dirs=numpy.get_include(),
19 |     # ext_modules=find_cython_extensions(),
20 |     # package
21 |     include_package_data=True,
22 |     packages=find_packages(include=["kantts*"]),
23 |     project_urls={
24 |         "Documentation": "https://github.com/AlibabaResearch/KAN-TTS/wiki",
25 |         "Tracker": "",
26 |         "Repository": "https://github.com/AlibabaResearch/KAN-TTS",
27 |         "Discussions": "",
28 |     },
29 |     python_requires=">=3.7.0, <3.9",
30 |     classifiers=[
31 |         "Programming Language :: Python",
32 |         "Programming Language :: Python :: 3",
33 |         "Programming Language :: Python :: 3.7",
34 |         "Programming Language :: Python :: 3.8",
35 |         "Development Status :: 3 - Alpha",
36 |         "Intended Audience :: Science/Research",
37 |         "Intended Audience :: Developers",
38 |         "Operating System :: POSIX :: Linux",
39 |         "License :: OSI Approved :: MIT License",
40 |         "Topic :: Software Development",
41 |         "Topic :: Software Development :: Libraries :: Python Modules",
42 |         "Topic :: Multimedia :: Sound/Audio :: Speech",
43 |         "Topic :: Multimedia :: Sound/Audio",
44 |         "Topic :: Multimedia",
45 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
46 |     ],
47 |     zip_safe=False,
48 | )
49 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modelscope/KAN-TTS/66202d052189a3dcca5f75c718c47a7a5cc68116/test/__init__.py


--------------------------------------------------------------------------------