├── .github └── workflows │ └── python-publish.yml ├── .gitignore ├── LICENSE ├── README.md ├── TRAINING.md ├── fix_pt.py ├── openphonemizer └── __init__.py ├── setup.py ├── train.py └── training └── config.yml /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | 163 | 164 | en.txt 165 | out.tsv -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The Clear BSD License 2 | 3 | Copyright (c) 2024 mrfakename, NeuralVox, OpenPhonemizer Contributors 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted (subject to the limitations in the disclaimer 8 | below) provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright 14 | notice, this list of conditions and the following disclaimer in the 15 | documentation and/or other materials provided with the distribution. 16 | 17 | * Neither the name of the copyright holder nor the names of its 18 | contributors may be used to endorse or promote products derived from this 19 | software without specific prior written permission. 20 | 21 | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY 22 | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 23 | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 25 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 26 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 27 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 28 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 29 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 30 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 | POSSIBILITY OF SUCH DAMAGE. 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenPhonemizer 2 | 3 | **Code / [Audio Samples](https://neuralvox.github.io/OpenPhonemizer/) / [Models](https://huggingface.co/openphonemizer/ckpt) / [Live Demo](https://huggingface.co/spaces/openphonemizer/PhonemizerHub) / [Dataset](https://huggingface.co/datasets/mrfakename/ipa-phonemes-word-pairs)** 4 | 5 | A permissively licensed, open sourced, local IPA Phonemizer (G2P) powered by deep learning. This Phonemizer attempts to replicate the `espeak` Phonemizer while remaining permissively-licensed. 6 | 7 | OpenPhonemizer is designed to be a drop-in replacement for espeak's phonemizer. This means you can use DeepPhonemizer in your software even if your software is not GPL licensed. 8 | 9 | OpenPhonemizer is heavily based on the amazing [DeepPhonemizer](https://github.com/as-ideas/DeepPhonemizer). The main changes are the model checkpoints, which more closely resemble `espeak`'s phonemizer. 10 | 11 | Optional GPL-licensed portions are available [here](https://github.com/NeuralVox/OpenPhonemizer-GPL). 12 | 13 | ## Features 14 | 15 | * Permissively licensed & open source 16 | * Fast & efficient 17 | * Works well with TTS models that depend on phonemizer or espeak 18 | * Automatic GPU acceleration (CUDA/MPS) if available 19 | 20 | ## Project 21 | 22 | * Project status: Alpha 23 | * Supported languages: English (more coming soon! What languages do you want? Let me know!) 24 | 25 | ## Installation 26 | 27 | Easily install OpenPhonemizer: 28 | 29 | ```bash 30 | pip install -U openphonemizer 31 | ``` 32 | 33 | Or, install the latest version from Git: 34 | 35 | ```bash 36 | pip install -U "openphonemizer @ git+https://github.com/NeuralVox/OpenPhonemizer" 37 | ``` 38 | 39 | ## Usage 40 | 41 | ### OpenPhonemizer 42 | 43 | ```python 44 | from openphonemizer import OpenPhonemizer 45 | phonemizer = OpenPhonemizer() 46 | # Or specify a custom checkpoint path: OpenPhonemizer('model.pt') 47 | phonemizer('test') 48 | phonemizer('hello this is a test') 49 | ``` 50 | 51 | Please note that by default, OpenPhonemizer loads a built-in dictionary of words/phonemes. Because storage is quite inefficient, the model is ~100MB larger and uses more memory, however it is _much_ faster. If you're low on VRAM, you can either run the model exclusively on CPU (`disable_gpu=True`) or load a model without a dictionary. 52 | 53 | **Load without dictionary:** 54 | 55 | ```python 56 | from cached_path import cached_path 57 | from openphonemizer import OpenPhonemizer 58 | phonemizer = OpenPhonemizer(str(cached_path('hf://openphonemizer/ckpt/best_model_no_optim.pt'))) # add disable_gpu=True to run on CPU only 59 | phonemizer('test') 60 | phonemizer('hello this is a test') 61 | ``` 62 | 63 | **Use autoregressive model:** 64 | 65 | > [!CAUTION] 66 | > OpenPhonemizer had a **bug** in the training script that caused significantly degraded performance. The autoregressive model has not yet been fixed. For now, please use the forward model. 67 | 68 | NEW: An autoregressive model is now available. The autoregressive model is more accurate but slightly slower. To use the autoregressive model: 69 | 70 | ```python 71 | OpenPhonemizer(str(cached_path('hf://openphonemizer/autoreg-ckpt/best_model.pt'))) 72 | ``` 73 | 74 | ## Evaluation 75 | 76 | We introduce PhonemizerBench, a benchmark to evaluate the similarity of alternate Phonemizers to `espeak` (this benchmark measures against `espeak`, assuming it's score is 100). 77 | 78 | | Phonemizer | Score (Run 1) | Score (Run 2) | Score (Run 3) | Average | 79 | | ----------------------------- | ------------- | ------------- | ------------- | --------- | 80 | | Gruut | 75.08 | 75.54 | 73.72 | 74.78 | 81 | | DeepPhonemizer | 85.24 | 85.03 | 84.64 | 84.97 | 82 | | G2P_EN | 86.16 | 86.28 | 85.74 | 86.06 | 83 | | OpenPhonemizer | 93.64 | 93.54 | 93.38 | 93.52 | 84 | | OpenPhonemizer Autoregressive | **93.74** | **93.59** | **93.67** | **93.67** | 85 | 86 | ## Todo 87 | 88 | - [x] Train autoregressive model 89 | - [x] Allow disabling GPU usage 90 | - [ ] Multilingual support (any requests?) 91 | 92 | ## License 93 | 94 | OpenPhonemizer is open source software. You may use it under the BSD-3-Clause Clear license found in the LICENSE file. 95 | 96 | Please note that OpenPhonemizer depends on software under different licenses, it is your responsibility when redistributing or modifying OpenPhonemizer to comply with these licenses (notably LGPL). 97 | 98 | *By contributing to this repository, you grant the author the permission to change the license in the future at their sole discretion or offer different licenses to other individuals.* 99 | 100 | **NOTE:** Model weights may be licensed under different licenses. Please make sure to check all model weights for licenses. 101 | 102 | ## Credits 103 | 104 | OpenPhonemizer is essentially a wrapper (using different pre-trained models) around the amazing [Deep Phonemizer](https://github.com/as-ideas/DeepPhonemizer) package created by [Christian Schäfer](https://github.com/cschaefer26). 105 | 106 | OpenPhonemizer uses [num2words](https://github.com/savoirfairelinux/num2words) to read out large numbers and [cached_path](https://github.com/allenai/cached_path) from Allen AI for caching models. 107 | 108 | OpenPhonemizer models were trained by [mrfakename](https://twitter.com/realmrfakename). 109 | -------------------------------------------------------------------------------- /TRAINING.md: -------------------------------------------------------------------------------- 1 | # Training 2 | 3 | (Some code borrowed from DeepPhonemizer) 4 | 5 | Assuming you're using Jupyter: 6 | 7 | ``` 8 | !pip install deep-phonemizer 9 | ``` 10 | 11 | ```python 12 | !wget https://huggingface.co/datasets/mrfakename/ipa-phonemes-word-pairs/raw/main/out.tsv 13 | with open('out.tsv', 'r', encoding='utf-8') as f: 14 | lines = f.readlines() 15 | 16 | lines = [l.replace(' ', '').replace('\n', '') for l in lines] 17 | splits = [l.split('\t') for l in lines] 18 | train_data = [('en_us', s[0], s[1]) for s in splits if len(s)==2] 19 | for d in train_data[:10000:1000]: 20 | print(d) 21 | ``` 22 | 23 | ```python 24 | from dp.utils.io import read_config, save_config 25 | import dp 26 | import os 27 | 28 | config_file = 'config.yml' 29 | config = read_config(config_file) 30 | config['training']['epochs'] = 10 31 | config['training']['warmup_steps'] = 100 32 | config['training']['generate_steps'] = 500 33 | config['training']['validate_steps'] = 500 34 | save_config(config, 'config.yaml') 35 | for k, v in config.items(): 36 | print(f'{k} {v}') 37 | ``` 38 | 39 | ``` 40 | %load_ext tensorboard 41 | %tensorboard --logdir /content/checkpoints 42 | ``` 43 | 44 | ```python 45 | from dp.preprocess import preprocess 46 | from dp.train import train 47 | preprocess(config_file='config.yaml', train_data=train_data) 48 | train(rank=0, num_gpus=1, config_file='config.yaml') 49 | ``` 50 | 51 | ```python 52 | from dp.phonemizer import Phonemizer 53 | 54 | phonemizer = Phonemizer.from_checkpoint('/content/checkpoints/best_model.pt') 55 | result = phonemizer('Phonemizing an English text is imposimpable!', lang='en_us') 56 | 57 | print(result) 58 | ``` 59 | 60 | ```python 61 | result = phonemizer.phonemise_list(['This is a test'], lang='en_us') 62 | 63 | for word, pred in result.predictions.items(): 64 | print(f'{word} {pred.phonemes} {pred.confidence}') 65 | ``` -------------------------------------------------------------------------------- /fix_pt.py: -------------------------------------------------------------------------------- 1 | MODEL_PATH = 'model_step_20k_fixed.pt' 2 | import torch 3 | from cached_path import cached_path 4 | x = torch.load(MODEL_PATH) 5 | with open(str(cached_path('https://huggingface.co/datasets/mrfakename/ipa-phonemes-word-pairs/raw/main/out.tsv'))) as f: 6 | lines = [l.replace(' ', '').replace('\n', '') for l in f.readlines()] 7 | splits = [l.split('\t') for l in lines] 8 | # for z in splits: 9 | # x['phoneme_dict']['en_us'][z[0]] = z[1] 10 | x['phoneme_dict']['en_us']['a']='ɐ' 11 | print(x['phoneme_dict']['en_us']) 12 | torch.save(x, 'model_step_20k_fixed.pt') -------------------------------------------------------------------------------- /openphonemizer/__init__.py: -------------------------------------------------------------------------------- 1 | # OpenPhonemizer 2 | # 3 | # Copyright (c) 2024 mrfakename, NeuralVox, OpenPhonemizer Contributors 4 | # All rights reserved. 5 | # 6 | # The Clear BSD License 7 | 8 | # Redistribution and use in source and binary forms, with or without 9 | # modification, are permitted (subject to the limitations in the disclaimer 10 | # below) provided that the following conditions are met: 11 | 12 | # * Redistributions of source code must retain the above copyright notice, 13 | # this list of conditions and the following disclaimer. 14 | 15 | # * Redistributions in binary form must reproduce the above copyright 16 | # notice, this list of conditions and the following disclaimer in the 17 | # documentation and/or other materials provided with the distribution. 18 | 19 | # * Neither the name of the copyright holder nor the names of its 20 | # contributors may be used to endorse or promote products derived from this 21 | # software without specific prior written permission. 22 | 23 | # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY 24 | # THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 25 | # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 26 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 27 | # PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 28 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 29 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 30 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 31 | # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 32 | # IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 33 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 | # POSSIBILITY OF SUCH DAMAGE. 35 | 36 | from dp.phonemizer import Phonemizer 37 | from cached_path import cached_path 38 | from num2words import num2words 39 | import re, torch 40 | class OpenPhonemizer: 41 | def __init__(self, model_checkpoint=None, disable_gpu=False): 42 | device = 'cpu' 43 | if torch.cuda.is_available(): device = 'cuda' 44 | if torch.backends.mps.is_available(): device = 'mps' 45 | if disable_gpu: device = 'cpu' 46 | if not model_checkpoint: 47 | model_checkpoint = str(cached_path('hf://openphonemizer/ckpt/best_model.pt')) 48 | self.phonemizer = Phonemizer.from_checkpoint(model_checkpoint, device=device) 49 | self.pattern = re.compile(r'\d+') 50 | def _num_process(self, text): 51 | matches = self.pattern.findall(text) 52 | for match in matches: 53 | word_equivalent = num2words(int(match)) 54 | text = text.replace(match, word_equivalent) 55 | return text 56 | def __call__(self, text, stress=True): 57 | out = self.phonemizer(self._num_process(text.replace(' .', '.').replace('.', ' .')), lang='en_us') 58 | if not stress: 59 | out = out.replace('ˈ', '') 60 | return out 61 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # OpenPhonemizer 2 | # 3 | # Copyright (c) 2024 mrfakename, NeuralVox, OpenPhonemizer Contributors 4 | # All rights reserved. 5 | # 6 | # The Clear BSD License 7 | 8 | # Redistribution and use in source and binary forms, with or without 9 | # modification, are permitted (subject to the limitations in the disclaimer 10 | # below) provided that the following conditions are met: 11 | 12 | # * Redistributions of source code must retain the above copyright notice, 13 | # this list of conditions and the following disclaimer. 14 | 15 | # * Redistributions in binary form must reproduce the above copyright 16 | # notice, this list of conditions and the following disclaimer in the 17 | # documentation and/or other materials provided with the distribution. 18 | 19 | # * Neither the name of the copyright holder nor the names of its 20 | # contributors may be used to endorse or promote products derived from this 21 | # software without specific prior written permission. 22 | 23 | # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY 24 | # THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 25 | # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 26 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 27 | # PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 28 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 29 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 30 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 31 | # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 32 | # IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 33 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 | # POSSIBILITY OF SUCH DAMAGE. 35 | 36 | from setuptools import setup, find_packages 37 | 38 | setup( 39 | name="openphonemizer", 40 | version="0.1.2", 41 | packages=find_packages(), 42 | author="mrfakename", 43 | author_email="me@mrfake.name", 44 | description="Permissively licensed, open sourced, local IPA Phonemizer (G2P) powered by deep learning.", 45 | long_description=open("README.md").read(), 46 | long_description_content_type="text/markdown", 47 | url="https://github.com/NeuralVox/OpenPhonemizer", 48 | license="BSD-3-Clause-Clear", 49 | classifiers=[ 50 | "License :: OSI Approved :: BSD License", 51 | ], 52 | install_requires=[ 53 | "deep-phonemizer", 54 | "cached-path", 55 | "num2words", 56 | ], 57 | ) 58 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # OpenPhonemizer 2 | # 3 | # Copyright (c) 2024 mrfakename, NeuralVox, OpenPhonemizer Contributors 4 | # All rights reserved. 5 | # 6 | # The Clear BSD License 7 | 8 | # Redistribution and use in source and binary forms, with or without 9 | # modification, are permitted (subject to the limitations in the disclaimer 10 | # below) provided that the following conditions are met: 11 | 12 | # * Redistributions of source code must retain the above copyright notice, 13 | # this list of conditions and the following disclaimer. 14 | 15 | # * Redistributions in binary form must reproduce the above copyright 16 | # notice, this list of conditions and the following disclaimer in the 17 | # documentation and/or other materials provided with the distribution. 18 | 19 | # * Neither the name of the copyright holder nor the names of its 20 | # contributors may be used to endorse or promote products derived from this 21 | # software without specific prior written permission. 22 | 23 | # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY 24 | # THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 25 | # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 26 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 27 | # PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 28 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 29 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 30 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 31 | # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 32 | # IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 33 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 | # POSSIBILITY OF SUCH DAMAGE. 35 | -------------------------------------------------------------------------------- /training/config.yml: -------------------------------------------------------------------------------- 1 | paths: 2 | checkpoint_dir: checkpoints 3 | data_dir: datasets 4 | 5 | preprocessing: 6 | languages: ["en_us"] 7 | text_symbols: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" 8 | phoneme_symbols: 9 | [ 10 | "a", 11 | "b", 12 | "d", 13 | "e", 14 | "f", 15 | "g", 16 | "h", 17 | "i", 18 | "j", 19 | "k", 20 | "l", 21 | "m", 22 | "n", 23 | "o", 24 | "p", 25 | "r", 26 | "s", 27 | "t", 28 | "u", 29 | "v", 30 | "w", 31 | "x", 32 | "y", 33 | "z", 34 | "æ", 35 | "ç", 36 | "ð", 37 | "ø", 38 | "ŋ", 39 | "œ", 40 | "ɐ", 41 | "ɑ", 42 | "ɔ", 43 | "ə", 44 | "ɛ", 45 | "ɜ", 46 | "ɝ", 47 | "ɹ", 48 | "ɚ", 49 | "ɡ", 50 | "ɪ", 51 | "ʁ", 52 | "ʃ", 53 | "ʊ", 54 | "ʌ", 55 | "ʏ", 56 | "ʒ", 57 | "ʔ", 58 | "ˈ", 59 | "ˌ", 60 | "ː", 61 | "̃", 62 | "̍", 63 | "̥", 64 | "̩", 65 | "̯", 66 | "͡", 67 | "θ", 68 | "'", 69 | "ɾ", 70 | "ᵻ" 71 | ] 72 | char_repeats: 3 73 | lowercase: true 74 | n_val: 5000 75 | model: 76 | type: "transformer" 77 | d_model: 512 78 | d_fft: 1024 79 | layers: 6 80 | dropout: 0.1 81 | heads: 4 82 | 83 | training: 84 | learning_rate: 0.0001 85 | warmup_steps: 10000 86 | scheduler_plateau_factor: 0.5 87 | scheduler_plateau_patience: 10 88 | batch_size: 64 89 | batch_size_val: 64 90 | epochs: 15 91 | generate_steps: 92 | 10000 93 | validate_steps: 94 | 10000 95 | checkpoint_steps: 10000 96 | n_generate_samples: 10 97 | store_phoneme_dict_in_model: 98 | true 99 | ddp_backend: "nccl" 100 | ddp_host: "localhost" 101 | ddp_post: "12355" 102 | --------------------------------------------------------------------------------