├── .gitignore ├── LICENSE ├── README.md ├── data └── 20k.txt ├── download_tts_models.py ├── generate_clips.py ├── models ├── vits │ ├── LICENSE │ ├── README.md │ ├── attentions.py │ ├── commons.py │ ├── configs │ │ ├── ljs_base.json │ │ ├── ljs_nosdp.json │ │ └── vctk_base.json │ ├── data_utils.py │ ├── filelists │ │ ├── ljs_audio_text_test_filelist.txt │ │ ├── ljs_audio_text_test_filelist.txt.cleaned │ │ ├── ljs_audio_text_train_filelist.txt │ │ ├── ljs_audio_text_train_filelist.txt.cleaned │ │ ├── ljs_audio_text_val_filelist.txt │ │ ├── ljs_audio_text_val_filelist.txt.cleaned │ │ ├── vctk_audio_sid_text_test_filelist.txt │ │ ├── vctk_audio_sid_text_test_filelist.txt.cleaned │ │ ├── vctk_audio_sid_text_train_filelist.txt │ │ ├── vctk_audio_sid_text_train_filelist.txt.cleaned │ │ ├── vctk_audio_sid_text_val_filelist.txt │ │ └── vctk_audio_sid_text_val_filelist.txt.cleaned │ ├── losses.py │ ├── mel_processing.py │ ├── models.py │ ├── modules.py │ ├── monotonic_align │ │ ├── __init__.py │ │ ├── core.pyx │ │ └── setup.py │ ├── preprocess.py │ ├── pretrained_models │ │ └── .gitkeep │ ├── requirements.txt │ ├── resources │ │ ├── fig_1a.png │ │ ├── fig_1b.png │ │ └── training.png │ ├── text │ │ ├── LICENSE │ │ ├── __init__.py │ │ ├── cleaners.py │ │ └── symbols.py │ ├── train.py │ ├── train_ms.py │ ├── transforms.py │ └── utils.py └── waveglow │ ├── Dockerfile │ ├── MANIFEST.in │ ├── README.md │ ├── TextToSpeechModel │ ├── __init__.py │ ├── artifacts │ │ ├── __init__.py │ │ ├── cmudict_dictionary │ │ ├── heteronyms │ │ └── libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist.txt │ ├── audio_processing.py │ ├── bentoml.yml │ ├── data.py │ ├── flowtron.py │ ├── glow.py │ ├── text │ │ ├── LICENSE │ │ ├── __init__.py │ │ ├── acronyms.py │ │ ├── cleaners.py │ │ ├── cmudict.py │ │ ├── cmudict_dictionary │ │ ├── datestime.py │ │ ├── heteronyms │ │ ├── numbers.py │ │ └── symbols.py │ ├── text_to_speech.py │ └── waveglow_artifact.py │ ├── bentoml-init.sh │ ├── bentoml.yml │ ├── docker-entrypoint.sh │ ├── environment.yml │ ├── python_version │ ├── requirements.txt │ └── setup.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Large ML models 132 | *.pt 133 | *.pth 134 | *.onnx -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Readme 2 | 3 | This repository contains text-to-speech (TTS) models and utilities designed produce synthetic training datasets for other speech-related models (e.g., [openWakeWord](https://github.com/dscripka/openWakeWord)). 4 | 5 | It includes two specific open-source TTS models that I have found to be useful when generating synthetic speech. Specifically: 6 | 7 | - [Nvidia Waveglow](https://github.com/NVIDIA/waveglow) 8 | - [VITS](https://github.com/jaywalnut310/vits) 9 | 10 | Note that the code in this repository varies greatly in quality and structure as it was derived from multiple sources. It is primarily meant for research and experimentation, and you are encouraged to makes changes and updates before relying on this code for production purposes. Also, these models are only trained on English TTS datasets (VCTK and LibriTTS), and will not produce accurate speech for other languages. 11 | 12 | # Installation 13 | 14 | First clone this repository: 15 | 16 | ```bash 17 | git clone https://github.com/dscripka/synthetic_speech_dataset_generation 18 | ``` 19 | 20 | Then install the requirements into your virtual environment of choice: 21 | 22 | ```bash 23 | pip install -r requirements.txt 24 | ``` 25 | 26 | If installing in an environment with GPUs available, you will need to update `requirements.txt` to include versions of Torch compatible with your GPU configuration. Note that while it is possible to generate data on CPUs only, the WAVEGLOW model will be very slow (e.g., 5-10 seconds per generation). The VITS model is somewhat faster on CPU (~1-3 seconds per generation), but for the large amounts of data generation that is often needed to train robust models, a GPU is *strongly* recommended. 27 | 28 | The TTS models themselves are not stored in this repository and need to be downloaded separately. There is an included script that will download the files and place them in the appropriate location within the repository. 29 | 30 | ```bash 31 | python download_tts_models.py 32 | ``` 33 | 34 | To test that everything is working correctly after these steps, use this command and listen to the output in the `generated_clips` directory that is created: 35 | 36 | ```bash 37 | python generate_clips.py --model VITS --text "This is some test speech" --N 1 --output_dir generated_clips 38 | ``` 39 | 40 | # Usage 41 | 42 | The primary way to generate synthetic speech is via the CLI in `generate_clips.py`. To see all of the possible arguments, use `python generate_clips.py --help`. 43 | 44 | As a quick example of usage, the following command will generate 5000 clips of the phrase "turn on the office lights" using the Nvidia Waveglow model (on a GPU) trained on the LibriTTS dataset. Additionally, the `--max_per_speaker` argument will limit the number of generations for each of the ~2300 LibriTTS training voices to 1, and after that limit is reached a random voice will be created by [spherical interpolation](https://en.wikipedia.org/wiki/Slerp) of random speaker embeddings. 45 | 46 | ``` 47 | python generate_clips.py \ 48 | --model WAVEGLOW \ 49 | --enable_gpu \ 50 | --text "turn on the office lights" \ 51 | --N 5000 \ 52 | --max_per_speaker 1 \ 53 | --output_dir /path/to/output/directory 54 | ``` 55 | 56 | # License 57 | 58 | The `generate_clips.py` code in this repository is licensed under Apache 2.0. The included TTS models (and the associated code from the source repos) have their own licenses, and you are strongly encouraged to review the original repositories to determine if the license is appropriate for a given use-case. 59 | 60 | - [Nvidia Waveglow](https://github.com/NVIDIA/waveglow) 61 | - [VITS](https://github.com/jaywalnut310/vits) -------------------------------------------------------------------------------- /download_tts_models.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 David Scripka. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Imports 16 | import functools 17 | import pathlib 18 | import shutil 19 | import requests 20 | import os 21 | from tqdm.auto import tqdm 22 | 23 | # Helper function to download files (from https://stackoverflow.com/a/63831344) 24 | def download(url, filename): 25 | r = requests.get(url, stream=True, allow_redirects=True) 26 | if r.status_code != 200: 27 | r.raise_for_status() # Will only raise for 4xx codes, so... 28 | raise RuntimeError(f"Request to {url} returned status code {r.status_code}") 29 | file_size = int(r.headers.get('Content-Length', 0)) 30 | 31 | path = pathlib.Path(filename).expanduser().resolve() 32 | path.parent.mkdir(parents=True, exist_ok=True) 33 | 34 | desc = "(Unknown total file size)" if file_size == 0 else "" 35 | r.raw.read = functools.partial(r.raw.read, decode_content=True) # Decompress if needed 36 | with tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw: 37 | with path.open("wb") as f: 38 | shutil.copyfileobj(r_raw, f) 39 | 40 | return path 41 | 42 | # Download files 43 | print("Downloading TTS models...\n") 44 | vits_model = "https://f002.backblazeb2.com/file/openwakeword-resources/tts_models/pretrained_vctk.pth" 45 | waveglow_model = "https://f002.backblazeb2.com/file/openwakeword-resources/tts_models/waveglow_256channels_universal_v5.pt" 46 | flowtron_libritts_model = "https://f002.backblazeb2.com/file/openwakeword-resources/tts_models/flowtron_libritts2p3k.pt" 47 | 48 | download(vits_model, vits_model.split("/")[-1]) 49 | download(waveglow_model, waveglow_model.split("/")[-1]) 50 | download(flowtron_libritts_model, flowtron_libritts_model.split("/")[-1]) 51 | 52 | # Move model files to correct locations 53 | print("\nMoving model files.....") 54 | shutil.move(vits_model.split("/")[-1], os.path.join("models", "vits", "pretrained_models", vits_model.split("/")[-1])) 55 | shutil.move(waveglow_model.split("/")[-1], os.path.join("models", "waveglow", "TextToSpeechModel", "artifacts", waveglow_model.split("/")[-1])) 56 | shutil.move(flowtron_libritts_model.split("/")[-1], os.path.join("models", "waveglow", "TextToSpeechModel", "artifacts", flowtron_libritts_model.split("/")[-1])) 57 | print("Done!") -------------------------------------------------------------------------------- /models/vits/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Jaehyeon Kim 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /models/vits/README.md: -------------------------------------------------------------------------------- 1 | # VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech 2 | 3 | ### Jaehyeon Kim, Jungil Kong, and Juhee Son 4 | 5 | In our recent [paper](https://arxiv.org/abs/2106.06103), we propose VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech. 6 | 7 | Several recent end-to-end text-to-speech (TTS) models enabling single-stage training and parallel sampling have been proposed, but their sample quality does not match that of two-stage TTS systems. In this work, we present a parallel end-to-end TTS method that generates more natural sounding audio than current two-stage models. Our method adopts variational inference augmented with normalizing flows and an adversarial training process, which improves the expressive power of generative modeling. We also propose a stochastic duration predictor to synthesize speech with diverse rhythms from input text. With the uncertainty modeling over latent variables and the stochastic duration predictor, our method expresses the natural one-to-many relationship in which a text input can be spoken in multiple ways with different pitches and rhythms. A subjective human evaluation (mean opinion score, or MOS) on the LJ Speech, a single speaker dataset, shows that our method outperforms the best publicly available TTS systems and achieves a MOS comparable to ground truth. 8 | 9 | Visit our [demo](https://jaywalnut310.github.io/vits-demo/index.html) for audio samples. 10 | 11 | We also provide the [pretrained models](https://drive.google.com/drive/folders/1ksarh-cJf3F5eKJjLVWY0X1j1qsQqiS2?usp=sharing). 12 | 13 | ** Update note: Thanks to [Rishikesh (ऋषिकेश)](https://github.com/jaywalnut310/vits/issues/1), our interactive TTS demo is now available on [Colab Notebook](https://colab.research.google.com/drive/1CO61pZizDj7en71NQG_aqqKdGaA_SaBf?usp=sharing). 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |
VITS at trainingVITS at inference
VITS at trainingVITS at inference
25 | 26 | 27 | ## Pre-requisites 28 | 0. Python >= 3.6 29 | 0. Clone this repository 30 | 0. Install python requirements. Please refer [requirements.txt](requirements.txt) 31 | 1. You may need to install espeak first: `apt-get install espeak` 32 | 0. Download datasets 33 | 1. Download and extract the LJ Speech dataset, then rename or create a link to the dataset folder: `ln -s /path/to/LJSpeech-1.1/wavs DUMMY1` 34 | 1. For mult-speaker setting, download and extract the VCTK dataset, and downsample wav files to 22050 Hz. Then rename or create a link to the dataset folder: `ln -s /path/to/VCTK-Corpus/downsampled_wavs DUMMY2` 35 | 0. Build Monotonic Alignment Search and run preprocessing if you use your own datasets. 36 | ```sh 37 | # Cython-version Monotonoic Alignment Search 38 | cd monotonic_align 39 | python setup.py build_ext --inplace 40 | 41 | # Preprocessing (g2p) for your own datasets. Preprocessed phonemes for LJ Speech and VCTK have been already provided. 42 | # python preprocess.py --text_index 1 --filelists filelists/ljs_audio_text_train_filelist.txt filelists/ljs_audio_text_val_filelist.txt filelists/ljs_audio_text_test_filelist.txt 43 | # python preprocess.py --text_index 2 --filelists filelists/vctk_audio_sid_text_train_filelist.txt filelists/vctk_audio_sid_text_val_filelist.txt filelists/vctk_audio_sid_text_test_filelist.txt 44 | ``` 45 | 46 | 47 | ## Training Exmaple 48 | ```sh 49 | # LJ Speech 50 | python train.py -c configs/ljs_base.json -m ljs_base 51 | 52 | # VCTK 53 | python train_ms.py -c configs/vctk_base.json -m vctk_base 54 | ``` 55 | 56 | 57 | ## Inference Example 58 | See [inference.ipynb](inference.ipynb) 59 | -------------------------------------------------------------------------------- /models/vits/attentions.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | import numpy as np 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | 8 | import commons 9 | import modules 10 | from modules import LayerNorm 11 | 12 | 13 | class Encoder(nn.Module): 14 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs): 15 | super().__init__() 16 | self.hidden_channels = hidden_channels 17 | self.filter_channels = filter_channels 18 | self.n_heads = n_heads 19 | self.n_layers = n_layers 20 | self.kernel_size = kernel_size 21 | self.p_dropout = p_dropout 22 | self.window_size = window_size 23 | 24 | self.drop = nn.Dropout(p_dropout) 25 | self.attn_layers = nn.ModuleList() 26 | self.norm_layers_1 = nn.ModuleList() 27 | self.ffn_layers = nn.ModuleList() 28 | self.norm_layers_2 = nn.ModuleList() 29 | for i in range(self.n_layers): 30 | self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size)) 31 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 32 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)) 33 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 34 | 35 | def forward(self, x, x_mask): 36 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 37 | x = x * x_mask 38 | for i in range(self.n_layers): 39 | y = self.attn_layers[i](x, x, attn_mask) 40 | y = self.drop(y) 41 | x = self.norm_layers_1[i](x + y) 42 | 43 | y = self.ffn_layers[i](x, x_mask) 44 | y = self.drop(y) 45 | x = self.norm_layers_2[i](x + y) 46 | x = x * x_mask 47 | return x 48 | 49 | 50 | class Decoder(nn.Module): 51 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs): 52 | super().__init__() 53 | self.hidden_channels = hidden_channels 54 | self.filter_channels = filter_channels 55 | self.n_heads = n_heads 56 | self.n_layers = n_layers 57 | self.kernel_size = kernel_size 58 | self.p_dropout = p_dropout 59 | self.proximal_bias = proximal_bias 60 | self.proximal_init = proximal_init 61 | 62 | self.drop = nn.Dropout(p_dropout) 63 | self.self_attn_layers = nn.ModuleList() 64 | self.norm_layers_0 = nn.ModuleList() 65 | self.encdec_attn_layers = nn.ModuleList() 66 | self.norm_layers_1 = nn.ModuleList() 67 | self.ffn_layers = nn.ModuleList() 68 | self.norm_layers_2 = nn.ModuleList() 69 | for i in range(self.n_layers): 70 | self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init)) 71 | self.norm_layers_0.append(LayerNorm(hidden_channels)) 72 | self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)) 73 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 74 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) 75 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 76 | 77 | def forward(self, x, x_mask, h, h_mask): 78 | """ 79 | x: decoder input 80 | h: encoder output 81 | """ 82 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) 83 | encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 84 | x = x * x_mask 85 | for i in range(self.n_layers): 86 | y = self.self_attn_layers[i](x, x, self_attn_mask) 87 | y = self.drop(y) 88 | x = self.norm_layers_0[i](x + y) 89 | 90 | y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) 91 | y = self.drop(y) 92 | x = self.norm_layers_1[i](x + y) 93 | 94 | y = self.ffn_layers[i](x, x_mask) 95 | y = self.drop(y) 96 | x = self.norm_layers_2[i](x + y) 97 | x = x * x_mask 98 | return x 99 | 100 | 101 | class MultiHeadAttention(nn.Module): 102 | def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False): 103 | super().__init__() 104 | assert channels % n_heads == 0 105 | 106 | self.channels = channels 107 | self.out_channels = out_channels 108 | self.n_heads = n_heads 109 | self.p_dropout = p_dropout 110 | self.window_size = window_size 111 | self.heads_share = heads_share 112 | self.block_length = block_length 113 | self.proximal_bias = proximal_bias 114 | self.proximal_init = proximal_init 115 | self.attn = None 116 | 117 | self.k_channels = channels // n_heads 118 | self.conv_q = nn.Conv1d(channels, channels, 1) 119 | self.conv_k = nn.Conv1d(channels, channels, 1) 120 | self.conv_v = nn.Conv1d(channels, channels, 1) 121 | self.conv_o = nn.Conv1d(channels, out_channels, 1) 122 | self.drop = nn.Dropout(p_dropout) 123 | 124 | if window_size is not None: 125 | n_heads_rel = 1 if heads_share else n_heads 126 | rel_stddev = self.k_channels**-0.5 127 | self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 128 | self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 129 | 130 | nn.init.xavier_uniform_(self.conv_q.weight) 131 | nn.init.xavier_uniform_(self.conv_k.weight) 132 | nn.init.xavier_uniform_(self.conv_v.weight) 133 | if proximal_init: 134 | with torch.no_grad(): 135 | self.conv_k.weight.copy_(self.conv_q.weight) 136 | self.conv_k.bias.copy_(self.conv_q.bias) 137 | 138 | def forward(self, x, c, attn_mask=None): 139 | q = self.conv_q(x) 140 | k = self.conv_k(c) 141 | v = self.conv_v(c) 142 | 143 | x, self.attn = self.attention(q, k, v, mask=attn_mask) 144 | 145 | x = self.conv_o(x) 146 | return x 147 | 148 | def attention(self, query, key, value, mask=None): 149 | # reshape [b, d, t] -> [b, n_h, t, d_k] 150 | b, d, t_s, t_t = (*key.size(), query.size(2)) 151 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) 152 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 153 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 154 | 155 | scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) 156 | if self.window_size is not None: 157 | assert t_s == t_t, "Relative attention is only available for self-attention." 158 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) 159 | rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings) 160 | scores_local = self._relative_position_to_absolute_position(rel_logits) 161 | scores = scores + scores_local 162 | if self.proximal_bias: 163 | assert t_s == t_t, "Proximal bias is only available for self-attention." 164 | scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) 165 | if mask is not None: 166 | scores = scores.masked_fill(mask == 0, -1e4) 167 | if self.block_length is not None: 168 | assert t_s == t_t, "Local attention is only available for self-attention." 169 | block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) 170 | scores = scores.masked_fill(block_mask == 0, -1e4) 171 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] 172 | p_attn = self.drop(p_attn) 173 | output = torch.matmul(p_attn, value) 174 | if self.window_size is not None: 175 | relative_weights = self._absolute_position_to_relative_position(p_attn) 176 | value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) 177 | output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) 178 | output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] 179 | return output, p_attn 180 | 181 | def _matmul_with_relative_values(self, x, y): 182 | """ 183 | x: [b, h, l, m] 184 | y: [h or 1, m, d] 185 | ret: [b, h, l, d] 186 | """ 187 | ret = torch.matmul(x, y.unsqueeze(0)) 188 | return ret 189 | 190 | def _matmul_with_relative_keys(self, x, y): 191 | """ 192 | x: [b, h, l, d] 193 | y: [h or 1, m, d] 194 | ret: [b, h, l, m] 195 | """ 196 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) 197 | return ret 198 | 199 | def _get_relative_embeddings(self, relative_embeddings, length): 200 | max_relative_position = 2 * self.window_size + 1 201 | # Pad first before slice to avoid using cond ops. 202 | pad_length = max(length - (self.window_size + 1), 0) 203 | slice_start_position = max((self.window_size + 1) - length, 0) 204 | slice_end_position = slice_start_position + 2 * length - 1 205 | if pad_length > 0: 206 | padded_relative_embeddings = F.pad( 207 | relative_embeddings, 208 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) 209 | else: 210 | padded_relative_embeddings = relative_embeddings 211 | used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position] 212 | return used_relative_embeddings 213 | 214 | def _relative_position_to_absolute_position(self, x): 215 | """ 216 | x: [b, h, l, 2*l-1] 217 | ret: [b, h, l, l] 218 | """ 219 | batch, heads, length, _ = x.size() 220 | # Concat columns of pad to shift from relative to absolute indexing. 221 | x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]])) 222 | 223 | # Concat extra elements so to add up to shape (len+1, 2*len-1). 224 | x_flat = x.view([batch, heads, length * 2 * length]) 225 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]])) 226 | 227 | # Reshape and slice out the padded elements. 228 | x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:] 229 | return x_final 230 | 231 | def _absolute_position_to_relative_position(self, x): 232 | """ 233 | x: [b, h, l, l] 234 | ret: [b, h, l, 2*l-1] 235 | """ 236 | batch, heads, length, _ = x.size() 237 | # padd along column 238 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]])) 239 | x_flat = x.view([batch, heads, length**2 + length*(length -1)]) 240 | # add 0's in the beginning that will skew the elements after reshape 241 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) 242 | x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:] 243 | return x_final 244 | 245 | def _attention_bias_proximal(self, length): 246 | """Bias for self-attention to encourage attention to close positions. 247 | Args: 248 | length: an integer scalar. 249 | Returns: 250 | a Tensor with shape [1, 1, length, length] 251 | """ 252 | r = torch.arange(length, dtype=torch.float32) 253 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) 254 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) 255 | 256 | 257 | class FFN(nn.Module): 258 | def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False): 259 | super().__init__() 260 | self.in_channels = in_channels 261 | self.out_channels = out_channels 262 | self.filter_channels = filter_channels 263 | self.kernel_size = kernel_size 264 | self.p_dropout = p_dropout 265 | self.activation = activation 266 | self.causal = causal 267 | 268 | if causal: 269 | self.padding = self._causal_padding 270 | else: 271 | self.padding = self._same_padding 272 | 273 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) 274 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) 275 | self.drop = nn.Dropout(p_dropout) 276 | 277 | def forward(self, x, x_mask): 278 | x = self.conv_1(self.padding(x * x_mask)) 279 | if self.activation == "gelu": 280 | x = x * torch.sigmoid(1.702 * x) 281 | else: 282 | x = torch.relu(x) 283 | x = self.drop(x) 284 | x = self.conv_2(self.padding(x * x_mask)) 285 | return x * x_mask 286 | 287 | def _causal_padding(self, x): 288 | if self.kernel_size == 1: 289 | return x 290 | pad_l = self.kernel_size - 1 291 | pad_r = 0 292 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 293 | x = F.pad(x, commons.convert_pad_shape(padding)) 294 | return x 295 | 296 | def _same_padding(self, x): 297 | if self.kernel_size == 1: 298 | return x 299 | pad_l = (self.kernel_size - 1) // 2 300 | pad_r = self.kernel_size // 2 301 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 302 | x = F.pad(x, commons.convert_pad_shape(padding)) 303 | return x 304 | -------------------------------------------------------------------------------- /models/vits/commons.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | 8 | def init_weights(m, mean=0.0, std=0.01): 9 | classname = m.__class__.__name__ 10 | if classname.find("Conv") != -1: 11 | m.weight.data.normal_(mean, std) 12 | 13 | 14 | def get_padding(kernel_size, dilation=1): 15 | return int((kernel_size*dilation - dilation)/2) 16 | 17 | 18 | def convert_pad_shape(pad_shape): 19 | l = pad_shape[::-1] 20 | pad_shape = [item for sublist in l for item in sublist] 21 | return pad_shape 22 | 23 | 24 | def intersperse(lst, item): 25 | result = [item] * (len(lst) * 2 + 1) 26 | result[1::2] = lst 27 | return result 28 | 29 | 30 | def kl_divergence(m_p, logs_p, m_q, logs_q): 31 | """KL(P||Q)""" 32 | kl = (logs_q - logs_p) - 0.5 33 | kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q) 34 | return kl 35 | 36 | 37 | def rand_gumbel(shape): 38 | """Sample from the Gumbel distribution, protect from overflows.""" 39 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 40 | return -torch.log(-torch.log(uniform_samples)) 41 | 42 | 43 | def rand_gumbel_like(x): 44 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) 45 | return g 46 | 47 | 48 | def slice_segments(x, ids_str, segment_size=4): 49 | ret = torch.zeros_like(x[:, :, :segment_size]) 50 | for i in range(x.size(0)): 51 | idx_str = ids_str[i] 52 | idx_end = idx_str + segment_size 53 | ret[i] = x[i, :, idx_str:idx_end] 54 | return ret 55 | 56 | 57 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 58 | b, d, t = x.size() 59 | if x_lengths is None: 60 | x_lengths = t 61 | ids_str_max = x_lengths - segment_size + 1 62 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 63 | ret = slice_segments(x, ids_str, segment_size) 64 | return ret, ids_str 65 | 66 | 67 | def get_timing_signal_1d( 68 | length, channels, min_timescale=1.0, max_timescale=1.0e4): 69 | position = torch.arange(length, dtype=torch.float) 70 | num_timescales = channels // 2 71 | log_timescale_increment = ( 72 | math.log(float(max_timescale) / float(min_timescale)) / 73 | (num_timescales - 1)) 74 | inv_timescales = min_timescale * torch.exp( 75 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment) 76 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) 77 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) 78 | signal = F.pad(signal, [0, 0, 0, channels % 2]) 79 | signal = signal.view(1, channels, length) 80 | return signal 81 | 82 | 83 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): 84 | b, channels, length = x.size() 85 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 86 | return x + signal.to(dtype=x.dtype, device=x.device) 87 | 88 | 89 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): 90 | b, channels, length = x.size() 91 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 92 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) 93 | 94 | 95 | def subsequent_mask(length): 96 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 97 | return mask 98 | 99 | 100 | @torch.jit.script 101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 102 | n_channels_int = n_channels[0] 103 | in_act = input_a + input_b 104 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 105 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 106 | acts = t_act * s_act 107 | return acts 108 | 109 | 110 | def convert_pad_shape(pad_shape): 111 | l = pad_shape[::-1] 112 | pad_shape = [item for sublist in l for item in sublist] 113 | return pad_shape 114 | 115 | 116 | def shift_1d(x): 117 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] 118 | return x 119 | 120 | 121 | def sequence_mask(length, max_length=None): 122 | if max_length is None: 123 | max_length = length.max() 124 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 125 | return x.unsqueeze(0) < length.unsqueeze(1) 126 | 127 | 128 | def generate_path(duration, mask): 129 | """ 130 | duration: [b, 1, t_x] 131 | mask: [b, 1, t_y, t_x] 132 | """ 133 | device = duration.device 134 | 135 | b, _, t_y, t_x = mask.shape 136 | cum_duration = torch.cumsum(duration, -1) 137 | 138 | cum_duration_flat = cum_duration.view(b * t_x) 139 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 140 | path = path.view(b, t_x, t_y) 141 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 142 | path = path.unsqueeze(1).transpose(2,3) * mask 143 | return path 144 | 145 | 146 | def clip_grad_value_(parameters, clip_value, norm_type=2): 147 | if isinstance(parameters, torch.Tensor): 148 | parameters = [parameters] 149 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 150 | norm_type = float(norm_type) 151 | if clip_value is not None: 152 | clip_value = float(clip_value) 153 | 154 | total_norm = 0 155 | for p in parameters: 156 | param_norm = p.grad.data.norm(norm_type) 157 | total_norm += param_norm.item() ** norm_type 158 | if clip_value is not None: 159 | p.grad.data.clamp_(min=-clip_value, max=clip_value) 160 | total_norm = total_norm ** (1. / norm_type) 161 | return total_norm 162 | -------------------------------------------------------------------------------- /models/vits/configs/ljs_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 64, 11 | "fp16_run": true, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0 18 | }, 19 | "data": { 20 | "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned", 21 | "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned", 22 | "text_cleaners":["english_cleaners2"], 23 | "max_wav_value": 32768.0, 24 | "sampling_rate": 22050, 25 | "filter_length": 1024, 26 | "hop_length": 256, 27 | "win_length": 1024, 28 | "n_mel_channels": 80, 29 | "mel_fmin": 0.0, 30 | "mel_fmax": null, 31 | "add_blank": true, 32 | "n_speakers": 0, 33 | "cleaned_text": true 34 | }, 35 | "model": { 36 | "inter_channels": 192, 37 | "hidden_channels": 192, 38 | "filter_channels": 768, 39 | "n_heads": 2, 40 | "n_layers": 6, 41 | "kernel_size": 3, 42 | "p_dropout": 0.1, 43 | "resblock": "1", 44 | "resblock_kernel_sizes": [3,7,11], 45 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 46 | "upsample_rates": [8,8,2,2], 47 | "upsample_initial_channel": 512, 48 | "upsample_kernel_sizes": [16,16,4,4], 49 | "n_layers_q": 3, 50 | "use_spectral_norm": false 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /models/vits/configs/ljs_nosdp.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 64, 11 | "fp16_run": true, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0 18 | }, 19 | "data": { 20 | "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned", 21 | "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned", 22 | "text_cleaners":["english_cleaners2"], 23 | "max_wav_value": 32768.0, 24 | "sampling_rate": 22050, 25 | "filter_length": 1024, 26 | "hop_length": 256, 27 | "win_length": 1024, 28 | "n_mel_channels": 80, 29 | "mel_fmin": 0.0, 30 | "mel_fmax": null, 31 | "add_blank": true, 32 | "n_speakers": 0, 33 | "cleaned_text": true 34 | }, 35 | "model": { 36 | "inter_channels": 192, 37 | "hidden_channels": 192, 38 | "filter_channels": 768, 39 | "n_heads": 2, 40 | "n_layers": 6, 41 | "kernel_size": 3, 42 | "p_dropout": 0.1, 43 | "resblock": "1", 44 | "resblock_kernel_sizes": [3,7,11], 45 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 46 | "upsample_rates": [8,8,2,2], 47 | "upsample_initial_channel": 512, 48 | "upsample_kernel_sizes": [16,16,4,4], 49 | "n_layers_q": 3, 50 | "use_spectral_norm": false, 51 | "use_sdp": false 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /models/vits/configs/vctk_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 10000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 64, 11 | "fp16_run": true, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0 18 | }, 19 | "data": { 20 | "training_files":"filelists/vctk_audio_sid_text_train_filelist.txt.cleaned", 21 | "validation_files":"filelists/vctk_audio_sid_text_val_filelist.txt.cleaned", 22 | "text_cleaners":["english_cleaners2"], 23 | "max_wav_value": 32768.0, 24 | "sampling_rate": 22050, 25 | "filter_length": 1024, 26 | "hop_length": 256, 27 | "win_length": 1024, 28 | "n_mel_channels": 80, 29 | "mel_fmin": 0.0, 30 | "mel_fmax": null, 31 | "add_blank": true, 32 | "n_speakers": 109, 33 | "cleaned_text": true 34 | }, 35 | "model": { 36 | "inter_channels": 192, 37 | "hidden_channels": 192, 38 | "filter_channels": 768, 39 | "n_heads": 2, 40 | "n_layers": 6, 41 | "kernel_size": 3, 42 | "p_dropout": 0.1, 43 | "resblock": "1", 44 | "resblock_kernel_sizes": [3,7,11], 45 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 46 | "upsample_rates": [8,8,2,2], 47 | "upsample_initial_channel": 512, 48 | "upsample_kernel_sizes": [16,16,4,4], 49 | "n_layers_q": 3, 50 | "use_spectral_norm": false, 51 | "gin_channels": 256 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /models/vits/filelists/ljs_audio_text_val_filelist.txt: -------------------------------------------------------------------------------- 1 | DUMMY1/LJ022-0023.wav|The overwhelming majority of people in this country know how to sift the wheat from the chaff in what they hear and what they read. 2 | DUMMY1/LJ043-0030.wav|If somebody did that to me, a lousy trick like that, to take my wife away, and all the furniture, I would be mad as hell, too. 3 | DUMMY1/LJ005-0201.wav|as is shown by the report of the Commissioners to inquire into the state of the municipal corporations in eighteen thirty-five. 4 | DUMMY1/LJ001-0110.wav|Even the Caslon type when enlarged shows great shortcomings in this respect: 5 | DUMMY1/LJ003-0345.wav|All the committee could do in this respect was to throw the responsibility on others. 6 | DUMMY1/LJ007-0154.wav|These pungent and well-grounded strictures applied with still greater force to the unconvicted prisoner, the man who came to the prison innocent, and still uncontaminated, 7 | DUMMY1/LJ018-0098.wav|and recognized as one of the frequenters of the bogus law-stationers. His arrest led to that of others. 8 | DUMMY1/LJ047-0044.wav|Oswald was, however, willing to discuss his contacts with Soviet authorities. He denied having any involvement with Soviet intelligence agencies 9 | DUMMY1/LJ031-0038.wav|The first physician to see the President at Parkland Hospital was Dr. Charles J. Carrico, a resident in general surgery. 10 | DUMMY1/LJ048-0194.wav|during the morning of November twenty-two prior to the motorcade. 11 | DUMMY1/LJ049-0026.wav|On occasion the Secret Service has been permitted to have an agent riding in the passenger compartment with the President. 12 | DUMMY1/LJ004-0152.wav|although at Mr. Buxton's visit a new jail was in process of erection, the first step towards reform since Howard's visitation in seventeen seventy-four. 13 | DUMMY1/LJ008-0278.wav|or theirs might be one of many, and it might be considered necessary to "make an example." 14 | DUMMY1/LJ043-0002.wav|The Warren Commission Report. By The President's Commission on the Assassination of President Kennedy. Chapter seven. Lee Harvey Oswald: 15 | DUMMY1/LJ009-0114.wav|Mr. Wakefield winds up his graphic but somewhat sensational account by describing another religious service, which may appropriately be inserted here. 16 | DUMMY1/LJ028-0506.wav|A modern artist would have difficulty in doing such accurate work. 17 | DUMMY1/LJ050-0168.wav|with the particular purposes of the agency involved. The Commission recognizes that this is a controversial area 18 | DUMMY1/LJ039-0223.wav|Oswald's Marine training in marksmanship, his other rifle experience and his established familiarity with this particular weapon 19 | DUMMY1/LJ029-0032.wav|According to O'Donnell, quote, we had a motorcade wherever we went, end quote. 20 | DUMMY1/LJ031-0070.wav|Dr. Clark, who most closely observed the head wound, 21 | DUMMY1/LJ034-0198.wav|Euins, who was on the southwest corner of Elm and Houston Streets testified that he could not describe the man he saw in the window. 22 | DUMMY1/LJ026-0068.wav|Energy enters the plant, to a small extent, 23 | DUMMY1/LJ039-0075.wav|once you know that you must put the crosshairs on the target and that is all that is necessary. 24 | DUMMY1/LJ004-0096.wav|the fatal consequences whereof might be prevented if the justices of the peace were duly authorized 25 | DUMMY1/LJ005-0014.wav|Speaking on a debate on prison matters, he declared that 26 | DUMMY1/LJ012-0161.wav|he was reported to have fallen away to a shadow. 27 | DUMMY1/LJ018-0239.wav|His disappearance gave color and substance to evil reports already in circulation that the will and conveyance above referred to 28 | DUMMY1/LJ019-0257.wav|Here the tread-wheel was in use, there cellular cranks, or hard-labor machines. 29 | DUMMY1/LJ028-0008.wav|you tap gently with your heel upon the shoulder of the dromedary to urge her on. 30 | DUMMY1/LJ024-0083.wav|This plan of mine is no attack on the Court; 31 | DUMMY1/LJ042-0129.wav|No night clubs or bowling alleys, no places of recreation except the trade union dances. I have had enough. 32 | DUMMY1/LJ036-0103.wav|The police asked him whether he could pick out his passenger from the lineup. 33 | DUMMY1/LJ046-0058.wav|During his Presidency, Franklin D. Roosevelt made almost four hundred journeys and traveled more than three hundred fifty thousand miles. 34 | DUMMY1/LJ014-0076.wav|He was seen afterwards smoking and talking with his hosts in their back parlor, and never seen again alive. 35 | DUMMY1/LJ002-0043.wav|long narrow rooms -- one thirty-six feet, six twenty-three feet, and the eighth eighteen, 36 | DUMMY1/LJ009-0076.wav|We come to the sermon. 37 | DUMMY1/LJ017-0131.wav|even when the high sheriff had told him there was no possibility of a reprieve, and within a few hours of execution. 38 | DUMMY1/LJ046-0184.wav|but there is a system for the immediate notification of the Secret Service by the confining institution when a subject is released or escapes. 39 | DUMMY1/LJ014-0263.wav|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art. 40 | DUMMY1/LJ042-0096.wav|(old exchange rate) in addition to his factory salary of approximately equal amount 41 | DUMMY1/LJ049-0050.wav|Hill had both feet on the car and was climbing aboard to assist President and Mrs. Kennedy. 42 | DUMMY1/LJ019-0186.wav|seeing that since the establishment of the Central Criminal Court, Newgate received prisoners for trial from several counties, 43 | DUMMY1/LJ028-0307.wav|then let twenty days pass, and at the end of that time station near the Chaldasan gates a body of four thousand. 44 | DUMMY1/LJ012-0235.wav|While they were in a state of insensibility the murder was committed. 45 | DUMMY1/LJ034-0053.wav|reached the same conclusion as Latona that the prints found on the cartons were those of Lee Harvey Oswald. 46 | DUMMY1/LJ014-0030.wav|These were damnatory facts which well supported the prosecution. 47 | DUMMY1/LJ015-0203.wav|but were the precautions too minute, the vigilance too close to be eluded or overcome? 48 | DUMMY1/LJ028-0093.wav|but his scribe wrote it in the manner customary for the scribes of those days to write of their royal masters. 49 | DUMMY1/LJ002-0018.wav|The inadequacy of the jail was noticed and reported upon again and again by the grand juries of the city of London, 50 | DUMMY1/LJ028-0275.wav|At last, in the twentieth month, 51 | DUMMY1/LJ012-0042.wav|which he kept concealed in a hiding-place with a trap-door just under his bed. 52 | DUMMY1/LJ011-0096.wav|He married a lady also belonging to the Society of Friends, who brought him a large fortune, which, and his own money, he put into a city firm, 53 | DUMMY1/LJ036-0077.wav|Roger D. Craig, a deputy sheriff of Dallas County, 54 | DUMMY1/LJ016-0318.wav|Other officials, great lawyers, governors of prisons, and chaplains supported this view. 55 | DUMMY1/LJ013-0164.wav|who came from his room ready dressed, a suspicious circumstance, as he was always late in the morning. 56 | DUMMY1/LJ027-0141.wav|is closely reproduced in the life-history of existing deer. Or, in other words, 57 | DUMMY1/LJ028-0335.wav|accordingly they committed to him the command of their whole army, and put the keys of their city into his hands. 58 | DUMMY1/LJ031-0202.wav|Mrs. Kennedy chose the hospital in Bethesda for the autopsy because the President had served in the Navy. 59 | DUMMY1/LJ021-0145.wav|From those willing to join in establishing this hoped-for period of peace, 60 | DUMMY1/LJ016-0288.wav|"Müller, Müller, He's the man," till a diversion was created by the appearance of the gallows, which was received with continuous yells. 61 | DUMMY1/LJ028-0081.wav|Years later, when the archaeologists could readily distinguish the false from the true, 62 | DUMMY1/LJ018-0081.wav|his defense being that he had intended to commit suicide, but that, on the appearance of this officer who had wronged him, 63 | DUMMY1/LJ021-0066.wav|together with a great increase in the payrolls, there has come a substantial rise in the total of industrial profits 64 | DUMMY1/LJ009-0238.wav|After this the sheriffs sent for another rope, but the spectators interfered, and the man was carried back to jail. 65 | DUMMY1/LJ005-0079.wav|and improve the morals of the prisoners, and shall insure the proper measure of punishment to convicted offenders. 66 | DUMMY1/LJ035-0019.wav|drove to the northwest corner of Elm and Houston, and parked approximately ten feet from the traffic signal. 67 | DUMMY1/LJ036-0174.wav|This is the approximate time he entered the roominghouse, according to Earlene Roberts, the housekeeper there. 68 | DUMMY1/LJ046-0146.wav|The criteria in effect prior to November twenty-two, nineteen sixty-three, for determining whether to accept material for the PRS general files 69 | DUMMY1/LJ017-0044.wav|and the deepest anxiety was felt that the crime, if crime there had been, should be brought home to its perpetrator. 70 | DUMMY1/LJ017-0070.wav|but his sporting operations did not prosper, and he became a needy man, always driven to desperate straits for cash. 71 | DUMMY1/LJ014-0020.wav|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood; 72 | DUMMY1/LJ016-0020.wav|He never reached the cistern, but fell back into the yard, injuring his legs severely. 73 | DUMMY1/LJ045-0230.wav|when he was finally apprehended in the Texas Theatre. Although it is not fully corroborated by others who were present, 74 | DUMMY1/LJ035-0129.wav|and she must have run down the stairs ahead of Oswald and would probably have seen or heard him. 75 | DUMMY1/LJ008-0307.wav|afterwards express a wish to murder the Recorder for having kept them so long in suspense. 76 | DUMMY1/LJ008-0294.wav|nearly indefinitely deferred. 77 | DUMMY1/LJ047-0148.wav|On October twenty-five, 78 | DUMMY1/LJ008-0111.wav|They entered a "stone cold room," and were presently joined by the prisoner. 79 | DUMMY1/LJ034-0042.wav|that he could only testify with certainty that the print was less than three days old. 80 | DUMMY1/LJ037-0234.wav|Mrs. Mary Brock, the wife of a mechanic who worked at the station, was there at the time and she saw a white male, 81 | DUMMY1/LJ040-0002.wav|Chapter seven. Lee Harvey Oswald: Background and Possible Motives, Part one. 82 | DUMMY1/LJ045-0140.wav|The arguments he used to justify his use of the alias suggest that Oswald may have come to think that the whole world was becoming involved 83 | DUMMY1/LJ012-0035.wav|the number and names on watches, were carefully removed or obliterated after the goods passed out of his hands. 84 | DUMMY1/LJ012-0250.wav|On the seventh July, eighteen thirty-seven, 85 | DUMMY1/LJ016-0179.wav|contracted with sheriffs and conveners to work by the job. 86 | DUMMY1/LJ016-0138.wav|at a distance from the prison. 87 | DUMMY1/LJ027-0052.wav|These principles of homology are essential to a correct interpretation of the facts of morphology. 88 | DUMMY1/LJ031-0134.wav|On one occasion Mrs. Johnson, accompanied by two Secret Service agents, left the room to see Mrs. Kennedy and Mrs. Connally. 89 | DUMMY1/LJ019-0273.wav|which Sir Joshua Jebb told the committee he considered the proper elements of penal discipline. 90 | DUMMY1/LJ014-0110.wav|At the first the boxes were impounded, opened, and found to contain many of O'Connor's effects. 91 | DUMMY1/LJ034-0160.wav|on Brennan's subsequent certain identification of Lee Harvey Oswald as the man he saw fire the rifle. 92 | DUMMY1/LJ038-0199.wav|eleven. If I am alive and taken prisoner, 93 | DUMMY1/LJ014-0010.wav|yet he could not overcome the strange fascination it had for him, and remained by the side of the corpse till the stretcher came. 94 | DUMMY1/LJ033-0047.wav|I noticed when I went out that the light was on, end quote, 95 | DUMMY1/LJ040-0027.wav|He was never satisfied with anything. 96 | DUMMY1/LJ048-0228.wav|and others who were present say that no agent was inebriated or acted improperly. 97 | DUMMY1/LJ003-0111.wav|He was in consequence put out of the protection of their internal law, end quote. Their code was a subject of some curiosity. 98 | DUMMY1/LJ008-0258.wav|Let me retrace my steps, and speak more in detail of the treatment of the condemned in those bloodthirsty and brutally indifferent days, 99 | DUMMY1/LJ029-0022.wav|The original plan called for the President to spend only one day in the State, making whirlwind visits to Dallas, Fort Worth, San Antonio, and Houston. 100 | DUMMY1/LJ004-0045.wav|Mr. Sturges Bourne, Sir James Mackintosh, Sir James Scarlett, and William Wilberforce. 101 | -------------------------------------------------------------------------------- /models/vits/filelists/ljs_audio_text_val_filelist.txt.cleaned: -------------------------------------------------------------------------------- 1 | DUMMY1/LJ022-0023.wav|ðɪ ˌoʊvɚwˈɛlmɪŋ mədʒˈɔːɹɪɾi ʌv pˈiːpəl ɪn ðɪs kˈʌntɹi nˈoʊ hˌaʊ tə sˈɪft ðə wˈiːt fɹʌmðə tʃˈæf ɪn wˌʌt ðeɪ hˈɪɹ ænd wˌʌt ðeɪ ɹˈiːd. 2 | DUMMY1/LJ043-0030.wav|ɪf sˈʌmbɑːdi dˈɪd ðˈæt tə mˌiː, ɐ lˈaʊsi tɹˈɪk lˈaɪk ðˈæt, tə tˈeɪk maɪ wˈaɪf ɐwˈeɪ, ænd ˈɔːl ðə fˈɜːnɪtʃɚ, ˈaɪ wʊd biː mˈæd æz hˈɛl, tˈuː. 3 | DUMMY1/LJ005-0201.wav|ˌæzˌɪz ʃˈoʊn baɪ ðə ɹɪpˈoːɹt ʌvðə kəmˈɪʃənɚz tʊ ɪnkwˈaɪɚɹ ˌɪntʊ ðə stˈeɪt ʌvðə mjuːnˈɪsɪpəl kˌɔːɹpɚɹˈeɪʃənz ɪn eɪtˈiːn θˈɜːɾifˈaɪv. 4 | DUMMY1/LJ001-0110.wav|ˈiːvən ðə kˈæslɑːn tˈaɪp wɛn ɛnlˈɑːɹdʒd ʃˈoʊz ɡɹˈeɪt ʃˈɔːɹtkʌmɪŋz ɪn ðɪs ɹɪspˈɛkt: 5 | DUMMY1/LJ003-0345.wav|ˈɔːl ðə kəmˈɪɾi kʊd dˈuː ɪn ðɪs ɹɪspˈɛkt wʌz tə θɹˈoʊ ðə ɹɪspˌɑːnsəbˈɪlɪɾi ˌɑːn ˈʌðɚz. 6 | DUMMY1/LJ007-0154.wav|ðiːz pˈʌndʒənt ænd wˈɛlɡɹˈaʊndᵻd stɹˈɪktʃɚz ɐplˈaɪd wɪð stˈɪl ɡɹˈeɪɾɚ fˈoːɹs tə ðɪ ʌnkənvˈɪktᵻd pɹˈɪzənɚ, ðə mˈæn hˌuː kˈeɪm tə ðə pɹˈɪzən ˈɪnəsənt, ænd stˈɪl ʌnkəntˈæmᵻnˌeɪɾᵻd, 7 | DUMMY1/LJ018-0098.wav|ænd ɹˈɛkəɡnˌaɪzd æz wˈʌn ʌvðə fɹˈiːkwɛntɚz ʌvðə bˈoʊɡəs lˈɔːstˈeɪʃənɚz. hɪz ɐɹˈɛst lˈɛd tə ðæt ʌv ˈʌðɚz. 8 | DUMMY1/LJ047-0044.wav|ˈɑːswəld wʌz, haʊˈɛvɚ, wˈɪlɪŋ tə dɪskˈʌs hɪz kˈɑːntækts wɪð sˈoʊviət ɐθˈɔːɹɪɾiz. hiː dɪnˈaɪd hˌævɪŋ ˌɛni ɪnvˈɑːlvmənt wɪð sˈoʊviət ɪntˈɛlɪdʒəns ˈeɪdʒənsiz 9 | DUMMY1/LJ031-0038.wav|ðə fˈɜːst fɪzˈɪʃən tə sˈiː ðə pɹˈɛzɪdənt æt pˈɑːɹklənd hˈɑːspɪɾəl wʌz dˈɑːktɚ tʃˈɑːɹlz dʒˈeɪ. kˈæɹɪkˌoʊ, ɐ ɹˈɛzɪdənt ɪn dʒˈɛnɚɹəl sˈɜːdʒɚɹi. 10 | DUMMY1/LJ048-0194.wav|dˈʊɹɪŋ ðə mˈɔːɹnɪŋ ʌv noʊvˈɛmbɚ twˈɛntitˈuː pɹˈaɪɚ tə ðə mˈoʊɾɚkˌeɪd. 11 | DUMMY1/LJ049-0026.wav|ˌɑːn əkˈeɪʒən ðə sˈiːkɹət sˈɜːvɪs hɐzbɪn pɚmˈɪɾᵻd tə hæv ɐn ˈeɪdʒənt ɹˈaɪdɪŋ ɪnðə pˈæsɪndʒɚ kəmpˈɑːɹtmənt wɪððə pɹˈɛzɪdənt. 12 | DUMMY1/LJ004-0152.wav|ɑːlðˈoʊ æt mˈɪstɚ bˈʌkstənz vˈɪzɪt ɐ nˈuː dʒˈeɪl wʌz ɪn pɹˈɑːsɛs ʌv ɪɹˈɛkʃən, ðə fˈɜːst stˈɛp tʊwˈɔːɹdz ɹɪfˈɔːɹm sˈɪns hˈaʊɚdz vˌɪzɪtˈeɪʃən ɪn sˌɛvəntˈiːn sˈɛvəntifˈoːɹ. 13 | DUMMY1/LJ008-0278.wav|ɔːɹ ðˈɛɹz mˌaɪt biː wˈʌn ʌv mˈɛni, ænd ɪt mˌaɪt biː kənsˈɪdɚd nˈɛsəsɚɹi tuː "mˌeɪk ɐn ɛɡzˈæmpəl." 14 | DUMMY1/LJ043-0002.wav|ðə wˈɔːɹən kəmˈɪʃən ɹɪpˈoːɹt. baɪ ðə pɹˈɛzɪdənts kəmˈɪʃən ɑːnðɪ ɐsˌæsᵻnˈeɪʃən ʌv pɹˈɛzɪdənt kˈɛnədi. tʃˈæptɚ sˈɛvən. lˈiː hˈɑːɹvi ˈɑːswəld: 15 | DUMMY1/LJ009-0114.wav|mˈɪstɚ wˈeɪkfiːld wˈaɪndz ˈʌp hɪz ɡɹˈæfɪk bˌʌt sˈʌmwʌt sɛnsˈeɪʃənəl ɐkˈaʊnt baɪ dɪskɹˈaɪbɪŋ ɐnˈʌðɚ ɹɪlˈɪdʒəs sˈɜːvɪs, wˌɪtʃ mˈeɪ ɐpɹˈoʊpɹɪətli biː ɪnsˈɜːɾᵻd hˈɪɹ. 16 | DUMMY1/LJ028-0506.wav|ɐ mˈɑːdɚn ˈɑːɹɾɪst wʊdhɐv dˈɪfɪkˌʌlti ɪn dˌuːɪŋ sˈʌtʃ ˈækjʊɹət wˈɜːk. 17 | DUMMY1/LJ050-0168.wav|wɪððə pɚtˈɪkjʊlɚ pˈɜːpəsᵻz ʌvðɪ ˈeɪdʒənsi ɪnvˈɑːlvd. ðə kəmˈɪʃən ɹˈɛkəɡnˌaɪzɪz ðæt ðɪs ɪz ɐ kˌɑːntɹəvˈɜːʃəl ˈɛɹiə 18 | DUMMY1/LJ039-0223.wav|ˈɑːswəldz mɚɹˈiːn tɹˈeɪnɪŋ ɪn mˈɑːɹksmənʃˌɪp, hɪz ˈʌðɚ ɹˈaɪfəl ɛkspˈiəɹɪəns ænd hɪz ɪstˈæblɪʃt fəmˌɪlɪˈæɹɪɾi wɪð ðɪs pɚtˈɪkjʊlɚ wˈɛpən 19 | DUMMY1/LJ029-0032.wav|ɐkˈoːɹdɪŋ tʊ oʊdˈɑːnəl, kwˈoʊt, wiː hɐd ɐ mˈoʊɾɚkˌeɪd wɛɹɹˈɛvɚ wiː wˈɛnt, ˈɛnd kwˈoʊt. 20 | DUMMY1/LJ031-0070.wav|dˈɑːktɚ klˈɑːɹk, hˌuː mˈoʊst klˈoʊsli ɑːbzˈɜːvd ðə hˈɛd wˈuːnd, 21 | DUMMY1/LJ034-0198.wav|jˈuːɪnz, hˌuː wʌz ɑːnðə saʊθwˈɛst kˈɔːɹnɚɹ ʌv ˈɛlm ænd hjˈuːstən stɹˈiːts tˈɛstɪfˌaɪd ðæt hiː kʊd nˌɑːt dɪskɹˈaɪb ðə mˈæn hiː sˈɔː ɪnðə wˈɪndoʊ. 22 | DUMMY1/LJ026-0068.wav|ˈɛnɚdʒi ˈɛntɚz ðə plˈænt, tʊ ɐ smˈɔːl ɛkstˈɛnt, 23 | DUMMY1/LJ039-0075.wav|wˈʌns juː nˈoʊ ðæt juː mˈʌst pˌʊt ðə kɹˈɔshɛɹz ɑːnðə tˈɑːɹɡɪt ænd ðæt ɪz ˈɔːl ðæt ɪz nˈɛsəsɚɹi. 24 | DUMMY1/LJ004-0096.wav|ðə fˈeɪɾəl kˈɑːnsɪkwənsᵻz wˈɛɹɑːf mˌaɪt biː pɹɪvˈɛntᵻd ɪf ðə dʒˈʌstɪsᵻz ʌvðə pˈiːs wɜː djˈuːli ˈɔːθɚɹˌaɪzd 25 | DUMMY1/LJ005-0014.wav|spˈiːkɪŋ ˌɑːn ɐ dɪbˈeɪt ˌɑːn pɹˈɪzən mˈæɾɚz, hiː dᵻklˈɛɹd ðˈæt 26 | DUMMY1/LJ012-0161.wav|hiː wʌz ɹɪpˈoːɹɾᵻd tə hæv fˈɔːlən ɐwˈeɪ tʊ ɐ ʃˈædoʊ. 27 | DUMMY1/LJ018-0239.wav|hɪz dˌɪsɐpˈɪɹəns ɡˈeɪv kˈʌlɚ ænd sˈʌbstəns tʊ ˈiːvəl ɹɪpˈoːɹts ɔːlɹˌɛdi ɪn sˌɜːkjʊlˈeɪʃən ðætðə wɪl ænd kənvˈeɪəns əbˌʌv ɹɪfˈɜːd tuː 28 | DUMMY1/LJ019-0257.wav|hˈɪɹ ðə tɹˈɛdwˈiːl wʌz ɪn jˈuːs, ðɛɹ sˈɛljʊlɚ kɹˈæŋks, ɔːɹ hˈɑːɹdlˈeɪbɚ məʃˈiːnz. 29 | DUMMY1/LJ028-0008.wav|juː tˈæp dʒˈɛntli wɪð jʊɹ hˈiːl əpˌɑːn ðə ʃˈoʊldɚɹ ʌvðə dɹˈoʊmdɚɹi tʊ ˈɜːdʒ hɜːɹ ˈɑːn. 30 | DUMMY1/LJ024-0083.wav|ðɪs plˈæn ʌv mˈaɪn ɪz nˈoʊ ɐtˈæk ɑːnðə kˈoːɹt; 31 | DUMMY1/LJ042-0129.wav|nˈoʊ nˈaɪt klˈʌbz ɔːɹ bˈoʊlɪŋ ˈælɪz, nˈoʊ plˈeɪsᵻz ʌv ɹˌɛkɹiːˈeɪʃən ɛksˈɛpt ðə tɹˈeɪd jˈuːniən dˈænsᵻz. ˈaɪ hæv hɐd ɪnˈʌf. 32 | DUMMY1/LJ036-0103.wav|ðə pəlˈiːs ˈæskt hˌɪm wˈɛðɚ hiː kʊd pˈɪk ˈaʊt hɪz pˈæsɪndʒɚ fɹʌmðə lˈaɪnʌp. 33 | DUMMY1/LJ046-0058.wav|dˈʊɹɪŋ hɪz pɹˈɛzɪdənsi, fɹˈæŋklɪn dˈiː. ɹˈoʊzəvˌɛlt mˌeɪd ˈɔːlmoʊst fˈoːɹ hˈʌndɹəd dʒˈɜːnɪz ænd tɹˈævəld mˈoːɹ ðɐn θɹˈiː hˈʌndɹəd fˈɪfti θˈaʊzənd mˈaɪlz. 34 | DUMMY1/LJ014-0076.wav|hiː wʌz sˈiːn ˈæftɚwɚdz smˈoʊkɪŋ ænd tˈɔːkɪŋ wɪð hɪz hˈoʊsts ɪn ðɛɹ bˈæk pˈɑːɹlɚ, ænd nˈɛvɚ sˈiːn ɐɡˈɛn ɐlˈaɪv. 35 | DUMMY1/LJ002-0043.wav|lˈɑːŋ nˈæɹoʊ ɹˈuːmz wˈʌn θˈɜːɾisˈɪks fˈiːt, sˈɪks twˈɛntiθɹˈiː fˈiːt, ænd ðɪ ˈeɪtθ eɪtˈiːn, 36 | DUMMY1/LJ009-0076.wav|wiː kˈʌm tə ðə sˈɜːmən. 37 | DUMMY1/LJ017-0131.wav|ˈiːvən wɛn ðə hˈaɪ ʃˈɛɹɪf hɐd tˈoʊld hˌɪm ðɛɹwˌʌz nˈoʊ pˌɑːsəbˈɪlɪɾi əvɚ ɹɪpɹˈiːv, ænd wɪðˌɪn ɐ fjˈuː ˈaɪʊɹz ʌv ˌɛksɪkjˈuːʃən. 38 | DUMMY1/LJ046-0184.wav|bˌʌt ðɛɹ ɪz ɐ sˈɪstəm fɚðɪ ɪmˈiːdɪət nˌoʊɾɪfɪkˈeɪʃən ʌvðə sˈiːkɹət sˈɜːvɪs baɪ ðə kənfˈaɪnɪŋ ˌɪnstɪtˈuːʃən wɛn ɐ sˈʌbdʒɛkt ɪz ɹɪlˈiːsd ɔːɹ ɛskˈeɪps. 39 | DUMMY1/LJ014-0263.wav|wˌɛn ˈʌðɚ plˈɛʒɚz pˈɔːld hiː tˈʊk ɐ θˈiəɾɚ, ænd pˈoʊzd æz ɐ mjuːnˈɪfɪsənt pˈeɪtɹən ʌvðə dɹəmˈæɾɪk ˈɑːɹt. 40 | DUMMY1/LJ042-0096.wav| ˈoʊld ɛkstʃˈeɪndʒ ɹˈeɪt ɪn ɐdˈɪʃən tə hɪz fˈæktɚɹi sˈælɚɹi ʌv ɐpɹˈɑːksɪmətli ˈiːkwəl ɐmˈaʊnt 41 | DUMMY1/LJ049-0050.wav|hˈɪl hɐd bˈoʊθ fˈiːt ɑːnðə kˈɑːɹ ænd wʌz klˈaɪmɪŋ ɐbˈoːɹd tʊ ɐsˈɪst pɹˈɛzɪdənt ænd mɪsˈɛs kˈɛnədi. 42 | DUMMY1/LJ019-0186.wav|sˈiːɪŋ ðæt sˈɪns ðɪ ɪstˈæblɪʃmənt ʌvðə sˈɛntɹəl kɹˈɪmɪnəl kˈoːɹt, nˈuːɡeɪt ɹɪsˈiːvd pɹˈɪzənɚz fɔːɹ tɹˈaɪəl fɹʌm sˈɛvɹəl kˈaʊntɪz, 43 | DUMMY1/LJ028-0307.wav|ðˈɛn lˈɛt twˈɛnti dˈeɪz pˈæs, ænd æt ðɪ ˈɛnd ʌv ðæt tˈaɪm stˈeɪʃən nˌɪɹ ðə tʃˈældæsən ɡˈeɪts ɐ bˈɑːdi ʌv fˈoːɹ θˈaʊzənd. 44 | DUMMY1/LJ012-0235.wav|wˌaɪl ðeɪ wɜːɹ ɪn ɐ stˈeɪt ʌv ɪnsˌɛnsəbˈɪlɪɾi ðə mˈɜːdɚ wʌz kəmˈɪɾᵻd. 45 | DUMMY1/LJ034-0053.wav|ɹˈiːtʃt ðə sˈeɪm kənklˈuːʒən æz lætˈoʊnə ðætðə pɹˈɪnts fˈaʊnd ɑːnðə kˈɑːɹtənz wɜː ðoʊz ʌv lˈiː hˈɑːɹvi ˈɑːswəld. 46 | DUMMY1/LJ014-0030.wav|ðiːz wɜː dˈæmnətˌoːɹi fˈækts wˌɪtʃ wˈɛl səpˈoːɹɾᵻd ðə pɹˌɑːsɪkjˈuːʃən. 47 | DUMMY1/LJ015-0203.wav|bˌʌt wɜː ðə pɹɪkˈɔːʃənz tˈuː mˈɪnɪt, ðə vˈɪdʒɪləns tˈuː klˈoʊs təbi ɪlˈuːdᵻd ɔːɹ ˌoʊvɚkˈʌm? 48 | DUMMY1/LJ028-0093.wav|bˌʌt hɪz skɹˈaɪb ɹˈoʊt ɪt ɪnðə mˈænɚ kˈʌstəmˌɛɹi fɚðə skɹˈaɪbz ʌv ðoʊz dˈeɪz tə ɹˈaɪt ʌv ðɛɹ ɹˈɔɪəl mˈæstɚz. 49 | DUMMY1/LJ002-0018.wav|ðɪ ɪnˈædɪkwəsi ʌvðə dʒˈeɪl wʌz nˈoʊɾɪsd ænd ɹɪpˈoːɹɾᵻd əpˌɑːn ɐɡˈɛn ænd ɐɡˈɛn baɪ ðə ɡɹˈænd dʒˈʊɹɪz ʌvðə sˈɪɾi ʌv lˈʌndən, 50 | DUMMY1/LJ028-0275.wav|æt lˈæst, ɪnðə twˈɛntiəθ mˈʌnθ, 51 | DUMMY1/LJ012-0042.wav|wˌɪtʃ hiː kˈɛpt kənsˈiːld ɪn ɐ hˈaɪdɪŋplˈeɪs wɪð ɐ tɹˈæpdˈoːɹ dʒˈʌst ˌʌndɚ hɪz bˈɛd. 52 | DUMMY1/LJ011-0096.wav|hiː mˈæɹɪd ɐ lˈeɪdi ˈɑːlsoʊ bɪlˈɑːŋɪŋ tə ðə səsˈaɪəɾi ʌv fɹˈɛndz, hˌuː bɹˈɔːt hˌɪm ɐ lˈɑːɹdʒ fˈɔːɹtʃən, wˈɪtʃ, ænd hɪz ˈoʊn mˈʌni, hiː pˌʊt ˌɪntʊ ɐ sˈɪɾi fˈɜːm, 53 | DUMMY1/LJ036-0077.wav|ɹˈɑːdʒɚ dˈiː. kɹˈeɪɡ, ɐ dˈɛpjuːɾi ʃˈɛɹɪf ʌv dˈæləs kˈaʊnti, 54 | DUMMY1/LJ016-0318.wav|ˈʌðɚɹ əfˈɪʃəlz, ɡɹˈeɪt lˈɔɪɚz, ɡˈʌvɚnɚz ʌv pɹˈɪzənz, ænd tʃˈæplɪnz səpˈoːɹɾᵻd ðɪs vjˈuː. 55 | DUMMY1/LJ013-0164.wav|hˌuː kˈeɪm fɹʌm hɪz ɹˈuːm ɹˈɛdi dɹˈɛst, ɐ səspˈɪʃəs sˈɜːkəmstˌæns, æz hiː wʌz ˈɔːlweɪz lˈeɪt ɪnðə mˈɔːɹnɪŋ. 56 | DUMMY1/LJ027-0141.wav|ɪz klˈoʊsli ɹɪpɹədˈuːst ɪnðə lˈaɪfhˈɪstɚɹi ʌv ɛɡzˈɪstɪŋ dˈɪɹ. ˈɔːɹ, ɪn ˈʌðɚ wˈɜːdz, 57 | DUMMY1/LJ028-0335.wav|ɐkˈoːɹdɪŋli ðeɪ kəmˈɪɾᵻd tə hˌɪm ðə kəmˈænd ʌv ðɛɹ hˈoʊl ˈɑːɹmi, ænd pˌʊt ðə kˈiːz ʌv ðɛɹ sˈɪɾi ˌɪntʊ hɪz hˈændz. 58 | DUMMY1/LJ031-0202.wav|mɪsˈɛs kˈɛnədi tʃˈoʊz ðə hˈɑːspɪɾəl ɪn bəθˈɛzdə fɚðɪ ˈɔːtɑːpsi bɪkˈʌz ðə pɹˈɛzɪdənt hɐd sˈɜːvd ɪnðə nˈeɪvi. 59 | DUMMY1/LJ021-0145.wav|fɹʌm ðoʊz wˈɪlɪŋ tə dʒˈɔɪn ɪn ɪstˈæblɪʃɪŋ ðɪs hˈoʊptfɔːɹ pˈiəɹɪəd ʌv pˈiːs, 60 | DUMMY1/LJ016-0288.wav|"mˈʌlɚ, mˈʌlɚ, hiːz ðə mˈæn," tˈɪl ɐ daɪvˈɜːʒən wʌz kɹiːˈeɪɾᵻd baɪ ðɪ ɐpˈɪɹəns ʌvðə ɡˈæloʊz, wˌɪtʃ wʌz ɹɪsˈiːvd wɪð kəntˈɪnjuːəs jˈɛlz. 61 | DUMMY1/LJ028-0081.wav|jˈɪɹz lˈeɪɾɚ, wˌɛn ðɪ ˌɑːɹkiːˈɑːlədʒˌɪsts kʊd ɹˈɛdɪli dɪstˈɪŋɡwɪʃ ðə fˈɑːls fɹʌmðə tɹˈuː, 62 | DUMMY1/LJ018-0081.wav|hɪz dɪfˈɛns bˌiːɪŋ ðæt hiː hɐd ɪntˈɛndᵻd tə kəmˈɪt sˈuːɪsˌaɪd, bˌʌt ðˈæt, ɑːnðɪ ɐpˈɪɹəns ʌv ðɪs ˈɑːfɪsɚ hˌuː hɐd ɹˈɔŋd hˌɪm, 63 | DUMMY1/LJ021-0066.wav|təɡˌɛðɚ wɪð ɐ ɡɹˈeɪt ˈɪnkɹiːs ɪnðə pˈeɪɹoʊlz, ðɛɹ hɐz kˈʌm ɐ səbstˈænʃəl ɹˈaɪz ɪnðə tˈoʊɾəl ʌv ɪndˈʌstɹɪəl pɹˈɑːfɪts 64 | DUMMY1/LJ009-0238.wav|ˈæftɚ ðɪs ðə ʃˈɛɹɪfs sˈɛnt fɔːɹ ɐnˈʌðɚ ɹˈoʊp, bˌʌt ðə spɛktˈeɪɾɚz ˌɪntəfˈɪɹd, ænd ðə mˈæn wʌz kˈæɹɪd bˈæk tə dʒˈeɪl. 65 | DUMMY1/LJ005-0079.wav|ænd ɪmpɹˈuːv ðə mˈɔːɹəlz ʌvðə pɹˈɪzənɚz, ænd ʃˌæl ɪnʃˈʊɹ ðə pɹˈɑːpɚ mˈɛʒɚɹ ʌv pˈʌnɪʃmənt tə kənvˈɪktᵻd əfˈɛndɚz. 66 | DUMMY1/LJ035-0019.wav|dɹˈoʊv tə ðə nɔːɹθwˈɛst kˈɔːɹnɚɹ ʌv ˈɛlm ænd hjˈuːstən, ænd pˈɑːɹkt ɐpɹˈɑːksɪmətli tˈɛn fˈiːt fɹʌmðə tɹˈæfɪk sˈɪɡnəl. 67 | DUMMY1/LJ036-0174.wav|ðɪs ɪz ðɪ ɐpɹˈɑːksɪmət tˈaɪm hiː ˈɛntɚd ðə ɹˈuːmɪŋhˌaʊs, ɐkˈoːɹdɪŋ tʊ ˈɜːliːn ɹˈɑːbɚts, ðə hˈaʊskiːpɚ ðˈɛɹ. 68 | DUMMY1/LJ046-0146.wav|ðə kɹaɪtˈiəɹɪə ɪn ɪfˈɛkt pɹˈaɪɚ tə noʊvˈɛmbɚ twˈɛntitˈuː, naɪntˈiːn sˈɪkstiθɹˈiː, fɔːɹ dɪtˈɜːmɪnɪŋ wˈɛðɚ tʊ ɐksˈɛpt mətˈiəɹɪəl fɚðə pˌiːˌɑːɹˈɛs dʒˈɛnɚɹəl fˈaɪlz 69 | DUMMY1/LJ017-0044.wav|ænd ðə dˈiːpəst æŋzˈaɪəɾi wʌz fˈɛlt ðætðə kɹˈaɪm, ɪf kɹˈaɪm ðˈɛɹ hɐdbɪn, ʃˌʊd biː bɹˈɔːt hˈoʊm tʊ ɪts pˈɜːpɪtɹˌeɪɾɚ. 70 | DUMMY1/LJ017-0070.wav|bˌʌt hɪz spˈoːɹɾɪŋ ˌɑːpɚɹˈeɪʃənz dɪdnˌɑːt pɹˈɑːspɚ, ænd hiː bɪkˌeɪm ɐ nˈiːdi mˈæn, ˈɔːlweɪz dɹˈɪvən tə dˈɛspɚɹət stɹˈeɪts fɔːɹ kˈæʃ. 71 | DUMMY1/LJ014-0020.wav|hiː wʌz sˈuːn ˈæftɚwɚdz ɐɹˈɛstᵻd ˌɑːn səspˈɪʃən, ænd ɐ sˈɜːtʃ ʌv hɪz lˈɑːdʒɪŋz bɹˈɔːt tə lˈaɪt sˈɛvɹəl ɡˈɑːɹmənts sˈætʃɚɹˌeɪɾᵻd wɪð blˈʌd; 72 | DUMMY1/LJ016-0020.wav|hiː nˈɛvɚ ɹˈiːtʃt ðə sˈɪstɚn, bˌʌt fˈɛl bˈæk ˌɪntʊ ðə jˈɑːɹd, ˈɪndʒɚɹɪŋ hɪz lˈɛɡz sɪvˈɪɹli. 73 | DUMMY1/LJ045-0230.wav|wˌɛn hiː wʌz fˈaɪnəli ˌæpɹɪhˈɛndᵻd ɪnðə tˈɛksəs θˈiəɾɚ. ɑːlðˈoʊ ɪt ɪz nˌɑːt fˈʊli kɚɹˈɑːbɚɹˌeɪɾᵻd baɪ ˈʌðɚz hˌuː wɜː pɹˈɛzənt, 74 | DUMMY1/LJ035-0129.wav|ænd ʃiː mˈʌstɐv ɹˈʌn dˌaʊn ðə stˈɛɹz ɐhˈɛd ʌv ˈɑːswəld ænd wʊd pɹˈɑːbəbli hæv sˈiːn ɔːɹ hˈɜːd hˌɪm. 75 | DUMMY1/LJ008-0307.wav|ˈæftɚwɚdz ɛkspɹˈɛs ɐ wˈɪʃ tə mˈɜːdɚ ðə ɹɪkˈoːɹdɚ fɔːɹ hˌævɪŋ kˈɛpt ðˌɛm sˌoʊ lˈɑːŋ ɪn səspˈɛns. 76 | DUMMY1/LJ008-0294.wav|nˌɪɹli ɪndˈɛfɪnətli dɪfˈɜːd. 77 | DUMMY1/LJ047-0148.wav|ˌɑːn ɑːktˈoʊbɚ twˈɛntifˈaɪv, 78 | DUMMY1/LJ008-0111.wav|ðeɪ ˈɛntɚd ˈeɪ "stˈoʊn kˈoʊld ɹˈuːm," ænd wɜː pɹˈɛzəntli dʒˈɔɪnd baɪ ðə pɹˈɪzənɚ. 79 | DUMMY1/LJ034-0042.wav|ðæt hiː kʊd ˈoʊnli tˈɛstɪfˌaɪ wɪð sˈɜːtənti ðætðə pɹˈɪnt wʌz lˈɛs ðɐn θɹˈiː dˈeɪz ˈoʊld. 80 | DUMMY1/LJ037-0234.wav|mɪsˈɛs mˈɛɹi bɹˈɑːk, ðə wˈaɪf əvə mɪkˈænɪk hˌuː wˈɜːkt æt ðə stˈeɪʃən, wʌz ðɛɹ æt ðə tˈaɪm ænd ʃiː sˈɔː ɐ wˈaɪt mˈeɪl, 81 | DUMMY1/LJ040-0002.wav|tʃˈæptɚ sˈɛvən. lˈiː hˈɑːɹvi ˈɑːswəld: bˈækɡɹaʊnd ænd pˈɑːsəbəl mˈoʊɾɪvz, pˈɑːɹt wˌʌn. 82 | DUMMY1/LJ045-0140.wav|ðɪ ˈɑːɹɡjuːmənts hiː jˈuːzd tə dʒˈʌstɪfˌaɪ hɪz jˈuːs ʌvðɪ ˈeɪliəs sədʒˈɛst ðæt ˈɑːswəld mˌeɪhɐv kˈʌm tə θˈɪŋk ðætðə hˈoʊl wˈɜːld wʌz bɪkˈʌmɪŋ ɪnvˈɑːlvd 83 | DUMMY1/LJ012-0035.wav|ðə nˈʌmbɚ ænd nˈeɪmz ˌɑːn wˈɑːtʃᵻz, wɜː kˈɛɹfəli ɹɪmˈuːvd ɔːɹ əblˈɪɾɚɹˌeɪɾᵻd ˈæftɚ ðə ɡˈʊdz pˈæst ˌaʊɾəv hɪz hˈændz. 84 | DUMMY1/LJ012-0250.wav|ɑːnðə sˈɛvənθ dʒuːlˈaɪ, eɪtˈiːn θˈɜːɾisˈɛvən, 85 | DUMMY1/LJ016-0179.wav|kəntɹˈæktᵻd wɪð ʃˈɛɹɪfs ænd kənvˈɛnɚz tə wˈɜːk baɪ ðə dʒˈɑːb. 86 | DUMMY1/LJ016-0138.wav|æɾə dˈɪstəns fɹʌmðə pɹˈɪzən. 87 | DUMMY1/LJ027-0052.wav|ðiːz pɹˈɪnsɪpəlz ʌv həmˈɑːlədʒi ɑːɹ ɪsˈɛnʃəl tʊ ɐ kɚɹˈɛkt ɪntˌɜːpɹɪtˈeɪʃən ʌvðə fˈækts ʌv mɔːɹfˈɑːlədʒi. 88 | DUMMY1/LJ031-0134.wav|ˌɑːn wˈʌn əkˈeɪʒən mɪsˈɛs dʒˈɑːnsən, ɐkˈʌmpənɪd baɪ tˈuː sˈiːkɹət sˈɜːvɪs ˈeɪdʒənts, lˈɛft ðə ɹˈuːm tə sˈiː mɪsˈɛs kˈɛnədi ænd mɪsˈɛs kənˈæli. 89 | DUMMY1/LJ019-0273.wav|wˌɪtʃ sˌɜː dʒˈɑːʃjuːə dʒˈɛb tˈoʊld ðə kəmˈɪɾi hiː kənsˈɪdɚd ðə pɹˈɑːpɚɹ ˈɛlɪmənts ʌv pˈiːnəl dˈɪsɪplˌɪn. 90 | DUMMY1/LJ014-0110.wav|æt ðə fˈɜːst ðə bˈɑːksᵻz wɜːɹ ɪmpˈaʊndᵻd, ˈoʊpənd, ænd fˈaʊnd tə kəntˈeɪn mˈɛnɪəv oʊkˈɑːnɚz ɪfˈɛkts. 91 | DUMMY1/LJ034-0160.wav|ˌɑːn bɹˈɛnənz sˈʌbsɪkwənt sˈɜːtən aɪdˈɛntɪfɪkˈeɪʃən ʌv lˈiː hˈɑːɹvi ˈɑːswəld æz ðə mˈæn hiː sˈɔː fˈaɪɚ ðə ɹˈaɪfəl. 92 | DUMMY1/LJ038-0199.wav|ɪlˈɛvən. ɪf ˈaɪ æm ɐlˈaɪv ænd tˈeɪkən pɹˈɪzənɚ, 93 | DUMMY1/LJ014-0010.wav|jˈɛt hiː kʊd nˌɑːt ˌoʊvɚkˈʌm ðə stɹˈeɪndʒ fˌæsᵻnˈeɪʃən ɪt hˈɐd fɔːɹ hˌɪm, ænd ɹɪmˈeɪnd baɪ ðə sˈaɪd ʌvðə kˈɔːɹps tˈɪl ðə stɹˈɛtʃɚ kˈeɪm. 94 | DUMMY1/LJ033-0047.wav|ˈaɪ nˈoʊɾɪsd wɛn ˈaɪ wɛnt ˈaʊt ðætðə lˈaɪt wʌz ˈɑːn, ˈɛnd kwˈoʊt, 95 | DUMMY1/LJ040-0027.wav|hiː wʌz nˈɛvɚ sˈæɾɪsfˌaɪd wɪð ˈɛnɪθˌɪŋ. 96 | DUMMY1/LJ048-0228.wav|ænd ˈʌðɚz hˌuː wɜː pɹˈɛzənt sˈeɪ ðæt nˈoʊ ˈeɪdʒənt wʌz ɪnˈiːbɹɪˌeɪɾᵻd ɔːɹ ˈæktᵻd ɪmpɹˈɑːpɚli. 97 | DUMMY1/LJ003-0111.wav|hiː wʌz ɪn kˈɑːnsɪkwəns pˌʊt ˌaʊɾəv ðə pɹətˈɛkʃən ʌv ðɛɹ ɪntˈɜːnəl lˈɔː, ˈɛnd kwˈoʊt. ðɛɹ kˈoʊd wʌzɐ sˈʌbdʒɛkt ʌv sˌʌm kjˌʊɹɪˈɑːsɪɾi. 98 | DUMMY1/LJ008-0258.wav|lˈɛt mˌiː ɹɪtɹˈeɪs maɪ stˈɛps, ænd spˈiːk mˈoːɹ ɪn diːtˈeɪl ʌvðə tɹˈiːtmənt ʌvðə kəndˈɛmd ɪn ðoʊz blˈʌdθɜːsti ænd bɹˈuːɾəli ɪndˈɪfɹənt dˈeɪz, 99 | DUMMY1/LJ029-0022.wav|ðɪ ɚɹˈɪdʒɪnəl plˈæn kˈɔːld fɚðə pɹˈɛzɪdənt tə spˈɛnd ˈoʊnli wˈʌn dˈeɪ ɪnðə stˈeɪt, mˌeɪkɪŋ wˈɜːlwɪnd vˈɪzɪts tə dˈæləs, fˈɔːɹt wˈɜːθ, sˌæn æntˈoʊnɪˌoʊ, ænd hjˈuːstən. 100 | DUMMY1/LJ004-0045.wav|mˈɪstɚ stˈɜːdʒᵻz bˈoːɹn, sˌɜː dʒˈeɪmz mˈækɪntˌɑːʃ, sˌɜː dʒˈeɪmz skˈɑːɹlɪt, ænd wˈɪljəm wˈɪlbɚfˌoːɹs. 101 | -------------------------------------------------------------------------------- /models/vits/filelists/vctk_audio_sid_text_val_filelist.txt: -------------------------------------------------------------------------------- 1 | DUMMY2/p364/p364_240.wav|88|It had happened to him. 2 | DUMMY2/p280/p280_148.wav|52|It is open season on the Old Firm. 3 | DUMMY2/p231/p231_320.wav|50|However, he is a coach, and he remains a coach at heart. 4 | DUMMY2/p282/p282_129.wav|83|It is not a U-turn. 5 | DUMMY2/p254/p254_015.wav|41|The Greeks used to imagine that it was a sign from the gods to foretell war or heavy rain. 6 | DUMMY2/p228/p228_285.wav|57|The songs are just so good. 7 | DUMMY2/p334/p334_307.wav|38|If they don't, they can expect their funding to be cut. 8 | DUMMY2/p287/p287_081.wav|77|I've never seen anything like it. 9 | DUMMY2/p247/p247_083.wav|14|It is a job creation scheme.) 10 | DUMMY2/p264/p264_051.wav|65|We were leading by two goals.) 11 | DUMMY2/p335/p335_058.wav|49|Let's see that increase over the years. 12 | DUMMY2/p236/p236_225.wav|75|There is no quick fix. 13 | DUMMY2/p374/p374_353.wav|11|And that brings us to the point. 14 | DUMMY2/p272/p272_076.wav|69|Sounds like The Sixth Sense? 15 | DUMMY2/p271/p271_152.wav|27|The petition was formally presented at Downing Street yesterday. 16 | DUMMY2/p228/p228_127.wav|57|They've got to account for it. 17 | DUMMY2/p276/p276_223.wav|106|It's been a humbling year. 18 | DUMMY2/p262/p262_248.wav|45|The project has already secured the support of Sir Sean Connery. 19 | DUMMY2/p314/p314_086.wav|51|The team this year is going places. 20 | DUMMY2/p225/p225_038.wav|101|Diving is no part of football. 21 | DUMMY2/p279/p279_088.wav|25|The shareholders will vote to wind up the company on Friday morning. 22 | DUMMY2/p272/p272_018.wav|69|Aristotle thought that the rainbow was caused by reflection of the sun's rays by the rain. 23 | DUMMY2/p256/p256_098.wav|90|She told The Herald. 24 | DUMMY2/p261/p261_218.wav|100|All will be revealed in due course. 25 | DUMMY2/p265/p265_063.wav|73|IT shouldn't come as a surprise, but it does. 26 | DUMMY2/p314/p314_042.wav|51|It is all about people being assaulted, abused. 27 | DUMMY2/p241/p241_188.wav|86|I wish I could say something. 28 | DUMMY2/p283/p283_111.wav|95|It's good to have a voice. 29 | DUMMY2/p275/p275_006.wav|40|When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow. 30 | DUMMY2/p228/p228_092.wav|57|Today I couldn't run on it. 31 | DUMMY2/p295/p295_343.wav|92|The atmosphere is businesslike. 32 | DUMMY2/p228/p228_187.wav|57|They will run a mile. 33 | DUMMY2/p294/p294_317.wav|104|It didn't put me off. 34 | DUMMY2/p231/p231_445.wav|50|It sounded like a bomb. 35 | DUMMY2/p272/p272_086.wav|69|Today she has been released. 36 | DUMMY2/p255/p255_210.wav|31|It was worth a photograph. 37 | DUMMY2/p229/p229_060.wav|67|And a film maker was born. 38 | DUMMY2/p260/p260_232.wav|81|The Home Office would not release any further details about the group. 39 | DUMMY2/p245/p245_025.wav|59|Johnson was pretty low. 40 | DUMMY2/p333/p333_185.wav|64|This area is perfect for children. 41 | DUMMY2/p244/p244_242.wav|78|He is a man of the people. 42 | DUMMY2/p376/p376_187.wav|71|"It is a terrible loss." 43 | DUMMY2/p239/p239_156.wav|48|It is a good lifestyle. 44 | DUMMY2/p307/p307_037.wav|22|He released a half-dozen solo albums. 45 | DUMMY2/p305/p305_185.wav|54|I am not even thinking about that. 46 | DUMMY2/p272/p272_081.wav|69|It was magic. 47 | DUMMY2/p302/p302_297.wav|30|I'm trying to stay open on that. 48 | DUMMY2/p275/p275_320.wav|40|We are in the end game. 49 | DUMMY2/p239/p239_231.wav|48|Then we will face the Danish champions. 50 | DUMMY2/p268/p268_301.wav|87|It was only later that the condition was diagnosed. 51 | DUMMY2/p336/p336_088.wav|98|They failed to reach agreement yesterday. 52 | DUMMY2/p278/p278_255.wav|10|They made such decisions in London. 53 | DUMMY2/p361/p361_132.wav|79|That got me out. 54 | DUMMY2/p307/p307_146.wav|22|You hope he prevails. 55 | DUMMY2/p244/p244_147.wav|78|They could not ignore the will of parliament, he claimed. 56 | DUMMY2/p294/p294_283.wav|104|This is our unfinished business. 57 | DUMMY2/p283/p283_300.wav|95|I would have the hammer in the crowd. 58 | DUMMY2/p239/p239_079.wav|48|I can understand the frustrations of our fans. 59 | DUMMY2/p264/p264_009.wav|65|There is , according to legend, a boiling pot of gold at one end. ) 60 | DUMMY2/p307/p307_348.wav|22|He did not oppose the divorce. 61 | DUMMY2/p304/p304_308.wav|72|We are the gateway to justice. 62 | DUMMY2/p281/p281_056.wav|36|None has ever been found. 63 | DUMMY2/p267/p267_158.wav|0|We were given a warm and friendly reception. 64 | DUMMY2/p300/p300_169.wav|102|Who do these people think they are? 65 | DUMMY2/p276/p276_177.wav|106|They exist in name alone. 66 | DUMMY2/p228/p228_245.wav|57|It is a policy which has the full support of the minister. 67 | DUMMY2/p300/p300_303.wav|102|I'm wondering what you feel about the youngest. 68 | DUMMY2/p362/p362_247.wav|15|This would give Scotland around eight members. 69 | DUMMY2/p326/p326_031.wav|28|United were in control without always being dominant. 70 | DUMMY2/p361/p361_288.wav|79|I did not think it was very proper. 71 | DUMMY2/p286/p286_145.wav|63|Tiger is not the norm. 72 | DUMMY2/p234/p234_071.wav|3|She did that for the rest of her life. 73 | DUMMY2/p263/p263_296.wav|39|The decision was announced at its annual conference in Dunfermline. 74 | DUMMY2/p323/p323_228.wav|34|She became a heroine of my childhood. 75 | DUMMY2/p280/p280_346.wav|52|It was a bit like having children. 76 | DUMMY2/p333/p333_080.wav|64|But the tragedy did not stop there. 77 | DUMMY2/p226/p226_268.wav|43|That decision is for the British Parliament and people. 78 | DUMMY2/p362/p362_314.wav|15|Is that right? 79 | DUMMY2/p240/p240_047.wav|93|It is so sad. 80 | DUMMY2/p250/p250_207.wav|24|You could feel the heat. 81 | DUMMY2/p273/p273_176.wav|56|Neither side would reveal the details of the offer. 82 | DUMMY2/p316/p316_147.wav|85|And frankly, it's been a while. 83 | DUMMY2/p265/p265_047.wav|73|It is unique. 84 | DUMMY2/p336/p336_353.wav|98|Sometimes you get them, sometimes you don't. 85 | DUMMY2/p230/p230_376.wav|35|This hasn't happened in a vacuum. 86 | DUMMY2/p308/p308_209.wav|107|There is great potential on this river. 87 | DUMMY2/p250/p250_442.wav|24|We have not yet received a letter from the Irish. 88 | DUMMY2/p260/p260_037.wav|81|It's a fact. 89 | DUMMY2/p299/p299_345.wav|58|We're very excited and challenged by the project. 90 | DUMMY2/p269/p269_218.wav|94|A Grampian Police spokesman said. 91 | DUMMY2/p306/p306_014.wav|12|To the Hebrews it was a token that there would be no more universal floods. 92 | DUMMY2/p271/p271_292.wav|27|It's a record label, not a form of music. 93 | DUMMY2/p247/p247_225.wav|14|I am considered a teenager.) 94 | DUMMY2/p294/p294_094.wav|104|It should be a condition of employment. 95 | DUMMY2/p269/p269_031.wav|94|Is this accurate? 96 | DUMMY2/p275/p275_116.wav|40|It's not fair. 97 | DUMMY2/p265/p265_006.wav|73|When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow. 98 | DUMMY2/p285/p285_072.wav|2|Mr Irvine said Mr Rafferty was now in good spirits. 99 | DUMMY2/p270/p270_167.wav|8|We did what we had to do. 100 | DUMMY2/p360/p360_397.wav|60|It is a relief. 101 | -------------------------------------------------------------------------------- /models/vits/filelists/vctk_audio_sid_text_val_filelist.txt.cleaned: -------------------------------------------------------------------------------- 1 | DUMMY2/p364/p364_240.wav|88|ɪt hɐd hˈæpənd tə hˌɪm. 2 | DUMMY2/p280/p280_148.wav|52|ɪt ɪz ˈoʊpən sˈiːzən ɑːnðɪ ˈoʊld fˈɜːm. 3 | DUMMY2/p231/p231_320.wav|50|haʊˈɛvɚ, hiː ɪz ɐ kˈoʊtʃ, ænd hiː ɹɪmˈeɪnz ɐ kˈoʊtʃ æt hˈɑːɹt. 4 | DUMMY2/p282/p282_129.wav|83|ɪt ɪz nˌɑːɾə jˈuːtˈɜːn. 5 | DUMMY2/p254/p254_015.wav|41|ðə ɡɹˈiːks jˈuːzd tʊ ɪmˈædʒɪn ðˌɐɾɪt wʌzɐ sˈaɪn fɹʌmðə ɡˈɑːdz tə foːɹtˈɛl wˈɔːɹ ɔːɹ hˈɛvi ɹˈeɪn. 6 | DUMMY2/p228/p228_285.wav|57|ðə sˈɔŋz ɑːɹ dʒˈʌst sˌoʊ ɡˈʊd. 7 | DUMMY2/p334/p334_307.wav|38|ɪf ðeɪ dˈoʊnt, ðeɪ kæn ɛkspˈɛkt ðɛɹ fˈʌndɪŋ təbi kˈʌt. 8 | DUMMY2/p287/p287_081.wav|77|aɪv nˈɛvɚ sˈiːn ˈɛnɪθˌɪŋ lˈaɪk ɪt. 9 | DUMMY2/p247/p247_083.wav|14|ɪt ɪz ɐ dʒˈɑːb kɹiːˈeɪʃən skˈiːm. 10 | DUMMY2/p264/p264_051.wav|65|wiː wɜː lˈiːdɪŋ baɪ tˈuː ɡˈoʊlz. 11 | DUMMY2/p335/p335_058.wav|49|lˈɛts sˈiː ðæt ˈɪnkɹiːs ˌoʊvɚ ðə jˈɪɹz. 12 | DUMMY2/p236/p236_225.wav|75|ðɛɹ ɪz nˈoʊ kwˈɪk fˈɪks. 13 | DUMMY2/p374/p374_353.wav|11|ænd ðæt bɹˈɪŋz ˌʌs tə ðə pˈɔɪnt. 14 | DUMMY2/p272/p272_076.wav|69|sˈaʊndz lˈaɪk ðə sˈɪksθ sˈɛns? 15 | DUMMY2/p271/p271_152.wav|27|ðə pətˈɪʃən wʌz fˈɔːɹməli pɹɪzˈɛntᵻd æt dˈaʊnɪŋ stɹˈiːt jˈɛstɚdˌeɪ. 16 | DUMMY2/p228/p228_127.wav|57|ðeɪv ɡɑːt tʊ ɐkˈaʊnt fɔːɹ ɪt. 17 | DUMMY2/p276/p276_223.wav|106|ɪts bˌɪn ɐ hˈʌmblɪŋ jˈɪɹ. 18 | DUMMY2/p262/p262_248.wav|45|ðə pɹˈɑːdʒɛkt hɐz ɔːlɹˌɛdi sɪkjˈʊɹd ðə səpˈoːɹt ʌv sˌɜː ʃˈɔːn kɑːnɚɹi. 19 | DUMMY2/p314/p314_086.wav|51|ðə tˈiːm ðɪs jˈɪɹ ɪz ɡˌoʊɪŋ plˈeɪsᵻz. 20 | DUMMY2/p225/p225_038.wav|101|dˈaɪvɪŋ ɪz nˈoʊ pˈɑːɹt ʌv fˈʊtbɔːl. 21 | DUMMY2/p279/p279_088.wav|25|ðə ʃˈɛɹhoʊldɚz wɪl vˈoʊt tə wˈaɪnd ˈʌp ðə kˈʌmpəni ˌɑːn fɹˈaɪdeɪ mˈɔːɹnɪŋ. 22 | DUMMY2/p272/p272_018.wav|69|ˈæɹɪstˌɑːɾəl θˈɔːt ðætðə ɹˈeɪnboʊ wʌz kˈɔːzd baɪ ɹɪflˈɛkʃən ʌvðə sˈʌnz ɹˈeɪz baɪ ðə ɹˈeɪn. 23 | DUMMY2/p256/p256_098.wav|90|ʃiː tˈoʊld ðə hˈɛɹəld. 24 | DUMMY2/p261/p261_218.wav|100|ˈɔːl wɪl biː ɹɪvˈiːld ɪn dˈuː kˈoːɹs. 25 | DUMMY2/p265/p265_063.wav|73|ɪt ʃˌʊdənt kˈʌm æz ɐ sɚpɹˈaɪz, bˌʌt ɪt dˈʌz. 26 | DUMMY2/p314/p314_042.wav|51|ɪt ɪz ˈɔːl ɐbˌaʊt pˈiːpəl bˌiːɪŋ ɐsˈɑːltᵻd, ɐbjˈuːsd. 27 | DUMMY2/p241/p241_188.wav|86|ˈaɪ wˈɪʃ ˈaɪ kʊd sˈeɪ sˈʌmθɪŋ. 28 | DUMMY2/p283/p283_111.wav|95|ɪts ɡˈʊd tə hæv ɐ vˈɔɪs. 29 | DUMMY2/p275/p275_006.wav|40|wˌɛn ðə sˈʌnlaɪt stɹˈaɪks ɹˈeɪndɹɑːps ɪnðɪ ˈɛɹ, ðeɪ ˈækt æz ɐ pɹˈɪzəm ænd fˈɔːɹm ɐ ɹˈeɪnboʊ. 30 | DUMMY2/p228/p228_092.wav|57|tədˈeɪ ˈaɪ kˌʊdənt ɹˈʌn ˈɑːn ɪt. 31 | DUMMY2/p295/p295_343.wav|92|ðɪ ˈætməsfˌɪɹ ɪz bˈɪznəslˌaɪk. 32 | DUMMY2/p228/p228_187.wav|57|ðeɪ wɪl ɹˈʌn ɐ mˈaɪl. 33 | DUMMY2/p294/p294_317.wav|104|ɪt dˈɪdnt pˌʊt mˌiː ˈɔf. 34 | DUMMY2/p231/p231_445.wav|50|ɪt sˈaʊndᵻd lˈaɪk ɐ bˈɑːm. 35 | DUMMY2/p272/p272_086.wav|69|tədˈeɪ ʃiː hɐzbɪn ɹɪlˈiːsd. 36 | DUMMY2/p255/p255_210.wav|31|ɪt wʌz wˈɜːθ ɐ fˈoʊɾəɡɹˌæf. 37 | DUMMY2/p229/p229_060.wav|67|ænd ɐ fˈɪlm mˈeɪkɚ wʌz bˈɔːɹn. 38 | DUMMY2/p260/p260_232.wav|81|ðə hˈoʊm ˈɑːfɪs wʊd nˌɑːt ɹɪlˈiːs ˌɛni fˈɜːðɚ diːtˈeɪlz ɐbˌaʊt ðə ɡɹˈuːp. 39 | DUMMY2/p245/p245_025.wav|59|dʒˈɑːnsən wʌz pɹˈɪɾi lˈoʊ. 40 | DUMMY2/p333/p333_185.wav|64|ðɪs ˈɛɹiə ɪz pˈɜːfɛkt fɔːɹ tʃˈɪldɹən. 41 | DUMMY2/p244/p244_242.wav|78|hiː ɪz ɐ mˈæn ʌvðə pˈiːpəl. 42 | DUMMY2/p376/p376_187.wav|71|"ɪt ɪz ɐ tˈɛɹəbəl lˈɔs." 43 | DUMMY2/p239/p239_156.wav|48|ɪt ɪz ɐ ɡˈʊd lˈaɪfstaɪl. 44 | DUMMY2/p307/p307_037.wav|22|hiː ɹɪlˈiːsd ɐ hˈæfdˈʌzən sˈoʊloʊ ˈælbəmz. 45 | DUMMY2/p305/p305_185.wav|54|ˈaɪ æm nˌɑːt ˈiːvən θˈɪŋkɪŋ ɐbˌaʊt ðˈæt. 46 | DUMMY2/p272/p272_081.wav|69|ɪt wʌz mˈædʒɪk. 47 | DUMMY2/p302/p302_297.wav|30|aɪm tɹˈaɪɪŋ tə stˈeɪ ˈoʊpən ˌɑːn ðˈæt. 48 | DUMMY2/p275/p275_320.wav|40|wiː ɑːɹ ɪnðɪ ˈɛnd ɡˈeɪm. 49 | DUMMY2/p239/p239_231.wav|48|ðˈɛn wiː wɪl fˈeɪs ðə dˈeɪnɪʃ tʃˈæmpiənz. 50 | DUMMY2/p268/p268_301.wav|87|ɪt wʌz ˈoʊnli lˈeɪɾɚ ðætðə kəndˈɪʃən wʌz dˌaɪəɡnˈoʊzd. 51 | DUMMY2/p336/p336_088.wav|98|ðeɪ fˈeɪld tə ɹˈiːtʃ ɐɡɹˈiːmənt jˈɛstɚdˌeɪ. 52 | DUMMY2/p278/p278_255.wav|10|ðeɪ mˌeɪd sˈʌtʃ dᵻsˈɪʒənz ɪn lˈʌndən. 53 | DUMMY2/p361/p361_132.wav|79|ðæt ɡɑːt mˌiː ˈaʊt. 54 | DUMMY2/p307/p307_146.wav|22|juː hˈoʊp hiː pɹɪvˈeɪlz. 55 | DUMMY2/p244/p244_147.wav|78|ðeɪ kʊd nˌɑːt ɪɡnˈoːɹ ðə wɪl ʌv pˈɑːɹləmənt, hiː klˈeɪmd. 56 | DUMMY2/p294/p294_283.wav|104|ðɪs ɪz ˌaʊɚɹ ʌnfˈɪnɪʃt bˈɪznəs. 57 | DUMMY2/p283/p283_300.wav|95|ˈaɪ wʊdhɐv ðə hˈæmɚɹ ɪnðə kɹˈaʊd. 58 | DUMMY2/p239/p239_079.wav|48|ˈaɪ kæn ˌʌndɚstˈænd ðə fɹʌstɹˈeɪʃənz ʌv ˌaʊɚ fˈænz. 59 | DUMMY2/p264/p264_009.wav|65|ðɛɹˈɪz , ɐkˈoːɹdɪŋ tə lˈɛdʒənd, ɐ bˈɔɪlɪŋ pˈɑːt ʌv ɡˈoʊld æt wˈʌn ˈɛnd. 60 | DUMMY2/p307/p307_348.wav|22|hiː dɪdnˌɑːt əpˈoʊz ðə dɪvˈoːɹs. 61 | DUMMY2/p304/p304_308.wav|72|wiː ɑːɹ ðə ɡˈeɪtweɪ tə dʒˈʌstɪs. 62 | DUMMY2/p281/p281_056.wav|36|nˈʌn hɐz ˈɛvɚ bˌɪn fˈaʊnd. 63 | DUMMY2/p267/p267_158.wav|0|wiː wɜː ɡˈɪvən ɐ wˈɔːɹm ænd fɹˈɛndli ɹɪsˈɛpʃən. 64 | DUMMY2/p300/p300_169.wav|102|hˌuː dˈuː ðiːz pˈiːpəl θˈɪŋk ðeɪ ɑːɹ? 65 | DUMMY2/p276/p276_177.wav|106|ðeɪ ɛɡzˈɪst ɪn nˈeɪm ɐlˈoʊn. 66 | DUMMY2/p228/p228_245.wav|57|ɪt ɪz ɐ pˈɑːlɪsi wˌɪtʃ hɐz ðə fˈʊl səpˈoːɹt ʌvðə mˈɪnɪstɚ. 67 | DUMMY2/p300/p300_303.wav|102|aɪm wˈʌndɚɹɪŋ wˌʌt juː fˈiːl ɐbˌaʊt ðə jˈʌŋɡəst. 68 | DUMMY2/p362/p362_247.wav|15|ðɪs wʊd ɡˈɪv skˈɑːtlənd ɐɹˈaʊnd ˈeɪt mˈɛmbɚz. 69 | DUMMY2/p326/p326_031.wav|28|juːnˈaɪɾᵻd wɜːɹ ɪn kəntɹˈoʊl wɪðˌaʊt ˈɔːlweɪz bˌiːɪŋ dˈɑːmɪnənt. 70 | DUMMY2/p361/p361_288.wav|79|ˈaɪ dɪdnˌɑːt θˈɪŋk ɪt wʌz vˈɛɹi pɹˈɑːpɚ. 71 | DUMMY2/p286/p286_145.wav|63|tˈaɪɡɚɹ ɪz nˌɑːt ðə nˈɔːɹm. 72 | DUMMY2/p234/p234_071.wav|3|ʃiː dˈɪd ðæt fɚðə ɹˈɛst ʌv hɜː lˈaɪf. 73 | DUMMY2/p263/p263_296.wav|39|ðə dᵻsˈɪʒən wʌz ɐnˈaʊnst æt ɪts ˈænjuːəl kˈɑːnfɹəns ɪn dˈʌnfɚmlˌaɪn. 74 | DUMMY2/p323/p323_228.wav|34|ʃiː bɪkˌeɪm ɐ hˈɛɹoʊˌɪn ʌv maɪ tʃˈaɪldhʊd. 75 | DUMMY2/p280/p280_346.wav|52|ɪt wʌzɐ bˈɪt lˈaɪk hˌævɪŋ tʃˈɪldɹən. 76 | DUMMY2/p333/p333_080.wav|64|bˌʌt ðə tɹˈædʒədi dɪdnˌɑːt stˈɑːp ðˈɛɹ. 77 | DUMMY2/p226/p226_268.wav|43|ðæt dᵻsˈɪʒən ɪz fɚðə bɹˈɪɾɪʃ pˈɑːɹləmənt ænd pˈiːpəl. 78 | DUMMY2/p362/p362_314.wav|15|ɪz ðæt ɹˈaɪt? 79 | DUMMY2/p240/p240_047.wav|93|ɪt ɪz sˌoʊ sˈæd. 80 | DUMMY2/p250/p250_207.wav|24|juː kʊd fˈiːl ðə hˈiːt. 81 | DUMMY2/p273/p273_176.wav|56|nˈiːðɚ sˈaɪd wʊd ɹɪvˈiːl ðə diːtˈeɪlz ʌvðɪ ˈɑːfɚ. 82 | DUMMY2/p316/p316_147.wav|85|ænd fɹˈæŋkli, ɪts bˌɪn ɐ wˈaɪl. 83 | DUMMY2/p265/p265_047.wav|73|ɪt ɪz juːnˈiːk. 84 | DUMMY2/p336/p336_353.wav|98|sˈʌmtaɪmz juː ɡˈɛt ðˌɛm, sˈʌmtaɪmz juː dˈoʊnt. 85 | DUMMY2/p230/p230_376.wav|35|ðɪs hˈæzənt hˈæpənd ɪn ɐ vˈækjuːm. 86 | DUMMY2/p308/p308_209.wav|107|ðɛɹ ɪz ɡɹˈeɪt pətˈɛnʃəl ˌɑːn ðɪs ɹˈɪvɚ. 87 | DUMMY2/p250/p250_442.wav|24|wiː hɐvnˌɑːt jˈɛt ɹɪsˈiːvd ɐ lˈɛɾɚ fɹʌmðɪ ˈaɪɹɪʃ. 88 | DUMMY2/p260/p260_037.wav|81|ɪts ɐ fˈækt. 89 | DUMMY2/p299/p299_345.wav|58|wɪɹ vˈɛɹi ɛksˈaɪɾᵻd ænd tʃˈælɪndʒd baɪ ðə pɹˈɑːdʒɛkt. 90 | DUMMY2/p269/p269_218.wav|94|ɐ ɡɹˈæmpiən pəlˈiːs spˈoʊksmən sˈɛd. 91 | DUMMY2/p306/p306_014.wav|12|tə ðə hˈiːbɹuːz ɪt wʌzɐ tˈoʊkən ðæt ðɛɹ wʊd biː nˈoʊmˌoːɹ jˌuːnɪvˈɜːsəl flˈʌdz. 92 | DUMMY2/p271/p271_292.wav|27|ɪts ɐ ɹˈɛkɚd lˈeɪbəl, nˌɑːɾə fˈɔːɹm ʌv mjˈuːzɪk. 93 | DUMMY2/p247/p247_225.wav|14|ˈaɪ æm kənsˈɪdɚd ɐ tˈiːneɪdʒɚ. 94 | DUMMY2/p294/p294_094.wav|104|ɪt ʃˌʊd biː ɐ kəndˈɪʃən ʌv ɛmplˈɔɪmənt. 95 | DUMMY2/p269/p269_031.wav|94|ɪz ðɪs ˈækjʊɹət? 96 | DUMMY2/p275/p275_116.wav|40|ɪts nˌɑːt fˈɛɹ. 97 | DUMMY2/p265/p265_006.wav|73|wˌɛn ðə sˈʌnlaɪt stɹˈaɪks ɹˈeɪndɹɑːps ɪnðɪ ˈɛɹ, ðeɪ ˈækt æz ɐ pɹˈɪzəm ænd fˈɔːɹm ɐ ɹˈeɪnboʊ. 98 | DUMMY2/p285/p285_072.wav|2|mˈɪstɚɹ ˈɜːvaɪn sˈɛd mˈɪstɚ ɹˈæfɚɾi wʌz nˈaʊ ɪn ɡˈʊd spˈɪɹɪts. 99 | DUMMY2/p270/p270_167.wav|8|wiː dˈɪd wˌʌt wiː hædtə dˈuː. 100 | DUMMY2/p360/p360_397.wav|60|ɪt ɪz ɐ ɹɪlˈiːf. 101 | -------------------------------------------------------------------------------- /models/vits/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | import commons 5 | 6 | 7 | def feature_loss(fmap_r, fmap_g): 8 | loss = 0 9 | for dr, dg in zip(fmap_r, fmap_g): 10 | for rl, gl in zip(dr, dg): 11 | rl = rl.float().detach() 12 | gl = gl.float() 13 | loss += torch.mean(torch.abs(rl - gl)) 14 | 15 | return loss * 2 16 | 17 | 18 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 19 | loss = 0 20 | r_losses = [] 21 | g_losses = [] 22 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 23 | dr = dr.float() 24 | dg = dg.float() 25 | r_loss = torch.mean((1-dr)**2) 26 | g_loss = torch.mean(dg**2) 27 | loss += (r_loss + g_loss) 28 | r_losses.append(r_loss.item()) 29 | g_losses.append(g_loss.item()) 30 | 31 | return loss, r_losses, g_losses 32 | 33 | 34 | def generator_loss(disc_outputs): 35 | loss = 0 36 | gen_losses = [] 37 | for dg in disc_outputs: 38 | dg = dg.float() 39 | l = torch.mean((1-dg)**2) 40 | gen_losses.append(l) 41 | loss += l 42 | 43 | return loss, gen_losses 44 | 45 | 46 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 47 | """ 48 | z_p, logs_q: [b, h, t_t] 49 | m_p, logs_p: [b, h, t_t] 50 | """ 51 | z_p = z_p.float() 52 | logs_q = logs_q.float() 53 | m_p = m_p.float() 54 | logs_p = logs_p.float() 55 | z_mask = z_mask.float() 56 | 57 | kl = logs_p - logs_q - 0.5 58 | kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p) 59 | kl = torch.sum(kl * z_mask) 60 | l = kl / torch.sum(z_mask) 61 | return l 62 | -------------------------------------------------------------------------------- /models/vits/mel_processing.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import random 4 | import torch 5 | from torch import nn 6 | import torch.nn.functional as F 7 | import torch.utils.data 8 | import numpy as np 9 | import librosa 10 | import librosa.util as librosa_util 11 | from librosa.util import normalize, pad_center, tiny 12 | from scipy.signal import get_window 13 | from scipy.io.wavfile import read 14 | from librosa.filters import mel as librosa_mel_fn 15 | 16 | MAX_WAV_VALUE = 32768.0 17 | 18 | 19 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 20 | """ 21 | PARAMS 22 | ------ 23 | C: compression factor 24 | """ 25 | return torch.log(torch.clamp(x, min=clip_val) * C) 26 | 27 | 28 | def dynamic_range_decompression_torch(x, C=1): 29 | """ 30 | PARAMS 31 | ------ 32 | C: compression factor used to compress 33 | """ 34 | return torch.exp(x) / C 35 | 36 | 37 | def spectral_normalize_torch(magnitudes): 38 | output = dynamic_range_compression_torch(magnitudes) 39 | return output 40 | 41 | 42 | def spectral_de_normalize_torch(magnitudes): 43 | output = dynamic_range_decompression_torch(magnitudes) 44 | return output 45 | 46 | 47 | mel_basis = {} 48 | hann_window = {} 49 | 50 | 51 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): 52 | if torch.min(y) < -1.: 53 | print('min value is ', torch.min(y)) 54 | if torch.max(y) > 1.: 55 | print('max value is ', torch.max(y)) 56 | 57 | global hann_window 58 | dtype_device = str(y.dtype) + '_' + str(y.device) 59 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 60 | if wnsize_dtype_device not in hann_window: 61 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 62 | 63 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 64 | y = y.squeeze(1) 65 | 66 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], 67 | center=center, pad_mode='reflect', normalized=False, onesided=True) 68 | 69 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 70 | return spec 71 | 72 | 73 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): 74 | global mel_basis 75 | dtype_device = str(spec.dtype) + '_' + str(spec.device) 76 | fmax_dtype_device = str(fmax) + '_' + dtype_device 77 | if fmax_dtype_device not in mel_basis: 78 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 79 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) 80 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 81 | spec = spectral_normalize_torch(spec) 82 | return spec 83 | 84 | 85 | def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): 86 | if torch.min(y) < -1.: 87 | print('min value is ', torch.min(y)) 88 | if torch.max(y) > 1.: 89 | print('max value is ', torch.max(y)) 90 | 91 | global mel_basis, hann_window 92 | dtype_device = str(y.dtype) + '_' + str(y.device) 93 | fmax_dtype_device = str(fmax) + '_' + dtype_device 94 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 95 | if fmax_dtype_device not in mel_basis: 96 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 97 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) 98 | if wnsize_dtype_device not in hann_window: 99 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 100 | 101 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 102 | y = y.squeeze(1) 103 | 104 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], 105 | center=center, pad_mode='reflect', normalized=False, onesided=True) 106 | 107 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 108 | 109 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 110 | spec = spectral_normalize_torch(spec) 111 | 112 | return spec 113 | -------------------------------------------------------------------------------- /models/vits/monotonic_align/__init__.py: -------------------------------------------------------------------------------- 1 | # import numpy as np 2 | # import torch 3 | # from .monotonic_align.core import maximum_path_c 4 | 5 | 6 | # def maximum_path(neg_cent, mask): 7 | # """ Cython optimized version. 8 | # neg_cent: [b, t_t, t_s] 9 | # mask: [b, t_t, t_s] 10 | # """ 11 | # device = neg_cent.device 12 | # dtype = neg_cent.dtype 13 | # neg_cent = neg_cent.data.cpu().numpy().astype(np.float32) 14 | # path = np.zeros(neg_cent.shape, dtype=np.int32) 15 | 16 | # t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32) 17 | # t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32) 18 | # maximum_path_c(path, neg_cent, t_t_max, t_s_max) 19 | # return torch.from_numpy(path).to(device=device, dtype=dtype) 20 | -------------------------------------------------------------------------------- /models/vits/monotonic_align/core.pyx: -------------------------------------------------------------------------------- 1 | cimport cython 2 | from cython.parallel import prange 3 | 4 | 5 | @cython.boundscheck(False) 6 | @cython.wraparound(False) 7 | cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil: 8 | cdef int x 9 | cdef int y 10 | cdef float v_prev 11 | cdef float v_cur 12 | cdef float tmp 13 | cdef int index = t_x - 1 14 | 15 | for y in range(t_y): 16 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): 17 | if x == y: 18 | v_cur = max_neg_val 19 | else: 20 | v_cur = value[y-1, x] 21 | if x == 0: 22 | if y == 0: 23 | v_prev = 0. 24 | else: 25 | v_prev = max_neg_val 26 | else: 27 | v_prev = value[y-1, x-1] 28 | value[y, x] += max(v_prev, v_cur) 29 | 30 | for y in range(t_y - 1, -1, -1): 31 | path[y, index] = 1 32 | if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]): 33 | index = index - 1 34 | 35 | 36 | @cython.boundscheck(False) 37 | @cython.wraparound(False) 38 | cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil: 39 | cdef int b = paths.shape[0] 40 | cdef int i 41 | for i in prange(b, nogil=True): 42 | maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i]) 43 | -------------------------------------------------------------------------------- /models/vits/monotonic_align/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from Cython.Build import cythonize 3 | import numpy 4 | 5 | setup( 6 | name = 'monotonic_align', 7 | ext_modules = cythonize("core.pyx"), 8 | include_dirs=[numpy.get_include()] 9 | ) 10 | -------------------------------------------------------------------------------- /models/vits/preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import text 3 | from utils import load_filepaths_and_text 4 | 5 | if __name__ == '__main__': 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--out_extension", default="cleaned") 8 | parser.add_argument("--text_index", default=1, type=int) 9 | parser.add_argument("--filelists", nargs="+", default=["filelists/ljs_audio_text_val_filelist.txt", "filelists/ljs_audio_text_test_filelist.txt"]) 10 | parser.add_argument("--text_cleaners", nargs="+", default=["english_cleaners2"]) 11 | 12 | args = parser.parse_args() 13 | 14 | 15 | for filelist in args.filelists: 16 | print("START:", filelist) 17 | filepaths_and_text = load_filepaths_and_text(filelist) 18 | for i in range(len(filepaths_and_text)): 19 | original_text = filepaths_and_text[i][args.text_index] 20 | cleaned_text = text._clean_text(original_text, args.text_cleaners) 21 | filepaths_and_text[i][args.text_index] = cleaned_text 22 | 23 | new_filelist = filelist + "." + args.out_extension 24 | with open(new_filelist, "w", encoding="utf-8") as f: 25 | f.writelines(["|".join(x) + "\n" for x in filepaths_and_text]) 26 | -------------------------------------------------------------------------------- /models/vits/pretrained_models/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dscripka/synthetic_speech_dataset_generation/09cdc32c9efafefa603346819ba84aef4be2063b/models/vits/pretrained_models/.gitkeep -------------------------------------------------------------------------------- /models/vits/requirements.txt: -------------------------------------------------------------------------------- 1 | Cython==0.29.21 2 | librosa==0.8.0 3 | matplotlib==3.3.1 4 | numpy==1.18.5 5 | phonemizer==3.2.1 6 | scipy==1.5.2 7 | tensorboard==2.3.0 8 | torch==1.6.0 9 | torchvision==0.7.0 10 | Unidecode==1.1.1 -------------------------------------------------------------------------------- /models/vits/resources/fig_1a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dscripka/synthetic_speech_dataset_generation/09cdc32c9efafefa603346819ba84aef4be2063b/models/vits/resources/fig_1a.png -------------------------------------------------------------------------------- /models/vits/resources/fig_1b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dscripka/synthetic_speech_dataset_generation/09cdc32c9efafefa603346819ba84aef4be2063b/models/vits/resources/fig_1b.png -------------------------------------------------------------------------------- /models/vits/resources/training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dscripka/synthetic_speech_dataset_generation/09cdc32c9efafefa603346819ba84aef4be2063b/models/vits/resources/training.png -------------------------------------------------------------------------------- /models/vits/text/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Keith Ito 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /models/vits/text/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | from text import cleaners 3 | from text.symbols import symbols 4 | 5 | 6 | # Mappings from symbol to numeric ID and vice versa: 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 9 | 10 | 11 | def text_to_sequence(text, cleaner_names): 12 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 13 | Args: 14 | text: string to convert to a sequence 15 | cleaner_names: names of the cleaner functions to run the text through 16 | Returns: 17 | List of integers corresponding to the symbols in the text 18 | ''' 19 | sequence = [] 20 | 21 | clean_text = _clean_text(text, cleaner_names) 22 | for symbol in clean_text: 23 | symbol_id = _symbol_to_id[symbol] 24 | sequence += [symbol_id] 25 | return sequence 26 | 27 | 28 | def cleaned_text_to_sequence(cleaned_text): 29 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 30 | Args: 31 | text: string to convert to a sequence 32 | Returns: 33 | List of integers corresponding to the symbols in the text 34 | ''' 35 | sequence = [_symbol_to_id[symbol] for symbol in cleaned_text] 36 | return sequence 37 | 38 | 39 | def sequence_to_text(sequence): 40 | '''Converts a sequence of IDs back to a string''' 41 | result = '' 42 | for symbol_id in sequence: 43 | s = _id_to_symbol[symbol_id] 44 | result += s 45 | return result 46 | 47 | 48 | def _clean_text(text, cleaner_names): 49 | for name in cleaner_names: 50 | cleaner = getattr(cleaners, name) 51 | if not cleaner: 52 | raise Exception('Unknown cleaner: %s' % name) 53 | text = cleaner(text) 54 | return text 55 | -------------------------------------------------------------------------------- /models/vits/text/cleaners.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Cleaners are transformations that run over the input text at both training and eval time. 5 | 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 8 | 1. "english_cleaners" for English text 9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 12 | the symbols in symbols.py to match your data). 13 | ''' 14 | 15 | import re 16 | from unidecode import unidecode 17 | from phonemizer import phonemize 18 | from phonemizer.backend import EspeakBackend 19 | from phonemizer.phonemize import _phonemize 20 | from phonemizer.separator import default_separator, Separator 21 | 22 | # Regular expression matching whitespace: 23 | _whitespace_re = re.compile(r'\s+') 24 | 25 | # List of (regular expression, replacement) pairs for abbreviations: 26 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 27 | ('mrs', 'misess'), 28 | ('mr', 'mister'), 29 | ('dr', 'doctor'), 30 | ('st', 'saint'), 31 | ('co', 'company'), 32 | ('jr', 'junior'), 33 | ('maj', 'major'), 34 | ('gen', 'general'), 35 | ('drs', 'doctors'), 36 | ('rev', 'reverend'), 37 | ('lt', 'lieutenant'), 38 | ('hon', 'honorable'), 39 | ('sgt', 'sergeant'), 40 | ('capt', 'captain'), 41 | ('esq', 'esquire'), 42 | ('ltd', 'limited'), 43 | ('col', 'colonel'), 44 | ('ft', 'fort'), 45 | ]] 46 | 47 | # espeak backend (instantiate once to avoid memory leaks) 48 | espeak_backend = EspeakBackend('en-us', preserve_punctuation=True, with_stress=True) 49 | 50 | 51 | def expand_abbreviations(text): 52 | for regex, replacement in _abbreviations: 53 | text = re.sub(regex, replacement, text) 54 | return text 55 | 56 | 57 | def expand_numbers(text): 58 | return normalize_numbers(text) 59 | 60 | 61 | def lowercase(text): 62 | return text.lower() 63 | 64 | 65 | def collapse_whitespace(text): 66 | return re.sub(_whitespace_re, ' ', text) 67 | 68 | 69 | def convert_to_ascii(text): 70 | return unidecode(text) 71 | 72 | 73 | def basic_cleaners(text): 74 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 75 | text = lowercase(text) 76 | text = collapse_whitespace(text) 77 | return text 78 | 79 | 80 | def transliteration_cleaners(text): 81 | '''Pipeline for non-English text that transliterates to ASCII.''' 82 | text = convert_to_ascii(text) 83 | text = lowercase(text) 84 | text = collapse_whitespace(text) 85 | return text 86 | 87 | 88 | def english_cleaners(text): 89 | '''Pipeline for English text, including abbreviation expansion.''' 90 | text = convert_to_ascii(text) 91 | text = lowercase(text) 92 | text = expand_abbreviations(text) 93 | phonemes = phonemize(text, language='en-us', backend='espeak', strip=True) 94 | phonemes = collapse_whitespace(phonemes) 95 | return phonemes 96 | 97 | 98 | def english_cleaners2(text): 99 | '''Pipeline for English text, including abbreviation expansion. + punctuation + stress''' 100 | text = convert_to_ascii(text) 101 | text = lowercase(text) 102 | text = expand_abbreviations(text) 103 | # phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True) 104 | phonemes = _phonemize(espeak_backend, text, separator=default_separator, strip=True, njobs=1, prepend_text=False, preserve_empty_lines=False) 105 | phonemes = collapse_whitespace(phonemes) 106 | return phonemes 107 | -------------------------------------------------------------------------------- /models/vits/text/symbols.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Defines the set of symbols used in text input to the model. 5 | ''' 6 | _pad = '_' 7 | _punctuation = ';:,.!?¡¿—…"«»“” ' 8 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' 9 | _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" 10 | 11 | 12 | # Export all symbols: 13 | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) 14 | 15 | # Special symbol ids 16 | SPACE_ID = symbols.index(" ") 17 | -------------------------------------------------------------------------------- /models/vits/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import itertools 5 | import math 6 | import torch 7 | from torch import nn, optim 8 | from torch.nn import functional as F 9 | from torch.utils.data import DataLoader 10 | from torch.utils.tensorboard import SummaryWriter 11 | import torch.multiprocessing as mp 12 | import torch.distributed as dist 13 | from torch.nn.parallel import DistributedDataParallel as DDP 14 | from torch.cuda.amp import autocast, GradScaler 15 | 16 | import commons 17 | import utils 18 | from data_utils import ( 19 | TextAudioLoader, 20 | TextAudioCollate, 21 | DistributedBucketSampler 22 | ) 23 | from models import ( 24 | SynthesizerTrn, 25 | MultiPeriodDiscriminator, 26 | ) 27 | from losses import ( 28 | generator_loss, 29 | discriminator_loss, 30 | feature_loss, 31 | kl_loss 32 | ) 33 | from mel_processing import mel_spectrogram_torch, spec_to_mel_torch 34 | from text.symbols import symbols 35 | 36 | 37 | torch.backends.cudnn.benchmark = True 38 | global_step = 0 39 | 40 | 41 | def main(): 42 | """Assume Single Node Multi GPUs Training Only""" 43 | assert torch.cuda.is_available(), "CPU training is not allowed." 44 | 45 | n_gpus = torch.cuda.device_count() 46 | os.environ['MASTER_ADDR'] = 'localhost' 47 | os.environ['MASTER_PORT'] = '80000' 48 | 49 | hps = utils.get_hparams() 50 | mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,)) 51 | 52 | 53 | def run(rank, n_gpus, hps): 54 | global global_step 55 | if rank == 0: 56 | logger = utils.get_logger(hps.model_dir) 57 | logger.info(hps) 58 | utils.check_git_hash(hps.model_dir) 59 | writer = SummaryWriter(log_dir=hps.model_dir) 60 | writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) 61 | 62 | dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank) 63 | torch.manual_seed(hps.train.seed) 64 | torch.cuda.set_device(rank) 65 | 66 | train_dataset = TextAudioLoader(hps.data.training_files, hps.data) 67 | train_sampler = DistributedBucketSampler( 68 | train_dataset, 69 | hps.train.batch_size, 70 | [32,300,400,500,600,700,800,900,1000], 71 | num_replicas=n_gpus, 72 | rank=rank, 73 | shuffle=True) 74 | collate_fn = TextAudioCollate() 75 | train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True, 76 | collate_fn=collate_fn, batch_sampler=train_sampler) 77 | if rank == 0: 78 | eval_dataset = TextAudioLoader(hps.data.validation_files, hps.data) 79 | eval_loader = DataLoader(eval_dataset, num_workers=8, shuffle=False, 80 | batch_size=hps.train.batch_size, pin_memory=True, 81 | drop_last=False, collate_fn=collate_fn) 82 | 83 | net_g = SynthesizerTrn( 84 | len(symbols), 85 | hps.data.filter_length // 2 + 1, 86 | hps.train.segment_size // hps.data.hop_length, 87 | **hps.model).cuda(rank) 88 | net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) 89 | optim_g = torch.optim.AdamW( 90 | net_g.parameters(), 91 | hps.train.learning_rate, 92 | betas=hps.train.betas, 93 | eps=hps.train.eps) 94 | optim_d = torch.optim.AdamW( 95 | net_d.parameters(), 96 | hps.train.learning_rate, 97 | betas=hps.train.betas, 98 | eps=hps.train.eps) 99 | net_g = DDP(net_g, device_ids=[rank]) 100 | net_d = DDP(net_d, device_ids=[rank]) 101 | 102 | try: 103 | _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g) 104 | _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d) 105 | global_step = (epoch_str - 1) * len(train_loader) 106 | except: 107 | epoch_str = 1 108 | global_step = 0 109 | 110 | scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str-2) 111 | scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str-2) 112 | 113 | scaler = GradScaler(enabled=hps.train.fp16_run) 114 | 115 | for epoch in range(epoch_str, hps.train.epochs + 1): 116 | if rank==0: 117 | train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval]) 118 | else: 119 | train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, None], None, None) 120 | scheduler_g.step() 121 | scheduler_d.step() 122 | 123 | 124 | def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers): 125 | net_g, net_d = nets 126 | optim_g, optim_d = optims 127 | scheduler_g, scheduler_d = schedulers 128 | train_loader, eval_loader = loaders 129 | if writers is not None: 130 | writer, writer_eval = writers 131 | 132 | train_loader.batch_sampler.set_epoch(epoch) 133 | global global_step 134 | 135 | net_g.train() 136 | net_d.train() 137 | for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths) in enumerate(train_loader): 138 | x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True) 139 | spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True) 140 | y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True) 141 | 142 | with autocast(enabled=hps.train.fp16_run): 143 | y_hat, l_length, attn, ids_slice, x_mask, z_mask,\ 144 | (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(x, x_lengths, spec, spec_lengths) 145 | 146 | mel = spec_to_mel_torch( 147 | spec, 148 | hps.data.filter_length, 149 | hps.data.n_mel_channels, 150 | hps.data.sampling_rate, 151 | hps.data.mel_fmin, 152 | hps.data.mel_fmax) 153 | y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) 154 | y_hat_mel = mel_spectrogram_torch( 155 | y_hat.squeeze(1), 156 | hps.data.filter_length, 157 | hps.data.n_mel_channels, 158 | hps.data.sampling_rate, 159 | hps.data.hop_length, 160 | hps.data.win_length, 161 | hps.data.mel_fmin, 162 | hps.data.mel_fmax 163 | ) 164 | 165 | y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice 166 | 167 | # Discriminator 168 | y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) 169 | with autocast(enabled=False): 170 | loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) 171 | loss_disc_all = loss_disc 172 | optim_d.zero_grad() 173 | scaler.scale(loss_disc_all).backward() 174 | scaler.unscale_(optim_d) 175 | grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) 176 | scaler.step(optim_d) 177 | 178 | with autocast(enabled=hps.train.fp16_run): 179 | # Generator 180 | y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) 181 | with autocast(enabled=False): 182 | loss_dur = torch.sum(l_length.float()) 183 | loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel 184 | loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl 185 | 186 | loss_fm = feature_loss(fmap_r, fmap_g) 187 | loss_gen, losses_gen = generator_loss(y_d_hat_g) 188 | loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl 189 | optim_g.zero_grad() 190 | scaler.scale(loss_gen_all).backward() 191 | scaler.unscale_(optim_g) 192 | grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) 193 | scaler.step(optim_g) 194 | scaler.update() 195 | 196 | if rank==0: 197 | if global_step % hps.train.log_interval == 0: 198 | lr = optim_g.param_groups[0]['lr'] 199 | losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_dur, loss_kl] 200 | logger.info('Train Epoch: {} [{:.0f}%]'.format( 201 | epoch, 202 | 100. * batch_idx / len(train_loader))) 203 | logger.info([x.item() for x in losses] + [global_step, lr]) 204 | 205 | scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g} 206 | scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl}) 207 | 208 | scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) 209 | scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) 210 | scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) 211 | image_dict = { 212 | "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), 213 | "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), 214 | "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), 215 | "all/attn": utils.plot_alignment_to_numpy(attn[0,0].data.cpu().numpy()) 216 | } 217 | utils.summarize( 218 | writer=writer, 219 | global_step=global_step, 220 | images=image_dict, 221 | scalars=scalar_dict) 222 | 223 | if global_step % hps.train.eval_interval == 0: 224 | evaluate(hps, net_g, eval_loader, writer_eval) 225 | utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(global_step))) 226 | utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "D_{}.pth".format(global_step))) 227 | global_step += 1 228 | 229 | if rank == 0: 230 | logger.info('====> Epoch: {}'.format(epoch)) 231 | 232 | 233 | def evaluate(hps, generator, eval_loader, writer_eval): 234 | generator.eval() 235 | with torch.no_grad(): 236 | for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths) in enumerate(eval_loader): 237 | x, x_lengths = x.cuda(0), x_lengths.cuda(0) 238 | spec, spec_lengths = spec.cuda(0), spec_lengths.cuda(0) 239 | y, y_lengths = y.cuda(0), y_lengths.cuda(0) 240 | 241 | # remove else 242 | x = x[:1] 243 | x_lengths = x_lengths[:1] 244 | spec = spec[:1] 245 | spec_lengths = spec_lengths[:1] 246 | y = y[:1] 247 | y_lengths = y_lengths[:1] 248 | break 249 | y_hat, attn, mask, *_ = generator.module.infer(x, x_lengths, max_len=1000) 250 | y_hat_lengths = mask.sum([1,2]).long() * hps.data.hop_length 251 | 252 | mel = spec_to_mel_torch( 253 | spec, 254 | hps.data.filter_length, 255 | hps.data.n_mel_channels, 256 | hps.data.sampling_rate, 257 | hps.data.mel_fmin, 258 | hps.data.mel_fmax) 259 | y_hat_mel = mel_spectrogram_torch( 260 | y_hat.squeeze(1).float(), 261 | hps.data.filter_length, 262 | hps.data.n_mel_channels, 263 | hps.data.sampling_rate, 264 | hps.data.hop_length, 265 | hps.data.win_length, 266 | hps.data.mel_fmin, 267 | hps.data.mel_fmax 268 | ) 269 | image_dict = { 270 | "gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()) 271 | } 272 | audio_dict = { 273 | "gen/audio": y_hat[0,:,:y_hat_lengths[0]] 274 | } 275 | if global_step == 0: 276 | image_dict.update({"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())}) 277 | audio_dict.update({"gt/audio": y[0,:,:y_lengths[0]]}) 278 | 279 | utils.summarize( 280 | writer=writer_eval, 281 | global_step=global_step, 282 | images=image_dict, 283 | audios=audio_dict, 284 | audio_sampling_rate=hps.data.sampling_rate 285 | ) 286 | generator.train() 287 | 288 | 289 | if __name__ == "__main__": 290 | main() 291 | -------------------------------------------------------------------------------- /models/vits/train_ms.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import itertools 5 | import math 6 | import torch 7 | from torch import nn, optim 8 | from torch.nn import functional as F 9 | from torch.utils.data import DataLoader 10 | from torch.utils.tensorboard import SummaryWriter 11 | import torch.multiprocessing as mp 12 | import torch.distributed as dist 13 | from torch.nn.parallel import DistributedDataParallel as DDP 14 | from torch.cuda.amp import autocast, GradScaler 15 | 16 | import commons 17 | import utils 18 | from data_utils import ( 19 | TextAudioSpeakerLoader, 20 | TextAudioSpeakerCollate, 21 | DistributedBucketSampler 22 | ) 23 | from models import ( 24 | SynthesizerTrn, 25 | MultiPeriodDiscriminator, 26 | ) 27 | from losses import ( 28 | generator_loss, 29 | discriminator_loss, 30 | feature_loss, 31 | kl_loss 32 | ) 33 | from mel_processing import mel_spectrogram_torch, spec_to_mel_torch 34 | from text.symbols import symbols 35 | 36 | 37 | torch.backends.cudnn.benchmark = True 38 | global_step = 0 39 | 40 | 41 | def main(): 42 | """Assume Single Node Multi GPUs Training Only""" 43 | assert torch.cuda.is_available(), "CPU training is not allowed." 44 | 45 | n_gpus = torch.cuda.device_count() 46 | os.environ['MASTER_ADDR'] = 'localhost' 47 | os.environ['MASTER_PORT'] = '80000' 48 | 49 | hps = utils.get_hparams() 50 | mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,)) 51 | 52 | 53 | def run(rank, n_gpus, hps): 54 | global global_step 55 | if rank == 0: 56 | logger = utils.get_logger(hps.model_dir) 57 | logger.info(hps) 58 | utils.check_git_hash(hps.model_dir) 59 | writer = SummaryWriter(log_dir=hps.model_dir) 60 | writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) 61 | 62 | dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank) 63 | torch.manual_seed(hps.train.seed) 64 | torch.cuda.set_device(rank) 65 | 66 | train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps.data) 67 | train_sampler = DistributedBucketSampler( 68 | train_dataset, 69 | hps.train.batch_size, 70 | [32,300,400,500,600,700,800,900,1000], 71 | num_replicas=n_gpus, 72 | rank=rank, 73 | shuffle=True) 74 | collate_fn = TextAudioSpeakerCollate() 75 | train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True, 76 | collate_fn=collate_fn, batch_sampler=train_sampler) 77 | if rank == 0: 78 | eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data) 79 | eval_loader = DataLoader(eval_dataset, num_workers=8, shuffle=False, 80 | batch_size=hps.train.batch_size, pin_memory=True, 81 | drop_last=False, collate_fn=collate_fn) 82 | 83 | net_g = SynthesizerTrn( 84 | len(symbols), 85 | hps.data.filter_length // 2 + 1, 86 | hps.train.segment_size // hps.data.hop_length, 87 | n_speakers=hps.data.n_speakers, 88 | **hps.model).cuda(rank) 89 | net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) 90 | optim_g = torch.optim.AdamW( 91 | net_g.parameters(), 92 | hps.train.learning_rate, 93 | betas=hps.train.betas, 94 | eps=hps.train.eps) 95 | optim_d = torch.optim.AdamW( 96 | net_d.parameters(), 97 | hps.train.learning_rate, 98 | betas=hps.train.betas, 99 | eps=hps.train.eps) 100 | net_g = DDP(net_g, device_ids=[rank]) 101 | net_d = DDP(net_d, device_ids=[rank]) 102 | 103 | try: 104 | _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g) 105 | _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d) 106 | global_step = (epoch_str - 1) * len(train_loader) 107 | except: 108 | epoch_str = 1 109 | global_step = 0 110 | 111 | scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str-2) 112 | scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str-2) 113 | 114 | scaler = GradScaler(enabled=hps.train.fp16_run) 115 | 116 | for epoch in range(epoch_str, hps.train.epochs + 1): 117 | if rank==0: 118 | train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval]) 119 | else: 120 | train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, None], None, None) 121 | scheduler_g.step() 122 | scheduler_d.step() 123 | 124 | 125 | def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers): 126 | net_g, net_d = nets 127 | optim_g, optim_d = optims 128 | scheduler_g, scheduler_d = schedulers 129 | train_loader, eval_loader = loaders 130 | if writers is not None: 131 | writer, writer_eval = writers 132 | 133 | train_loader.batch_sampler.set_epoch(epoch) 134 | global global_step 135 | 136 | net_g.train() 137 | net_d.train() 138 | for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(train_loader): 139 | x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True) 140 | spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True) 141 | y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True) 142 | speakers = speakers.cuda(rank, non_blocking=True) 143 | 144 | with autocast(enabled=hps.train.fp16_run): 145 | y_hat, l_length, attn, ids_slice, x_mask, z_mask,\ 146 | (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(x, x_lengths, spec, spec_lengths, speakers) 147 | 148 | mel = spec_to_mel_torch( 149 | spec, 150 | hps.data.filter_length, 151 | hps.data.n_mel_channels, 152 | hps.data.sampling_rate, 153 | hps.data.mel_fmin, 154 | hps.data.mel_fmax) 155 | y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) 156 | y_hat_mel = mel_spectrogram_torch( 157 | y_hat.squeeze(1), 158 | hps.data.filter_length, 159 | hps.data.n_mel_channels, 160 | hps.data.sampling_rate, 161 | hps.data.hop_length, 162 | hps.data.win_length, 163 | hps.data.mel_fmin, 164 | hps.data.mel_fmax 165 | ) 166 | 167 | y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice 168 | 169 | # Discriminator 170 | y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) 171 | with autocast(enabled=False): 172 | loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) 173 | loss_disc_all = loss_disc 174 | optim_d.zero_grad() 175 | scaler.scale(loss_disc_all).backward() 176 | scaler.unscale_(optim_d) 177 | grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) 178 | scaler.step(optim_d) 179 | 180 | with autocast(enabled=hps.train.fp16_run): 181 | # Generator 182 | y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) 183 | with autocast(enabled=False): 184 | loss_dur = torch.sum(l_length.float()) 185 | loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel 186 | loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl 187 | 188 | loss_fm = feature_loss(fmap_r, fmap_g) 189 | loss_gen, losses_gen = generator_loss(y_d_hat_g) 190 | loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl 191 | optim_g.zero_grad() 192 | scaler.scale(loss_gen_all).backward() 193 | scaler.unscale_(optim_g) 194 | grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) 195 | scaler.step(optim_g) 196 | scaler.update() 197 | 198 | if rank==0: 199 | if global_step % hps.train.log_interval == 0: 200 | lr = optim_g.param_groups[0]['lr'] 201 | losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_dur, loss_kl] 202 | logger.info('Train Epoch: {} [{:.0f}%]'.format( 203 | epoch, 204 | 100. * batch_idx / len(train_loader))) 205 | logger.info([x.item() for x in losses] + [global_step, lr]) 206 | 207 | scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g} 208 | scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl}) 209 | 210 | scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) 211 | scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) 212 | scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) 213 | image_dict = { 214 | "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), 215 | "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), 216 | "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), 217 | "all/attn": utils.plot_alignment_to_numpy(attn[0,0].data.cpu().numpy()) 218 | } 219 | utils.summarize( 220 | writer=writer, 221 | global_step=global_step, 222 | images=image_dict, 223 | scalars=scalar_dict) 224 | 225 | if global_step % hps.train.eval_interval == 0: 226 | evaluate(hps, net_g, eval_loader, writer_eval) 227 | utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(global_step))) 228 | utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "D_{}.pth".format(global_step))) 229 | global_step += 1 230 | 231 | if rank == 0: 232 | logger.info('====> Epoch: {}'.format(epoch)) 233 | 234 | 235 | def evaluate(hps, generator, eval_loader, writer_eval): 236 | generator.eval() 237 | with torch.no_grad(): 238 | for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(eval_loader): 239 | x, x_lengths = x.cuda(0), x_lengths.cuda(0) 240 | spec, spec_lengths = spec.cuda(0), spec_lengths.cuda(0) 241 | y, y_lengths = y.cuda(0), y_lengths.cuda(0) 242 | speakers = speakers.cuda(0) 243 | 244 | # remove else 245 | x = x[:1] 246 | x_lengths = x_lengths[:1] 247 | spec = spec[:1] 248 | spec_lengths = spec_lengths[:1] 249 | y = y[:1] 250 | y_lengths = y_lengths[:1] 251 | speakers = speakers[:1] 252 | break 253 | y_hat, attn, mask, *_ = generator.module.infer(x, x_lengths, speakers, max_len=1000) 254 | y_hat_lengths = mask.sum([1,2]).long() * hps.data.hop_length 255 | 256 | mel = spec_to_mel_torch( 257 | spec, 258 | hps.data.filter_length, 259 | hps.data.n_mel_channels, 260 | hps.data.sampling_rate, 261 | hps.data.mel_fmin, 262 | hps.data.mel_fmax) 263 | y_hat_mel = mel_spectrogram_torch( 264 | y_hat.squeeze(1).float(), 265 | hps.data.filter_length, 266 | hps.data.n_mel_channels, 267 | hps.data.sampling_rate, 268 | hps.data.hop_length, 269 | hps.data.win_length, 270 | hps.data.mel_fmin, 271 | hps.data.mel_fmax 272 | ) 273 | image_dict = { 274 | "gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()) 275 | } 276 | audio_dict = { 277 | "gen/audio": y_hat[0,:,:y_hat_lengths[0]] 278 | } 279 | if global_step == 0: 280 | image_dict.update({"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())}) 281 | audio_dict.update({"gt/audio": y[0,:,:y_lengths[0]]}) 282 | 283 | utils.summarize( 284 | writer=writer_eval, 285 | global_step=global_step, 286 | images=image_dict, 287 | audios=audio_dict, 288 | audio_sampling_rate=hps.data.sampling_rate 289 | ) 290 | generator.train() 291 | 292 | 293 | if __name__ == "__main__": 294 | main() 295 | -------------------------------------------------------------------------------- /models/vits/transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | import numpy as np 5 | 6 | 7 | DEFAULT_MIN_BIN_WIDTH = 1e-3 8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3 9 | DEFAULT_MIN_DERIVATIVE = 1e-3 10 | 11 | 12 | def piecewise_rational_quadratic_transform(inputs, 13 | unnormalized_widths, 14 | unnormalized_heights, 15 | unnormalized_derivatives, 16 | inverse=False, 17 | tails=None, 18 | tail_bound=1., 19 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 20 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 21 | min_derivative=DEFAULT_MIN_DERIVATIVE): 22 | 23 | if tails is None: 24 | spline_fn = rational_quadratic_spline 25 | spline_kwargs = {} 26 | else: 27 | spline_fn = unconstrained_rational_quadratic_spline 28 | spline_kwargs = { 29 | 'tails': tails, 30 | 'tail_bound': tail_bound 31 | } 32 | 33 | outputs, logabsdet = spline_fn( 34 | inputs=inputs, 35 | unnormalized_widths=unnormalized_widths, 36 | unnormalized_heights=unnormalized_heights, 37 | unnormalized_derivatives=unnormalized_derivatives, 38 | inverse=inverse, 39 | min_bin_width=min_bin_width, 40 | min_bin_height=min_bin_height, 41 | min_derivative=min_derivative, 42 | **spline_kwargs 43 | ) 44 | return outputs, logabsdet 45 | 46 | 47 | def searchsorted(bin_locations, inputs, eps=1e-6): 48 | bin_locations[..., -1] += eps 49 | return torch.sum( 50 | inputs[..., None] >= bin_locations, 51 | dim=-1 52 | ) - 1 53 | 54 | 55 | def unconstrained_rational_quadratic_spline(inputs, 56 | unnormalized_widths, 57 | unnormalized_heights, 58 | unnormalized_derivatives, 59 | inverse=False, 60 | tails='linear', 61 | tail_bound=1., 62 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 63 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 64 | min_derivative=DEFAULT_MIN_DERIVATIVE): 65 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) 66 | outside_interval_mask = ~inside_interval_mask 67 | 68 | outputs = torch.zeros_like(inputs) 69 | logabsdet = torch.zeros_like(inputs) 70 | 71 | if tails == 'linear': 72 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) 73 | constant = np.log(np.exp(1 - min_derivative) - 1) 74 | unnormalized_derivatives[..., 0] = constant 75 | unnormalized_derivatives[..., -1] = constant 76 | 77 | outputs[outside_interval_mask] = inputs[outside_interval_mask] 78 | logabsdet[outside_interval_mask] = 0 79 | else: 80 | raise RuntimeError('{} tails are not implemented.'.format(tails)) 81 | 82 | outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline( 83 | inputs=inputs[inside_interval_mask], 84 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :], 85 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :], 86 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], 87 | inverse=inverse, 88 | left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound, 89 | min_bin_width=min_bin_width, 90 | min_bin_height=min_bin_height, 91 | min_derivative=min_derivative 92 | ) 93 | 94 | return outputs, logabsdet 95 | 96 | def rational_quadratic_spline(inputs, 97 | unnormalized_widths, 98 | unnormalized_heights, 99 | unnormalized_derivatives, 100 | inverse=False, 101 | left=0., right=1., bottom=0., top=1., 102 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 103 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 104 | min_derivative=DEFAULT_MIN_DERIVATIVE): 105 | if torch.min(inputs) < left or torch.max(inputs) > right: 106 | raise ValueError('Input to a transform is not within its domain') 107 | 108 | num_bins = unnormalized_widths.shape[-1] 109 | 110 | if min_bin_width * num_bins > 1.0: 111 | raise ValueError('Minimal bin width too large for the number of bins') 112 | if min_bin_height * num_bins > 1.0: 113 | raise ValueError('Minimal bin height too large for the number of bins') 114 | 115 | widths = F.softmax(unnormalized_widths, dim=-1) 116 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths 117 | cumwidths = torch.cumsum(widths, dim=-1) 118 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0) 119 | cumwidths = (right - left) * cumwidths + left 120 | cumwidths[..., 0] = left 121 | cumwidths[..., -1] = right 122 | widths = cumwidths[..., 1:] - cumwidths[..., :-1] 123 | 124 | derivatives = min_derivative + F.softplus(unnormalized_derivatives) 125 | 126 | heights = F.softmax(unnormalized_heights, dim=-1) 127 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights 128 | cumheights = torch.cumsum(heights, dim=-1) 129 | cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0) 130 | cumheights = (top - bottom) * cumheights + bottom 131 | cumheights[..., 0] = bottom 132 | cumheights[..., -1] = top 133 | heights = cumheights[..., 1:] - cumheights[..., :-1] 134 | 135 | if inverse: 136 | bin_idx = searchsorted(cumheights, inputs)[..., None] 137 | else: 138 | bin_idx = searchsorted(cumwidths, inputs)[..., None] 139 | 140 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] 141 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0] 142 | 143 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] 144 | delta = heights / widths 145 | input_delta = delta.gather(-1, bin_idx)[..., 0] 146 | 147 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] 148 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] 149 | 150 | input_heights = heights.gather(-1, bin_idx)[..., 0] 151 | 152 | if inverse: 153 | a = (((inputs - input_cumheights) * (input_derivatives 154 | + input_derivatives_plus_one 155 | - 2 * input_delta) 156 | + input_heights * (input_delta - input_derivatives))) 157 | b = (input_heights * input_derivatives 158 | - (inputs - input_cumheights) * (input_derivatives 159 | + input_derivatives_plus_one 160 | - 2 * input_delta)) 161 | c = - input_delta * (inputs - input_cumheights) 162 | 163 | discriminant = b.pow(2) - 4 * a * c 164 | assert (discriminant >= 0).all() 165 | 166 | root = (2 * c) / (-b - torch.sqrt(discriminant)) 167 | outputs = root * input_bin_widths + input_cumwidths 168 | 169 | theta_one_minus_theta = root * (1 - root) 170 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) 171 | * theta_one_minus_theta) 172 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2) 173 | + 2 * input_delta * theta_one_minus_theta 174 | + input_derivatives * (1 - root).pow(2)) 175 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 176 | 177 | return outputs, -logabsdet 178 | else: 179 | theta = (inputs - input_cumwidths) / input_bin_widths 180 | theta_one_minus_theta = theta * (1 - theta) 181 | 182 | numerator = input_heights * (input_delta * theta.pow(2) 183 | + input_derivatives * theta_one_minus_theta) 184 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) 185 | * theta_one_minus_theta) 186 | outputs = input_cumheights + numerator / denominator 187 | 188 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2) 189 | + 2 * input_delta * theta_one_minus_theta 190 | + input_derivatives * (1 - theta).pow(2)) 191 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 192 | 193 | return outputs, logabsdet 194 | -------------------------------------------------------------------------------- /models/vits/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import sys 4 | import argparse 5 | import logging 6 | import json 7 | import subprocess 8 | import numpy as np 9 | from scipy.io.wavfile import read 10 | import torch 11 | 12 | MATPLOTLIB_FLAG = False 13 | 14 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 15 | logger = logging 16 | 17 | 18 | def load_checkpoint(checkpoint_path, model, optimizer=None): 19 | assert os.path.isfile(checkpoint_path) 20 | checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') 21 | iteration = checkpoint_dict['iteration'] 22 | learning_rate = checkpoint_dict['learning_rate'] 23 | if optimizer is not None: 24 | optimizer.load_state_dict(checkpoint_dict['optimizer']) 25 | saved_state_dict = checkpoint_dict['model'] 26 | if hasattr(model, 'module'): 27 | state_dict = model.module.state_dict() 28 | else: 29 | state_dict = model.state_dict() 30 | new_state_dict= {} 31 | for k, v in state_dict.items(): 32 | try: 33 | new_state_dict[k] = saved_state_dict[k] 34 | except: 35 | logger.info("%s is not in the checkpoint" % k) 36 | new_state_dict[k] = v 37 | if hasattr(model, 'module'): 38 | model.module.load_state_dict(new_state_dict) 39 | else: 40 | model.load_state_dict(new_state_dict) 41 | logger.info("Loaded checkpoint '{}' (iteration {})" .format( 42 | checkpoint_path, iteration)) 43 | return model, optimizer, learning_rate, iteration 44 | 45 | 46 | def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): 47 | logger.info("Saving model and optimizer state at iteration {} to {}".format( 48 | iteration, checkpoint_path)) 49 | if hasattr(model, 'module'): 50 | state_dict = model.module.state_dict() 51 | else: 52 | state_dict = model.state_dict() 53 | torch.save({'model': state_dict, 54 | 'iteration': iteration, 55 | 'optimizer': optimizer.state_dict(), 56 | 'learning_rate': learning_rate}, checkpoint_path) 57 | 58 | 59 | def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050): 60 | for k, v in scalars.items(): 61 | writer.add_scalar(k, v, global_step) 62 | for k, v in histograms.items(): 63 | writer.add_histogram(k, v, global_step) 64 | for k, v in images.items(): 65 | writer.add_image(k, v, global_step, dataformats='HWC') 66 | for k, v in audios.items(): 67 | writer.add_audio(k, v, global_step, audio_sampling_rate) 68 | 69 | 70 | def latest_checkpoint_path(dir_path, regex="G_*.pth"): 71 | f_list = glob.glob(os.path.join(dir_path, regex)) 72 | f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) 73 | x = f_list[-1] 74 | print(x) 75 | return x 76 | 77 | 78 | def plot_spectrogram_to_numpy(spectrogram): 79 | global MATPLOTLIB_FLAG 80 | if not MATPLOTLIB_FLAG: 81 | import matplotlib 82 | matplotlib.use("Agg") 83 | MATPLOTLIB_FLAG = True 84 | mpl_logger = logging.getLogger('matplotlib') 85 | mpl_logger.setLevel(logging.WARNING) 86 | import matplotlib.pylab as plt 87 | import numpy as np 88 | 89 | fig, ax = plt.subplots(figsize=(10,2)) 90 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", 91 | interpolation='none') 92 | plt.colorbar(im, ax=ax) 93 | plt.xlabel("Frames") 94 | plt.ylabel("Channels") 95 | plt.tight_layout() 96 | 97 | fig.canvas.draw() 98 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') 99 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 100 | plt.close() 101 | return data 102 | 103 | 104 | def plot_alignment_to_numpy(alignment, info=None): 105 | global MATPLOTLIB_FLAG 106 | if not MATPLOTLIB_FLAG: 107 | import matplotlib 108 | matplotlib.use("Agg") 109 | MATPLOTLIB_FLAG = True 110 | mpl_logger = logging.getLogger('matplotlib') 111 | mpl_logger.setLevel(logging.WARNING) 112 | import matplotlib.pylab as plt 113 | import numpy as np 114 | 115 | fig, ax = plt.subplots(figsize=(6, 4)) 116 | im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower', 117 | interpolation='none') 118 | fig.colorbar(im, ax=ax) 119 | xlabel = 'Decoder timestep' 120 | if info is not None: 121 | xlabel += '\n\n' + info 122 | plt.xlabel(xlabel) 123 | plt.ylabel('Encoder timestep') 124 | plt.tight_layout() 125 | 126 | fig.canvas.draw() 127 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') 128 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 129 | plt.close() 130 | return data 131 | 132 | 133 | def load_wav_to_torch(full_path): 134 | sampling_rate, data = read(full_path) 135 | return torch.FloatTensor(data.astype(np.float32)), sampling_rate 136 | 137 | 138 | def load_filepaths_and_text(filename, split="|"): 139 | with open(filename, encoding='utf-8') as f: 140 | filepaths_and_text = [line.strip().split(split) for line in f] 141 | return filepaths_and_text 142 | 143 | 144 | def get_hparams(init=True): 145 | parser = argparse.ArgumentParser() 146 | parser.add_argument('-c', '--config', type=str, default="./configs/base.json", 147 | help='JSON file for configuration') 148 | parser.add_argument('-m', '--model', type=str, required=True, 149 | help='Model name') 150 | 151 | args = parser.parse_args() 152 | model_dir = os.path.join("./logs", args.model) 153 | 154 | if not os.path.exists(model_dir): 155 | os.makedirs(model_dir) 156 | 157 | config_path = args.config 158 | config_save_path = os.path.join(model_dir, "config.json") 159 | if init: 160 | with open(config_path, "r") as f: 161 | data = f.read() 162 | with open(config_save_path, "w") as f: 163 | f.write(data) 164 | else: 165 | with open(config_save_path, "r") as f: 166 | data = f.read() 167 | config = json.loads(data) 168 | 169 | hparams = HParams(**config) 170 | hparams.model_dir = model_dir 171 | return hparams 172 | 173 | 174 | def get_hparams_from_dir(model_dir): 175 | config_save_path = os.path.join(model_dir, "config.json") 176 | with open(config_save_path, "r") as f: 177 | data = f.read() 178 | config = json.loads(data) 179 | 180 | hparams =HParams(**config) 181 | hparams.model_dir = model_dir 182 | return hparams 183 | 184 | 185 | def get_hparams_from_file(config_path): 186 | with open(config_path, "r") as f: 187 | data = f.read() 188 | config = json.loads(data) 189 | 190 | hparams =HParams(**config) 191 | return hparams 192 | 193 | 194 | def check_git_hash(model_dir): 195 | source_dir = os.path.dirname(os.path.realpath(__file__)) 196 | if not os.path.exists(os.path.join(source_dir, ".git")): 197 | logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format( 198 | source_dir 199 | )) 200 | return 201 | 202 | cur_hash = subprocess.getoutput("git rev-parse HEAD") 203 | 204 | path = os.path.join(model_dir, "githash") 205 | if os.path.exists(path): 206 | saved_hash = open(path).read() 207 | if saved_hash != cur_hash: 208 | logger.warn("git hash values are different. {}(saved) != {}(current)".format( 209 | saved_hash[:8], cur_hash[:8])) 210 | else: 211 | open(path, "w").write(cur_hash) 212 | 213 | 214 | def get_logger(model_dir, filename="train.log"): 215 | global logger 216 | logger = logging.getLogger(os.path.basename(model_dir)) 217 | logger.setLevel(logging.DEBUG) 218 | 219 | formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") 220 | if not os.path.exists(model_dir): 221 | os.makedirs(model_dir) 222 | h = logging.FileHandler(os.path.join(model_dir, filename)) 223 | h.setLevel(logging.DEBUG) 224 | h.setFormatter(formatter) 225 | logger.addHandler(h) 226 | return logger 227 | 228 | 229 | class HParams(): 230 | def __init__(self, **kwargs): 231 | for k, v in kwargs.items(): 232 | if type(v) == dict: 233 | v = HParams(**v) 234 | self[k] = v 235 | 236 | def keys(self): 237 | return self.__dict__.keys() 238 | 239 | def items(self): 240 | return self.__dict__.items() 241 | 242 | def values(self): 243 | return self.__dict__.values() 244 | 245 | def __len__(self): 246 | return len(self.__dict__) 247 | 248 | def __getitem__(self, key): 249 | return getattr(self, key) 250 | 251 | def __setitem__(self, key, value): 252 | return setattr(self, key, value) 253 | 254 | def __contains__(self, key): 255 | return key in self.__dict__ 256 | 257 | def __repr__(self): 258 | return self.__dict__.__repr__() 259 | -------------------------------------------------------------------------------- /models/waveglow/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM bentoml/model-server:0.12.1-py37 2 | 3 | # Configure PIP install arguments, e.g. --index-url, --trusted-url, --extra-index-url 4 | ARG EXTRA_PIP_INSTALL_ARGS= 5 | ENV EXTRA_PIP_INSTALL_ARGS $EXTRA_PIP_INSTALL_ARGS 6 | 7 | ARG UID=1034 8 | ARG GID=1034 9 | RUN groupadd -g $GID -o bentoml && useradd -m -u $UID -g $GID -o -r bentoml 10 | 11 | ARG BUNDLE_PATH=/home/bentoml/bundle 12 | ENV BUNDLE_PATH=$BUNDLE_PATH 13 | ENV BENTOML_HOME=/home/bentoml/ 14 | 15 | RUN mkdir $BUNDLE_PATH && chown bentoml:bentoml $BUNDLE_PATH -R 16 | RUN mkdir /home/bentoml/logs && chown bentoml:bentoml /home/bentoml/logs -R 17 | RUN mkdir /home/bentoml/prometheus_multiproc_dir && chown bentoml:bentoml /home/bentoml/prometheus_multiproc_dir -R 18 | WORKDIR $BUNDLE_PATH 19 | 20 | # copy over the init script; copy over entrypoint scripts 21 | COPY --chown=bentoml:bentoml bentoml-init.sh docker-entrypoint.sh ./ 22 | RUN chmod +x ./bentoml-init.sh 23 | 24 | # Copy docker-entrypoint.sh again, because setup.sh might not exist. This prevent COPY command from failing. 25 | COPY --chown=bentoml:bentoml docker-entrypoint.sh setup.s[h] ./ 26 | RUN ./bentoml-init.sh custom_setup 27 | 28 | COPY --chown=bentoml:bentoml docker-entrypoint.sh python_versio[n] ./ 29 | RUN ./bentoml-init.sh ensure_python 30 | 31 | COPY --chown=bentoml:bentoml environment.yml ./ 32 | RUN ./bentoml-init.sh restore_conda_env 33 | 34 | COPY --chown=bentoml:bentoml requirements.txt ./ 35 | RUN ./bentoml-init.sh install_pip_packages 36 | 37 | COPY --chown=bentoml:bentoml docker-entrypoint.sh bundled_pip_dependencie[s] ./bundled_pip_dependencies/ 38 | RUN rm ./bundled_pip_dependencies/docker-entrypoint.sh && ./bentoml-init.sh install_bundled_pip_packages 39 | 40 | # copy over model files 41 | COPY --chown=bentoml:bentoml . ./ 42 | 43 | # the env var $PORT is required by heroku container runtime 44 | ENV PORT 5000 45 | EXPOSE $PORT 46 | 47 | USER bentoml 48 | RUN chmod +x ./docker-entrypoint.sh 49 | ENTRYPOINT [ "./docker-entrypoint.sh" ] 50 | CMD ["bentoml", "serve-gunicorn", "./"] 51 | -------------------------------------------------------------------------------- /models/waveglow/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include TextToSpeechModel/bentoml.yml 2 | graft TextToSpeechModel/artifacts 3 | -------------------------------------------------------------------------------- /models/waveglow/README.md: -------------------------------------------------------------------------------- 1 | # Generated BentoService bundle - TextToSpeechModel:20210531095723_F76C2A 2 | 3 | This is a ML Service bundle created with BentoML, it is not recommended to edit 4 | code or files contained in this directory. Instead, edit the code that uses BentoML 5 | to create this bundle, and save a new BentoService bundle. 6 | 7 | A model that converts text into spoken speech -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | 5 | from bentoml import saved_bundle, configure_logging 6 | from bentoml.cli.bento_service import create_bento_service_cli 7 | 8 | # By default, ignore warnings when loading BentoService installed as PyPI distribution 9 | # CLI will change back to default log level in config(info), and by adding --quiet or 10 | # --verbose CLI option, user can change the CLI output behavior 11 | configure_logging(logging.ERROR) 12 | 13 | __VERSION__ = "20210531095723_F76C2A" 14 | 15 | __module_path = os.path.abspath(os.path.dirname(__file__)) 16 | 17 | TextToSpeechModel = saved_bundle.load_bento_service_class(__module_path) 18 | 19 | cli=create_bento_service_cli(__module_path) 20 | 21 | 22 | def load(): 23 | return saved_bundle.load_from_dir(__module_path) 24 | 25 | 26 | __all__ = ['__version__', 'TextToSpeechModel', 'load'] 27 | -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/artifacts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dscripka/synthetic_speech_dataset_generation/09cdc32c9efafefa603346819ba84aef4be2063b/models/waveglow/TextToSpeechModel/artifacts/__init__.py -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/artifacts/cmudict_dictionary: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dscripka/synthetic_speech_dataset_generation/09cdc32c9efafefa603346819ba84aef4be2063b/models/waveglow/TextToSpeechModel/artifacts/cmudict_dictionary -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/artifacts/heteronyms: -------------------------------------------------------------------------------- 1 | abject 2 | abrogate 3 | absent 4 | abstract 5 | abuse 6 | ache 7 | Acre 8 | acuminate 9 | addict 10 | address 11 | adduct 12 | Adele 13 | advocate 14 | affect 15 | affiliate 16 | agape 17 | aged 18 | agglomerate 19 | aggregate 20 | agonic 21 | agora 22 | allied 23 | ally 24 | alternate 25 | alum 26 | am 27 | analyses 28 | Andrea 29 | animate 30 | apply 31 | appropriate 32 | approximate 33 | ares 34 | arithmetic 35 | arsenic 36 | articulate 37 | associate 38 | attribute 39 | august 40 | axes 41 | ay 42 | aye 43 | bases 44 | bass 45 | bathed 46 | bested 47 | bifurcate 48 | blessed 49 | blotto 50 | bow 51 | bowed 52 | bowman 53 | brassy 54 | buffet 55 | bustier 56 | carbonate 57 | Celtic 58 | choral 59 | Chumash 60 | close 61 | closer 62 | coax 63 | coincidence 64 | color coordinate 65 | colour coordinate 66 | comber 67 | combine 68 | combs 69 | committee 70 | commune 71 | compact 72 | complex 73 | compound 74 | compress 75 | concert 76 | conduct 77 | confine 78 | confines 79 | conflict 80 | conglomerate 81 | conscript 82 | conserve 83 | consist 84 | console 85 | consort 86 | construct 87 | consult 88 | consummate 89 | content 90 | contest 91 | contract 92 | contracts 93 | contrast 94 | converse 95 | convert 96 | convict 97 | coop 98 | coordinate 99 | covey 100 | crooked 101 | curate 102 | cussed 103 | decollate 104 | decrease 105 | defect 106 | defense 107 | delegate 108 | deliberate 109 | denier 110 | desert 111 | detail 112 | deviate 113 | diagnoses 114 | diffuse 115 | digest 116 | discard 117 | discharge 118 | discount 119 | do 120 | document 121 | does 122 | dogged 123 | domesticate 124 | Dominican 125 | dove 126 | dr 127 | drawer 128 | duplicate 129 | egress 130 | ejaculate 131 | eject 132 | elaborate 133 | ellipses 134 | email 135 | emu 136 | entrace 137 | entrance 138 | escort 139 | estimate 140 | eta 141 | Etna 142 | evening 143 | excise 144 | excuse 145 | exploit 146 | export 147 | extract 148 | fine 149 | flower 150 | forbear 151 | four-legged 152 | frequent 153 | furrier 154 | gallant 155 | gel 156 | geminate 157 | gillie 158 | glower 159 | Gotham 160 | graduate 161 | haggis 162 | heavy 163 | hinder 164 | house 165 | housewife 166 | impact 167 | imped 168 | implant 169 | implement 170 | import 171 | impress 172 | incense 173 | incline 174 | increase 175 | infix 176 | insert 177 | instar 178 | insult 179 | integral 180 | intercept 181 | interchange 182 | interflow 183 | interleaf 184 | intermediate 185 | intern 186 | interspace 187 | intimate 188 | intrigue 189 | invalid 190 | invert 191 | invite 192 | irony 193 | jagged 194 | Jesses 195 | Julies 196 | kite 197 | laminate 198 | Laos 199 | lather 200 | lead 201 | learned 202 | leasing 203 | lech 204 | legitimate 205 | lied 206 | lima 207 | lipread 208 | live 209 | lower 210 | lunged 211 | maas 212 | Magdalen 213 | manes 214 | mare 215 | marked 216 | merchandise 217 | merlion 218 | minute 219 | misconduct 220 | misled 221 | misprint 222 | mobile 223 | moderate 224 | mong 225 | moped 226 | moth 227 | mouth 228 | mow 229 | mpg 230 | multiply 231 | mush 232 | nana 233 | nice 234 | Nice 235 | number 236 | numerate 237 | nun 238 | object 239 | opiate 240 | ornament 241 | outbox 242 | outcry 243 | outpour 244 | outreach 245 | outride 246 | outright 247 | outside 248 | outwork 249 | overall 250 | overbid 251 | overcall 252 | overcast 253 | overfall 254 | overflow 255 | overhaul 256 | overhead 257 | overlap 258 | overlay 259 | overuse 260 | overweight 261 | overwork 262 | pace 263 | palled 264 | palling 265 | para 266 | pasty 267 | pate 268 | Pauline 269 | pedal 270 | peer 271 | perfect 272 | periodic 273 | permit 274 | pervert 275 | pinta 276 | placer 277 | platy 278 | polish 279 | Polish 280 | poll 281 | pontificate 282 | postulate 283 | pram 284 | prayer 285 | precipitate 286 | predate 287 | predicate 288 | prefix 289 | preposition 290 | present 291 | pretest 292 | primer 293 | proceeds 294 | produce 295 | progress 296 | project 297 | proportionate 298 | prospect 299 | protest 300 | pussy 301 | putter 302 | putting 303 | quite 304 | ragged 305 | raven 306 | re 307 | read 308 | reading 309 | Reading 310 | real 311 | rebel 312 | recall 313 | recap 314 | recitative 315 | recollect 316 | record 317 | recreate 318 | recreation 319 | redress 320 | refill 321 | refund 322 | refuse 323 | reject 324 | relay 325 | remake 326 | repaint 327 | reprint 328 | reread 329 | rerun 330 | resent 331 | reside 332 | resign 333 | respray 334 | resume 335 | retard 336 | retest 337 | retread 338 | rewrite 339 | root 340 | routed 341 | routing 342 | row 343 | rugged 344 | rummy 345 | sais 346 | sake 347 | sambuca 348 | saucier 349 | second 350 | secrete 351 | secreted 352 | secreting 353 | segment 354 | separate 355 | sewer 356 | shirk 357 | shower 358 | sin 359 | skied 360 | slaver 361 | slough 362 | sow 363 | spoof 364 | squid 365 | stingy 366 | subject 367 | subordinate 368 | subvert 369 | supply 370 | supposed 371 | survey 372 | suspect 373 | syringes 374 | tabulate 375 | tales 376 | tarrier 377 | tarry 378 | taxes 379 | taxis 380 | tear 381 | Theron 382 | thou 383 | three-legged 384 | tier 385 | tinged 386 | torment 387 | transfer 388 | transform 389 | transplant 390 | transport 391 | transpose 392 | tush 393 | two-legged 394 | unionised 395 | unionized 396 | update 397 | uplift 398 | upset 399 | use 400 | used 401 | vale 402 | violist 403 | viva 404 | ware 405 | whinged 406 | whoop 407 | wicked 408 | wind 409 | windy 410 | wino 411 | won 412 | worsted 413 | wound 414 | -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/audio_processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from scipy.signal import get_window 4 | from librosa.filters import mel as librosa_mel_fn 5 | import librosa.util as librosa_util 6 | 7 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800, 8 | n_fft=800, dtype=np.float32, norm=None): 9 | """ 10 | # from librosa 0.6 11 | Compute the sum-square envelope of a window function at a given hop length. 12 | 13 | This is used to estimate modulation effects induced by windowing 14 | observations in short-time fourier transforms. 15 | 16 | Parameters 17 | ---------- 18 | window : string, tuple, number, callable, or list-like 19 | Window specification, as in `get_window` 20 | 21 | n_frames : int > 0 22 | The number of analysis frames 23 | 24 | hop_length : int > 0 25 | The number of samples to advance between frames 26 | 27 | win_length : [optional] 28 | The length of the window function. By default, this matches `n_fft`. 29 | 30 | n_fft : int > 0 31 | The length of each analysis frame. 32 | 33 | dtype : np.dtype 34 | The data type of the output 35 | 36 | Returns 37 | ------- 38 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` 39 | The sum-squared envelope of the window function 40 | """ 41 | if win_length is None: 42 | win_length = n_fft 43 | 44 | n = n_fft + hop_length * (n_frames - 1) 45 | x = np.zeros(n, dtype=dtype) 46 | 47 | # Compute the squared window at the desired length 48 | win_sq = get_window(window, win_length, fftbins=True) 49 | win_sq = librosa_util.normalize(win_sq, norm=norm)**2 50 | win_sq = librosa_util.pad_center(win_sq, n_fft) 51 | 52 | # Fill the envelope 53 | for i in range(n_frames): 54 | sample = i * hop_length 55 | x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] 56 | return x 57 | 58 | 59 | def griffin_lim(magnitudes, stft_fn, n_iters=30): 60 | """ 61 | PARAMS 62 | ------ 63 | magnitudes: spectrogram magnitudes 64 | stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods 65 | """ 66 | 67 | angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) 68 | angles = angles.astype(np.float32) 69 | angles = torch.autograd.Variable(torch.from_numpy(angles)) 70 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 71 | 72 | for i in range(n_iters): 73 | _, angles = stft_fn.transform(signal) 74 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 75 | return signal 76 | 77 | 78 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 79 | """ 80 | PARAMS 81 | ------ 82 | C: compression factor 83 | """ 84 | return torch.log(torch.clamp(x, min=clip_val) * C) 85 | 86 | 87 | def dynamic_range_decompression(x, C=1): 88 | """ 89 | PARAMS 90 | ------ 91 | C: compression factor used to compress 92 | """ 93 | return torch.exp(x) / C 94 | 95 | 96 | class TacotronSTFT(torch.nn.Module): 97 | def __init__(self, filter_length=1024, hop_length=256, win_length=1024, 98 | n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, 99 | mel_fmax=None): 100 | super(TacotronSTFT, self).__init__() 101 | self.n_mel_channels = n_mel_channels 102 | self.sampling_rate = sampling_rate 103 | self.stft_fn = STFT(filter_length, hop_length, win_length) 104 | mel_basis = librosa_mel_fn( 105 | sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) 106 | mel_basis = torch.from_numpy(mel_basis).float() 107 | self.register_buffer('mel_basis', mel_basis) 108 | 109 | def spectral_normalize(self, magnitudes): 110 | output = dynamic_range_compression(magnitudes) 111 | return output 112 | 113 | def spectral_de_normalize(self, magnitudes): 114 | output = dynamic_range_decompression(magnitudes) 115 | return output 116 | 117 | def mel_spectrogram(self, y): 118 | """Computes mel-spectrograms from a batch of waves 119 | PARAMS 120 | ------ 121 | y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] 122 | 123 | RETURNS 124 | ------- 125 | mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) 126 | """ 127 | assert(torch.min(y.data) >= -1) 128 | assert(torch.max(y.data) <= 1) 129 | 130 | magnitudes, phases = self.stft_fn.transform(y) 131 | magnitudes = magnitudes.data 132 | mel_output = torch.matmul(self.mel_basis, magnitudes) 133 | mel_output = self.spectral_normalize(mel_output) 134 | return mel_output 135 | 136 | """ 137 | BSD 3-Clause License 138 | 139 | Copyright (c) 2017, Prem Seetharaman 140 | All rights reserved. 141 | 142 | * Redistribution and use in source and binary forms, with or without 143 | modification, are permitted provided that the following conditions are met: 144 | 145 | * Redistributions of source code must retain the above copyright notice, 146 | this list of conditions and the following disclaimer. 147 | 148 | * Redistributions in binary form must reproduce the above copyright notice, this 149 | list of conditions and the following disclaimer in the 150 | documentation and/or other materials provided with the distribution. 151 | 152 | * Neither the name of the copyright holder nor the names of its 153 | contributors may be used to endorse or promote products derived from this 154 | software without specific prior written permission. 155 | 156 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 157 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 158 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 159 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 160 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 161 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 162 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 163 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 164 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 165 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 166 | """ 167 | import torch.nn.functional as F 168 | from torch.autograd import Variable 169 | from scipy.signal import get_window 170 | from librosa.util import pad_center, tiny 171 | 172 | class STFT(torch.nn.Module): 173 | """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" 174 | def __init__(self, filter_length=800, hop_length=200, win_length=800, 175 | window='hann'): 176 | super(STFT, self).__init__() 177 | self.filter_length = filter_length 178 | self.hop_length = hop_length 179 | self.win_length = win_length 180 | self.window = window 181 | self.forward_transform = None 182 | scale = self.filter_length / self.hop_length 183 | fourier_basis = np.fft.fft(np.eye(self.filter_length)) 184 | 185 | cutoff = int((self.filter_length / 2 + 1)) 186 | fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), 187 | np.imag(fourier_basis[:cutoff, :])]) 188 | 189 | forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) 190 | inverse_basis = torch.FloatTensor( 191 | np.linalg.pinv(scale * fourier_basis).T[:, None, :]) 192 | 193 | if window is not None: 194 | assert(win_length >= filter_length) 195 | # get window and zero center pad it to filter_length 196 | fft_window = get_window(window, win_length, fftbins=True) 197 | fft_window = pad_center(fft_window, filter_length) 198 | fft_window = torch.from_numpy(fft_window).float() 199 | 200 | # window the bases 201 | forward_basis *= fft_window 202 | inverse_basis *= fft_window 203 | 204 | self.register_buffer('forward_basis', forward_basis.float()) 205 | self.register_buffer('inverse_basis', inverse_basis.float()) 206 | 207 | def transform(self, input_data): 208 | num_batches = input_data.size(0) 209 | num_samples = input_data.size(1) 210 | 211 | self.num_samples = num_samples 212 | 213 | # similar to librosa, reflect-pad the input 214 | input_data = input_data.view(num_batches, 1, num_samples) 215 | input_data = F.pad( 216 | input_data.unsqueeze(1), 217 | (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), 218 | mode='reflect') 219 | input_data = input_data.squeeze(1) 220 | 221 | forward_transform = F.conv1d( 222 | input_data, 223 | Variable(self.forward_basis, requires_grad=False), 224 | stride=self.hop_length, 225 | padding=0) 226 | 227 | cutoff = int((self.filter_length / 2) + 1) 228 | real_part = forward_transform[:, :cutoff, :] 229 | imag_part = forward_transform[:, cutoff:, :] 230 | 231 | magnitude = torch.sqrt(real_part**2 + imag_part**2) 232 | phase = torch.autograd.Variable( 233 | torch.atan2(imag_part.data, real_part.data)) 234 | 235 | return magnitude, phase 236 | 237 | def inverse(self, magnitude, phase): 238 | recombine_magnitude_phase = torch.cat( 239 | [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) 240 | 241 | inverse_transform = F.conv_transpose1d( 242 | recombine_magnitude_phase, 243 | Variable(self.inverse_basis, requires_grad=False), 244 | stride=self.hop_length, 245 | padding=0) 246 | 247 | if self.window is not None: 248 | window_sum = window_sumsquare( 249 | self.window, magnitude.size(-1), hop_length=self.hop_length, 250 | win_length=self.win_length, n_fft=self.filter_length, 251 | dtype=np.float32) 252 | # remove modulation effects 253 | approx_nonzero_indices = torch.from_numpy( 254 | np.where(window_sum > tiny(window_sum))[0]) 255 | window_sum = torch.autograd.Variable( 256 | torch.from_numpy(window_sum), requires_grad=False) 257 | inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] 258 | 259 | # scale by hop ratio 260 | inverse_transform *= float(self.filter_length) / self.hop_length 261 | 262 | inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] 263 | inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):] 264 | 265 | return inverse_transform 266 | 267 | def forward(self, input_data): 268 | self.magnitude, self.phase = self.transform(input_data) 269 | reconstruction = self.inverse(self.magnitude, self.phase) 270 | return reconstruction 271 | -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/bentoml.yml: -------------------------------------------------------------------------------- 1 | version: 0.12.1 2 | kind: BentoService 3 | metadata: 4 | created_at: 2021-05-31 13:57:23.628577 5 | service_name: TextToSpeechModel 6 | service_version: 20210531095723_F76C2A 7 | module_name: text_to_speech 8 | module_file: text_to_speech.py 9 | env: 10 | pip_packages: 11 | - bentoml==0.12.1 12 | - torch==1.7.1 13 | - numpy==1.19.2 14 | - inflect==4.1.0 15 | - scipy==1.5.2 16 | - Unidecode==1.0.22 17 | - librosa==0.6.0 18 | conda_env: 19 | name: bentoml-default-conda-env 20 | dependencies: [] 21 | python_version: 3.7.6 22 | docker_base_image: bentoml/model-server:0.12.1-py37 23 | apis: 24 | - name: predict 25 | docs: "BentoService inference API 'predict', input: 'JsonInput', output: 'DefaultOutput'" 26 | input_type: JsonInput 27 | output_type: DefaultOutput 28 | mb_max_batch_size: 4000 29 | mb_max_latency: 20000 30 | batch: false 31 | route: predict 32 | output_config: 33 | cors: '*' 34 | artifacts: 35 | - name: model 36 | artifact_type: WaveglowArtifact 37 | metadata: {} 38 | -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/data.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | ############################################################################### 17 | import re 18 | import os 19 | import sys 20 | import argparse 21 | import json 22 | import random 23 | import numpy as np 24 | import torch 25 | import torch.utils.data 26 | from scipy.io.wavfile import read 27 | from audio_processing import TacotronSTFT 28 | from text import text_to_sequence, cmudict, _clean_text, get_arpabet 29 | 30 | 31 | def load_filepaths_and_text(filename, split="|"): 32 | with open(filename, encoding='utf-8') as f: 33 | filepaths_and_text = [line.strip().split(split) for line in f] 34 | return filepaths_and_text 35 | 36 | 37 | def load_wav_to_torch(full_path): 38 | """ Loads wavdata into torch array """ 39 | sampling_rate, data = read(full_path) 40 | return torch.from_numpy(data).float(), sampling_rate 41 | 42 | 43 | class Data(torch.utils.data.Dataset): 44 | def __init__(self, filelist_path, filter_length, hop_length, win_length, 45 | sampling_rate, mel_fmin, mel_fmax, max_wav_value, p_arpabet, 46 | cmudict_path, text_cleaners, speaker_ids=None, randomize=True, 47 | seed=1234): 48 | self.max_wav_value = max_wav_value 49 | self.audiopaths_and_text = load_filepaths_and_text(filelist_path) 50 | self.stft = TacotronSTFT(filter_length=filter_length, 51 | hop_length=hop_length, 52 | win_length=win_length, 53 | sampling_rate=sampling_rate, 54 | mel_fmin=mel_fmin, mel_fmax=mel_fmax) 55 | self.sampling_rate = sampling_rate 56 | self.text_cleaners = text_cleaners 57 | self.p_arpabet = p_arpabet 58 | self.cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=True) 59 | if speaker_ids is None: 60 | self.speaker_ids = self.create_speaker_lookup_table(self.audiopaths_and_text) 61 | else: 62 | self.speaker_ids = speaker_ids 63 | 64 | random.seed(seed) 65 | if randomize: 66 | random.shuffle(self.audiopaths_and_text) 67 | 68 | def create_speaker_lookup_table(self, audiopaths_and_text): 69 | speaker_ids = np.sort(np.unique([x[2] for x in audiopaths_and_text])) 70 | d = {int(speaker_ids[i]): i for i in range(len(speaker_ids))} 71 | print("Number of speakers :", len(d)) 72 | return d 73 | 74 | def get_mel(self, audio): 75 | audio_norm = audio / self.max_wav_value 76 | audio_norm = audio_norm.unsqueeze(0) 77 | audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) 78 | melspec = self.stft.mel_spectrogram(audio_norm) 79 | melspec = torch.squeeze(melspec, 0) 80 | return melspec 81 | 82 | def get_speaker_id(self, speaker_id): 83 | return torch.LongTensor([self.speaker_ids[int(speaker_id)]]) 84 | 85 | def get_text(self, text): 86 | text = _clean_text(text, self.text_cleaners) 87 | words = re.findall(r'\S*\{.*?\}\S*|\S+', text) 88 | text = ' '.join([get_arpabet(word, self.cmudict) 89 | if random.random() < self.p_arpabet else word 90 | for word in words]) 91 | text_norm = torch.LongTensor(text_to_sequence(text)) 92 | return text_norm 93 | 94 | def __getitem__(self, index): 95 | # Read audio and text 96 | audiopath, text, speaker_id = self.audiopaths_and_text[index] 97 | audio, sampling_rate = load_wav_to_torch(audiopath) 98 | if sampling_rate != self.sampling_rate: 99 | raise ValueError("{} SR doesn't match target {} SR".format( 100 | sampling_rate, self.sampling_rate)) 101 | 102 | mel = self.get_mel(audio) 103 | text_encoded = self.get_text(text) 104 | speaker_id = self.get_speaker_id(speaker_id) 105 | return (mel, speaker_id, text_encoded) 106 | 107 | def __len__(self): 108 | return len(self.audiopaths_and_text) 109 | 110 | 111 | class DataCollate(): 112 | """ Zero-pads model inputs and targets based on number of frames per step """ 113 | def __init__(self, n_frames_per_step=1): 114 | self.n_frames_per_step = n_frames_per_step 115 | 116 | def __call__(self, batch): 117 | """Collate's training batch from normalized text and mel-spectrogram """ 118 | # Right zero-pad all one-hot text sequences to max input length 119 | input_lengths, ids_sorted_decreasing = torch.sort( 120 | torch.LongTensor([len(x[2]) for x in batch]), 121 | dim=0, descending=True) 122 | max_input_len = input_lengths[0] 123 | 124 | text_padded = torch.LongTensor(len(batch), max_input_len) 125 | text_padded.zero_() 126 | for i in range(len(ids_sorted_decreasing)): 127 | text = batch[ids_sorted_decreasing[i]][2] 128 | text_padded[i, :text.size(0)] = text 129 | 130 | # Right zero-pad mel-spec 131 | num_mel_channels = batch[0][0].size(0) 132 | max_target_len = max([x[0].size(1) for x in batch]) 133 | if max_target_len % self.n_frames_per_step != 0: 134 | max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step 135 | assert max_target_len % self.n_frames_per_step == 0 136 | 137 | # include mel padded, gate padded and speaker ids 138 | mel_padded = torch.FloatTensor(len(batch), num_mel_channels, max_target_len) 139 | mel_padded.zero_() 140 | gate_padded = torch.FloatTensor(len(batch), max_target_len) 141 | gate_padded.zero_() 142 | output_lengths = torch.LongTensor(len(batch)) 143 | speaker_ids = torch.LongTensor(len(batch)) 144 | for i in range(len(ids_sorted_decreasing)): 145 | mel = batch[ids_sorted_decreasing[i]][0] 146 | mel_padded[i, :, :mel.size(1)] = mel 147 | gate_padded[i, mel.size(1)-1:] = 1 148 | output_lengths[i] = mel.size(1) 149 | speaker_ids[i] = batch[ids_sorted_decreasing[i]][1] 150 | 151 | return mel_padded, speaker_ids, text_padded, input_lengths, output_lengths, gate_padded 152 | 153 | 154 | # =================================================================== 155 | # Takes directory of clean audio and makes directory of spectrograms 156 | # Useful for making test sets 157 | # =================================================================== 158 | if __name__ == "__main__": 159 | # Get defaults so it can work with no Sacred 160 | parser = argparse.ArgumentParser() 161 | parser.add_argument('-c', '--config', type=str, 162 | help='JSON file for configuration') 163 | parser.add_argument('-f', '--filelist', type=str, 164 | help='List of files to generate mels') 165 | parser.add_argument('-o', '--output_dir', type=str, 166 | help='Output directory') 167 | args = parser.parse_args() 168 | 169 | with open(args.config) as f: 170 | data = f.read() 171 | data_config = json.loads(data)["data_config"] 172 | mel2samp = Data(**data_config) 173 | 174 | # Make directory if it doesn't exist 175 | if not os.path.isdir(args.output_dir): 176 | os.makedirs(args.output_dir) 177 | os.chmod(args.output_dir, 0o775) 178 | 179 | filepaths_and_text = load_filepaths_and_text(args.filelist) 180 | for (filepath, text, speaker_id) in filepaths_and_text: 181 | print("speaker id", speaker_id) 182 | print("text", text) 183 | print("text encoded", mel2samp.get_text(text)) 184 | audio, sr = load_wav_to_torch(filepath) 185 | melspectrogram = mel2samp.get_mel(audio) 186 | filename = os.path.basename(filepath) 187 | new_filepath = args.output_dir + '/' + filename + '.pt' 188 | print(new_filepath) 189 | torch.save(melspectrogram, new_filepath) 190 | -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/text/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Keith Ito 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/text/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | import re 3 | import pathlib 4 | from text import cleaners 5 | from text.symbols import symbols 6 | from text.symbols import _punctuation as punctuation_symbols 7 | 8 | # Mappings from symbol to numeric ID and vice versa: 9 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 10 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 11 | 12 | # Regular expression matching text enclosed in curly braces: 13 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 14 | 15 | # for arpabet with apostrophe 16 | _apostrophe = re.compile(r"(?=\S*['])([a-zA-Z'-]+)") 17 | 18 | def text_to_sequence(text): 19 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 20 | 21 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 22 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 23 | 24 | Args: 25 | text: string to convert to a sequence 26 | cleaner_names: names of the cleaner functions to run the text through 27 | 28 | Returns: 29 | List of integers corresponding to the symbols in the text 30 | ''' 31 | sequence = [] 32 | 33 | # Check for curly braces and treat their contents as ARPAbet: 34 | while len(text): 35 | m = _curly_re.match(text) 36 | if not m: 37 | sequence += _symbols_to_sequence(text) 38 | break 39 | sequence += _symbols_to_sequence(m.group(1)) 40 | sequence += _arpabet_to_sequence(m.group(2)) 41 | text = m.group(3) 42 | 43 | return sequence 44 | 45 | 46 | def sequence_to_text(sequence): 47 | '''Converts a sequence of IDs back to a string''' 48 | result = '' 49 | for symbol_id in sequence: 50 | if symbol_id in _id_to_symbol: 51 | s = _id_to_symbol[symbol_id] 52 | # Enclose ARPAbet back in curly braces: 53 | if len(s) > 1 and s[0] == '@': 54 | s = '{%s}' % s[1:] 55 | result += s 56 | return result.replace('}{', ' ') 57 | 58 | 59 | def _clean_text(text, cleaner_names): 60 | for name in cleaner_names: 61 | cleaner = getattr(cleaners, name) 62 | if not cleaner: 63 | raise Exception('Unknown cleaner: %s' % name) 64 | text = cleaner(text) 65 | 66 | return text 67 | 68 | 69 | def _symbols_to_sequence(symbols): 70 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 71 | 72 | 73 | def _arpabet_to_sequence(text): 74 | return _symbols_to_sequence(['@' + s for s in text.split()]) 75 | 76 | 77 | def _should_keep_symbol(s): 78 | return s in _symbol_to_id and s is not '_' and s is not '~' 79 | 80 | 81 | def get_arpabet(word, cmudict, index=0): 82 | re_start_punc = r"\A\W+" 83 | re_end_punc = r"\W+\Z" 84 | 85 | start_symbols = re.findall(re_start_punc, word) 86 | if len(start_symbols): 87 | start_symbols = start_symbols[0] 88 | word = word[len(start_symbols):] 89 | else: 90 | start_symbols = '' 91 | 92 | end_symbols = re.findall(re_end_punc, word) 93 | if len(end_symbols): 94 | end_symbols = end_symbols[0] 95 | word = word[:-len(end_symbols)] 96 | else: 97 | end_symbols = '' 98 | 99 | arpabet_suffix = '' 100 | if _apostrophe.match(word) is not None and word.lower() != "it's" and word.lower()[-1] == 's': 101 | word = word[:-2] 102 | arpabet_suffix = ' Z' 103 | arpabet = None if word.lower() in HETERONYMS else cmudict.lookup(word) 104 | 105 | if arpabet is not None: 106 | return start_symbols + '{%s}' % (arpabet[index] + arpabet_suffix) + end_symbols 107 | else: 108 | return start_symbols + word + end_symbols 109 | 110 | 111 | def files_to_list(filename): 112 | """ 113 | Takes a text file of filenames and makes a list of filenames 114 | """ 115 | with open(filename, encoding='utf-8') as f: 116 | files = f.readlines() 117 | 118 | files = [f.rstrip() for f in files] 119 | return files 120 | 121 | HETERONYMS = set(files_to_list(str(pathlib.Path(__file__).parent.absolute()) + '/heteronyms')) 122 | -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/text/acronyms.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pathlib 3 | from .cmudict import CMUDict 4 | 5 | _letter_to_arpabet = { 6 | 'A': 'EY1', 7 | 'B': 'B IY1', 8 | 'C': 'S IY1', 9 | 'D': 'D IY1', 10 | 'E': 'IY1', 11 | 'F': 'EH1 F', 12 | 'G': 'JH IY1', 13 | 'H': 'EY1 CH', 14 | 'I': 'AY1', 15 | 'J': 'JH EY1', 16 | 'K': 'K EY1', 17 | 'L': 'EH1 L', 18 | 'M': 'EH1 M', 19 | 'N': 'EH1 N', 20 | 'O': 'OW1', 21 | 'P': 'P IY1', 22 | 'Q': 'K Y UW1', 23 | 'R': 'AA1 R', 24 | 'S': 'EH1 S', 25 | 'T': 'T IY1', 26 | 'U': 'Y UW1', 27 | 'V': 'V IY1', 28 | 'X': 'EH1 K S', 29 | 'Y': 'W AY1', 30 | 'W': 'D AH1 B AH0 L Y UW0', 31 | 'Z': 'Z IY1', 32 | 's': 'Z' 33 | } 34 | 35 | # must ignore roman numerals 36 | _acronym_re = re.compile(r'([A-Z][A-Z]+)s?|([A-Z]\.([A-Z]\.)+s?)') 37 | cmudict = CMUDict(str(pathlib.Path(__file__).parent.absolute()) + '/cmudict_dictionary', keep_ambiguous=False) 38 | 39 | 40 | def _expand_acronyms(m, add_spaces=True): 41 | acronym = m.group(0) 42 | 43 | # remove dots if they exist 44 | acronym = re.sub('\.', '', acronym) 45 | 46 | acronym = "".join(acronym.split()) 47 | arpabet = cmudict.lookup(acronym) 48 | 49 | if arpabet is None: 50 | acronym = list(acronym) 51 | arpabet = ["{" + _letter_to_arpabet[letter] + "}" for letter in acronym] 52 | # temporary fix 53 | if arpabet[-1] == '{Z}' and len(arpabet) > 1: 54 | arpabet[-2] = arpabet[-2][:-1] + ' ' + arpabet[-1][1:] 55 | del arpabet[-1] 56 | 57 | arpabet = ' '.join(arpabet) 58 | else: 59 | arpabet = "{" + arpabet[0] + "}" 60 | 61 | return arpabet 62 | 63 | 64 | def normalize_acronyms(text): 65 | text = re.sub(_acronym_re, _expand_acronyms, text) 66 | return text 67 | -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/text/cleaners.py: -------------------------------------------------------------------------------- 1 | """ adapted from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Cleaners are transformations that run over the input text at both training and eval time. 5 | 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 8 | 1. "english_cleaners" for English text 9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 12 | the symbols in symbols.py to match your data). 13 | ''' 14 | 15 | import re 16 | from unidecode import unidecode 17 | from .numbers import normalize_numbers 18 | from .acronyms import normalize_acronyms 19 | from .datestime import normalize_datestime 20 | 21 | 22 | # Regular expression matching whitespace: 23 | _whitespace_re = re.compile(r'\s+') 24 | 25 | # List of (regular expression, replacement) pairs for abbreviations: 26 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 27 | ('mrs', 'misess'), 28 | ('ms', 'miss'), 29 | ('mr', 'mister'), 30 | ('dr', 'doctor'), 31 | ('st', 'saint'), 32 | ('co', 'company'), 33 | ('jr', 'junior'), 34 | ('maj', 'major'), 35 | ('gen', 'general'), 36 | ('drs', 'doctors'), 37 | ('rev', 'reverend'), 38 | ('lt', 'lieutenant'), 39 | ('hon', 'honorable'), 40 | ('sgt', 'sergeant'), 41 | ('capt', 'captain'), 42 | ('esq', 'esquire'), 43 | ('ltd', 'limited'), 44 | ('col', 'colonel'), 45 | ('ft', 'fort'), 46 | ]] 47 | 48 | _safe_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 49 | ('no', 'number'), 50 | ]] 51 | 52 | 53 | 54 | def expand_abbreviations(text): 55 | for regex, replacement in _abbreviations: 56 | text = re.sub(regex, replacement, text) 57 | return text 58 | 59 | def expand_safe_abbreviations(text): 60 | for regex, replacement in _safe_abbreviations: 61 | text = re.sub(regex, replacement, text) 62 | return text 63 | 64 | def expand_numbers(text): 65 | return normalize_numbers(text) 66 | 67 | 68 | def expand_acronyms(text): 69 | return normalize_acronyms(text) 70 | 71 | 72 | def expand_datestime(text): 73 | return normalize_datestime(text) 74 | 75 | 76 | def lowercase(text): 77 | return text.lower() 78 | 79 | 80 | def collapse_whitespace(text): 81 | return re.sub(_whitespace_re, ' ', text) 82 | 83 | 84 | def separate_acronyms(text): 85 | text = re.sub(r"([0-9]+)([a-zA-Z]+)", r"\1 \2", text) 86 | text = re.sub(r"([a-zA-Z]+)([0-9]+)", r"\1 \2", text) 87 | return text 88 | 89 | 90 | def remove_hyphens(text): 91 | text = re.sub(r'(?<=\w)(-)(?=\w)', ' ', text) 92 | return text 93 | 94 | 95 | def convert_to_ascii(text): 96 | return unidecode(text) 97 | 98 | 99 | def basic_cleaners(text): 100 | '''Basic pipeline that collapses whitespace without transliteration.''' 101 | text = lowercase(text) 102 | text = collapse_whitespace(text) 103 | return text 104 | 105 | 106 | def transliteration_cleaners(text): 107 | '''Pipeline for non-English text that transliterates to ASCII.''' 108 | text = convert_to_ascii(text) 109 | text = lowercase(text) 110 | text = collapse_whitespace(text) 111 | return text 112 | 113 | 114 | def flowtron_cleaners(text): 115 | text = collapse_whitespace(text) 116 | text = remove_hyphens(text) 117 | text = expand_datestime(text) 118 | text = expand_numbers(text) 119 | text = expand_safe_abbreviations(text) 120 | text = expand_acronyms(text) 121 | return text 122 | 123 | 124 | def english_cleaners(text): 125 | '''Pipeline for English text, with number and abbreviation expansion.''' 126 | text = convert_to_ascii(text) 127 | text = lowercase(text) 128 | text = expand_numbers(text) 129 | text = expand_abbreviations(text) 130 | text = collapse_whitespace(text) 131 | return text 132 | -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/text/cmudict.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | import re 4 | 5 | 6 | valid_symbols = [ 7 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 8 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 9 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 10 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 11 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 12 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 13 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 14 | ] 15 | 16 | _valid_symbol_set = set(valid_symbols) 17 | 18 | 19 | class CMUDict: 20 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 21 | def __init__(self, file_or_path, keep_ambiguous=True): 22 | if isinstance(file_or_path, str): 23 | with open(file_or_path, encoding='latin-1') as f: 24 | entries = _parse_cmudict(f) 25 | else: 26 | entries = _parse_cmudict(file_or_path) 27 | if not keep_ambiguous: 28 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 29 | self._entries = entries 30 | 31 | 32 | def __len__(self): 33 | return len(self._entries) 34 | 35 | 36 | def lookup(self, word): 37 | '''Returns list of ARPAbet pronunciations of the given word.''' 38 | return self._entries.get(word.upper()) 39 | 40 | 41 | 42 | _alt_re = re.compile(r'\([0-9]+\)') 43 | 44 | 45 | def _parse_cmudict(file): 46 | cmudict = {} 47 | for line in file: 48 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 49 | parts = line.split(' ') 50 | word = re.sub(_alt_re, '', parts[0]) 51 | pronunciation = _get_pronunciation(parts[1]) 52 | if pronunciation: 53 | if word in cmudict: 54 | cmudict[word].append(pronunciation) 55 | else: 56 | cmudict[word] = [pronunciation] 57 | return cmudict 58 | 59 | 60 | def _get_pronunciation(s): 61 | parts = s.strip().split(' ') 62 | for part in parts: 63 | if part not in _valid_symbol_set: 64 | return None 65 | return ' '.join(parts) 66 | -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/text/cmudict_dictionary: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dscripka/synthetic_speech_dataset_generation/09cdc32c9efafefa603346819ba84aef4be2063b/models/waveglow/TextToSpeechModel/text/cmudict_dictionary -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/text/datestime.py: -------------------------------------------------------------------------------- 1 | import re 2 | _ampm_re = re.compile(r'([0-9]|0[0-9]|1[0-9]|2[0-3]):?([0-5][0-9])?\s*([AaPp][Mm]\b)') 3 | 4 | 5 | def _expand_ampm(m): 6 | matches = list(m.groups(0)) 7 | txt = matches[0] 8 | if matches[1] == 0 or matches[1] == '0' or matches[1] == '00': 9 | pass 10 | else: 11 | txt += ' ' + matches[1] 12 | 13 | if matches[2][0] == 'a': 14 | txt += ' AM' 15 | elif matches[2][0] == 'p': 16 | txt += ' PM' 17 | 18 | return txt 19 | 20 | 21 | def normalize_datestime(text): 22 | text = re.sub(_ampm_re, _expand_ampm, text) 23 | text = re.sub(r"([0-9]|0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])?", r"\1 \2", text) 24 | return text 25 | -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/text/heteronyms: -------------------------------------------------------------------------------- 1 | abject 2 | abrogate 3 | absent 4 | abstract 5 | abuse 6 | ache 7 | Acre 8 | acuminate 9 | addict 10 | address 11 | adduct 12 | Adele 13 | advocate 14 | affect 15 | affiliate 16 | agape 17 | aged 18 | agglomerate 19 | aggregate 20 | agonic 21 | agora 22 | allied 23 | ally 24 | alternate 25 | alum 26 | am 27 | analyses 28 | Andrea 29 | animate 30 | apply 31 | appropriate 32 | approximate 33 | ares 34 | arithmetic 35 | arsenic 36 | articulate 37 | associate 38 | attribute 39 | august 40 | axes 41 | ay 42 | aye 43 | bases 44 | bass 45 | bathed 46 | bested 47 | bifurcate 48 | blessed 49 | blotto 50 | bow 51 | bowed 52 | bowman 53 | brassy 54 | buffet 55 | bustier 56 | carbonate 57 | Celtic 58 | choral 59 | Chumash 60 | close 61 | closer 62 | coax 63 | coincidence 64 | color coordinate 65 | colour coordinate 66 | comber 67 | combine 68 | combs 69 | committee 70 | commune 71 | compact 72 | complex 73 | compound 74 | compress 75 | concert 76 | conduct 77 | confine 78 | confines 79 | conflict 80 | conglomerate 81 | conscript 82 | conserve 83 | consist 84 | console 85 | consort 86 | construct 87 | consult 88 | consummate 89 | content 90 | contest 91 | contract 92 | contracts 93 | contrast 94 | converse 95 | convert 96 | convict 97 | coop 98 | coordinate 99 | covey 100 | crooked 101 | curate 102 | cussed 103 | decollate 104 | decrease 105 | defect 106 | defense 107 | delegate 108 | deliberate 109 | denier 110 | desert 111 | detail 112 | deviate 113 | diagnoses 114 | diffuse 115 | digest 116 | discard 117 | discharge 118 | discount 119 | do 120 | document 121 | does 122 | dogged 123 | domesticate 124 | Dominican 125 | dove 126 | dr 127 | drawer 128 | duplicate 129 | egress 130 | ejaculate 131 | eject 132 | elaborate 133 | ellipses 134 | email 135 | emu 136 | entrace 137 | entrance 138 | escort 139 | estimate 140 | eta 141 | Etna 142 | evening 143 | excise 144 | excuse 145 | exploit 146 | export 147 | extract 148 | fine 149 | flower 150 | forbear 151 | four-legged 152 | frequent 153 | furrier 154 | gallant 155 | gel 156 | geminate 157 | gillie 158 | glower 159 | Gotham 160 | graduate 161 | haggis 162 | heavy 163 | hinder 164 | house 165 | housewife 166 | impact 167 | imped 168 | implant 169 | implement 170 | import 171 | impress 172 | incense 173 | incline 174 | increase 175 | infix 176 | insert 177 | instar 178 | insult 179 | integral 180 | intercept 181 | interchange 182 | interflow 183 | interleaf 184 | intermediate 185 | intern 186 | interspace 187 | intimate 188 | intrigue 189 | invalid 190 | invert 191 | invite 192 | irony 193 | jagged 194 | Jesses 195 | Julies 196 | kite 197 | laminate 198 | Laos 199 | lather 200 | lead 201 | learned 202 | leasing 203 | lech 204 | legitimate 205 | lied 206 | lima 207 | lipread 208 | live 209 | lower 210 | lunged 211 | maas 212 | Magdalen 213 | manes 214 | mare 215 | marked 216 | merchandise 217 | merlion 218 | minute 219 | misconduct 220 | misled 221 | misprint 222 | mobile 223 | moderate 224 | mong 225 | moped 226 | moth 227 | mouth 228 | mow 229 | mpg 230 | multiply 231 | mush 232 | nana 233 | nice 234 | Nice 235 | number 236 | numerate 237 | nun 238 | object 239 | opiate 240 | ornament 241 | outbox 242 | outcry 243 | outpour 244 | outreach 245 | outride 246 | outright 247 | outside 248 | outwork 249 | overall 250 | overbid 251 | overcall 252 | overcast 253 | overfall 254 | overflow 255 | overhaul 256 | overhead 257 | overlap 258 | overlay 259 | overuse 260 | overweight 261 | overwork 262 | pace 263 | palled 264 | palling 265 | para 266 | pasty 267 | pate 268 | Pauline 269 | pedal 270 | peer 271 | perfect 272 | periodic 273 | permit 274 | pervert 275 | pinta 276 | placer 277 | platy 278 | polish 279 | Polish 280 | poll 281 | pontificate 282 | postulate 283 | pram 284 | prayer 285 | precipitate 286 | predate 287 | predicate 288 | prefix 289 | preposition 290 | present 291 | pretest 292 | primer 293 | proceeds 294 | produce 295 | progress 296 | project 297 | proportionate 298 | prospect 299 | protest 300 | pussy 301 | putter 302 | putting 303 | quite 304 | ragged 305 | raven 306 | re 307 | read 308 | reading 309 | Reading 310 | real 311 | rebel 312 | recall 313 | recap 314 | recitative 315 | recollect 316 | record 317 | recreate 318 | recreation 319 | redress 320 | refill 321 | refund 322 | refuse 323 | reject 324 | relay 325 | remake 326 | repaint 327 | reprint 328 | reread 329 | rerun 330 | resent 331 | reside 332 | resign 333 | respray 334 | resume 335 | retard 336 | retest 337 | retread 338 | rewrite 339 | root 340 | routed 341 | routing 342 | row 343 | rugged 344 | rummy 345 | sais 346 | sake 347 | sambuca 348 | saucier 349 | second 350 | secrete 351 | secreted 352 | secreting 353 | segment 354 | separate 355 | sewer 356 | shirk 357 | shower 358 | sin 359 | skied 360 | slaver 361 | slough 362 | sow 363 | spoof 364 | squid 365 | stingy 366 | subject 367 | subordinate 368 | subvert 369 | supply 370 | supposed 371 | survey 372 | suspect 373 | syringes 374 | tabulate 375 | tales 376 | tarrier 377 | tarry 378 | taxes 379 | taxis 380 | tear 381 | Theron 382 | thou 383 | three-legged 384 | tier 385 | tinged 386 | torment 387 | transfer 388 | transform 389 | transplant 390 | transport 391 | transpose 392 | tush 393 | two-legged 394 | unionised 395 | unionized 396 | update 397 | uplift 398 | upset 399 | use 400 | used 401 | vale 402 | violist 403 | viva 404 | ware 405 | whinged 406 | whoop 407 | wicked 408 | wind 409 | windy 410 | wino 411 | won 412 | worsted 413 | wound 414 | -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/text/numbers.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | import inflect 4 | import re 5 | _large_numbers = '(trillion|billion|million|thousand|hundred)' 6 | _measurements = '(f|c|k|d)' 7 | _measurements_key = {'f': 'fahrenheit', 'c': 'celsius', 'k': 'thousand', 'd': 'd'} 8 | _inflect = inflect.engine() 9 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 10 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 11 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 12 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+[ ]?{}?)'.format(_large_numbers), re.IGNORECASE) 13 | _measurement_re = re.compile(r'([0-9\.\,]*[0-9]+(\s)?{}\b)'.format(_measurements), re.IGNORECASE) 14 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 15 | _number_re = re.compile(r"[0-9]+'s|[0-9]+") 16 | 17 | def _remove_commas(m): 18 | return m.group(1).replace(',', '') 19 | 20 | 21 | def _expand_decimal_point(m): 22 | return m.group(1).replace('.', ' point ') 23 | 24 | 25 | def _expand_dollars(m): 26 | match = m.group(1) 27 | 28 | # check for million, billion, etc... 29 | parts = match.split(' ') 30 | if len(parts) == 2 and len(parts[1]) > 0 and parts[1] in _large_numbers: 31 | return "{} {} {} ".format(parts[0], parts[1], 'dollars') 32 | 33 | parts = parts[0].split('.') 34 | if len(parts) > 2: 35 | return match + " dollars" # Unexpected format 36 | dollars = int(parts[0]) if parts[0] else 0 37 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 38 | if dollars and cents: 39 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 40 | cent_unit = 'cent' if cents == 1 else 'cents' 41 | return "{} {}, {} {} ".format( 42 | _inflect.number_to_words(dollars), dollar_unit, 43 | _inflect.number_to_words(cents), cent_unit) 44 | elif dollars: 45 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 46 | return "{} {} ".format(_inflect.number_to_words(dollars), dollar_unit) 47 | elif cents: 48 | cent_unit = 'cent' if cents == 1 else 'cents' 49 | return "{} {} ".format(_inflect.number_to_words(cents), cent_unit) 50 | else: 51 | return 'zero dollars' 52 | 53 | 54 | def _expand_ordinal(m): 55 | return _inflect.number_to_words(m.group(0)) 56 | 57 | 58 | def _expand_measurement(m): 59 | _, number, measurement = re.split('(\d+(?:\.\d+)?)', m.group(0)) 60 | number = _inflect.number_to_words(number) 61 | measurement = "".join(measurement.split()) 62 | measurement = _measurements_key[measurement.lower()] 63 | return "{} {}".format(number, measurement) 64 | 65 | 66 | def _expand_number(m): 67 | _, number, suffix = re.split(r"(\d+(?:'\d+)?)", m.group(0)) 68 | num = int(number) 69 | if num > 1000 and num < 3000: 70 | if num == 2000: 71 | text = 'two thousand' 72 | elif num > 2000 and num < 2010: 73 | text = 'two thousand ' + _inflect.number_to_words(num % 100) 74 | elif num % 100 == 0: 75 | text = _inflect.number_to_words(num // 100) + ' hundred' 76 | else: 77 | num = _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 78 | num = re.sub(r'-', ' ', num) 79 | text = num 80 | else: 81 | num = _inflect.number_to_words(num, andword='') 82 | num = re.sub(r'-', ' ', num) 83 | num = re.sub(r',', '', num) 84 | text = num 85 | 86 | if suffix == "'s" and text[-1] == 'y': 87 | text = text[:-1] + 'ies' 88 | 89 | return text 90 | 91 | 92 | def normalize_numbers(text): 93 | text = re.sub(_comma_number_re, _remove_commas, text) 94 | text = re.sub(_pounds_re, r'\1 pounds', text) 95 | text = re.sub(_dollars_re, _expand_dollars, text) 96 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 97 | text = re.sub(_ordinal_re, _expand_ordinal, text) 98 | text = re.sub(_measurement_re, _expand_measurement, text) 99 | text = re.sub(_number_re, _expand_number, text) 100 | return text 101 | -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/text/symbols.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Defines the set of symbols used in text input to the model. 5 | 6 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' 7 | from text import cmudict 8 | 9 | _punctuation = '!\'",.:;? ' 10 | _math = '#%&*+-/[]()' 11 | _special = '_@©°½—₩€$' 12 | _accented = 'áçéêëñöøćž' 13 | _numbers = '0123456789' 14 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' 15 | 16 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 17 | _arpabet = ['@' + s for s in cmudict.valid_symbols] 18 | 19 | # Export all symbols: 20 | symbols = list(_punctuation + _math + _special + _accented + _numbers + _letters) + _arpabet 21 | -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/text_to_speech.py: -------------------------------------------------------------------------------- 1 | 2 | # Create Bento for text to speech 3 | 4 | import os 5 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 6 | 7 | import torch 8 | from bentoml import env, artifacts, api, BentoService 9 | from bentoml.adapters import JsonInput 10 | from bentoml.frameworks.pytorch import PytorchModelArtifact 11 | 12 | from waveglow_artifact import WaveglowArtifact 13 | from glow import WaveGlow 14 | from data import Data 15 | 16 | import re 17 | import numpy as np 18 | import base64 19 | import pathlib 20 | import scipy 21 | 22 | @env( 23 | pip_packages=[ 24 | "bentoml==0.12.1", 25 | "torch==1.7.1", 26 | "numpy==1.19.2", 27 | "inflect==4.1.0", 28 | "scipy==1.5.2", 29 | "Unidecode==1.0.22", 30 | "librosa==0.6.0" 31 | ] 32 | ) 33 | @artifacts([WaveglowArtifact('model')]) 34 | class TextToSpeechModel(BentoService): 35 | """ 36 | A model that converts text into spoken speech 37 | """ 38 | def __init__(self): 39 | super(TextToSpeechModel, self).__init__() 40 | self.data_config = { 41 | "text_cleaners": ["flowtron_cleaners"], 42 | "p_arpabet": 0.5, 43 | "cmudict_path": str(pathlib.Path(__file__).parent.absolute()) + "/artifacts/cmudict_dictionary", 44 | "sampling_rate": 22050, 45 | "filter_length": 1024, 46 | "hop_length": 256, 47 | "win_length": 1024, 48 | "mel_fmin": 0.0, 49 | "mel_fmax": 8000.0, 50 | "max_wav_value": 32768.0 51 | } 52 | training_files = str(pathlib.Path(__file__).parent.absolute()) + \ 53 | "/artifacts/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist.txt" 54 | self.tokenizer = Data(training_files, **self.data_config) 55 | 56 | if torch.cuda.is_available(): 57 | self.device = torch.device('cuda') 58 | else: 59 | self.device = torch.device('cpu') 60 | 61 | @api(input=JsonInput()) 62 | def generate(self, parsed_json, speaker_id=[24], sample_rate=22050, sigma=0.8, n_frames=300): 63 | text = parsed_json['text'] 64 | if parsed_json.get('speaker_id', None): 65 | speaker_id = parsed_json['speaker_id'] 66 | if parsed_json.get('sample_rate', None): 67 | sample_rate = parsed_json['sample_rate'] 68 | if parsed_json.get('sigma', None): 69 | sigma = parsed_json['sigma'] 70 | if parsed_json.get('n_frames', None): 71 | n_frames = parsed_json['n_frames'] 72 | 73 | sentences = re.split('\,|\.|\;|\?|\!', text) # tokenize into chunks by punctuation 74 | sentences = [i for i in sentences if i != ""] 75 | audio = [] 76 | for sentence in sentences: 77 | speaker_vecs = torch.tensor(speaker_id)[None].to(self.device)#.cuda() 78 | text = self.tokenizer.get_text(sentence)[None].to(self.device)#.cuda() 79 | 80 | with torch.no_grad(): 81 | # residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma 82 | residual = (torch.FloatTensor(1, 80, n_frames).normal_() * sigma).to(self.device) 83 | mels, attentions = self.artifacts.model.get("flowtron").infer(residual, speaker_vecs, text) 84 | 85 | if self.device.type == "cuda": 86 | mels = mels.half() 87 | 88 | clip = self.artifacts.model.get("waveglow").infer(mels, sigma=sigma).float() 89 | clip = clip.cpu().numpy()[0] 90 | clip = clip / np.abs(clip).max() # normalize audio 91 | if sample_rate != 22050: 92 | audio.append(scipy.signal.resample(clip, int(sample_rate*len(clip)/22050))) #convert to desired khz for playback 93 | else: 94 | audio.append(clip) 95 | 96 | speech = np.concatenate((audio)) 97 | speech = (speech*32767).astype(np.int16) # convert to 16-bit PCM data 98 | return base64.b64encode(speech.tobytes()).decode('utf-8') -------------------------------------------------------------------------------- /models/waveglow/TextToSpeechModel/waveglow_artifact.py: -------------------------------------------------------------------------------- 1 | 2 | # Custom interface for Nvidia Waveglow models 3 | 4 | import os 5 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 6 | import json 7 | from bentoml.utils import cloudpickle 8 | from bentoml.exceptions import InvalidArgument 9 | from bentoml.service.artifacts import BentoServiceArtifact 10 | 11 | from flowtron import Flowtron 12 | 13 | import torch 14 | 15 | class WaveglowArtifact(BentoServiceArtifact): 16 | def __init__(self, name): 17 | super(WaveglowArtifact, self).__init__(name) 18 | self._model = None 19 | self.model_config = { 20 | "n_speakers": 2311, 21 | "n_speaker_dim": 128, 22 | "n_text": 185, 23 | "n_text_dim": 512, 24 | "n_flows": 2, 25 | "n_mel_channels": 80, 26 | "n_attn_channels": 640, 27 | "n_hidden": 1024, 28 | "n_lstm_layers": 2, 29 | "mel_encoder_n_hidden": 512, 30 | "n_components": 0, 31 | "mean_scale": 0.0, 32 | "fixed_gaussian": True, 33 | "dummy_speaker_embedding": False, 34 | "use_gate_layer": True 35 | } 36 | 37 | def pack(self, model, metadata=None): 38 | self._model = model 39 | return self 40 | 41 | def get(self): 42 | return self._model 43 | 44 | def save(self, dst): 45 | pass 46 | 47 | def load(self, path): 48 | if torch.cuda.is_available(): 49 | device = torch.device('cuda') 50 | else: 51 | device = torch.device('cpu') 52 | 53 | # load waveglow model 54 | waveglow = torch.load(os.path.join(path, 'waveglow_256channels_universal_v5.pt'))['model'].to(device) 55 | if device.type == "cuda": 56 | waveglow.cuda().half() 57 | waveglow.eval() 58 | 59 | # Load flowtron model 60 | flowtron = Flowtron(**self.model_config).to(device) 61 | state_dict = torch.load(os.path.join(path, "flowtron_libritts2p3k.pt"), map_location='cpu')['model'].state_dict() 62 | flowtron.load_state_dict(state_dict) 63 | _ = flowtron.eval() 64 | 65 | return self.pack({"waveglow": waveglow, "flowtron": flowtron}) 66 | -------------------------------------------------------------------------------- /models/waveglow/bentoml-init.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Bash Script that installs the dependencies specified in the BentoService archive 3 | # 4 | # Usage: 5 | # * `bentoml-init.sh` to run the full script 6 | # * `bentoml-init.sh ` to run a specific step 7 | # available steps: [custom_setup ensure_python restore_conda_env install_pip_packages install_bundled_pip_packages 8 | 9 | set -ex 10 | 11 | # cd to the saved bundle directory 12 | SAVED_BUNDLE_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P) 13 | cd $SAVED_BUNDLE_PATH 14 | 15 | # Run the user defined setup.sh script if it is presented 16 | if [ $# -eq 0 ] || [ $1 == "custom_setup" ] ; then 17 | if [ -f ./setup.sh ]; then chmod +x ./setup.sh && bash -c ./setup.sh; fi 18 | fi 19 | 20 | # Check and install the right python version 21 | if [ $# -eq 0 ] || [ $1 == "ensure_python" ] ; then 22 | if [ -f ./python_version ]; then 23 | PY_VERSION_SAVED=$(cat ./python_version) 24 | # remove PATCH version - since most patch version only contains backwards compatible 25 | # bug fixes and the BentoML defautl docker base image will include the latest 26 | # patch version of each Python minor release 27 | DESIRED_PY_VERSION=${PY_VERSION_SAVED:0:3} # returns 3.6, 3.7 or 3.8 28 | CURRENT_PY_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') 29 | 30 | if [[ "$DESIRED_PY_VERSION" == "$CURRENT_PY_VERSION" ]]; then 31 | echo "Python Version in docker base image $CURRENT_PY_VERSION matches requirement python=$DESIRED_PY_VERSION. Skipping." 32 | else 33 | if command -v conda >/dev/null 2>&1; then 34 | echo "Installing python=$DESIRED_PY_VERSION with conda:" 35 | conda install -y -n base pkgs/main::python=$DESIRED_PY_VERSION pip 36 | else 37 | echo "WARNING: Python Version $DESIRED_PY_VERSION is required, but $CURRENT_PY_VERSION was found." 38 | fi 39 | fi 40 | fi 41 | fi 42 | 43 | if [ $# -eq 0 ] || [ $1 == "restore_conda_env" ] ; then 44 | if command -v conda >/dev/null 2>&1; then 45 | # set pip_interop_enabled to improve conda-pip interoperability. Conda can use 46 | # pip-installed packages to satisfy dependencies. 47 | # this option is only available after conda version 4.6.0 48 | # "|| true" ignores the error when the option is not found, for older conda version 49 | # This is commented out due to a bug with conda's implementation, we should revisit 50 | # after conda remove the experimental flag on pip_interop_enabled option 51 | # See more details on https://github.com/bentoml/BentoML/pull/1012 52 | # conda config --set pip_interop_enabled True || true 53 | 54 | echo "Updating conda base environment with environment.yml" 55 | conda env update -n base -f ./environment.yml 56 | conda clean --all 57 | else 58 | echo "WARNING: conda command not found, skipping conda dependencies in environment.yml" 59 | fi 60 | fi 61 | 62 | # Install PyPI packages specified in requirements.txt 63 | if [ $# -eq 0 ] || [ $1 == "install_pip_packages" ] ; then 64 | pip install -r ./requirements.txt --no-cache-dir $EXTRA_PIP_INSTALL_ARGS 65 | fi 66 | 67 | # Install additional python packages inside bundled pip dependencies directory 68 | if [ $# -eq 0 ] || [ $1 == "install_bundled_pip_packages" ] ; then 69 | for filename in ./bundled_pip_dependencies/*; do 70 | [ -e "$filename" ] || continue 71 | pip install -U "$filename" 72 | done 73 | fi 74 | -------------------------------------------------------------------------------- /models/waveglow/bentoml.yml: -------------------------------------------------------------------------------- 1 | version: 0.12.1 2 | kind: BentoService 3 | metadata: 4 | created_at: 2021-05-31 13:57:23.628577 5 | service_name: TextToSpeechModel 6 | service_version: 20210531095723_F76C2A 7 | module_name: text_to_speech 8 | module_file: text_to_speech.py 9 | env: 10 | pip_packages: 11 | - bentoml==0.12.1 12 | - torch==1.7.1 13 | - numpy==1.19.2 14 | - inflect==4.1.0 15 | - scipy==1.5.2 16 | - Unidecode==1.0.22 17 | - librosa==0.6.0 18 | conda_env: 19 | name: bentoml-default-conda-env 20 | dependencies: [] 21 | python_version: 3.7.6 22 | docker_base_image: bentoml/model-server:0.12.1-py37 23 | apis: 24 | - name: predict 25 | docs: "BentoService inference API 'predict', input: 'JsonInput', output: 'DefaultOutput'" 26 | input_type: JsonInput 27 | output_type: DefaultOutput 28 | mb_max_batch_size: 4000 29 | mb_max_latency: 20000 30 | batch: false 31 | route: predict 32 | output_config: 33 | cors: '*' 34 | artifacts: 35 | - name: model 36 | artifact_type: WaveglowArtifact 37 | metadata: {} 38 | -------------------------------------------------------------------------------- /models/waveglow/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -Eeuo pipefail 3 | 4 | # check to see if this file is being run or sourced from another script 5 | _is_sourced() { 6 | # https://unix.stackexchange.com/a/215279 7 | [ "${#FUNCNAME[@]}" -ge 2 ] \ 8 | && [ "${FUNCNAME[0]}" = '_is_sourced' ] \ 9 | && [ "${FUNCNAME[1]}" = 'source' ] 10 | } 11 | 12 | _main() { 13 | # if first arg looks like a flag, assume we want to start bentoml YataiService 14 | if [ "${1:0:1}" = '-' ]; then 15 | set -- bentoml serve-gunicorn "$@" $BUNDLE_PATH 16 | fi 17 | 18 | # Set BentoML API server port via env var 19 | export BENTOML_PORT=$PORT \ 20 | # Backward compatibility for BentoML prior to 0.7.5 21 | export BENTOML__APISERVER__DEFAULT_PORT=$PORT \ 22 | 23 | exec "$@" 24 | } 25 | 26 | if ! _is_sourced; then 27 | _main "$@" 28 | fi 29 | -------------------------------------------------------------------------------- /models/waveglow/environment.yml: -------------------------------------------------------------------------------- 1 | name: bentoml-default-conda-env 2 | dependencies: [] 3 | -------------------------------------------------------------------------------- /models/waveglow/python_version: -------------------------------------------------------------------------------- 1 | 3.7.6 -------------------------------------------------------------------------------- /models/waveglow/requirements.txt: -------------------------------------------------------------------------------- 1 | bentoml==0.12.1 2 | torch==1.7.1 3 | numpy==1.19.2 4 | inflect==4.1.0 5 | scipy==1.5.2 6 | Unidecode==1.0.22 7 | librosa==0.6.0 8 | numba==0.49.1 9 | llvmlite==0.32.1 -------------------------------------------------------------------------------- /models/waveglow/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | try: 3 | # for pip >= 10 4 | from pip._internal.req import parse_requirements 5 | try: 6 | # for pip >= 20.0 7 | from pip._internal.network.session import PipSession 8 | except ModuleNotFoundError: 9 | # for pip >= 10, < 20.0 10 | from pip._internal.download import PipSession 11 | except ImportError: 12 | # for pip <= 9.0.3 13 | from pip.req import parse_requirements 14 | from pip.download import PipSession 15 | 16 | try: 17 | raw = parse_requirements('requirements.txt', session=PipSession()) 18 | 19 | # pip >= 20.1 changed ParsedRequirement attribute from `req` to `requirement` 20 | install_reqs = [] 21 | for i in raw: 22 | try: 23 | install_reqs.append(str(i.requirement)) 24 | except AttributeError: 25 | install_reqs.append(str(i.req)) 26 | except Exception: 27 | install_reqs = [] 28 | 29 | setuptools.setup( 30 | name='TextToSpeechModel', 31 | version='20210531095723_F76C2A', 32 | description="BentoML generated model module", 33 | long_description="""# Generated BentoService bundle - TextToSpeechModel:20210531095723_F76C2A 34 | 35 | This is a ML Service bundle created with BentoML, it is not recommended to edit 36 | code or files contained in this directory. Instead, edit the code that uses BentoML 37 | to create this bundle, and save a new BentoService bundle. 38 | 39 | A model that converts text into spoken speech""", 40 | long_description_content_type="text/markdown", 41 | url="https://github.com/bentoml/BentoML", 42 | packages=setuptools.find_packages(), 43 | install_requires=install_reqs, 44 | include_package_data=True, 45 | package_data={ 46 | 'TextToSpeechModel': ['bentoml.yml', 'artifacts/*'] 47 | }, 48 | entry_points={ 49 | 'console_scripts': [ 50 | 'TextToSpeechModel=TextToSpeechModel:cli', 51 | ], 52 | } 53 | ) 54 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests>=2.28.2,<3 2 | tqdm>=4.47.0,<5 3 | torch==1.12.1 4 | numpy==1.23.3 5 | torchaudio==0.12.1 6 | librosa==0.9.2 7 | unidecode==1.3.6 8 | phonemizer==3.2.1 9 | bentoml==0.12.1 10 | inflect==6.0.2 11 | protobuf==3.19.1 --------------------------------------------------------------------------------