├── README.md ├── __pycache__ └── script.cpython-310.pyc ├── editor.py ├── merge_2_multiturn_HEB.py ├── requirements.txt ├── script.py ├── tortoise ├── CITATION.cff ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── TorToiSe.egg-info │ ├── PKG-INFO │ ├── SOURCES.txt │ ├── dependency_links.txt │ ├── requires.txt │ └── top_level.txt ├── convert │ ├── convert.bat │ └── convert.sh ├── requirements.txt ├── requirements_legacy.txt ├── scripts │ └── tortoise_tts.py ├── setup.py └── tortoise │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-310.pyc │ └── api.cpython-310.pyc │ ├── api.py │ ├── data │ ├── got.txt │ ├── layman.txt │ ├── mel_norms.pth │ ├── riding_hood.txt │ ├── seal_copypasta.txt │ └── tokenizer.json │ ├── do_tts.py │ ├── eval.py │ ├── get_conditioning_latents.py │ ├── is_this_from_tortoise.py │ ├── models │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── activations.cpython-310.pyc │ │ ├── arch_util.cpython-310.pyc │ │ ├── autoregressive.cpython-310.pyc │ │ ├── bigvgan.cpython-310.pyc │ │ ├── classifier.cpython-310.pyc │ │ ├── clvp.cpython-310.pyc │ │ ├── cvvp.cpython-310.pyc │ │ ├── diffusion_decoder.cpython-310.pyc │ │ ├── random_latent_generator.cpython-310.pyc │ │ ├── transformer.cpython-310.pyc │ │ ├── vocoder.cpython-310.pyc │ │ └── xtransformers.cpython-310.pyc │ ├── activations.py │ ├── alias_free_torch │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-310.pyc │ │ │ ├── act.cpython-310.pyc │ │ │ ├── filter.cpython-310.pyc │ │ │ └── resample.cpython-310.pyc │ │ ├── act.py │ │ ├── filter.py │ │ └── resample.py │ ├── arch_util.py │ ├── autoregressive.py │ ├── bigvgan.py │ ├── classifier.py │ ├── clvp.py │ ├── cvvp.py │ ├── diffusion_decoder.py │ ├── random_latent_generator.py │ ├── transformer.py │ ├── vocoder.py │ └── xtransformers.py │ ├── read.py │ ├── utils │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── audio.cpython-310.pyc │ │ ├── device.cpython-310.pyc │ │ ├── diffusion.cpython-310.pyc │ │ ├── stft.cpython-310.pyc │ │ ├── text.cpython-310.pyc │ │ ├── tokenizer.cpython-310.pyc │ │ ├── torch_intermediary.cpython-310.pyc │ │ ├── typical_sampling.cpython-310.pyc │ │ └── wav2vec_alignment.cpython-310.pyc │ ├── audio.py │ ├── device.py │ ├── diffusion.py │ ├── stft.py │ ├── text.py │ ├── tokenizer.py │ ├── torch_intermediary.py │ ├── typical_sampling.py │ └── wav2vec_alignment.py │ └── voices │ └── sp1 │ ├── sp1_00.wav │ ├── sp1_01.wav │ ├── sp1_02.wav │ ├── sp1_03.wav │ ├── sp1_04.wav │ ├── sp1_05.wav │ ├── sp1_06.wav │ ├── sp1_07.wav │ ├── sp1_08.wav │ ├── sp1_09.wav │ ├── sp1_1.wav │ ├── sp1_10.wav │ ├── sp1_2.wav │ ├── sp1_3.wav │ ├── sp1_4.wav │ └── sp1_5.wav └── tts_preprocessor.py /__pycache__/script.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/__pycache__/script.cpython-310.pyc -------------------------------------------------------------------------------- /editor.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | import json 3 | 4 | def create_conversation_pair(question_text, answer_text): 5 | # Split the strings into lines assuming each line is a separate question or answer 6 | question_strings = question_text.strip().split('\n') 7 | answer_strings = answer_text.strip().split('\n') 8 | 9 | # Ensure the lengths of question_strings and answer_strings are the same 10 | min_length = min(len(question_strings), len(answer_strings)) 11 | 12 | # Create conversation pairs 13 | conversations = [] 14 | 15 | for i in range(min_length): 16 | # Create conversation pair 17 | conversation_pair = [ 18 | {"from": "human", "value": question_strings[i]}, 19 | {"from": "gpt", "value": answer_strings[i]} 20 | ] 21 | 22 | conversations.append(conversation_pair) 23 | 24 | return conversations 25 | 26 | # Gradio GUI 27 | with gr.Blocks() as User_Interface_GUI: 28 | with gr.Row(): 29 | with gr.Column(): 30 | question_text_field = gr.components.Textbox(label="Question", lines=4, interactive=True) 31 | answer_text_field = gr.components.Textbox(label="Answer", lines=15, interactive=True) 32 | with gr.Row(): 33 | index_write_decision = gr.components.Radio(choices=["Create new index", "Add to the last index"], 34 | label="JSON Creation", value="Create new index", 35 | interactive=True) 36 | conversation_id_record = gr.Number(label="Conversation ID:", 37 | value=1, minimum=1, interactive=True) 38 | output_filename = gr.components.Textbox(label=["Filename"], 39 | value='shareGPT.json', interactive=True) 40 | with gr.Column(): 41 | save_json_shareGPT = gr.components.Button(value="Save") 42 | 43 | with gr.Column(): 44 | output_text_field = gr.components.Textbox(label="Output", lines=34) 45 | # 46 | # ============Functions start here============ 47 | def on_save_button_click(question_text,answer_text,filename_text,id_text): 48 | 49 | 50 | 51 | conversations = create_conversation_pair(question_text, answer_text) 52 | 53 | # Create JSON structure 54 | json_structure = [{"id": int(id_text), "conversations": conversation} for i, conversation in enumerate(conversations, 1)] 55 | 56 | # Export to JSON file 57 | with open(filename_text, 'w', encoding='utf-8') as json_file: 58 | json.dump(json_structure, json_file, indent=2) 59 | 60 | output_text_field.value = f"JSON saved to {filename_text}" 61 | 62 | save_json_shareGPT.click(on_save_button_click,[question_text_field,answer_text_field,output_filename,conversation_id_record]) 63 | 64 | 65 | 66 | 67 | User_Interface_GUI.launch() 68 | -------------------------------------------------------------------------------- /merge_2_multiturn_HEB.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | progressbar 2 | librosa 3 | num2words 4 | rotary_embedding_torch==0.4.0 5 | inflect 6 | unidecode 7 | torchaudio 8 | -------------------------------------------------------------------------------- /tortoise/CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.3.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Betker" 5 | given-names: "James" 6 | orcid: "https://orcid.org/my-orcid?orcid=0000-0003-3259-4862" 7 | title: "TorToiSe text-to-speech" 8 | version: 2.0 9 | date-released: 2022-04-28 10 | url: "https://github.com/neonbjb/tortoise-tts" -------------------------------------------------------------------------------- /tortoise/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:12.2.0-base-ubuntu22.04 2 | 3 | COPY . /app 4 | 5 | RUN apt-get update && \ 6 | apt-get install -y --allow-unauthenticated --no-install-recommends \ 7 | wget \ 8 | git \ 9 | && apt-get autoremove -y \ 10 | && apt-get clean -y \ 11 | && rm -rf /var/lib/apt/lists/* 12 | 13 | ENV HOME "/root" 14 | ENV CONDA_DIR "${HOME}/miniconda" 15 | ENV PATH="$CONDA_DIR/bin":$PATH 16 | ENV CONDA_AUTO_UPDATE_CONDA=false 17 | ENV PIP_DOWNLOAD_CACHE="$HOME/.pip/cache" 18 | ENV TORTOISE_MODELS_DIR 19 | 20 | RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh \ 21 | && bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u \ 22 | && "${CONDA_DIR}/bin/conda" init bash \ 23 | && rm -f /tmp/miniconda3.sh \ 24 | && echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile" 25 | 26 | # --login option used to source bashrc (thus activating conda env) at every RUN statement 27 | SHELL ["/bin/bash", "--login", "-c"] 28 | 29 | RUN conda create --name tortoise python=3.9 numba inflect \ 30 | && conda activate tortoise \ 31 | && conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia \ 32 | && conda install transformers=4.29.2 \ 33 | && cd /app \ 34 | && python setup.py install 35 | -------------------------------------------------------------------------------- /tortoise/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /tortoise/MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include tortoise/data * 2 | recursive-include tortoise/voices * 3 | -------------------------------------------------------------------------------- /tortoise/TorToiSe.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: TorToiSe 3 | Version: 2.4.5 4 | Summary: A high quality multi-voice text-to-speech library 5 | Home-page: https://git.ecker.tech/mrq/tortoise-tts 6 | Author: James Betker 7 | Author-email: james@adamant.ai 8 | Classifier: Programming Language :: Python :: 3 9 | Classifier: License :: OSI Approved :: Apache Software License 10 | Classifier: Operating System :: OS Independent 11 | Requires-Python: >=3.6 12 | Description-Content-Type: text/markdown 13 | License-File: LICENSE 14 | 15 | # (QoL improvements for) TorToiSe 16 | 17 | This repo is for my modifications to [neonbjb/tortoise-tts](https://github.com/neonbjb/tortoise-tts). If you need the original README, refer to the original repo. 18 | 19 | \> w-where'd everything go? 20 | 21 | Please migrate to [mrq/ai-voice-cloning](https://git.ecker.tech/mrq/ai-voice-cloning), as that repo is the more cohesive package for voice cloning. 22 | -------------------------------------------------------------------------------- /tortoise/TorToiSe.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | LICENSE 2 | MANIFEST.in 3 | README.md 4 | setup.py 5 | TorToiSe.egg-info/PKG-INFO 6 | TorToiSe.egg-info/SOURCES.txt 7 | TorToiSe.egg-info/dependency_links.txt 8 | TorToiSe.egg-info/requires.txt 9 | TorToiSe.egg-info/top_level.txt 10 | scripts/tortoise_tts.py 11 | tortoise/__init__.py 12 | tortoise/api.py 13 | tortoise/do_tts.py 14 | tortoise/eval.py 15 | tortoise/get_conditioning_latents.py 16 | tortoise/is_this_from_tortoise.py 17 | tortoise/read.py 18 | tortoise/data/got.txt 19 | tortoise/data/layman.txt 20 | tortoise/data/mel_norms.pth 21 | tortoise/data/riding_hood.txt 22 | tortoise/data/seal_copypasta.txt 23 | tortoise/data/tokenizer.json 24 | tortoise/models/__init__.py 25 | tortoise/models/activations.py 26 | tortoise/models/arch_util.py 27 | tortoise/models/autoregressive.py 28 | tortoise/models/bigvgan.py 29 | tortoise/models/classifier.py 30 | tortoise/models/clvp.py 31 | tortoise/models/cvvp.py 32 | tortoise/models/diffusion_decoder.py 33 | tortoise/models/random_latent_generator.py 34 | tortoise/models/transformer.py 35 | tortoise/models/vocoder.py 36 | tortoise/models/xtransformers.py 37 | tortoise/models/alias_free_torch/__init__.py 38 | tortoise/models/alias_free_torch/act.py 39 | tortoise/models/alias_free_torch/filter.py 40 | tortoise/models/alias_free_torch/resample.py 41 | tortoise/utils/__init__.py 42 | tortoise/utils/audio.py 43 | tortoise/utils/device.py 44 | tortoise/utils/diffusion.py 45 | tortoise/utils/stft.py 46 | tortoise/utils/text.py 47 | tortoise/utils/tokenizer.py 48 | tortoise/utils/torch_intermediary.py 49 | tortoise/utils/typical_sampling.py 50 | tortoise/utils/wav2vec_alignment.py -------------------------------------------------------------------------------- /tortoise/TorToiSe.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tortoise/TorToiSe.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | rotary_embedding_torch 3 | inflect 4 | progressbar 5 | einops 6 | unidecode 7 | scipy 8 | librosa 9 | transformers 10 | tokenizers 11 | transformers==4.19 12 | torchaudio 13 | threadpoolctl 14 | appdirs 15 | numpy 16 | numba 17 | -------------------------------------------------------------------------------- /tortoise/TorToiSe.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | tortoise 2 | -------------------------------------------------------------------------------- /tortoise/convert/convert.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | rm .\in\.gitkeep 3 | rm .\out\.gitkeep 4 | for %%a in (".\in\*.*") do ffmpeg -i "%%a" -ac 1 ".\out\%%~na.wav" -------------------------------------------------------------------------------- /tortoise/convert/convert.sh: -------------------------------------------------------------------------------- 1 | for a in $(find "in/" -maxdepth 1 -not -name '.gitkeep' -type f); do ffmpeg -i "$a" -ac 1 "out/$(basename $a).wav"; done -------------------------------------------------------------------------------- /tortoise/requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | rotary_embedding_torch 3 | transformers==4.31.0 4 | tokenizers 5 | inflect 6 | progressbar 7 | einops==0.4.1 8 | unidecode 9 | scipy 10 | librosa==0.9.1 11 | ffmpeg 12 | numpy 13 | numba 14 | torchaudio 15 | threadpoolctl 16 | llvmlite 17 | appdirs 18 | nbconvert==5.3.1 19 | tornado==4.2 20 | pydantic==1.9.1 21 | deepspeed==0.8.3 22 | py-cpuinfo 23 | hjson 24 | psutil 25 | -------------------------------------------------------------------------------- /tortoise/requirements_legacy.txt: -------------------------------------------------------------------------------- 1 | gradio 2 | music-tag 3 | k-diffusion 4 | voicefixer -------------------------------------------------------------------------------- /tortoise/scripts/tortoise_tts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | import sys 6 | import tempfile 7 | import time 8 | 9 | import torch 10 | import torchaudio 11 | 12 | from tortoise.api import MODELS_DIR, TextToSpeech 13 | from tortoise.utils.audio import get_voices, load_voices, load_audio 14 | from tortoise.utils.text import split_and_recombine_text 15 | 16 | parser = argparse.ArgumentParser( 17 | description='TorToiSe is a text-to-speech program that is capable of synthesizing speech ' 18 | 'in multiple voices with realistic prosody and intonation.') 19 | 20 | parser.add_argument( 21 | 'text', type=str, nargs='*', 22 | help='Text to speak. If omitted, text is read from stdin.') 23 | parser.add_argument( 24 | '-v, --voice', type=str, default='random', metavar='VOICE', dest='voice', 25 | help='Selects the voice to use for generation. Use the & character to join two voices together. ' 26 | 'Use a comma to perform inference on multiple voices. Set to "all" to use all available voices. ' 27 | 'Note that multiple voices require the --output-dir option to be set.') 28 | parser.add_argument( 29 | '-V, --voices-dir', metavar='VOICES_DIR', type=str, dest='voices_dir', 30 | help='Path to directory containing extra voices to be loaded. Use a comma to specify multiple directories.') 31 | parser.add_argument( 32 | '-p, --preset', type=str, default='fast', choices=['ultra_fast', 'fast', 'standard', 'high_quality'], dest='preset', 33 | help='Which voice quality preset to use.') 34 | parser.add_argument( 35 | '-q, --quiet', default=False, action='store_true', dest='quiet', 36 | help='Suppress all output.') 37 | 38 | output_group = parser.add_mutually_exclusive_group(required=True) 39 | output_group.add_argument( 40 | '-l, --list-voices', default=False, action='store_true', dest='list_voices', 41 | help='List available voices and exit.') 42 | output_group.add_argument( 43 | '-P, --play', action='store_true', dest='play', 44 | help='Play the audio (requires pydub).') 45 | output_group.add_argument( 46 | '-o, --output', type=str, metavar='OUTPUT', dest='output', 47 | help='Save the audio to a file.') 48 | output_group.add_argument( 49 | '-O, --output-dir', type=str, metavar='OUTPUT_DIR', dest='output_dir', 50 | help='Save the audio to a directory as individual segments.') 51 | 52 | multi_output_group = parser.add_argument_group('multi-output options (requires --output-dir)') 53 | multi_output_group.add_argument( 54 | '--candidates', type=int, default=1, 55 | help='How many output candidates to produce per-voice. Note that only the first candidate is used in the combined output.') 56 | multi_output_group.add_argument( 57 | '--regenerate', type=str, default=None, 58 | help='Comma-separated list of clip numbers to re-generate.') 59 | multi_output_group.add_argument( 60 | '--skip-existing', action='store_true', 61 | help='Set to skip re-generating existing clips.') 62 | 63 | advanced_group = parser.add_argument_group('advanced options') 64 | advanced_group.add_argument( 65 | '--produce-debug-state', default=False, action='store_true', 66 | help='Whether or not to produce debug_states in current directory, which can aid in reproducing problems.') 67 | advanced_group.add_argument( 68 | '--seed', type=int, default=None, 69 | help='Random seed which can be used to reproduce results.') 70 | advanced_group.add_argument( 71 | '--models-dir', type=str, default=MODELS_DIR, 72 | help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to ' 73 | '~/.cache/tortoise/.models, so this should only be specified if you have custom checkpoints.') 74 | advanced_group.add_argument( 75 | '--text-split', type=str, default=None, 76 | help='How big chunks to split the text into, in the format ,.') 77 | advanced_group.add_argument( 78 | '--disable-redaction', default=False, action='store_true', 79 | help='Normally text enclosed in brackets are automatically redacted from the spoken output ' 80 | '(but are still rendered by the model), this can be used for prompt engineering. ' 81 | 'Set this to disable this behavior.') 82 | advanced_group.add_argument( 83 | '--device', type=str, default=None, 84 | help='Device to use for inference.') 85 | advanced_group.add_argument( 86 | '--batch-size', type=int, default=None, 87 | help='Batch size to use for inference. If omitted, the batch size is set based on available GPU memory.') 88 | 89 | tuning_group = parser.add_argument_group('tuning options (overrides preset settings)') 90 | tuning_group.add_argument( 91 | '--num-autoregressive-samples', type=int, default=None, 92 | help='Number of samples taken from the autoregressive model, all of which are filtered using CLVP. ' 93 | 'As TorToiSe is a probabilistic model, more samples means a higher probability of creating something "great".') 94 | tuning_group.add_argument( 95 | '--temperature', type=float, default=None, 96 | help='The softmax temperature of the autoregressive model.') 97 | tuning_group.add_argument( 98 | '--length-penalty', type=float, default=None, 99 | help='A length penalty applied to the autoregressive decoder. Higher settings causes the model to produce more terse outputs.') 100 | tuning_group.add_argument( 101 | '--repetition-penalty', type=float, default=None, 102 | help='A penalty that prevents the autoregressive decoder from repeating itself during decoding. ' 103 | 'Can be used to reduce the incidence of long silences or "uhhhhhhs", etc.') 104 | tuning_group.add_argument( 105 | '--top-p', type=float, default=None, 106 | help='P value used in nucleus sampling. 0 to 1. Lower values mean the decoder produces more "likely" (aka boring) outputs.') 107 | tuning_group.add_argument( 108 | '--max-mel-tokens', type=int, default=None, 109 | help='Restricts the output length. 1 to 600. Each unit is 1/20 of a second.') 110 | tuning_group.add_argument( 111 | '--cvvp-amount', type=float, default=None, 112 | help='How much the CVVP model should influence the output.' 113 | 'Increasing this can in some cases reduce the likelihood of multiple speakers.') 114 | tuning_group.add_argument( 115 | '--diffusion-iterations', type=int, default=None, 116 | help='Number of diffusion steps to perform. More steps means the network has more chances to iteratively' 117 | 'refine the output, which should theoretically mean a higher quality output. ' 118 | 'Generally a value above 250 is not noticeably better, however.') 119 | tuning_group.add_argument( 120 | '--cond-free', type=bool, default=None, 121 | help='Whether or not to perform conditioning-free diffusion. Conditioning-free diffusion performs two forward passes for ' 122 | 'each diffusion step: one with the outputs of the autoregressive model and one with no conditioning priors. The output ' 123 | 'of the two is blended according to the cond_free_k value below. Conditioning-free diffusion is the real deal, and ' 124 | 'dramatically improves realism.') 125 | tuning_group.add_argument( 126 | '--cond-free-k', type=float, default=None, 127 | help='Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf]. ' 128 | 'As cond_free_k increases, the output becomes dominated by the conditioning-free signal. ' 129 | 'Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k') 130 | tuning_group.add_argument( 131 | '--diffusion-temperature', type=float, default=None, 132 | help='Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0 ' 133 | 'are the "mean" prediction of the diffusion network and will sound bland and smeared. ') 134 | 135 | usage_examples = f''' 136 | Examples: 137 | 138 | Read text using random voice and place it in a file: 139 | 140 | {parser.prog} -o hello.wav "Hello, how are you?" 141 | 142 | Read text from stdin and play it using the tom voice: 143 | 144 | echo "Say it like you mean it!" | {parser.prog} -P -v tom 145 | 146 | Read a text file using multiple voices and save the audio clips to a directory: 147 | 148 | {parser.prog} -O /tmp/tts-results -v tom,emma max_length: 183 | parser.error(f'--text-split: desired_length ({desired_length}) must be <= max_length ({max_length})') 184 | texts = split_and_recombine_text(text, desired_length, max_length) 185 | else: 186 | texts = split_and_recombine_text(text) 187 | if len(texts) == 0: 188 | parser.error('no text provided') 189 | 190 | if args.output_dir: 191 | os.makedirs(args.output_dir, exist_ok=True) 192 | else: 193 | if len(selected_voices) > 1: 194 | parser.error('cannot have multiple voices without --output-dir"') 195 | if args.candidates > 1: 196 | parser.error('cannot have multiple candidates without --output-dir"') 197 | 198 | # error out early if pydub isn't installed 199 | if args.play: 200 | try: 201 | import pydub 202 | import pydub.playback 203 | except ImportError: 204 | parser.error('--play requires pydub to be installed, which can be done with "pip install pydub"') 205 | 206 | seed = int(time.time()) if args.seed is None else args.seed 207 | if not args.quiet: 208 | print('Loading tts...') 209 | tts = TextToSpeech(models_dir=args.models_dir, enable_redaction=not args.disable_redaction, 210 | device=args.device, autoregressive_batch_size=args.batch_size) 211 | gen_settings = { 212 | 'use_deterministic_seed': seed, 213 | 'verbose': not args.quiet, 214 | 'k': args.candidates, 215 | 'preset': args.preset, 216 | } 217 | tuning_options = [ 218 | 'num_autoregressive_samples', 'temperature', 'length_penalty', 'repetition_penalty', 'top_p', 219 | 'max_mel_tokens', 'cvvp_amount', 'diffusion_iterations', 'cond_free', 'cond_free_k', 'diffusion_temperature'] 220 | for option in tuning_options: 221 | if getattr(args, option) is not None: 222 | gen_settings[option] = getattr(args, option) 223 | total_clips = len(texts) * len(selected_voices) 224 | regenerate_clips = [int(x) for x in args.regenerate.split(',')] if args.regenerate else None 225 | for voice_idx, voice in enumerate(selected_voices): 226 | audio_parts = [] 227 | voice_samples, conditioning_latents = load_voices(voice, extra_voice_dirs) 228 | for text_idx, text in enumerate(texts): 229 | clip_name = f'{"-".join(voice)}_{text_idx:02d}' 230 | if args.output_dir: 231 | first_clip = os.path.join(args.output_dir, f'{clip_name}_00.wav') 232 | if (args.skip_existing or (regenerate_clips and text_idx not in regenerate_clips)) and os.path.exists(first_clip): 233 | audio_parts.append(load_audio(first_clip, 24000)) 234 | if not args.quiet: 235 | print(f'Skipping {clip_name}') 236 | continue 237 | if not args.quiet: 238 | print(f'Rendering {clip_name} ({(voice_idx * len(texts) + text_idx + 1)} of {total_clips})...') 239 | print(' ' + text) 240 | gen = tts.tts_with_preset( 241 | text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, **gen_settings) 242 | gen = gen if args.candidates > 1 else [gen] 243 | for candidate_idx, audio in enumerate(gen): 244 | audio = audio.squeeze(0).cpu() 245 | if candidate_idx == 0: 246 | audio_parts.append(audio) 247 | if args.output_dir: 248 | filename = f'{clip_name}_{candidate_idx:02d}.wav' 249 | torchaudio.save(os.path.join(args.output_dir, filename), audio, 24000) 250 | 251 | audio = torch.cat(audio_parts, dim=-1) 252 | if args.output_dir: 253 | filename = f'{"-".join(voice)}_combined.wav' 254 | torchaudio.save(os.path.join(args.output_dir, filename), audio, 24000) 255 | elif args.output: 256 | filename = args.output if args.output else os.tmp 257 | torchaudio.save(args.output, audio, 24000) 258 | elif args.play: 259 | f = tempfile.NamedTemporaryFile(suffix='.wav', delete=True) 260 | torchaudio.save(f.name, audio, 24000) 261 | pydub.playback.play(pydub.AudioSegment.from_wav(f.name)) 262 | 263 | if args.produce_debug_state: 264 | os.makedirs('debug_states', exist_ok=True) 265 | dbg_state = (seed, texts, voice_samples, conditioning_latents, args) 266 | torch.save(dbg_state, os.path.join('debug_states', f'debug_{"-".join(voice)}.pth')) 267 | -------------------------------------------------------------------------------- /tortoise/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="TorToiSe", 8 | packages=setuptools.find_packages(), 9 | version="2.7.0", 10 | author="James Betker", 11 | author_email="james@adamant.ai", 12 | description="A high quality multi-voice text-to-speech library", 13 | long_description=long_description, 14 | long_description_content_type="text/markdown", 15 | url="https://github.com/neonbjb/tortoise-tts", 16 | project_urls={}, 17 | scripts=[ 18 | 'scripts/tortoise_tts.py', 19 | ], 20 | include_package_data=True, 21 | install_requires=[ 22 | 'tqdm', 23 | 'rotary_embedding_torch', 24 | 'inflect', 25 | 'progressbar', 26 | 'einops', 27 | 'unidecode', 28 | 'scipy', 29 | 'librosa', 30 | 'transformers==4.31.0', 31 | 'tokenizers', 32 | ], 33 | classifiers=[ 34 | "Programming Language :: Python :: 3", 35 | "License :: OSI Approved :: Apache Software License", 36 | "Operating System :: OS Independent", 37 | ], 38 | python_requires=">=3.6", 39 | ) 40 | -------------------------------------------------------------------------------- /tortoise/tortoise/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/__init__.py -------------------------------------------------------------------------------- /tortoise/tortoise/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/__pycache__/api.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/__pycache__/api.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/data/layman.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/data/layman.txt -------------------------------------------------------------------------------- /tortoise/tortoise/data/mel_norms.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/data/mel_norms.pth -------------------------------------------------------------------------------- /tortoise/tortoise/data/riding_hood.txt: -------------------------------------------------------------------------------- 1 | Once upon a time there lived in a certain village a little country girl, the prettiest creature who was ever seen. Her mother was excessively fond of her; and her grandmother doted on her still more. This good woman had a little red riding hood made for her. It suited the girl so extremely well that everybody called her Little Red Riding Hood. 2 | One day her mother, having made some cakes, said to her, "Go, my dear, and see how your grandmother is doing, for I hear she has been very ill. Take her a cake, and this little pot of butter." 3 | 4 | Little Red Riding Hood set out immediately to go to her grandmother, who lived in another village. 5 | 6 | As she was going through the wood, she met with a wolf, who had a very great mind to eat her up, but he dared not, because of some woodcutters working nearby in the forest. He asked her where she was going. The poor child, who did not know that it was dangerous to stay and talk to a wolf, said to him, "I am going to see my grandmother and carry her a cake and a little pot of butter from my mother." 7 | 8 | "Does she live far off?" said the wolf 9 | 10 | "Oh I say," answered Little Red Riding Hood; "it is beyond that mill you see there, at the first house in the village." 11 | 12 | "Well," said the wolf, "and I'll go and see her too. I'll go this way and go you that, and we shall see who will be there first." 13 | 14 | The wolf ran as fast as he could, taking the shortest path, and the little girl took a roundabout way, entertaining herself by gathering nuts, running after butterflies, and gathering bouquets of little flowers. It was not long before the wolf arrived at the old woman's house. He knocked at the door: tap, tap. 15 | 16 | "Who's there?" 17 | 18 | "Your grandchild, Little Red Riding Hood," replied the wolf, counterfeiting her voice; "who has brought you a cake and a little pot of butter sent you by mother." 19 | 20 | The good grandmother, who was in bed, because she was somewhat ill, cried out, "Pull the bobbin, and the latch will go up." 21 | 22 | The wolf pulled the bobbin, and the door opened, and then he immediately fell upon the good woman and ate her up in a moment, for it been more than three days since he had eaten. He then shut the door and got into the grandmother's bed, expecting Little Red Riding Hood, who came some time afterwards and knocked at the door: tap, tap. 23 | 24 | "Who's there?" 25 | 26 | Little Red Riding Hood, hearing the big voice of the wolf, was at first afraid; but believing her grandmother had a cold and was hoarse, answered, "It is your grandchild Little Red Riding Hood, who has brought you a cake and a little pot of butter mother sends you." 27 | 28 | The wolf cried out to her, softening his voice as much as he could, "Pull the bobbin, and the latch will go up." 29 | 30 | Little Red Riding Hood pulled the bobbin, and the door opened. 31 | 32 | The wolf, seeing her come in, said to her, hiding himself under the bedclothes, "Put the cake and the little pot of butter upon the stool, and come get into bed with me." 33 | 34 | Little Red Riding Hood took off her clothes and got into bed. She was greatly amazed to see how her grandmother looked in her nightclothes, and said to her, "Grandmother, what big arms you have!" 35 | 36 | "All the better to hug you with, my dear." 37 | 38 | "Grandmother, what big legs you have!" 39 | 40 | "All the better to run with, my child." 41 | 42 | "Grandmother, what big ears you have!" 43 | 44 | "All the better to hear with, my child." 45 | 46 | "Grandmother, what big eyes you have!" 47 | 48 | "All the better to see with, my child." 49 | 50 | "Grandmother, what big teeth you have got!" 51 | 52 | "All the better to eat you up with." 53 | 54 | And, saying these words, this wicked wolf fell upon Little Red Riding Hood, and ate her all up. -------------------------------------------------------------------------------- /tortoise/tortoise/data/seal_copypasta.txt: -------------------------------------------------------------------------------- 1 | What the fuck did you just fucking say about me, you little bitch? I'll have you know I graduated top of my class in the Navy Seals, and I've been involved in numerous secret raids on Al kayda, and I have over 300 confirmed kills. I am trained in gorilla warfare and I'm the top sniper in the entire U S armed forces. You are nothing to me but just another target. I will wipe you the fuck out with precision the likes of which has never been seen before on this Earth, mark my fucking words. You think you can get away with saying that shit to me over the Internet? Think again, fucker. As we speak I am contacting my secret network of spies across the U S A and your IP is being traced right now so you better prepare for the storm, maggot. The storm that wipes out the pathetic little thing you call your life. You're fucking dead, kid. I can be anywhere, anytime, and I can kill you in over seven hundred ways, and that's just with my bare hands. Not only am I extensively trained in unarmed combat, but I have access to the entire arsenal of the United States Marine Corps and I will use it to its full extent to wipe your miserable ass off the face of the continent, you little shit. If only you could have known what unholy retribution your little "clever" comment was about to bring down upon you, maybe you would have held your fucking tongue. But you couldn't, you didn't, and now you're paying the price, you goddamn idiot. I will shit fury all over you and you will drown in it. You're fucking dead, kiddo. -------------------------------------------------------------------------------- /tortoise/tortoise/data/tokenizer.json: -------------------------------------------------------------------------------- 1 | {"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}} -------------------------------------------------------------------------------- /tortoise/tortoise/do_tts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | import torchaudio 6 | 7 | from api import TextToSpeech, MODELS_DIR 8 | from utils.audio import load_voices 9 | 10 | if __name__ == '__main__': 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--text', type=str, help='Text to speak.', default="The expressiveness of autoregressive transformers is literally nuts! I absolutely adore them.") 13 | parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) ' 14 | 'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='random') 15 | parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='fast') 16 | parser.add_argument('--use_deepspeed', type=str, help='Which voice preset to use.', default=False) 17 | parser.add_argument('--kv_cache', type=bool, help='If you disable this please wait for a long a time to get the output', default=True) 18 | parser.add_argument('--half', type=bool, help="float16(half) precision inference if True it's faster and take less vram and ram", default=True) 19 | parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/') 20 | parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this' 21 | 'should only be specified if you have custom checkpoints.', default=MODELS_DIR) 22 | parser.add_argument('--candidates', type=int, help='How many output candidates to produce per-voice.', default=3) 23 | parser.add_argument('--seed', type=int, help='Random seed which can be used to reproduce results.', default=None) 24 | parser.add_argument('--produce_debug_state', type=bool, help='Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true.', default=True) 25 | parser.add_argument('--cvvp_amount', type=float, help='How much the CVVP model should influence the output.' 26 | 'Increasing this can in some cases reduce the likelihood of multiple speakers. Defaults to 0 (disabled)', default=.0) 27 | args = parser.parse_args() 28 | if torch.backends.mps.is_available(): 29 | args.use_deepspeed = False 30 | os.makedirs(args.output_path, exist_ok=True) 31 | tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half) 32 | 33 | selected_voices = args.voice.split(',') 34 | for k, selected_voice in enumerate(selected_voices): 35 | if '&' in selected_voice: 36 | voice_sel = selected_voice.split('&') 37 | else: 38 | voice_sel = [selected_voice] 39 | voice_samples, conditioning_latents = load_voices(voice_sel) 40 | 41 | gen, dbg_state = tts.tts_with_preset(args.text, k=args.candidates, voice_samples=voice_samples, conditioning_latents=conditioning_latents, 42 | preset=args.preset, use_deterministic_seed=args.seed, return_deterministic_state=True, cvvp_amount=args.cvvp_amount) 43 | if isinstance(gen, list): 44 | for j, g in enumerate(gen): 45 | torchaudio.save(os.path.join(args.output_path, f'{selected_voice}_{k}_{j}.wav'), g.squeeze(0).cpu(), 24000) 46 | else: 47 | torchaudio.save(os.path.join(args.output_path, f'{selected_voice}_{k}.wav'), gen.squeeze(0).cpu(), 24000) 48 | 49 | if args.produce_debug_state: 50 | os.makedirs('debug_states', exist_ok=True) 51 | torch.save(dbg_state, f'debug_states/do_tts_debug_{selected_voice}.pth') 52 | 53 | -------------------------------------------------------------------------------- /tortoise/tortoise/eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torchaudio 5 | 6 | from api import TextToSpeech 7 | from tortoise.utils.audio import load_audio 8 | 9 | if __name__ == '__main__': 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--eval_path', type=str, help='Path to TSV test file', default="D:\\tmp\\tortoise-tts-eval\\test.tsv") 12 | parser.add_argument('--output_path', type=str, help='Where to put results', default="D:\\tmp\\tortoise-tts-eval\\baseline") 13 | parser.add_argument('--preset', type=str, help='Rendering preset.', default="standard") 14 | args = parser.parse_args() 15 | os.makedirs(args.output_path, exist_ok=True) 16 | 17 | tts = TextToSpeech() 18 | 19 | with open(args.eval_path, 'r', encoding='utf-8') as f: 20 | lines = f.readlines() 21 | 22 | for line in lines: 23 | text, real = line.strip().split('\t') 24 | conds = [load_audio(real, 22050)] 25 | gen = tts.tts_with_preset(text, voice_samples=conds, conditioning_latents=None, preset=args.preset) 26 | torchaudio.save(os.path.join(args.output_path, os.path.basename(real)), gen.squeeze(0).cpu(), 24000) 27 | 28 | -------------------------------------------------------------------------------- /tortoise/tortoise/get_conditioning_latents.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import torch 4 | 5 | from api import TextToSpeech 6 | from tortoise.utils.audio import load_audio, get_voices 7 | 8 | """ 9 | Dumps the conditioning latents for the specified voice to disk. These are expressive latents which can be used for 10 | other ML models, or can be augmented manually and fed back into Tortoise to affect vocal qualities. 11 | """ 12 | if __name__ == '__main__': 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--voice', type=str, help='Selects the voice to convert to conditioning latents', default='pat2') 15 | parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='../results/conditioning_latents') 16 | args = parser.parse_args() 17 | os.makedirs(args.output_path, exist_ok=True) 18 | 19 | tts = TextToSpeech() 20 | voices = get_voices() 21 | selected_voices = args.voice.split(',') 22 | for voice in selected_voices: 23 | cond_paths = voices[voice] 24 | conds = [] 25 | for cond_path in cond_paths: 26 | c = load_audio(cond_path, 22050) 27 | conds.append(c) 28 | conditioning_latents = tts.get_conditioning_latents(conds) 29 | torch.save(conditioning_latents, os.path.join(args.output_path, f'{voice}.pth')) 30 | 31 | -------------------------------------------------------------------------------- /tortoise/tortoise/is_this_from_tortoise.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from api import classify_audio_clip 4 | from tortoise.utils.audio import load_audio 5 | 6 | if __name__ == '__main__': 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('--clip', type=str, help='Path to an audio clip to classify.', default="../examples/favorite_riding_hood.mp3") 9 | args = parser.parse_args() 10 | 11 | clip = load_audio(args.clip, 24000) 12 | clip = clip[:, :220000] 13 | prob = classify_audio_clip(clip) 14 | print(f"This classifier thinks there is a {prob*100}% chance that this clip was generated from Tortoise.") -------------------------------------------------------------------------------- /tortoise/tortoise/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/models/__init__.py -------------------------------------------------------------------------------- /tortoise/tortoise/models/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/models/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/models/__pycache__/activations.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/models/__pycache__/activations.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/models/__pycache__/arch_util.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/models/__pycache__/arch_util.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/models/__pycache__/autoregressive.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/models/__pycache__/autoregressive.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/models/__pycache__/bigvgan.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/models/__pycache__/bigvgan.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/models/__pycache__/classifier.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/models/__pycache__/classifier.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/models/__pycache__/clvp.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/models/__pycache__/clvp.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/models/__pycache__/cvvp.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/models/__pycache__/cvvp.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/models/__pycache__/diffusion_decoder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/models/__pycache__/diffusion_decoder.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/models/__pycache__/random_latent_generator.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/models/__pycache__/random_latent_generator.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/models/__pycache__/transformer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/models/__pycache__/transformer.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/models/__pycache__/vocoder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/models/__pycache__/vocoder.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/models/__pycache__/xtransformers.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/models/__pycache__/xtransformers.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/models/activations.py: -------------------------------------------------------------------------------- 1 | # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license. 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch 5 | from torch import nn, sin, pow 6 | from torch.nn import Parameter 7 | 8 | 9 | class Snake(nn.Module): 10 | ''' 11 | Implementation of a sine-based periodic activation function 12 | Shape: 13 | - Input: (B, C, T) 14 | - Output: (B, C, T), same shape as the input 15 | Parameters: 16 | - alpha - trainable parameter 17 | References: 18 | - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: 19 | https://arxiv.org/abs/2006.08195 20 | Examples: 21 | >>> a1 = snake(256) 22 | >>> x = torch.randn(256) 23 | >>> x = a1(x) 24 | ''' 25 | def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): 26 | ''' 27 | Initialization. 28 | INPUT: 29 | - in_features: shape of the input 30 | - alpha: trainable parameter 31 | alpha is initialized to 1 by default, higher values = higher-frequency. 32 | alpha will be trained along with the rest of your model. 33 | ''' 34 | super(Snake, self).__init__() 35 | self.in_features = in_features 36 | 37 | # initialize alpha 38 | self.alpha_logscale = alpha_logscale 39 | if self.alpha_logscale: # log scale alphas initialized to zeros 40 | self.alpha = Parameter(torch.zeros(in_features) * alpha) 41 | else: # linear scale alphas initialized to ones 42 | self.alpha = Parameter(torch.ones(in_features) * alpha) 43 | 44 | self.alpha.requires_grad = alpha_trainable 45 | 46 | self.no_div_by_zero = 0.000000001 47 | 48 | def forward(self, x): 49 | ''' 50 | Forward pass of the function. 51 | Applies the function to the input elementwise. 52 | Snake ∶= x + 1/a * sin^2 (xa) 53 | ''' 54 | alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] 55 | if self.alpha_logscale: 56 | alpha = torch.exp(alpha) 57 | x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2) 58 | 59 | return x 60 | 61 | 62 | class SnakeBeta(nn.Module): 63 | ''' 64 | A modified Snake function which uses separate parameters for the magnitude of the periodic components 65 | Shape: 66 | - Input: (B, C, T) 67 | - Output: (B, C, T), same shape as the input 68 | Parameters: 69 | - alpha - trainable parameter that controls frequency 70 | - beta - trainable parameter that controls magnitude 71 | References: 72 | - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: 73 | https://arxiv.org/abs/2006.08195 74 | Examples: 75 | >>> a1 = snakebeta(256) 76 | >>> x = torch.randn(256) 77 | >>> x = a1(x) 78 | ''' 79 | def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): 80 | ''' 81 | Initialization. 82 | INPUT: 83 | - in_features: shape of the input 84 | - alpha - trainable parameter that controls frequency 85 | - beta - trainable parameter that controls magnitude 86 | alpha is initialized to 1 by default, higher values = higher-frequency. 87 | beta is initialized to 1 by default, higher values = higher-magnitude. 88 | alpha will be trained along with the rest of your model. 89 | ''' 90 | super(SnakeBeta, self).__init__() 91 | self.in_features = in_features 92 | 93 | # initialize alpha 94 | self.alpha_logscale = alpha_logscale 95 | if self.alpha_logscale: # log scale alphas initialized to zeros 96 | self.alpha = Parameter(torch.zeros(in_features) * alpha) 97 | self.beta = Parameter(torch.zeros(in_features) * alpha) 98 | else: # linear scale alphas initialized to ones 99 | self.alpha = Parameter(torch.ones(in_features) * alpha) 100 | self.beta = Parameter(torch.ones(in_features) * alpha) 101 | 102 | self.alpha.requires_grad = alpha_trainable 103 | self.beta.requires_grad = alpha_trainable 104 | 105 | self.no_div_by_zero = 0.000000001 106 | 107 | def forward(self, x): 108 | ''' 109 | Forward pass of the function. 110 | Applies the function to the input elementwise. 111 | SnakeBeta ∶= x + 1/b * sin^2 (xa) 112 | ''' 113 | alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] 114 | beta = self.beta.unsqueeze(0).unsqueeze(-1) 115 | if self.alpha_logscale: 116 | alpha = torch.exp(alpha) 117 | beta = torch.exp(beta) 118 | x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2) 119 | 120 | return x -------------------------------------------------------------------------------- /tortoise/tortoise/models/alias_free_torch/__init__.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | from .filter import * 5 | from .resample import * 6 | from .act import * -------------------------------------------------------------------------------- /tortoise/tortoise/models/alias_free_torch/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/models/alias_free_torch/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/models/alias_free_torch/__pycache__/act.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/models/alias_free_torch/__pycache__/act.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/models/alias_free_torch/__pycache__/filter.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/models/alias_free_torch/__pycache__/filter.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/models/alias_free_torch/__pycache__/resample.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/models/alias_free_torch/__pycache__/resample.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/models/alias_free_torch/act.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch.nn as nn 5 | from .resample import UpSample1d, DownSample1d 6 | 7 | 8 | class Activation1d(nn.Module): 9 | def __init__(self, 10 | activation, 11 | up_ratio: int = 2, 12 | down_ratio: int = 2, 13 | up_kernel_size: int = 12, 14 | down_kernel_size: int = 12): 15 | super().__init__() 16 | self.up_ratio = up_ratio 17 | self.down_ratio = down_ratio 18 | self.act = activation 19 | self.upsample = UpSample1d(up_ratio, up_kernel_size) 20 | self.downsample = DownSample1d(down_ratio, down_kernel_size) 21 | 22 | # x: [B,C,T] 23 | def forward(self, x): 24 | x = self.upsample(x) 25 | x = self.act(x) 26 | x = self.downsample(x) 27 | 28 | return x -------------------------------------------------------------------------------- /tortoise/tortoise/models/alias_free_torch/filter.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import math 8 | 9 | if 'sinc' in dir(torch): 10 | sinc = torch.sinc 11 | else: 12 | # This code is adopted from adefossez's julius.core.sinc under the MIT License 13 | # https://adefossez.github.io/julius/julius/core.html 14 | # LICENSE is in incl_licenses directory. 15 | def sinc(x: torch.Tensor): 16 | """ 17 | Implementation of sinc, i.e. sin(pi * x) / (pi * x) 18 | __Warning__: Different to julius.sinc, the input is multiplied by `pi`! 19 | """ 20 | return torch.where(x == 0, 21 | torch.tensor(1., device=x.device, dtype=x.dtype), 22 | torch.sin(math.pi * x) / math.pi / x) 23 | 24 | 25 | # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License 26 | # https://adefossez.github.io/julius/julius/lowpass.html 27 | # LICENSE is in incl_licenses directory. 28 | def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size] 29 | even = (kernel_size % 2 == 0) 30 | half_size = kernel_size // 2 31 | 32 | #For kaiser window 33 | delta_f = 4 * half_width 34 | A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95 35 | if A > 50.: 36 | beta = 0.1102 * (A - 8.7) 37 | elif A >= 21.: 38 | beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.) 39 | else: 40 | beta = 0. 41 | window = torch.kaiser_window(kernel_size, beta=beta, periodic=False) 42 | 43 | # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio 44 | if even: 45 | time = (torch.arange(-half_size, half_size) + 0.5) 46 | else: 47 | time = torch.arange(kernel_size) - half_size 48 | if cutoff == 0: 49 | filter_ = torch.zeros_like(time) 50 | else: 51 | filter_ = 2 * cutoff * window * sinc(2 * cutoff * time) 52 | # Normalize filter to have sum = 1, otherwise we will have a small leakage 53 | # of the constant component in the input signal. 54 | filter_ /= filter_.sum() 55 | filter = filter_.view(1, 1, kernel_size) 56 | 57 | return filter 58 | 59 | 60 | class LowPassFilter1d(nn.Module): 61 | def __init__(self, 62 | cutoff=0.5, 63 | half_width=0.6, 64 | stride: int = 1, 65 | padding: bool = True, 66 | padding_mode: str = 'replicate', 67 | kernel_size: int = 12): 68 | # kernel_size should be even number for stylegan3 setup, 69 | # in this implementation, odd number is also possible. 70 | super().__init__() 71 | if cutoff < -0.: 72 | raise ValueError("Minimum cutoff must be larger than zero.") 73 | if cutoff > 0.5: 74 | raise ValueError("A cutoff above 0.5 does not make sense.") 75 | self.kernel_size = kernel_size 76 | self.even = (kernel_size % 2 == 0) 77 | self.pad_left = kernel_size // 2 - int(self.even) 78 | self.pad_right = kernel_size // 2 79 | self.stride = stride 80 | self.padding = padding 81 | self.padding_mode = padding_mode 82 | filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size) 83 | self.register_buffer("filter", filter) 84 | 85 | #input [B, C, T] 86 | def forward(self, x): 87 | _, C, _ = x.shape 88 | 89 | if self.padding: 90 | x = F.pad(x, (self.pad_left, self.pad_right), 91 | mode=self.padding_mode) 92 | out = F.conv1d(x, self.filter.expand(C, -1, -1), 93 | stride=self.stride, groups=C) 94 | 95 | return out -------------------------------------------------------------------------------- /tortoise/tortoise/models/alias_free_torch/resample.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch.nn as nn 5 | from torch.nn import functional as F 6 | from .filter import LowPassFilter1d 7 | from .filter import kaiser_sinc_filter1d 8 | 9 | 10 | class UpSample1d(nn.Module): 11 | def __init__(self, ratio=2, kernel_size=None): 12 | super().__init__() 13 | self.ratio = ratio 14 | self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size 15 | self.stride = ratio 16 | self.pad = self.kernel_size // ratio - 1 17 | self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2 18 | self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2 19 | filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, 20 | half_width=0.6 / ratio, 21 | kernel_size=self.kernel_size) 22 | self.register_buffer("filter", filter) 23 | 24 | # x: [B, C, T] 25 | def forward(self, x): 26 | _, C, _ = x.shape 27 | 28 | x = F.pad(x, (self.pad, self.pad), mode='replicate') 29 | x = self.ratio * F.conv_transpose1d( 30 | x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C) 31 | x = x[..., self.pad_left:-self.pad_right] 32 | 33 | return x 34 | 35 | 36 | class DownSample1d(nn.Module): 37 | def __init__(self, ratio=2, kernel_size=None): 38 | super().__init__() 39 | self.ratio = ratio 40 | self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size 41 | self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio, 42 | half_width=0.6 / ratio, 43 | stride=ratio, 44 | kernel_size=self.kernel_size) 45 | 46 | def forward(self, x): 47 | xx = self.lowpass(x) 48 | 49 | return xx -------------------------------------------------------------------------------- /tortoise/tortoise/models/arch_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import functools 3 | import math 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torchaudio 9 | from tortoise.models.xtransformers import ContinuousTransformerWrapper, RelativePositionBias 10 | 11 | 12 | def zero_module(module): 13 | """ 14 | Zero out the parameters of a module and return it. 15 | """ 16 | for p in module.parameters(): 17 | p.detach().zero_() 18 | return module 19 | 20 | 21 | class GroupNorm32(nn.GroupNorm): 22 | def forward(self, x): 23 | return super().forward(x.float()).type(x.dtype) 24 | 25 | 26 | def normalization(channels): 27 | """ 28 | Make a standard normalization layer. 29 | 30 | :param channels: number of input channels. 31 | :return: an nn.Module for normalization. 32 | """ 33 | groups = 32 34 | if channels <= 16: 35 | groups = 8 36 | elif channels <= 64: 37 | groups = 16 38 | while channels % groups != 0: 39 | groups = int(groups / 2) 40 | assert groups > 2 41 | return GroupNorm32(groups, channels) 42 | 43 | 44 | class QKVAttentionLegacy(nn.Module): 45 | """ 46 | A module which performs QKV attention. Matches legacy QKVAttention + input/output heads shaping 47 | """ 48 | 49 | def __init__(self, n_heads): 50 | super().__init__() 51 | self.n_heads = n_heads 52 | 53 | def forward(self, qkv, mask=None, rel_pos=None): 54 | """ 55 | Apply QKV attention. 56 | 57 | :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs. 58 | :return: an [N x (H * C) x T] tensor after attention. 59 | """ 60 | bs, width, length = qkv.shape 61 | assert width % (3 * self.n_heads) == 0 62 | ch = width // (3 * self.n_heads) 63 | q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1) 64 | scale = 1 / math.sqrt(math.sqrt(ch)) 65 | weight = torch.einsum( 66 | "bct,bcs->bts", q * scale, k * scale 67 | ) # More stable with f16 than dividing afterwards 68 | if rel_pos is not None: 69 | weight = rel_pos(weight.reshape(bs, self.n_heads, weight.shape[-2], weight.shape[-1])).reshape(bs * self.n_heads, weight.shape[-2], weight.shape[-1]) 70 | weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype) 71 | if mask is not None: 72 | # The proper way to do this is to mask before the softmax using -inf, but that doesn't work properly on CPUs. 73 | mask = mask.repeat(self.n_heads, 1).unsqueeze(1) 74 | weight = weight * mask 75 | a = torch.einsum("bts,bcs->bct", weight, v) 76 | 77 | return a.reshape(bs, -1, length) 78 | 79 | 80 | class AttentionBlock(nn.Module): 81 | """ 82 | An attention block that allows spatial positions to attend to each other. 83 | 84 | Originally ported from here, but adapted to the N-d case. 85 | https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66. 86 | """ 87 | 88 | def __init__( 89 | self, 90 | channels, 91 | num_heads=1, 92 | num_head_channels=-1, 93 | do_checkpoint=True, 94 | relative_pos_embeddings=False, 95 | ): 96 | super().__init__() 97 | self.channels = channels 98 | self.do_checkpoint = do_checkpoint 99 | if num_head_channels == -1: 100 | self.num_heads = num_heads 101 | else: 102 | assert ( 103 | channels % num_head_channels == 0 104 | ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" 105 | self.num_heads = channels // num_head_channels 106 | self.norm = normalization(channels) 107 | self.qkv = nn.Conv1d(channels, channels * 3, 1) 108 | # split heads before split qkv 109 | self.attention = QKVAttentionLegacy(self.num_heads) 110 | 111 | self.proj_out = zero_module(nn.Conv1d(channels, channels, 1)) 112 | if relative_pos_embeddings: 113 | self.relative_pos_embeddings = RelativePositionBias(scale=(channels // self.num_heads) ** .5, causal=False, heads=num_heads, num_buckets=32, max_distance=64) 114 | else: 115 | self.relative_pos_embeddings = None 116 | 117 | def forward(self, x, mask=None): 118 | b, c, *spatial = x.shape 119 | x = x.reshape(b, c, -1) 120 | qkv = self.qkv(self.norm(x)) 121 | h = self.attention(qkv, mask, self.relative_pos_embeddings) 122 | h = self.proj_out(h) 123 | return (x + h).reshape(b, c, *spatial) 124 | 125 | 126 | class Upsample(nn.Module): 127 | """ 128 | An upsampling layer with an optional convolution. 129 | 130 | :param channels: channels in the inputs and outputs. 131 | :param use_conv: a bool determining if a convolution is applied. 132 | """ 133 | 134 | def __init__(self, channels, use_conv, out_channels=None, factor=4): 135 | super().__init__() 136 | self.channels = channels 137 | self.out_channels = out_channels or channels 138 | self.use_conv = use_conv 139 | self.factor = factor 140 | if use_conv: 141 | ksize = 5 142 | pad = 2 143 | self.conv = nn.Conv1d(self.channels, self.out_channels, ksize, padding=pad) 144 | 145 | def forward(self, x): 146 | assert x.shape[1] == self.channels 147 | x = F.interpolate(x, scale_factor=self.factor, mode="nearest") 148 | if self.use_conv: 149 | x = self.conv(x) 150 | return x 151 | 152 | 153 | class Downsample(nn.Module): 154 | """ 155 | A downsampling layer with an optional convolution. 156 | 157 | :param channels: channels in the inputs and outputs. 158 | :param use_conv: a bool determining if a convolution is applied. 159 | """ 160 | 161 | def __init__(self, channels, use_conv, out_channels=None, factor=4, ksize=5, pad=2): 162 | super().__init__() 163 | self.channels = channels 164 | self.out_channels = out_channels or channels 165 | self.use_conv = use_conv 166 | 167 | stride = factor 168 | if use_conv: 169 | self.op = nn.Conv1d( 170 | self.channels, self.out_channels, ksize, stride=stride, padding=pad 171 | ) 172 | else: 173 | assert self.channels == self.out_channels 174 | self.op = nn.AvgPool1d(kernel_size=stride, stride=stride) 175 | 176 | def forward(self, x): 177 | assert x.shape[1] == self.channels 178 | return self.op(x) 179 | 180 | 181 | class ResBlock(nn.Module): 182 | def __init__( 183 | self, 184 | channels, 185 | dropout, 186 | out_channels=None, 187 | use_conv=False, 188 | use_scale_shift_norm=False, 189 | up=False, 190 | down=False, 191 | kernel_size=3, 192 | ): 193 | super().__init__() 194 | self.channels = channels 195 | self.dropout = dropout 196 | self.out_channels = out_channels or channels 197 | self.use_conv = use_conv 198 | self.use_scale_shift_norm = use_scale_shift_norm 199 | padding = 1 if kernel_size == 3 else 2 200 | 201 | self.in_layers = nn.Sequential( 202 | normalization(channels), 203 | nn.SiLU(), 204 | nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding), 205 | ) 206 | 207 | self.updown = up or down 208 | 209 | if up: 210 | self.h_upd = Upsample(channels, False) 211 | self.x_upd = Upsample(channels, False) 212 | elif down: 213 | self.h_upd = Downsample(channels, False) 214 | self.x_upd = Downsample(channels, False) 215 | else: 216 | self.h_upd = self.x_upd = nn.Identity() 217 | 218 | self.out_layers = nn.Sequential( 219 | normalization(self.out_channels), 220 | nn.SiLU(), 221 | nn.Dropout(p=dropout), 222 | zero_module( 223 | nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding) 224 | ), 225 | ) 226 | 227 | if self.out_channels == channels: 228 | self.skip_connection = nn.Identity() 229 | elif use_conv: 230 | self.skip_connection = nn.Conv1d( 231 | channels, self.out_channels, kernel_size, padding=padding 232 | ) 233 | else: 234 | self.skip_connection = nn.Conv1d(channels, self.out_channels, 1) 235 | 236 | def forward(self, x): 237 | if self.updown: 238 | in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] 239 | h = in_rest(x) 240 | h = self.h_upd(h) 241 | x = self.x_upd(x) 242 | h = in_conv(h) 243 | else: 244 | h = self.in_layers(x) 245 | h = self.out_layers(h) 246 | return self.skip_connection(x) + h 247 | 248 | 249 | class AudioMiniEncoder(nn.Module): 250 | def __init__(self, 251 | spec_dim, 252 | embedding_dim, 253 | base_channels=128, 254 | depth=2, 255 | resnet_blocks=2, 256 | attn_blocks=4, 257 | num_attn_heads=4, 258 | dropout=0, 259 | downsample_factor=2, 260 | kernel_size=3): 261 | super().__init__() 262 | self.init = nn.Sequential( 263 | nn.Conv1d(spec_dim, base_channels, 3, padding=1) 264 | ) 265 | ch = base_channels 266 | res = [] 267 | for l in range(depth): 268 | for r in range(resnet_blocks): 269 | res.append(ResBlock(ch, dropout, kernel_size=kernel_size)) 270 | res.append(Downsample(ch, use_conv=True, out_channels=ch*2, factor=downsample_factor)) 271 | ch *= 2 272 | self.res = nn.Sequential(*res) 273 | self.final = nn.Sequential( 274 | normalization(ch), 275 | nn.SiLU(), 276 | nn.Conv1d(ch, embedding_dim, 1) 277 | ) 278 | attn = [] 279 | for a in range(attn_blocks): 280 | attn.append(AttentionBlock(embedding_dim, num_attn_heads,)) 281 | self.attn = nn.Sequential(*attn) 282 | self.dim = embedding_dim 283 | 284 | def forward(self, x): 285 | h = self.init(x) 286 | h = self.res(h) 287 | h = self.final(h) 288 | h = self.attn(h) 289 | return h[:, :, 0] 290 | 291 | 292 | DEFAULT_MEL_NORM_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/mel_norms.pth') 293 | 294 | 295 | class TorchMelSpectrogram(nn.Module): 296 | def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, mel_fmin=0, mel_fmax=8000, 297 | sampling_rate=22050, normalize=False, mel_norm_file=DEFAULT_MEL_NORM_FILE): 298 | super().__init__() 299 | # These are the default tacotron values for the MEL spectrogram. 300 | self.filter_length = filter_length 301 | self.hop_length = hop_length 302 | self.win_length = win_length 303 | self.n_mel_channels = n_mel_channels 304 | self.mel_fmin = mel_fmin 305 | self.mel_fmax = mel_fmax 306 | self.sampling_rate = sampling_rate 307 | self.mel_stft = torchaudio.transforms.MelSpectrogram(n_fft=self.filter_length, hop_length=self.hop_length, 308 | win_length=self.win_length, power=2, normalized=normalize, 309 | sample_rate=self.sampling_rate, f_min=self.mel_fmin, 310 | f_max=self.mel_fmax, n_mels=self.n_mel_channels, 311 | norm="slaney") 312 | self.mel_norm_file = mel_norm_file 313 | if self.mel_norm_file is not None: 314 | self.mel_norms = torch.load(self.mel_norm_file) 315 | else: 316 | self.mel_norms = None 317 | 318 | def forward(self, inp): 319 | if len(inp.shape) == 3: # Automatically squeeze out the channels dimension if it is present (assuming mono-audio) 320 | inp = inp.squeeze(1) 321 | assert len(inp.shape) == 2 322 | if torch.backends.mps.is_available(): 323 | inp = inp.to('cpu') 324 | self.mel_stft = self.mel_stft.to(inp.device) 325 | mel = self.mel_stft(inp) 326 | # Perform dynamic range compression 327 | mel = torch.log(torch.clamp(mel, min=1e-5)) 328 | if self.mel_norms is not None: 329 | self.mel_norms = self.mel_norms.to(mel.device) 330 | mel = mel / self.mel_norms.unsqueeze(0).unsqueeze(-1) 331 | return mel 332 | 333 | 334 | class CheckpointedLayer(nn.Module): 335 | """ 336 | Wraps a module. When forward() is called, passes kwargs that require_grad through torch.checkpoint() and bypasses 337 | checkpoint for all other args. 338 | """ 339 | def __init__(self, wrap): 340 | super().__init__() 341 | self.wrap = wrap 342 | 343 | def forward(self, x, *args, **kwargs): 344 | for k, v in kwargs.items(): 345 | assert not (isinstance(v, torch.Tensor) and v.requires_grad) # This would screw up checkpointing. 346 | partial = functools.partial(self.wrap, **kwargs) 347 | return partial(x, *args) 348 | 349 | 350 | class CheckpointedXTransformerEncoder(nn.Module): 351 | """ 352 | Wraps a ContinuousTransformerWrapper and applies CheckpointedLayer to each layer and permutes from channels-mid 353 | to channels-last that XTransformer expects. 354 | """ 355 | def __init__(self, needs_permute=True, exit_permute=True, checkpoint=True, **xtransformer_kwargs): 356 | super().__init__() 357 | self.transformer = ContinuousTransformerWrapper(**xtransformer_kwargs) 358 | self.needs_permute = needs_permute 359 | self.exit_permute = exit_permute 360 | 361 | if not checkpoint: 362 | return 363 | for i in range(len(self.transformer.attn_layers.layers)): 364 | n, b, r = self.transformer.attn_layers.layers[i] 365 | self.transformer.attn_layers.layers[i] = nn.ModuleList([n, CheckpointedLayer(b), r]) 366 | 367 | def forward(self, x, **kwargs): 368 | if self.needs_permute: 369 | x = x.permute(0,2,1) 370 | h = self.transformer(x, **kwargs) 371 | if self.exit_permute: 372 | h = h.permute(0,2,1) 373 | return h -------------------------------------------------------------------------------- /tortoise/tortoise/models/classifier.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from tortoise.models.arch_util import Upsample, Downsample, normalization, zero_module, AttentionBlock 5 | 6 | 7 | class ResBlock(nn.Module): 8 | def __init__( 9 | self, 10 | channels, 11 | dropout, 12 | out_channels=None, 13 | use_conv=False, 14 | use_scale_shift_norm=False, 15 | dims=2, 16 | up=False, 17 | down=False, 18 | kernel_size=3, 19 | do_checkpoint=True, 20 | ): 21 | super().__init__() 22 | self.channels = channels 23 | self.dropout = dropout 24 | self.out_channels = out_channels or channels 25 | self.use_conv = use_conv 26 | self.use_scale_shift_norm = use_scale_shift_norm 27 | self.do_checkpoint = do_checkpoint 28 | padding = 1 if kernel_size == 3 else 2 29 | 30 | self.in_layers = nn.Sequential( 31 | normalization(channels), 32 | nn.SiLU(), 33 | nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding), 34 | ) 35 | 36 | self.updown = up or down 37 | 38 | if up: 39 | self.h_upd = Upsample(channels, False, dims) 40 | self.x_upd = Upsample(channels, False, dims) 41 | elif down: 42 | self.h_upd = Downsample(channels, False, dims) 43 | self.x_upd = Downsample(channels, False, dims) 44 | else: 45 | self.h_upd = self.x_upd = nn.Identity() 46 | 47 | self.out_layers = nn.Sequential( 48 | normalization(self.out_channels), 49 | nn.SiLU(), 50 | nn.Dropout(p=dropout), 51 | zero_module( 52 | nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding) 53 | ), 54 | ) 55 | 56 | if self.out_channels == channels: 57 | self.skip_connection = nn.Identity() 58 | elif use_conv: 59 | self.skip_connection = nn.Conv1d( 60 | dims, channels, self.out_channels, kernel_size, padding=padding 61 | ) 62 | else: 63 | self.skip_connection = nn.Conv1d(dims, channels, self.out_channels, 1) 64 | 65 | def forward(self, x): 66 | if self.updown: 67 | in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] 68 | h = in_rest(x) 69 | h = self.h_upd(h) 70 | x = self.x_upd(x) 71 | h = in_conv(h) 72 | else: 73 | h = self.in_layers(x) 74 | h = self.out_layers(h) 75 | return self.skip_connection(x) + h 76 | 77 | 78 | class AudioMiniEncoder(nn.Module): 79 | def __init__(self, 80 | spec_dim, 81 | embedding_dim, 82 | base_channels=128, 83 | depth=2, 84 | resnet_blocks=2, 85 | attn_blocks=4, 86 | num_attn_heads=4, 87 | dropout=0, 88 | downsample_factor=2, 89 | kernel_size=3): 90 | super().__init__() 91 | self.init = nn.Sequential( 92 | nn.Conv1d(spec_dim, base_channels, 3, padding=1) 93 | ) 94 | ch = base_channels 95 | res = [] 96 | self.layers = depth 97 | for l in range(depth): 98 | for r in range(resnet_blocks): 99 | res.append(ResBlock(ch, dropout, do_checkpoint=False, kernel_size=kernel_size)) 100 | res.append(Downsample(ch, use_conv=True, out_channels=ch*2, factor=downsample_factor)) 101 | ch *= 2 102 | self.res = nn.Sequential(*res) 103 | self.final = nn.Sequential( 104 | normalization(ch), 105 | nn.SiLU(), 106 | nn.Conv1d(ch, embedding_dim, 1) 107 | ) 108 | attn = [] 109 | for a in range(attn_blocks): 110 | attn.append(AttentionBlock(embedding_dim, num_attn_heads, do_checkpoint=False)) 111 | self.attn = nn.Sequential(*attn) 112 | self.dim = embedding_dim 113 | 114 | def forward(self, x): 115 | h = self.init(x) 116 | h = self.res(h) 117 | h = self.final(h) 118 | for blk in self.attn: 119 | h = blk(h) 120 | return h[:, :, 0] 121 | 122 | 123 | class AudioMiniEncoderWithClassifierHead(nn.Module): 124 | def __init__(self, classes, distribute_zero_label=True, **kwargs): 125 | super().__init__() 126 | self.enc = AudioMiniEncoder(**kwargs) 127 | self.head = nn.Linear(self.enc.dim, classes) 128 | self.num_classes = classes 129 | self.distribute_zero_label = distribute_zero_label 130 | 131 | def forward(self, x, labels=None): 132 | h = self.enc(x) 133 | logits = self.head(h) 134 | if labels is None: 135 | return logits 136 | else: 137 | if self.distribute_zero_label: 138 | oh_labels = nn.functional.one_hot(labels, num_classes=self.num_classes) 139 | zeros_indices = (labels == 0).unsqueeze(-1) 140 | # Distribute 20% of the probability mass on all classes when zero is specified, to compensate for dataset noise. 141 | zero_extra_mass = torch.full_like(oh_labels, dtype=torch.float, fill_value=.2/(self.num_classes-1)) 142 | zero_extra_mass[:, 0] = -.2 143 | zero_extra_mass = zero_extra_mass * zeros_indices 144 | oh_labels = oh_labels + zero_extra_mass 145 | else: 146 | oh_labels = labels 147 | loss = nn.functional.cross_entropy(logits, oh_labels) 148 | return loss 149 | -------------------------------------------------------------------------------- /tortoise/tortoise/models/clvp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch import einsum 5 | 6 | from tortoise.models.arch_util import CheckpointedXTransformerEncoder 7 | from tortoise.models.transformer import Transformer 8 | from tortoise.models.xtransformers import Encoder 9 | 10 | 11 | def exists(val): 12 | return val is not None 13 | 14 | 15 | def masked_mean(t, mask, dim = 1): 16 | t = t.masked_fill(~mask[:, :, None], 0.) 17 | return t.sum(dim = 1) / mask.sum(dim = 1)[..., None] 18 | 19 | class CLVP(nn.Module): 20 | """ 21 | CLIP model retrofitted for performing contrastive evaluation between tokenized audio data and the corresponding 22 | transcribed text. 23 | 24 | Originally from https://github.com/lucidrains/DALLE-pytorch/blob/main/dalle_pytorch/dalle_pytorch.py 25 | """ 26 | 27 | def __init__( 28 | self, 29 | *, 30 | dim_text=512, 31 | dim_speech=512, 32 | dim_latent=512, 33 | num_text_tokens=256, 34 | text_enc_depth=6, 35 | text_seq_len=120, 36 | text_heads=8, 37 | num_speech_tokens=8192, 38 | speech_enc_depth=6, 39 | speech_heads=8, 40 | speech_seq_len=250, 41 | text_mask_percentage=0, 42 | voice_mask_percentage=0, 43 | wav_token_compression=1024, 44 | use_xformers=False, 45 | ): 46 | super().__init__() 47 | self.text_emb = nn.Embedding(num_text_tokens, dim_text) 48 | self.to_text_latent = nn.Linear(dim_text, dim_latent, bias=False) 49 | 50 | self.speech_emb = nn.Embedding(num_speech_tokens, dim_speech) 51 | self.to_speech_latent = nn.Linear(dim_speech, dim_latent, bias=False) 52 | 53 | if use_xformers: 54 | self.text_transformer = CheckpointedXTransformerEncoder( 55 | needs_permute=False, 56 | exit_permute=False, 57 | max_seq_len=-1, 58 | attn_layers=Encoder( 59 | dim=dim_text, 60 | depth=text_enc_depth, 61 | heads=text_heads, 62 | ff_dropout=.1, 63 | ff_mult=2, 64 | attn_dropout=.1, 65 | use_rmsnorm=True, 66 | ff_glu=True, 67 | rotary_pos_emb=True, 68 | )) 69 | self.speech_transformer = CheckpointedXTransformerEncoder( 70 | needs_permute=False, 71 | exit_permute=False, 72 | max_seq_len=-1, 73 | attn_layers=Encoder( 74 | dim=dim_speech, 75 | depth=speech_enc_depth, 76 | heads=speech_heads, 77 | ff_dropout=.1, 78 | ff_mult=2, 79 | attn_dropout=.1, 80 | use_rmsnorm=True, 81 | ff_glu=True, 82 | rotary_pos_emb=True, 83 | )) 84 | else: 85 | self.text_transformer = Transformer(causal=False, seq_len=text_seq_len, dim=dim_text, depth=text_enc_depth, 86 | heads=text_heads) 87 | self.speech_transformer = Transformer(causal=False, seq_len=speech_seq_len, dim=dim_speech, 88 | depth=speech_enc_depth, heads=speech_heads) 89 | 90 | self.temperature = nn.Parameter(torch.tensor(1.)) 91 | self.text_mask_percentage = text_mask_percentage 92 | self.voice_mask_percentage = voice_mask_percentage 93 | self.wav_token_compression = wav_token_compression 94 | self.xformers = use_xformers 95 | if not use_xformers: 96 | self.text_pos_emb = nn.Embedding(text_seq_len, dim_text) 97 | self.speech_pos_emb = nn.Embedding(num_speech_tokens, dim_speech) 98 | 99 | def forward( 100 | self, 101 | text, 102 | speech_tokens, 103 | return_loss=False 104 | ): 105 | b, device = text.shape[0], text.device 106 | if self.training: 107 | text_mask = torch.rand_like(text.float()) > self.text_mask_percentage 108 | voice_mask = torch.rand_like(speech_tokens.float()) > self.voice_mask_percentage 109 | else: 110 | text_mask = torch.ones_like(text.float()).bool() 111 | voice_mask = torch.ones_like(speech_tokens.float()).bool() 112 | 113 | text_emb = self.text_emb(text) 114 | speech_emb = self.speech_emb(speech_tokens) 115 | 116 | if not self.xformers: 117 | text_emb += self.text_pos_emb(torch.arange(text.shape[1], device=device)) 118 | speech_emb += self.speech_pos_emb(torch.arange(speech_emb.shape[1], device=device)) 119 | 120 | enc_text = self.text_transformer(text_emb, mask=text_mask) 121 | enc_speech = self.speech_transformer(speech_emb, mask=voice_mask) 122 | 123 | text_latents = masked_mean(enc_text, text_mask, dim=1) 124 | speech_latents = masked_mean(enc_speech, voice_mask, dim=1) 125 | 126 | text_latents = self.to_text_latent(text_latents) 127 | speech_latents = self.to_speech_latent(speech_latents) 128 | 129 | text_latents, speech_latents = map(lambda t: F.normalize(t, p=2, dim=-1), (text_latents, speech_latents)) 130 | 131 | temp = self.temperature.exp() 132 | 133 | if not return_loss: 134 | sim = einsum('n d, n d -> n', text_latents, speech_latents) * temp 135 | return sim 136 | 137 | sim = einsum('i d, j d -> i j', text_latents, speech_latents) * temp 138 | labels = torch.arange(b, device=device) 139 | loss = (F.cross_entropy(sim, labels) + F.cross_entropy(sim.t(), labels)) / 2 140 | return loss 141 | 142 | 143 | if __name__ == '__main__': 144 | clip = CLVP(text_mask_percentage=.2, voice_mask_percentage=.2) 145 | clip(torch.randint(0,256,(2,120)), 146 | torch.tensor([50,100]), 147 | torch.randint(0,8192,(2,250)), 148 | torch.tensor([101,102]), 149 | return_loss=True) 150 | nonloss = clip(torch.randint(0,256,(2,120)), 151 | torch.tensor([50,100]), 152 | torch.randint(0,8192,(2,250)), 153 | torch.tensor([101,102]), 154 | return_loss=False) 155 | print(nonloss.shape) -------------------------------------------------------------------------------- /tortoise/tortoise/models/cvvp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch import einsum 5 | 6 | from tortoise.models.arch_util import AttentionBlock 7 | from tortoise.models.xtransformers import ContinuousTransformerWrapper, Encoder 8 | 9 | 10 | def exists(val): 11 | return val is not None 12 | 13 | 14 | def masked_mean(t, mask): 15 | t = t.masked_fill(~mask, 0.) 16 | return t.sum(dim=1) / mask.sum(dim=1) 17 | 18 | 19 | class CollapsingTransformer(nn.Module): 20 | def __init__(self, model_dim, output_dims, heads, dropout, depth, mask_percentage=0, **encoder_kwargs): 21 | super().__init__() 22 | self.transformer = ContinuousTransformerWrapper( 23 | max_seq_len=-1, 24 | use_pos_emb=False, 25 | attn_layers=Encoder( 26 | dim=model_dim, 27 | depth=depth, 28 | heads=heads, 29 | ff_dropout=dropout, 30 | ff_mult=1, 31 | attn_dropout=dropout, 32 | use_rmsnorm=True, 33 | ff_glu=True, 34 | rotary_pos_emb=True, 35 | **encoder_kwargs, 36 | )) 37 | self.pre_combiner = nn.Sequential(nn.Conv1d(model_dim, output_dims, 1), 38 | AttentionBlock( 39 | output_dims, num_heads=heads, do_checkpoint=False), 40 | nn.Conv1d(output_dims, output_dims, 1)) 41 | self.mask_percentage = mask_percentage 42 | 43 | def forward(self, x, **transformer_kwargs): 44 | h = self.transformer(x, **transformer_kwargs) 45 | h = h.permute(0, 2, 1) 46 | h = self.pre_combiner(h).permute(0, 2, 1) 47 | if self.training: 48 | mask = torch.rand_like(h.float()) > self.mask_percentage 49 | else: 50 | mask = torch.ones_like(h.float()).bool() 51 | return masked_mean(h, mask) 52 | 53 | 54 | class ConvFormatEmbedding(nn.Module): 55 | def __init__(self, *args, **kwargs): 56 | super().__init__() 57 | self.emb = nn.Embedding(*args, **kwargs) 58 | 59 | def forward(self, x): 60 | y = self.emb(x) 61 | return y.permute(0, 2, 1) 62 | 63 | 64 | class CVVP(nn.Module): 65 | def __init__( 66 | self, 67 | model_dim=512, 68 | transformer_heads=8, 69 | dropout=.1, 70 | conditioning_enc_depth=8, 71 | cond_mask_percentage=0, 72 | mel_channels=80, 73 | mel_codes=None, 74 | speech_enc_depth=8, 75 | speech_mask_percentage=0, 76 | latent_multiplier=1, 77 | ): 78 | super().__init__() 79 | latent_dim = latent_multiplier*model_dim 80 | self.temperature = nn.Parameter(torch.tensor(1.)) 81 | 82 | self.cond_emb = nn.Sequential(nn.Conv1d(mel_channels, model_dim//2, kernel_size=5, stride=2, padding=2), 83 | nn.Conv1d(model_dim//2, model_dim, kernel_size=3, stride=2, padding=1)) 84 | self.conditioning_transformer = CollapsingTransformer( 85 | model_dim, model_dim, transformer_heads, dropout, conditioning_enc_depth, cond_mask_percentage) 86 | self.to_conditioning_latent = nn.Linear( 87 | latent_dim, latent_dim, bias=False) 88 | 89 | if mel_codes is None: 90 | self.speech_emb = nn.Conv1d( 91 | mel_channels, model_dim, kernel_size=5, padding=2) 92 | else: 93 | self.speech_emb = ConvFormatEmbedding(mel_codes, model_dim) 94 | self.speech_transformer = CollapsingTransformer( 95 | model_dim, latent_dim, transformer_heads, dropout, speech_enc_depth, speech_mask_percentage) 96 | self.to_speech_latent = nn.Linear( 97 | latent_dim, latent_dim, bias=False) 98 | 99 | def get_grad_norm_parameter_groups(self): 100 | return { 101 | 'conditioning': list(self.conditioning_transformer.parameters()), 102 | 'speech': list(self.speech_transformer.parameters()), 103 | } 104 | 105 | def forward( 106 | self, 107 | mel_cond, 108 | mel_input, 109 | return_loss=False 110 | ): 111 | cond_emb = self.cond_emb(mel_cond).permute(0, 2, 1) 112 | enc_cond = self.conditioning_transformer(cond_emb) 113 | cond_latents = self.to_conditioning_latent(enc_cond) 114 | 115 | speech_emb = self.speech_emb(mel_input).permute(0, 2, 1) 116 | enc_speech = self.speech_transformer(speech_emb) 117 | speech_latents = self.to_speech_latent(enc_speech) 118 | 119 | cond_latents, speech_latents = map(lambda t: F.normalize( 120 | t, p=2, dim=-1), (cond_latents, speech_latents)) 121 | temp = self.temperature.exp() 122 | 123 | if not return_loss: 124 | sim = einsum('n d, n d -> n', cond_latents, 125 | speech_latents) * temp 126 | return sim 127 | 128 | sim = einsum('i d, j d -> i j', cond_latents, 129 | speech_latents) * temp 130 | labels = torch.arange( 131 | cond_latents.shape[0], device=mel_input.device) 132 | loss = (F.cross_entropy(sim, labels) + 133 | F.cross_entropy(sim.t(), labels)) / 2 134 | 135 | return loss 136 | 137 | 138 | if __name__ == '__main__': 139 | clvp = CVVP() 140 | clvp(torch.randn(2, 80, 100), 141 | torch.randn(2, 80, 95), 142 | return_loss=True) 143 | -------------------------------------------------------------------------------- /tortoise/tortoise/models/diffusion_decoder.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | from abc import abstractmethod 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch import autocast 9 | 10 | from tortoise.models.arch_util import normalization, AttentionBlock 11 | 12 | 13 | def is_latent(t): 14 | return t.dtype == torch.float 15 | 16 | 17 | def is_sequence(t): 18 | return t.dtype == torch.long 19 | 20 | 21 | def timestep_embedding(timesteps, dim, max_period=10000): 22 | """ 23 | Create sinusoidal timestep embeddings. 24 | 25 | :param timesteps: a 1-D Tensor of N indices, one per batch element. 26 | These may be fractional. 27 | :param dim: the dimension of the output. 28 | :param max_period: controls the minimum frequency of the embeddings. 29 | :return: an [N x dim] Tensor of positional embeddings. 30 | """ 31 | half = dim // 2 32 | freqs = torch.exp( 33 | -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half 34 | ).to(device=timesteps.device) 35 | args = timesteps[:, None].float() * freqs[None] 36 | embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) 37 | if dim % 2: 38 | embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) 39 | return embedding 40 | 41 | 42 | class TimestepBlock(nn.Module): 43 | @abstractmethod 44 | def forward(self, x, emb): 45 | """ 46 | Apply the module to `x` given `emb` timestep embeddings. 47 | """ 48 | 49 | 50 | class TimestepEmbedSequential(nn.Sequential, TimestepBlock): 51 | def forward(self, x, emb): 52 | for layer in self: 53 | if isinstance(layer, TimestepBlock): 54 | x = layer(x, emb) 55 | else: 56 | x = layer(x) 57 | return x 58 | 59 | 60 | class ResBlock(TimestepBlock): 61 | def __init__( 62 | self, 63 | channels, 64 | emb_channels, 65 | dropout, 66 | out_channels=None, 67 | dims=2, 68 | kernel_size=3, 69 | efficient_config=True, 70 | use_scale_shift_norm=False, 71 | ): 72 | super().__init__() 73 | self.channels = channels 74 | self.emb_channels = emb_channels 75 | self.dropout = dropout 76 | self.out_channels = out_channels or channels 77 | self.use_scale_shift_norm = use_scale_shift_norm 78 | padding = {1: 0, 3: 1, 5: 2}[kernel_size] 79 | eff_kernel = 1 if efficient_config else 3 80 | eff_padding = 0 if efficient_config else 1 81 | 82 | self.in_layers = nn.Sequential( 83 | normalization(channels), 84 | nn.SiLU(), 85 | nn.Conv1d(channels, self.out_channels, eff_kernel, padding=eff_padding), 86 | ) 87 | 88 | self.emb_layers = nn.Sequential( 89 | nn.SiLU(), 90 | nn.Linear( 91 | emb_channels, 92 | 2 * self.out_channels if use_scale_shift_norm else self.out_channels, 93 | ), 94 | ) 95 | self.out_layers = nn.Sequential( 96 | normalization(self.out_channels), 97 | nn.SiLU(), 98 | nn.Dropout(p=dropout), 99 | nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding), 100 | ) 101 | 102 | if self.out_channels == channels: 103 | self.skip_connection = nn.Identity() 104 | else: 105 | self.skip_connection = nn.Conv1d(channels, self.out_channels, eff_kernel, padding=eff_padding) 106 | 107 | def forward(self, x, emb): 108 | h = self.in_layers(x) 109 | emb_out = self.emb_layers(emb).type(h.dtype) 110 | while len(emb_out.shape) < len(h.shape): 111 | emb_out = emb_out[..., None] 112 | if self.use_scale_shift_norm: 113 | out_norm, out_rest = self.out_layers[0], self.out_layers[1:] 114 | scale, shift = torch.chunk(emb_out, 2, dim=1) 115 | h = out_norm(h) * (1 + scale) + shift 116 | h = out_rest(h) 117 | else: 118 | h = h + emb_out 119 | h = self.out_layers(h) 120 | return self.skip_connection(x) + h 121 | 122 | 123 | class DiffusionLayer(TimestepBlock): 124 | def __init__(self, model_channels, dropout, num_heads): 125 | super().__init__() 126 | self.resblk = ResBlock(model_channels, model_channels, dropout, model_channels, dims=1, use_scale_shift_norm=True) 127 | self.attn = AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True) 128 | 129 | def forward(self, x, time_emb): 130 | y = self.resblk(x, time_emb) 131 | return self.attn(y) 132 | 133 | 134 | class DiffusionTts(nn.Module): 135 | def __init__( 136 | self, 137 | model_channels=512, 138 | num_layers=8, 139 | in_channels=100, 140 | in_latent_channels=512, 141 | in_tokens=8193, 142 | out_channels=200, # mean and variance 143 | dropout=0, 144 | use_fp16=False, 145 | num_heads=16, 146 | # Parameters for regularization. 147 | layer_drop=.1, 148 | unconditioned_percentage=.1, # This implements a mechanism similar to what is used in classifier-free training. 149 | ): 150 | super().__init__() 151 | 152 | self.in_channels = in_channels 153 | self.model_channels = model_channels 154 | self.out_channels = out_channels 155 | self.dropout = dropout 156 | self.num_heads = num_heads 157 | self.unconditioned_percentage = unconditioned_percentage 158 | self.enable_fp16 = use_fp16 159 | self.layer_drop = layer_drop 160 | 161 | self.inp_block = nn.Conv1d(in_channels, model_channels, 3, 1, 1) 162 | self.time_embed = nn.Sequential( 163 | nn.Linear(model_channels, model_channels), 164 | nn.SiLU(), 165 | nn.Linear(model_channels, model_channels), 166 | ) 167 | 168 | # Either code_converter or latent_converter is used, depending on what type of conditioning data is fed. 169 | # This model is meant to be able to be trained on both for efficiency purposes - it is far less computationally 170 | # complex to generate tokens, while generating latents will normally mean propagating through a deep autoregressive 171 | # transformer network. 172 | self.code_embedding = nn.Embedding(in_tokens, model_channels) 173 | self.code_converter = nn.Sequential( 174 | AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), 175 | AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), 176 | AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), 177 | ) 178 | self.code_norm = normalization(model_channels) 179 | self.latent_conditioner = nn.Sequential( 180 | nn.Conv1d(in_latent_channels, model_channels, 3, padding=1), 181 | AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), 182 | AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), 183 | AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), 184 | AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), 185 | ) 186 | self.contextual_embedder = nn.Sequential(nn.Conv1d(in_channels,model_channels,3,padding=1,stride=2), 187 | nn.Conv1d(model_channels, model_channels*2,3,padding=1,stride=2), 188 | AttentionBlock(model_channels*2, num_heads, relative_pos_embeddings=True, do_checkpoint=False), 189 | AttentionBlock(model_channels*2, num_heads, relative_pos_embeddings=True, do_checkpoint=False), 190 | AttentionBlock(model_channels*2, num_heads, relative_pos_embeddings=True, do_checkpoint=False), 191 | AttentionBlock(model_channels*2, num_heads, relative_pos_embeddings=True, do_checkpoint=False), 192 | AttentionBlock(model_channels*2, num_heads, relative_pos_embeddings=True, do_checkpoint=False)) 193 | self.unconditioned_embedding = nn.Parameter(torch.randn(1,model_channels,1)) 194 | self.conditioning_timestep_integrator = TimestepEmbedSequential( 195 | DiffusionLayer(model_channels, dropout, num_heads), 196 | DiffusionLayer(model_channels, dropout, num_heads), 197 | DiffusionLayer(model_channels, dropout, num_heads), 198 | ) 199 | 200 | self.integrating_conv = nn.Conv1d(model_channels*2, model_channels, kernel_size=1) 201 | self.mel_head = nn.Conv1d(model_channels, in_channels, kernel_size=3, padding=1) 202 | 203 | self.layers = nn.ModuleList([DiffusionLayer(model_channels, dropout, num_heads) for _ in range(num_layers)] + 204 | [ResBlock(model_channels, model_channels, dropout, dims=1, use_scale_shift_norm=True) for _ in range(3)]) 205 | 206 | self.out = nn.Sequential( 207 | normalization(model_channels), 208 | nn.SiLU(), 209 | nn.Conv1d(model_channels, out_channels, 3, padding=1), 210 | ) 211 | 212 | def get_grad_norm_parameter_groups(self): 213 | groups = { 214 | 'minicoder': list(self.contextual_embedder.parameters()), 215 | 'layers': list(self.layers.parameters()), 216 | 'code_converters': list(self.code_embedding.parameters()) + list(self.code_converter.parameters()) + list(self.latent_conditioner.parameters()) + list(self.latent_conditioner.parameters()), 217 | 'timestep_integrator': list(self.conditioning_timestep_integrator.parameters()) + list(self.integrating_conv.parameters()), 218 | 'time_embed': list(self.time_embed.parameters()), 219 | } 220 | return groups 221 | 222 | def get_conditioning(self, conditioning_input): 223 | speech_conditioning_input = conditioning_input.unsqueeze(1) if len( 224 | conditioning_input.shape) == 3 else conditioning_input 225 | conds = [] 226 | for j in range(speech_conditioning_input.shape[1]): 227 | conds.append(self.contextual_embedder(speech_conditioning_input[:, j])) 228 | conds = torch.cat(conds, dim=-1) 229 | conds = conds.mean(dim=-1) 230 | return conds 231 | 232 | def timestep_independent(self, aligned_conditioning, conditioning_latent, expected_seq_len, return_code_pred): 233 | # Shuffle aligned_latent to BxCxS format 234 | if is_latent(aligned_conditioning): 235 | aligned_conditioning = aligned_conditioning.permute(0, 2, 1) 236 | 237 | cond_scale, cond_shift = torch.chunk(conditioning_latent, 2, dim=1) 238 | if is_latent(aligned_conditioning): 239 | code_emb = self.latent_conditioner(aligned_conditioning) 240 | else: 241 | code_emb = self.code_embedding(aligned_conditioning).permute(0, 2, 1) 242 | code_emb = self.code_converter(code_emb) 243 | code_emb = self.code_norm(code_emb) * (1 + cond_scale.unsqueeze(-1)) + cond_shift.unsqueeze(-1) 244 | 245 | unconditioned_batches = torch.zeros((code_emb.shape[0], 1, 1), device=code_emb.device) 246 | # Mask out the conditioning branch for whole batch elements, implementing something similar to classifier-free guidance. 247 | if self.training and self.unconditioned_percentage > 0: 248 | unconditioned_batches = torch.rand((code_emb.shape[0], 1, 1), 249 | device=code_emb.device) < self.unconditioned_percentage 250 | code_emb = torch.where(unconditioned_batches, self.unconditioned_embedding.repeat(aligned_conditioning.shape[0], 1, 1), 251 | code_emb) 252 | expanded_code_emb = F.interpolate(code_emb, size=expected_seq_len, mode='nearest') 253 | 254 | if not return_code_pred: 255 | return expanded_code_emb 256 | else: 257 | mel_pred = self.mel_head(expanded_code_emb) 258 | # Multiply mel_pred by !unconditioned_branches, which drops the gradient on unconditioned branches. This is because we don't want that gradient being used to train parameters through the codes_embedder as it unbalances contributions to that network from the MSE loss. 259 | mel_pred = mel_pred * unconditioned_batches.logical_not() 260 | return expanded_code_emb, mel_pred 261 | 262 | def forward(self, x, timesteps, aligned_conditioning=None, conditioning_latent=None, precomputed_aligned_embeddings=None, conditioning_free=False, return_code_pred=False): 263 | """ 264 | Apply the model to an input batch. 265 | 266 | :param x: an [N x C x ...] Tensor of inputs. 267 | :param timesteps: a 1-D batch of timesteps. 268 | :param aligned_conditioning: an aligned latent or sequence of tokens providing useful data about the sample to be produced. 269 | :param conditioning_latent: a pre-computed conditioning latent; see get_conditioning(). 270 | :param precomputed_aligned_embeddings: Embeddings returned from self.timestep_independent() 271 | :param conditioning_free: When set, all conditioning inputs (including tokens and conditioning_input) will not be considered. 272 | :return: an [N x C x ...] Tensor of outputs. 273 | """ 274 | assert precomputed_aligned_embeddings is not None or (aligned_conditioning is not None and conditioning_latent is not None) 275 | assert not (return_code_pred and precomputed_aligned_embeddings is not None) # These two are mutually exclusive. 276 | 277 | unused_params = [] 278 | if conditioning_free: 279 | code_emb = self.unconditioned_embedding.repeat(x.shape[0], 1, x.shape[-1]) 280 | unused_params.extend(list(self.code_converter.parameters()) + list(self.code_embedding.parameters())) 281 | unused_params.extend(list(self.latent_conditioner.parameters())) 282 | else: 283 | if precomputed_aligned_embeddings is not None: 284 | code_emb = precomputed_aligned_embeddings 285 | else: 286 | code_emb, mel_pred = self.timestep_independent(aligned_conditioning, conditioning_latent, x.shape[-1], True) 287 | if is_latent(aligned_conditioning): 288 | unused_params.extend(list(self.code_converter.parameters()) + list(self.code_embedding.parameters())) 289 | else: 290 | unused_params.extend(list(self.latent_conditioner.parameters())) 291 | 292 | unused_params.append(self.unconditioned_embedding) 293 | 294 | time_emb = self.time_embed(timestep_embedding(timesteps, self.model_channels)) 295 | code_emb = self.conditioning_timestep_integrator(code_emb, time_emb) 296 | x = self.inp_block(x) 297 | x = torch.cat([x, code_emb], dim=1) 298 | x = self.integrating_conv(x) 299 | for i, lyr in enumerate(self.layers): 300 | # Do layer drop where applicable. Do not drop first and last layers. 301 | if self.training and self.layer_drop > 0 and i != 0 and i != (len(self.layers)-1) and random.random() < self.layer_drop: 302 | unused_params.extend(list(lyr.parameters())) 303 | else: 304 | # First and last blocks will have autocast disabled for improved precision. 305 | if not torch.backends.mps.is_available(): 306 | with autocast(x.device.type, enabled=self.enable_fp16 and i != 0): 307 | x = lyr(x, time_emb) 308 | else: 309 | x = lyr(x, time_emb) 310 | 311 | x = x.float() 312 | out = self.out(x) 313 | 314 | # Involve probabilistic or possibly unused parameters in loss so we don't get DDP errors. 315 | extraneous_addition = 0 316 | for p in unused_params: 317 | extraneous_addition = extraneous_addition + p.mean() 318 | out = out + extraneous_addition * 0 319 | 320 | if return_code_pred: 321 | return out, mel_pred 322 | return out 323 | 324 | 325 | if __name__ == '__main__': 326 | clip = torch.randn(2, 100, 400) 327 | aligned_latent = torch.randn(2,388,512) 328 | aligned_sequence = torch.randint(0,8192,(2,100)) 329 | cond = torch.randn(2, 100, 400) 330 | ts = torch.LongTensor([600, 600]) 331 | model = DiffusionTts(512, layer_drop=.3, unconditioned_percentage=.5) 332 | # Test with latent aligned conditioning 333 | #o = model(clip, ts, aligned_latent, cond) 334 | # Test with sequence aligned conditioning 335 | o = model(clip, ts, aligned_sequence, cond) 336 | 337 | -------------------------------------------------------------------------------- /tortoise/tortoise/models/random_latent_generator.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2 ** 0.5): 9 | if bias is not None: 10 | rest_dim = [1] * (input.ndim - bias.ndim - 1) 11 | return ( 12 | F.leaky_relu( 13 | input + bias.view(1, bias.shape[0], *rest_dim), negative_slope=negative_slope 14 | ) 15 | * scale 16 | ) 17 | else: 18 | return F.leaky_relu(input, negative_slope=0.2) * scale 19 | 20 | 21 | class EqualLinear(nn.Module): 22 | def __init__( 23 | self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1 24 | ): 25 | super().__init__() 26 | self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul)) 27 | if bias: 28 | self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init)) 29 | else: 30 | self.bias = None 31 | self.scale = (1 / math.sqrt(in_dim)) * lr_mul 32 | self.lr_mul = lr_mul 33 | 34 | def forward(self, input): 35 | out = F.linear(input, self.weight * self.scale) 36 | out = fused_leaky_relu(out, self.bias * self.lr_mul) 37 | return out 38 | 39 | 40 | class RandomLatentConverter(nn.Module): 41 | def __init__(self, channels): 42 | super().__init__() 43 | self.layers = nn.Sequential(*[EqualLinear(channels, channels, lr_mul=.1) for _ in range(5)], 44 | nn.Linear(channels, channels)) 45 | self.channels = channels 46 | 47 | def forward(self, ref): 48 | r = torch.randn(ref.shape[0], self.channels, device=ref.device) 49 | y = self.layers(r) 50 | return y 51 | 52 | 53 | if __name__ == '__main__': 54 | model = RandomLatentConverter(512) 55 | model(torch.randn(5,512)) -------------------------------------------------------------------------------- /tortoise/tortoise/models/transformer.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from einops import rearrange 6 | from rotary_embedding_torch import RotaryEmbedding, broadcat 7 | from torch import nn 8 | 9 | 10 | # helpers 11 | 12 | 13 | def exists(val): 14 | return val is not None 15 | 16 | 17 | def default(val, d): 18 | return val if exists(val) else d 19 | 20 | 21 | def cast_tuple(val, depth = 1): 22 | if isinstance(val, list): 23 | val = tuple(val) 24 | return val if isinstance(val, tuple) else (val,) * depth 25 | 26 | 27 | def max_neg_value(t): 28 | return -torch.finfo(t.dtype).max 29 | 30 | 31 | def stable_softmax(t, dim = -1, alpha = 32 ** 2): 32 | t = t / alpha 33 | t = t - torch.amax(t, dim = dim, keepdim = True).detach() 34 | return (t * alpha).softmax(dim = dim) 35 | 36 | 37 | def route_args(router, args, depth): 38 | routed_args = [(dict(), dict()) for _ in range(depth)] 39 | matched_keys = [key for key in args.keys() if key in router] 40 | 41 | for key in matched_keys: 42 | val = args[key] 43 | for depth, ((f_args, g_args), routes) in enumerate(zip(routed_args, router[key])): 44 | new_f_args, new_g_args = map(lambda route: ({key: val} if route else {}), routes) 45 | routed_args[depth] = ({**f_args, **new_f_args}, {**g_args, **new_g_args}) 46 | return routed_args 47 | 48 | 49 | # classes 50 | class SequentialSequence(nn.Module): 51 | def __init__(self, layers, args_route = {}, layer_dropout = 0.): 52 | super().__init__() 53 | assert all(len(route) == len(layers) for route in args_route.values()), 'each argument route map must have the same depth as the number of sequential layers' 54 | self.layers = layers 55 | self.args_route = args_route 56 | self.layer_dropout = layer_dropout 57 | 58 | def forward(self, x, **kwargs): 59 | args = route_args(self.args_route, kwargs, len(self.layers)) 60 | layers_and_args = list(zip(self.layers, args)) 61 | 62 | for (f, g), (f_args, g_args) in layers_and_args: 63 | x = x + f(x, **f_args) 64 | x = x + g(x, **g_args) 65 | return x 66 | 67 | 68 | class DivideMax(nn.Module): 69 | def __init__(self, dim): 70 | super().__init__() 71 | self.dim = dim 72 | 73 | def forward(self, x): 74 | maxes = x.amax(dim = self.dim, keepdim = True).detach() 75 | return x / maxes 76 | 77 | 78 | # https://arxiv.org/abs/2103.17239 79 | class LayerScale(nn.Module): 80 | def __init__(self, dim, depth, fn): 81 | super().__init__() 82 | if depth <= 18: 83 | init_eps = 0.1 84 | elif depth > 18 and depth <= 24: 85 | init_eps = 1e-5 86 | else: 87 | init_eps = 1e-6 88 | 89 | scale = torch.zeros(1, 1, dim).fill_(init_eps) 90 | self.scale = nn.Parameter(scale) 91 | self.fn = fn 92 | def forward(self, x, **kwargs): 93 | return self.fn(x, **kwargs) * self.scale 94 | 95 | # layer norm 96 | 97 | 98 | class PreNorm(nn.Module): 99 | def __init__(self, dim, fn, sandwich = False): 100 | super().__init__() 101 | self.norm = nn.LayerNorm(dim) 102 | self.norm_out = nn.LayerNorm(dim) if sandwich else nn.Identity() 103 | self.fn = fn 104 | 105 | def forward(self, x, **kwargs): 106 | x = self.norm(x) 107 | x = self.fn(x, **kwargs) 108 | return self.norm_out(x) 109 | 110 | # feed forward 111 | 112 | 113 | class GEGLU(nn.Module): 114 | def forward(self, x): 115 | x, gates = x.chunk(2, dim = -1) 116 | return x * F.gelu(gates) 117 | 118 | 119 | class FeedForward(nn.Module): 120 | def __init__(self, dim, dropout = 0., mult = 4.): 121 | super().__init__() 122 | self.net = nn.Sequential( 123 | nn.Linear(dim, dim * mult * 2), 124 | GEGLU(), 125 | nn.Dropout(dropout), 126 | nn.Linear(dim * mult, dim) 127 | ) 128 | 129 | def forward(self, x): 130 | return self.net(x) 131 | 132 | # Attention 133 | 134 | 135 | class Attention(nn.Module): 136 | def __init__(self, dim, seq_len, causal = True, heads = 8, dim_head = 64, dropout = 0.): 137 | super().__init__() 138 | inner_dim = dim_head * heads 139 | self.heads = heads 140 | self.seq_len = seq_len 141 | self.scale = dim_head ** -0.5 142 | 143 | self.causal = causal 144 | 145 | self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False) 146 | self.to_out = nn.Sequential( 147 | nn.Linear(inner_dim, dim), 148 | nn.Dropout(dropout) 149 | ) 150 | 151 | def forward(self, x, mask = None): 152 | b, n, _, h, device = *x.shape, self.heads, x.device 153 | softmax = torch.softmax 154 | 155 | qkv = self.to_qkv(x).chunk(3, dim = -1) 156 | q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv) 157 | 158 | q = q * self.scale 159 | 160 | dots = torch.einsum('b h i d, b h j d -> b h i j', q, k) 161 | mask_value = max_neg_value(dots) 162 | 163 | if exists(mask): 164 | mask = rearrange(mask, 'b j -> b () () j') 165 | dots.masked_fill_(~mask, mask_value) 166 | del mask 167 | 168 | if self.causal: 169 | i, j = dots.shape[-2:] 170 | mask = torch.ones(i, j, device = device).triu_(j - i + 1).bool() 171 | dots.masked_fill_(mask, mask_value) 172 | 173 | attn = softmax(dots, dim=-1) 174 | 175 | out = torch.einsum('b h i j, b h j d -> b h i d', attn, v) 176 | out = rearrange(out, 'b h n d -> b n (h d)') 177 | out = self.to_out(out) 178 | return out 179 | 180 | 181 | # main transformer class 182 | class Transformer(nn.Module): 183 | def __init__( 184 | self, 185 | *, 186 | dim, 187 | depth, 188 | seq_len, 189 | causal = True, 190 | heads = 8, 191 | dim_head = 64, 192 | ff_mult = 4, 193 | attn_dropout = 0., 194 | ff_dropout = 0., 195 | sparse_attn = False, 196 | sandwich_norm = False, 197 | ): 198 | super().__init__() 199 | layers = nn.ModuleList([]) 200 | sparse_layer = cast_tuple(sparse_attn, depth) 201 | 202 | for ind, sparse_attn in zip(range(depth), sparse_layer): 203 | attn = Attention(dim, causal = causal, seq_len = seq_len, heads = heads, dim_head = dim_head, dropout = attn_dropout) 204 | 205 | ff = FeedForward(dim, mult = ff_mult, dropout = ff_dropout) 206 | 207 | layers.append(nn.ModuleList([ 208 | LayerScale(dim, ind + 1, PreNorm(dim, attn, sandwich = sandwich_norm)), 209 | LayerScale(dim, ind + 1, PreNorm(dim, ff, sandwich = sandwich_norm)) 210 | ])) 211 | 212 | execute_type = SequentialSequence 213 | route_attn = ((True, False),) * depth 214 | attn_route_map = {'mask': route_attn} 215 | 216 | self.layers = execute_type(layers, args_route = attn_route_map) 217 | 218 | def forward(self, x, **kwargs): 219 | return self.layers(x, **kwargs) 220 | -------------------------------------------------------------------------------- /tortoise/tortoise/models/vocoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | MAX_WAV_VALUE = 32768.0 6 | 7 | class KernelPredictor(torch.nn.Module): 8 | ''' Kernel predictor for the location-variable convolutions''' 9 | 10 | def __init__( 11 | self, 12 | cond_channels, 13 | conv_in_channels, 14 | conv_out_channels, 15 | conv_layers, 16 | conv_kernel_size=3, 17 | kpnet_hidden_channels=64, 18 | kpnet_conv_size=3, 19 | kpnet_dropout=0.0, 20 | kpnet_nonlinear_activation="LeakyReLU", 21 | kpnet_nonlinear_activation_params={"negative_slope": 0.1}, 22 | ): 23 | ''' 24 | Args: 25 | cond_channels (int): number of channel for the conditioning sequence, 26 | conv_in_channels (int): number of channel for the input sequence, 27 | conv_out_channels (int): number of channel for the output sequence, 28 | conv_layers (int): number of layers 29 | ''' 30 | super().__init__() 31 | 32 | self.conv_in_channels = conv_in_channels 33 | self.conv_out_channels = conv_out_channels 34 | self.conv_kernel_size = conv_kernel_size 35 | self.conv_layers = conv_layers 36 | 37 | kpnet_kernel_channels = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers # l_w 38 | kpnet_bias_channels = conv_out_channels * conv_layers # l_b 39 | 40 | self.input_conv = nn.Sequential( 41 | nn.utils.weight_norm(nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)), 42 | getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), 43 | ) 44 | 45 | self.residual_convs = nn.ModuleList() 46 | padding = (kpnet_conv_size - 1) // 2 47 | for _ in range(3): 48 | self.residual_convs.append( 49 | nn.Sequential( 50 | nn.Dropout(kpnet_dropout), 51 | nn.utils.weight_norm( 52 | nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, 53 | bias=True)), 54 | getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), 55 | nn.utils.weight_norm( 56 | nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, 57 | bias=True)), 58 | getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), 59 | ) 60 | ) 61 | self.kernel_conv = nn.utils.weight_norm( 62 | nn.Conv1d(kpnet_hidden_channels, kpnet_kernel_channels, kpnet_conv_size, padding=padding, bias=True)) 63 | self.bias_conv = nn.utils.weight_norm( 64 | nn.Conv1d(kpnet_hidden_channels, kpnet_bias_channels, kpnet_conv_size, padding=padding, bias=True)) 65 | 66 | def forward(self, c): 67 | ''' 68 | Args: 69 | c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) 70 | ''' 71 | batch, _, cond_length = c.shape 72 | c = self.input_conv(c) 73 | for residual_conv in self.residual_convs: 74 | residual_conv.to(c.device) 75 | c = c + residual_conv(c) 76 | k = self.kernel_conv(c) 77 | b = self.bias_conv(c) 78 | kernels = k.contiguous().view( 79 | batch, 80 | self.conv_layers, 81 | self.conv_in_channels, 82 | self.conv_out_channels, 83 | self.conv_kernel_size, 84 | cond_length, 85 | ) 86 | bias = b.contiguous().view( 87 | batch, 88 | self.conv_layers, 89 | self.conv_out_channels, 90 | cond_length, 91 | ) 92 | 93 | return kernels, bias 94 | 95 | def remove_weight_norm(self): 96 | nn.utils.remove_weight_norm(self.input_conv[0]) 97 | nn.utils.remove_weight_norm(self.kernel_conv) 98 | nn.utils.remove_weight_norm(self.bias_conv) 99 | for block in self.residual_convs: 100 | nn.utils.remove_weight_norm(block[1]) 101 | nn.utils.remove_weight_norm(block[3]) 102 | 103 | 104 | class LVCBlock(torch.nn.Module): 105 | '''the location-variable convolutions''' 106 | 107 | def __init__( 108 | self, 109 | in_channels, 110 | cond_channels, 111 | stride, 112 | dilations=[1, 3, 9, 27], 113 | lReLU_slope=0.2, 114 | conv_kernel_size=3, 115 | cond_hop_length=256, 116 | kpnet_hidden_channels=64, 117 | kpnet_conv_size=3, 118 | kpnet_dropout=0.0, 119 | ): 120 | super().__init__() 121 | 122 | self.cond_hop_length = cond_hop_length 123 | self.conv_layers = len(dilations) 124 | self.conv_kernel_size = conv_kernel_size 125 | 126 | self.kernel_predictor = KernelPredictor( 127 | cond_channels=cond_channels, 128 | conv_in_channels=in_channels, 129 | conv_out_channels=2 * in_channels, 130 | conv_layers=len(dilations), 131 | conv_kernel_size=conv_kernel_size, 132 | kpnet_hidden_channels=kpnet_hidden_channels, 133 | kpnet_conv_size=kpnet_conv_size, 134 | kpnet_dropout=kpnet_dropout, 135 | kpnet_nonlinear_activation_params={"negative_slope": lReLU_slope} 136 | ) 137 | 138 | self.convt_pre = nn.Sequential( 139 | nn.LeakyReLU(lReLU_slope), 140 | nn.utils.weight_norm(nn.ConvTranspose1d(in_channels, in_channels, 2 * stride, stride=stride, 141 | padding=stride // 2 + stride % 2, output_padding=stride % 2)), 142 | ) 143 | 144 | self.conv_blocks = nn.ModuleList() 145 | for dilation in dilations: 146 | self.conv_blocks.append( 147 | nn.Sequential( 148 | nn.LeakyReLU(lReLU_slope), 149 | nn.utils.weight_norm(nn.Conv1d(in_channels, in_channels, conv_kernel_size, 150 | padding=dilation * (conv_kernel_size - 1) // 2, dilation=dilation)), 151 | nn.LeakyReLU(lReLU_slope), 152 | ) 153 | ) 154 | 155 | def forward(self, x, c): 156 | ''' forward propagation of the location-variable convolutions. 157 | Args: 158 | x (Tensor): the input sequence (batch, in_channels, in_length) 159 | c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) 160 | 161 | Returns: 162 | Tensor: the output sequence (batch, in_channels, in_length) 163 | ''' 164 | _, in_channels, _ = x.shape # (B, c_g, L') 165 | 166 | x = self.convt_pre(x) # (B, c_g, stride * L') 167 | kernels, bias = self.kernel_predictor(c) 168 | 169 | for i, conv in enumerate(self.conv_blocks): 170 | output = conv(x) # (B, c_g, stride * L') 171 | 172 | k = kernels[:, i, :, :, :, :] # (B, 2 * c_g, c_g, kernel_size, cond_length) 173 | b = bias[:, i, :, :] # (B, 2 * c_g, cond_length) 174 | 175 | output = self.location_variable_convolution(output, k, b, 176 | hop_size=self.cond_hop_length) # (B, 2 * c_g, stride * L'): LVC 177 | x = x + torch.sigmoid(output[:, :in_channels, :]) * torch.tanh( 178 | output[:, in_channels:, :]) # (B, c_g, stride * L'): GAU 179 | 180 | return x 181 | 182 | def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=256): 183 | ''' perform location-variable convolution operation on the input sequence (x) using the local convolution kernl. 184 | Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100. 185 | Args: 186 | x (Tensor): the input sequence (batch, in_channels, in_length). 187 | kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length) 188 | bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length) 189 | dilation (int): the dilation of convolution. 190 | hop_size (int): the hop_size of the conditioning sequence. 191 | Returns: 192 | (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length). 193 | ''' 194 | batch, _, in_length = x.shape 195 | batch, _, out_channels, kernel_size, kernel_length = kernel.shape 196 | assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched" 197 | 198 | padding = dilation * int((kernel_size - 1) / 2) 199 | x = F.pad(x, (padding, padding), 'constant', 0) # (batch, in_channels, in_length + 2*padding) 200 | x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding) 201 | 202 | if hop_size < dilation: 203 | x = F.pad(x, (0, dilation), 'constant', 0) 204 | x = x.unfold(3, dilation, 205 | dilation) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation) 206 | x = x[:, :, :, :, :hop_size] 207 | x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation) 208 | x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size) 209 | 210 | o = torch.einsum('bildsk,biokl->bolsd', x, kernel) 211 | o = o.to(memory_format=torch.channels_last_3d) 212 | bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d) 213 | o = o + bias 214 | o = o.contiguous().view(batch, out_channels, -1) 215 | 216 | return o 217 | 218 | def remove_weight_norm(self): 219 | self.kernel_predictor.remove_weight_norm() 220 | nn.utils.remove_weight_norm(self.convt_pre[1]) 221 | for block in self.conv_blocks: 222 | nn.utils.remove_weight_norm(block[1]) 223 | 224 | 225 | class UnivNetGenerator(nn.Module): 226 | """ 227 | UnivNet Generator 228 | 229 | Originally from https://github.com/mindslab-ai/univnet/blob/master/model/generator.py. 230 | """ 231 | 232 | def __init__(self, noise_dim=64, channel_size=32, dilations=[1,3,9,27], strides=[8,8,4], lReLU_slope=.2, kpnet_conv_size=3, 233 | # Below are MEL configurations options that this generator requires. 234 | hop_length=256, n_mel_channels=100): 235 | super(UnivNetGenerator, self).__init__() 236 | self.mel_channel = n_mel_channels 237 | self.noise_dim = noise_dim 238 | self.hop_length = hop_length 239 | channel_size = channel_size 240 | kpnet_conv_size = kpnet_conv_size 241 | 242 | self.res_stack = nn.ModuleList() 243 | hop_length = 1 244 | for stride in strides: 245 | hop_length = stride * hop_length 246 | self.res_stack.append( 247 | LVCBlock( 248 | channel_size, 249 | n_mel_channels, 250 | stride=stride, 251 | dilations=dilations, 252 | lReLU_slope=lReLU_slope, 253 | cond_hop_length=hop_length, 254 | kpnet_conv_size=kpnet_conv_size 255 | ) 256 | ) 257 | 258 | self.conv_pre = \ 259 | nn.utils.weight_norm(nn.Conv1d(noise_dim, channel_size, 7, padding=3, padding_mode='reflect')) 260 | 261 | self.conv_post = nn.Sequential( 262 | nn.LeakyReLU(lReLU_slope), 263 | nn.utils.weight_norm(nn.Conv1d(channel_size, 1, 7, padding=3, padding_mode='reflect')), 264 | nn.Tanh(), 265 | ) 266 | 267 | def forward(self, c, z): 268 | ''' 269 | Args: 270 | c (Tensor): the conditioning sequence of mel-spectrogram (batch, mel_channels, in_length) 271 | z (Tensor): the noise sequence (batch, noise_dim, in_length) 272 | 273 | ''' 274 | z = self.conv_pre(z) # (B, c_g, L) 275 | 276 | for res_block in self.res_stack: 277 | res_block.to(z.device) 278 | z = res_block(z, c) # (B, c_g, L * s_0 * ... * s_i) 279 | 280 | z = self.conv_post(z) # (B, 1, L * 256) 281 | 282 | return z 283 | 284 | def eval(self, inference=False): 285 | super(UnivNetGenerator, self).eval() 286 | # don't remove weight norm while validation in training loop 287 | if inference: 288 | self.remove_weight_norm() 289 | 290 | def remove_weight_norm(self): 291 | nn.utils.remove_weight_norm(self.conv_pre) 292 | 293 | for layer in self.conv_post: 294 | if len(layer.state_dict()) != 0: 295 | nn.utils.remove_weight_norm(layer) 296 | 297 | for res_block in self.res_stack: 298 | res_block.remove_weight_norm() 299 | 300 | def inference(self, c, z=None): 301 | # pad input mel with zeros to cut artifact 302 | # see https://github.com/seungwonpark/melgan/issues/8 303 | zero = torch.full((c.shape[0], self.mel_channel, 10), -11.5129).to(c.device) 304 | mel = torch.cat((c, zero), dim=2) 305 | 306 | if z is None: 307 | z = torch.randn(c.shape[0], self.noise_dim, mel.size(2)).to(mel.device) 308 | 309 | audio = self.forward(mel, z) 310 | audio = audio[:, :, :-(self.hop_length * 10)] 311 | audio = audio.clamp(min=-1, max=1) 312 | return audio 313 | 314 | 315 | if __name__ == '__main__': 316 | model = UnivNetGenerator() 317 | 318 | c = torch.randn(3, 100, 10) 319 | z = torch.randn(3, 64, 10) 320 | print(c.shape) 321 | 322 | y = model(c, z) 323 | print(y.shape) 324 | assert y.shape == torch.Size([3, 1, 2560]) 325 | 326 | pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) 327 | print(pytorch_total_params) 328 | -------------------------------------------------------------------------------- /tortoise/tortoise/read.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from time import time 4 | 5 | import torch 6 | import torchaudio 7 | 8 | from api import TextToSpeech, MODELS_DIR 9 | from utils.audio import load_audio, load_voices 10 | from utils.text import split_and_recombine_text 11 | 12 | 13 | if __name__ == '__main__': 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="tortoise/data/riding_hood.txt") 16 | parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) ' 17 | 'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='pat') 18 | parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/') 19 | parser.add_argument('--output_name', type=str, help='How to name the output file', default='combined.wav') 20 | parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard') 21 | parser.add_argument('--regenerate', type=str, help='Comma-separated list of clip numbers to re-generate, or nothing.', default=None) 22 | parser.add_argument('--candidates', type=int, help='How many output candidates to produce per-voice. Only the first candidate is actually used in the final product, the others can be used manually.', default=1) 23 | parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this' 24 | 'should only be specified if you have custom checkpoints.', default=MODELS_DIR) 25 | parser.add_argument('--seed', type=int, help='Random seed which can be used to reproduce results.', default=None) 26 | parser.add_argument('--produce_debug_state', type=bool, help='Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true.', default=True) 27 | parser.add_argument('--use_deepspeed', type=bool, help='Use deepspeed for speed bump.', default=False) 28 | parser.add_argument('--kv_cache', type=bool, help='If you disable this please wait for a long a time to get the output', default=True) 29 | parser.add_argument('--half', type=bool, help="float16(half) precision inference if True it's faster and take less vram and ram", default=True) 30 | 31 | 32 | args = parser.parse_args() 33 | if torch.backends.mps.is_available(): 34 | args.use_deepspeed = False 35 | tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half) 36 | 37 | outpath = args.output_path 38 | outname = args.output_name 39 | selected_voices = args.voice.split(',') 40 | regenerate = args.regenerate 41 | if regenerate is not None: 42 | regenerate = [int(e) for e in regenerate.split(',')] 43 | 44 | # Process text 45 | with open(args.textfile, 'r', encoding='utf-8') as f: 46 | text = ' '.join([l for l in f.readlines()]) 47 | if '|' in text: 48 | print("Found the '|' character in your text, which I will use as a cue for where to split it up. If this was not" 49 | "your intent, please remove all '|' characters from the input.") 50 | texts = text.split('|') 51 | else: 52 | texts = split_and_recombine_text(text) 53 | 54 | seed = int(time()) if args.seed is None else args.seed 55 | for selected_voice in selected_voices: 56 | voice_outpath = os.path.join(outpath, selected_voice) 57 | os.makedirs(voice_outpath, exist_ok=True) 58 | 59 | if '&' in selected_voice: 60 | voice_sel = selected_voice.split('&') 61 | else: 62 | voice_sel = [selected_voice] 63 | 64 | voice_samples, conditioning_latents = load_voices(voice_sel) 65 | all_parts = [] 66 | for j, text in enumerate(texts): 67 | if regenerate is not None and j not in regenerate: 68 | all_parts.append(load_audio(os.path.join(voice_outpath, f'{j}.wav'), 24000)) 69 | continue 70 | gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, 71 | preset=args.preset, k=args.candidates, use_deterministic_seed=seed) 72 | if args.candidates == 1: 73 | audio_ = gen.squeeze(0).cpu() 74 | torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), audio_, 24000) 75 | else: 76 | candidate_dir = os.path.join(voice_outpath, str(j)) 77 | os.makedirs(candidate_dir, exist_ok=True) 78 | for k, g in enumerate(gen): 79 | torchaudio.save(os.path.join(candidate_dir, f'{k}.wav'), g.squeeze(0).cpu(), 24000) 80 | audio_ = gen[0].squeeze(0).cpu() 81 | all_parts.append(audio_) 82 | 83 | if args.candidates == 1: 84 | full_audio = torch.cat(all_parts, dim=-1) 85 | torchaudio.save(os.path.join(voice_outpath, f"{outname}.wav"), full_audio, 24000) 86 | 87 | if args.produce_debug_state: 88 | os.makedirs('debug_states', exist_ok=True) 89 | dbg_state = (seed, texts, voice_samples, conditioning_latents) 90 | torch.save(dbg_state, f'debug_states/read_debug_{selected_voice}.pth') 91 | 92 | # Combine each candidate's audio clips. 93 | if args.candidates > 1: 94 | audio_clips = [] 95 | for candidate in range(args.candidates): 96 | for line in range(len(texts)): 97 | wav_file = os.path.join(voice_outpath, str(line), f"{candidate}.wav") 98 | audio_clips.append(load_audio(wav_file, 24000)) 99 | audio_clips = torch.cat(audio_clips, dim=-1) 100 | torchaudio.save(os.path.join(voice_outpath, f"{outname}_{candidate:02d}.wav"), audio_clips, 24000) 101 | audio_clips = [] 102 | -------------------------------------------------------------------------------- /tortoise/tortoise/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/utils/__init__.py -------------------------------------------------------------------------------- /tortoise/tortoise/utils/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/utils/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/utils/__pycache__/audio.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/utils/__pycache__/audio.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/utils/__pycache__/device.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/utils/__pycache__/device.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/utils/__pycache__/diffusion.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/utils/__pycache__/diffusion.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/utils/__pycache__/stft.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/utils/__pycache__/stft.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/utils/__pycache__/text.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/utils/__pycache__/text.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/utils/__pycache__/tokenizer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/utils/__pycache__/tokenizer.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/utils/__pycache__/torch_intermediary.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/utils/__pycache__/torch_intermediary.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/utils/__pycache__/typical_sampling.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/utils/__pycache__/typical_sampling.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/utils/__pycache__/wav2vec_alignment.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/utils/__pycache__/wav2vec_alignment.cpython-310.pyc -------------------------------------------------------------------------------- /tortoise/tortoise/utils/audio.py: -------------------------------------------------------------------------------- 1 | import os 2 | from glob import glob 3 | 4 | import librosa 5 | import torch 6 | import torchaudio 7 | import numpy as np 8 | from scipy.io.wavfile import read 9 | 10 | from tortoise.utils.stft import STFT 11 | 12 | 13 | BUILTIN_VOICES_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../voices') 14 | 15 | 16 | def load_wav_to_torch(full_path): 17 | sampling_rate, data = read(full_path) 18 | if data.dtype == np.int32: 19 | norm_fix = 2 ** 31 20 | elif data.dtype == np.int16: 21 | norm_fix = 2 ** 15 22 | elif data.dtype == np.float16 or data.dtype == np.float32: 23 | norm_fix = 1. 24 | else: 25 | raise NotImplemented(f"Provided data dtype not supported: {data.dtype}") 26 | return (torch.FloatTensor(data.astype(np.float32)) / norm_fix, sampling_rate) 27 | 28 | 29 | def load_audio(audiopath, sampling_rate): 30 | if audiopath[-4:] == '.wav': 31 | audio, lsr = load_wav_to_torch(audiopath) 32 | elif audiopath[-4:] == '.mp3': 33 | audio, lsr = librosa.load(audiopath, sr=sampling_rate) 34 | audio = torch.FloatTensor(audio) 35 | else: 36 | assert False, f"Unsupported audio format provided: {audiopath[-4:]}" 37 | 38 | # Remove any channel data. 39 | if len(audio.shape) > 1: 40 | if audio.shape[0] < 5: 41 | audio = audio[0] 42 | else: 43 | assert audio.shape[1] < 5 44 | audio = audio[:, 0] 45 | 46 | if lsr != sampling_rate: 47 | audio = torchaudio.functional.resample(audio, lsr, sampling_rate) 48 | 49 | # Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk. 50 | # '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds. 51 | if torch.any(audio > 2) or not torch.any(audio < 0): 52 | print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}") 53 | audio.clip_(-1, 1) 54 | 55 | return audio.unsqueeze(0) 56 | 57 | 58 | TACOTRON_MEL_MAX = 2.3143386840820312 59 | TACOTRON_MEL_MIN = -11.512925148010254 60 | 61 | 62 | def denormalize_tacotron_mel(norm_mel): 63 | return ((norm_mel+1)/2)*(TACOTRON_MEL_MAX-TACOTRON_MEL_MIN)+TACOTRON_MEL_MIN 64 | 65 | 66 | def normalize_tacotron_mel(mel): 67 | return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1 68 | 69 | 70 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 71 | """ 72 | PARAMS 73 | ------ 74 | C: compression factor 75 | """ 76 | return torch.log(torch.clamp(x, min=clip_val) * C) 77 | 78 | 79 | def dynamic_range_decompression(x, C=1): 80 | """ 81 | PARAMS 82 | ------ 83 | C: compression factor used to compress 84 | """ 85 | return torch.exp(x) / C 86 | 87 | 88 | def get_voices(extra_voice_dirs=[]): 89 | dirs = [BUILTIN_VOICES_DIR] + extra_voice_dirs 90 | voices = {} 91 | for d in dirs: 92 | subs = os.listdir(d) 93 | for sub in subs: 94 | subj = os.path.join(d, sub) 95 | if os.path.isdir(subj): 96 | voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3')) + list(glob(f'{subj}/*.pth')) 97 | return voices 98 | 99 | 100 | def load_voice(voice, extra_voice_dirs=[]): 101 | if voice == 'random': 102 | return None, None 103 | 104 | voices = get_voices(extra_voice_dirs) 105 | paths = voices[voice] 106 | if len(paths) == 1 and paths[0].endswith('.pth'): 107 | return None, torch.load(paths[0]) 108 | else: 109 | conds = [] 110 | for cond_path in paths: 111 | c = load_audio(cond_path, 22050) 112 | conds.append(c) 113 | return conds, None 114 | 115 | 116 | def load_voices(voices, extra_voice_dirs=[]): 117 | latents = [] 118 | clips = [] 119 | for voice in voices: 120 | if voice == 'random': 121 | if len(voices) > 1: 122 | print("Cannot combine a random voice with a non-random voice. Just using a random voice.") 123 | return None, None 124 | clip, latent = load_voice(voice, extra_voice_dirs) 125 | if latent is None: 126 | assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." 127 | clips.extend(clip) 128 | elif clip is None: 129 | assert len(clips) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." 130 | latents.append(latent) 131 | if len(latents) == 0: 132 | return clips, None 133 | else: 134 | latents_0 = torch.stack([l[0] for l in latents], dim=0).mean(dim=0) 135 | latents_1 = torch.stack([l[1] for l in latents], dim=0).mean(dim=0) 136 | latents = (latents_0,latents_1) 137 | return None, latents 138 | 139 | 140 | class TacotronSTFT(torch.nn.Module): 141 | def __init__(self, filter_length=1024, hop_length=256, win_length=1024, 142 | n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, 143 | mel_fmax=8000.0): 144 | super(TacotronSTFT, self).__init__() 145 | self.n_mel_channels = n_mel_channels 146 | self.sampling_rate = sampling_rate 147 | self.stft_fn = STFT(filter_length, hop_length, win_length) 148 | from librosa.filters import mel as librosa_mel_fn 149 | mel_basis = librosa_mel_fn( 150 | sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax) 151 | mel_basis = torch.from_numpy(mel_basis).float() 152 | self.register_buffer('mel_basis', mel_basis) 153 | 154 | def spectral_normalize(self, magnitudes): 155 | output = dynamic_range_compression(magnitudes) 156 | return output 157 | 158 | def spectral_de_normalize(self, magnitudes): 159 | output = dynamic_range_decompression(magnitudes) 160 | return output 161 | 162 | def mel_spectrogram(self, y): 163 | """Computes mel-spectrograms from a batch of waves 164 | PARAMS 165 | ------ 166 | y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] 167 | 168 | RETURNS 169 | ------- 170 | mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) 171 | """ 172 | assert(torch.min(y.data) >= -10) 173 | assert(torch.max(y.data) <= 10) 174 | y = torch.clip(y, min=-1, max=1) 175 | 176 | magnitudes, phases = self.stft_fn.transform(y) 177 | magnitudes = magnitudes.data 178 | mel_output = torch.matmul(self.mel_basis, magnitudes) 179 | mel_output = self.spectral_normalize(mel_output) 180 | return mel_output 181 | 182 | 183 | def wav_to_univnet_mel(wav, do_normalization=False, device='cuda' if not torch.backends.mps.is_available() else 'mps'): 184 | stft = TacotronSTFT(1024, 256, 1024, 100, 24000, 0, 12000) 185 | stft = stft.to(device) 186 | mel = stft.mel_spectrogram(wav) 187 | if do_normalization: 188 | mel = normalize_tacotron_mel(mel) 189 | return mel 190 | -------------------------------------------------------------------------------- /tortoise/tortoise/utils/device.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import psutil 3 | import importlib 4 | 5 | DEVICE_OVERRIDE = None 6 | DEVICE_BATCH_SIZE_MAP = [(14, 16), (10,8), (7,4)] 7 | 8 | from inspect import currentframe, getframeinfo 9 | import gc 10 | 11 | def do_gc(): 12 | gc.collect() 13 | try: 14 | torch.cuda.empty_cache() 15 | except Exception as e: 16 | pass 17 | 18 | def print_stats(collect=False): 19 | cf = currentframe().f_back 20 | msg = f'{getframeinfo(cf).filename}:{cf.f_lineno}' 21 | 22 | if collect: 23 | do_gc() 24 | 25 | tot = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3) 26 | res = torch.cuda.memory_reserved(0) / (1024 ** 3) 27 | alloc = torch.cuda.memory_allocated(0) / (1024 ** 3) 28 | print("[{}] Total: {:.3f} | Reserved: {:.3f} | Allocated: {:.3f} | Free: {:.3f}".format( msg, tot, res, alloc, tot-res )) 29 | 30 | 31 | def has_dml(): 32 | loader = importlib.find_loader('torch_directml') 33 | if loader is None: 34 | return False 35 | 36 | import torch_directml 37 | return torch_directml.is_available() 38 | 39 | def set_device_name(name): 40 | global DEVICE_OVERRIDE 41 | DEVICE_OVERRIDE = name 42 | 43 | def get_device_name(attempt_gc=True): 44 | global DEVICE_OVERRIDE 45 | if DEVICE_OVERRIDE is not None and DEVICE_OVERRIDE != "": 46 | return DEVICE_OVERRIDE 47 | 48 | name = 'cpu' 49 | 50 | if torch.cuda.is_available(): 51 | name = 'cuda' 52 | if attempt_gc: 53 | torch.cuda.empty_cache() # may have performance implications 54 | elif has_dml(): 55 | name = 'dml' 56 | 57 | return name 58 | 59 | def get_device(verbose=False): 60 | name = get_device_name() 61 | 62 | if verbose: 63 | if name == 'cpu': 64 | print("No hardware acceleration is available, falling back to CPU...") 65 | else: 66 | print(f"Hardware acceleration found: {name}") 67 | 68 | if name == "dml": 69 | import torch_directml 70 | return torch_directml.device() 71 | 72 | return torch.device(name) 73 | 74 | def get_device_vram( name=get_device_name() ): 75 | available = 1 76 | 77 | if name == "cuda": 78 | _, available = torch.cuda.mem_get_info() 79 | elif name == "cpu": 80 | available = psutil.virtual_memory()[4] 81 | 82 | return available / (1024 ** 3) 83 | 84 | def get_device_batch_size(name=get_device_name()): 85 | vram = get_device_vram(name) 86 | 87 | if vram > 14: 88 | return 16 89 | elif vram > 10: 90 | return 8 91 | elif vram > 7: 92 | return 4 93 | """ 94 | for k, v in DEVICE_BATCH_SIZE_MAP: 95 | if vram > k: 96 | return v 97 | """ 98 | return 1 99 | 100 | def get_device_count(name=get_device_name()): 101 | if name == "cuda": 102 | return torch.cuda.device_count() 103 | if name == "dml": 104 | import torch_directml 105 | return torch_directml.device_count() 106 | 107 | return 1 108 | 109 | 110 | # if you're getting errors make sure you've updated your torch-directml, and if you're still getting errors then you can uncomment the below block 111 | """ 112 | if has_dml(): 113 | _cumsum = torch.cumsum 114 | _repeat_interleave = torch.repeat_interleave 115 | _multinomial = torch.multinomial 116 | 117 | _Tensor_new = torch.Tensor.new 118 | _Tensor_cumsum = torch.Tensor.cumsum 119 | _Tensor_repeat_interleave = torch.Tensor.repeat_interleave 120 | _Tensor_multinomial = torch.Tensor.multinomial 121 | 122 | torch.cumsum = lambda input, *args, **kwargs: ( _cumsum(input.to("cpu"), *args, **kwargs).to(input.device) ) 123 | torch.repeat_interleave = lambda input, *args, **kwargs: ( _repeat_interleave(input.to("cpu"), *args, **kwargs).to(input.device) ) 124 | torch.multinomial = lambda input, *args, **kwargs: ( _multinomial(input.to("cpu"), *args, **kwargs).to(input.device) ) 125 | 126 | torch.Tensor.new = lambda self, *args, **kwargs: ( _Tensor_new(self.to("cpu"), *args, **kwargs).to(self.device) ) 127 | torch.Tensor.cumsum = lambda self, *args, **kwargs: ( _Tensor_cumsum(self.to("cpu"), *args, **kwargs).to(self.device) ) 128 | torch.Tensor.repeat_interleave = lambda self, *args, **kwargs: ( _Tensor_repeat_interleave(self.to("cpu"), *args, **kwargs).to(self.device) ) 129 | torch.Tensor.multinomial = lambda self, *args, **kwargs: ( _Tensor_multinomial(self.to("cpu"), *args, **kwargs).to(self.device) ) 130 | """ -------------------------------------------------------------------------------- /tortoise/tortoise/utils/stft.py: -------------------------------------------------------------------------------- 1 | """ 2 | BSD 3-Clause License 3 | 4 | Copyright (c) 2017, Prem Seetharaman 5 | All rights reserved. 6 | 7 | * Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, this 14 | list of conditions and the following disclaimer in the 15 | documentation and/or other materials provided with the distribution. 16 | 17 | * Neither the name of the copyright holder nor the names of its 18 | contributors may be used to endorse or promote products derived from this 19 | software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | """ 32 | 33 | import torch 34 | import numpy as np 35 | import torch.nn.functional as F 36 | from torch.autograd import Variable 37 | from scipy.signal import get_window 38 | from librosa.util import pad_center, tiny 39 | import librosa.util as librosa_util 40 | 41 | 42 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800, 43 | n_fft=800, dtype=np.float32, norm=None): 44 | """ 45 | # from librosa 0.6 46 | Compute the sum-square envelope of a window function at a given hop length. 47 | 48 | This is used to estimate modulation effects induced by windowing 49 | observations in short-time fourier transforms. 50 | 51 | Parameters 52 | ---------- 53 | window : string, tuple, number, callable, or list-like 54 | Window specification, as in `get_window` 55 | 56 | n_frames : int > 0 57 | The number of analysis frames 58 | 59 | hop_length : int > 0 60 | The number of samples to advance between frames 61 | 62 | win_length : [optional] 63 | The length of the window function. By default, this matches `n_fft`. 64 | 65 | n_fft : int > 0 66 | The length of each analysis frame. 67 | 68 | dtype : np.dtype 69 | The data type of the output 70 | 71 | Returns 72 | ------- 73 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` 74 | The sum-squared envelope of the window function 75 | """ 76 | if win_length is None: 77 | win_length = n_fft 78 | 79 | n = n_fft + hop_length * (n_frames - 1) 80 | x = np.zeros(n, dtype=dtype) 81 | 82 | # Compute the squared window at the desired length 83 | win_sq = get_window(window, win_length, fftbins=True) 84 | win_sq = librosa_util.normalize(win_sq, norm=norm)**2 85 | win_sq = librosa_util.pad_center(win_sq, n_fft) 86 | 87 | # Fill the envelope 88 | for i in range(n_frames): 89 | sample = i * hop_length 90 | x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] 91 | return x 92 | 93 | 94 | class STFT(torch.nn.Module): 95 | """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" 96 | def __init__(self, filter_length=800, hop_length=200, win_length=800, 97 | window='hann'): 98 | super(STFT, self).__init__() 99 | self.filter_length = filter_length 100 | self.hop_length = hop_length 101 | self.win_length = win_length 102 | self.window = window 103 | self.forward_transform = None 104 | scale = self.filter_length / self.hop_length 105 | fourier_basis = np.fft.fft(np.eye(self.filter_length)) 106 | 107 | cutoff = int((self.filter_length / 2 + 1)) 108 | fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), 109 | np.imag(fourier_basis[:cutoff, :])]) 110 | 111 | forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) 112 | inverse_basis = torch.FloatTensor( 113 | np.linalg.pinv(scale * fourier_basis).T[:, None, :]) 114 | 115 | if window is not None: 116 | assert(filter_length >= win_length) 117 | # get window and zero center pad it to filter_length 118 | fft_window = get_window(window, win_length, fftbins=True) 119 | fft_window = pad_center(fft_window, size=filter_length) 120 | fft_window = torch.from_numpy(fft_window).float() 121 | 122 | # window the bases 123 | forward_basis *= fft_window 124 | inverse_basis *= fft_window 125 | 126 | self.register_buffer('forward_basis', forward_basis.float()) 127 | self.register_buffer('inverse_basis', inverse_basis.float()) 128 | 129 | def transform(self, input_data): 130 | num_batches = input_data.size(0) 131 | num_samples = input_data.size(1) 132 | 133 | self.num_samples = num_samples 134 | 135 | # similar to librosa, reflect-pad the input 136 | input_data = input_data.view(num_batches, 1, num_samples) 137 | input_data = F.pad( 138 | input_data.unsqueeze(1), 139 | (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), 140 | mode='reflect') 141 | input_data = input_data.squeeze(1) 142 | 143 | forward_transform = F.conv1d( 144 | input_data, 145 | Variable(self.forward_basis, requires_grad=False), 146 | stride=self.hop_length, 147 | padding=0) 148 | 149 | cutoff = int((self.filter_length / 2) + 1) 150 | real_part = forward_transform[:, :cutoff, :] 151 | imag_part = forward_transform[:, cutoff:, :] 152 | 153 | magnitude = torch.sqrt(real_part**2 + imag_part**2) 154 | phase = torch.autograd.Variable( 155 | torch.atan2(imag_part.data, real_part.data)) 156 | 157 | return magnitude, phase 158 | 159 | def inverse(self, magnitude, phase): 160 | recombine_magnitude_phase = torch.cat( 161 | [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) 162 | 163 | inverse_transform = F.conv_transpose1d( 164 | recombine_magnitude_phase, 165 | Variable(self.inverse_basis, requires_grad=False), 166 | stride=self.hop_length, 167 | padding=0) 168 | 169 | if self.window is not None: 170 | window_sum = window_sumsquare( 171 | self.window, magnitude.size(-1), hop_length=self.hop_length, 172 | win_length=self.win_length, n_fft=self.filter_length, 173 | dtype=np.float32) 174 | # remove modulation effects 175 | approx_nonzero_indices = torch.from_numpy( 176 | np.where(window_sum > tiny(window_sum))[0]) 177 | window_sum = torch.autograd.Variable( 178 | torch.from_numpy(window_sum), requires_grad=False) 179 | window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum 180 | inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] 181 | 182 | # scale by hop ratio 183 | inverse_transform *= float(self.filter_length) / self.hop_length 184 | 185 | inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] 186 | inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):] 187 | 188 | return inverse_transform 189 | 190 | def forward(self, input_data): 191 | self.magnitude, self.phase = self.transform(input_data) 192 | reconstruction = self.inverse(self.magnitude, self.phase) 193 | return reconstruction -------------------------------------------------------------------------------- /tortoise/tortoise/utils/text.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def split_and_recombine_text(text, desired_length=200, max_length=300): 5 | """Split text it into chunks of a desired length trying to keep sentences intact.""" 6 | # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii 7 | text = re.sub(r'\n\n+', '\n', text) 8 | text = re.sub(r'\s+', ' ', text) 9 | text = re.sub(r'[“”]', '"', text) 10 | 11 | rv = [] 12 | in_quote = False 13 | current = "" 14 | split_pos = [] 15 | pos = -1 16 | end_pos = len(text) - 1 17 | 18 | def seek(delta): 19 | nonlocal pos, in_quote, current 20 | is_neg = delta < 0 21 | for _ in range(abs(delta)): 22 | if is_neg: 23 | pos -= 1 24 | current = current[:-1] 25 | else: 26 | pos += 1 27 | current += text[pos] 28 | if text[pos] == '"': 29 | in_quote = not in_quote 30 | return text[pos] 31 | 32 | def peek(delta): 33 | p = pos + delta 34 | return text[p] if p < end_pos and p >= 0 else "" 35 | 36 | def commit(): 37 | nonlocal rv, current, split_pos 38 | rv.append(current) 39 | current = "" 40 | split_pos = [] 41 | 42 | while pos < end_pos: 43 | c = seek(1) 44 | # do we need to force a split? 45 | if len(current) >= max_length: 46 | if len(split_pos) > 0 and len(current) > (desired_length / 2): 47 | # we have at least one sentence and we are over half the desired length, seek back to the last split 48 | d = pos - split_pos[-1] 49 | seek(-d) 50 | else: 51 | # no full sentences, seek back until we are not in the middle of a word and split there 52 | while c not in '!?.\n ' and pos > 0 and len(current) > desired_length: 53 | c = seek(-1) 54 | commit() 55 | # check for sentence boundaries 56 | elif not in_quote and (c in '!?\n' or (c == '.' and peek(1) in '\n ')): 57 | # seek forward if we have consecutive boundary markers but still within the max length 58 | while pos < len(text) - 1 and len(current) < max_length and peek(1) in '!?.': 59 | c = seek(1) 60 | split_pos.append(pos) 61 | if len(current) >= desired_length: 62 | commit() 63 | # treat end of quote as a boundary if its followed by a space or newline 64 | elif in_quote and peek(1) == '"' and peek(2) in '\n ': 65 | seek(2) 66 | split_pos.append(pos) 67 | rv.append(current) 68 | 69 | # clean up, remove lines with only whitespace or punctuation 70 | rv = [s.strip() for s in rv] 71 | rv = [s for s in rv if len(s) > 0 and not re.match(r'^[\s\.,;:!?]*$', s)] 72 | 73 | return rv 74 | 75 | 76 | if __name__ == '__main__': 77 | import os 78 | import unittest 79 | 80 | class Test(unittest.TestCase): 81 | def test_split_and_recombine_text(self): 82 | text = """ 83 | This is a sample sentence. 84 | This is another sample sentence. 85 | This is a longer sample sentence that should force a split inthemiddlebutinotinthislongword. 86 | "Don't split my quote... please" 87 | """ 88 | self.assertEqual(split_and_recombine_text(text, desired_length=20, max_length=40), 89 | ['This is a sample sentence.', 90 | 'This is another sample sentence.', 91 | 'This is a longer sample sentence that', 92 | 'should force a split', 93 | 'inthemiddlebutinotinthislongword.', 94 | '"Don\'t split my quote... please"']) 95 | 96 | def test_split_and_recombine_text_2(self): 97 | text = """ 98 | When you are really angry sometimes you use consecutive exclamation marks!!!!!! Is this a good thing to do?!?!?! 99 | I don't know but we should handle this situation.......................... 100 | """ 101 | self.assertEqual(split_and_recombine_text(text, desired_length=30, max_length=50), 102 | ['When you are really angry sometimes you use', 103 | 'consecutive exclamation marks!!!!!!', 104 | 'Is this a good thing to do?!?!?!', 105 | 'I don\'t know but we should handle this situation.']) 106 | 107 | def test_split_and_recombine_text_3(self): 108 | text_src = os.path.join(os.path.dirname(__file__), '../data/riding_hood.txt') 109 | with open(text_src, 'r') as f: 110 | text = f.read() 111 | self.assertEqual( 112 | split_and_recombine_text(text), 113 | [ 114 | 'Once upon a time there lived in a certain village a little country girl, the prettiest creature who was ever seen. Her mother was excessively fond of her; and her grandmother doted on her still more. This good woman had a little red riding hood made for her.', 115 | 'It suited the girl so extremely well that everybody called her Little Red Riding Hood. One day her mother, having made some cakes, said to her, "Go, my dear, and see how your grandmother is doing, for I hear she has been very ill. Take her a cake, and this little pot of butter."', 116 | 'Little Red Riding Hood set out immediately to go to her grandmother, who lived in another village. As she was going through the wood, she met with a wolf, who had a very great mind to eat her up, but he dared not, because of some woodcutters working nearby in the forest.', 117 | 'He asked her where she was going. The poor child, who did not know that it was dangerous to stay and talk to a wolf, said to him, "I am going to see my grandmother and carry her a cake and a little pot of butter from my mother." "Does she live far off?" said the wolf "Oh I say,"', 118 | 'answered Little Red Riding Hood; "it is beyond that mill you see there, at the first house in the village." "Well," said the wolf, "and I\'ll go and see her too. I\'ll go this way and go you that, and we shall see who will be there first."', 119 | 'The wolf ran as fast as he could, taking the shortest path, and the little girl took a roundabout way, entertaining herself by gathering nuts, running after butterflies, and gathering bouquets of little flowers.', 120 | 'It was not long before the wolf arrived at the old woman\'s house. He knocked at the door: tap, tap. "Who\'s there?" "Your grandchild, Little Red Riding Hood," replied the wolf, counterfeiting her voice; "who has brought you a cake and a little pot of butter sent you by mother."', 121 | 'The good grandmother, who was in bed, because she was somewhat ill, cried out, "Pull the bobbin, and the latch will go up."', 122 | 'The wolf pulled the bobbin, and the door opened, and then he immediately fell upon the good woman and ate her up in a moment, for it been more than three days since he had eaten.', 123 | 'He then shut the door and got into the grandmother\'s bed, expecting Little Red Riding Hood, who came some time afterwards and knocked at the door: tap, tap. "Who\'s there?"', 124 | 'Little Red Riding Hood, hearing the big voice of the wolf, was at first afraid; but believing her grandmother had a cold and was hoarse, answered, "It is your grandchild Little Red Riding Hood, who has brought you a cake and a little pot of butter mother sends you."', 125 | 'The wolf cried out to her, softening his voice as much as he could, "Pull the bobbin, and the latch will go up." Little Red Riding Hood pulled the bobbin, and the door opened.', 126 | 'The wolf, seeing her come in, said to her, hiding himself under the bedclothes, "Put the cake and the little pot of butter upon the stool, and come get into bed with me." Little Red Riding Hood took off her clothes and got into bed.', 127 | 'She was greatly amazed to see how her grandmother looked in her nightclothes, and said to her, "Grandmother, what big arms you have!" "All the better to hug you with, my dear." "Grandmother, what big legs you have!" "All the better to run with, my child." "Grandmother, what big ears you have!"', 128 | '"All the better to hear with, my child." "Grandmother, what big eyes you have!" "All the better to see with, my child." "Grandmother, what big teeth you have got!" "All the better to eat you up with." And, saying these words, this wicked wolf fell upon Little Red Riding Hood, and ate her all up.', 129 | ] 130 | ) 131 | 132 | unittest.main() 133 | -------------------------------------------------------------------------------- /tortoise/tortoise/utils/tokenizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import inflect 5 | import torch 6 | from tokenizers import Tokenizer 7 | 8 | 9 | # Regular expression matching whitespace: 10 | from unidecode import unidecode 11 | 12 | _whitespace_re = re.compile(r'\s+') 13 | 14 | 15 | # List of (regular expression, replacement) pairs for abbreviations: 16 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 17 | ('mrs', 'misess'), 18 | ('mr', 'mister'), 19 | ('dr', 'doctor'), 20 | ('st', 'saint'), 21 | ('co', 'company'), 22 | ('jr', 'junior'), 23 | ('maj', 'major'), 24 | ('gen', 'general'), 25 | ('drs', 'doctors'), 26 | ('rev', 'reverend'), 27 | ('lt', 'lieutenant'), 28 | ('hon', 'honorable'), 29 | ('sgt', 'sergeant'), 30 | ('capt', 'captain'), 31 | ('esq', 'esquire'), 32 | ('ltd', 'limited'), 33 | ('col', 'colonel'), 34 | ('ft', 'fort'), 35 | ]] 36 | 37 | 38 | def expand_abbreviations(text): 39 | for regex, replacement in _abbreviations: 40 | text = re.sub(regex, replacement, text) 41 | return text 42 | 43 | 44 | _inflect = inflect.engine() 45 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 46 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 47 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 48 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 49 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 50 | _number_re = re.compile(r'[0-9]+') 51 | 52 | 53 | def _remove_commas(m): 54 | return m.group(1).replace(',', '') 55 | 56 | 57 | def _expand_decimal_point(m): 58 | return m.group(1).replace('.', ' point ') 59 | 60 | 61 | def _expand_dollars(m): 62 | match = m.group(1) 63 | parts = match.split('.') 64 | if len(parts) > 2: 65 | return match + ' dollars' # Unexpected format 66 | dollars = int(parts[0]) if parts[0] else 0 67 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 68 | if dollars and cents: 69 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 70 | cent_unit = 'cent' if cents == 1 else 'cents' 71 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 72 | elif dollars: 73 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 74 | return '%s %s' % (dollars, dollar_unit) 75 | elif cents: 76 | cent_unit = 'cent' if cents == 1 else 'cents' 77 | return '%s %s' % (cents, cent_unit) 78 | else: 79 | return 'zero dollars' 80 | 81 | 82 | def _expand_ordinal(m): 83 | return _inflect.number_to_words(m.group(0)) 84 | 85 | 86 | def _expand_number(m): 87 | num = int(m.group(0)) 88 | if num > 1000 and num < 3000: 89 | if num == 2000: 90 | return 'two thousand' 91 | elif num > 2000 and num < 2010: 92 | return 'two thousand ' + _inflect.number_to_words(num % 100) 93 | elif num % 100 == 0: 94 | return _inflect.number_to_words(num // 100) + ' hundred' 95 | else: 96 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 97 | else: 98 | return _inflect.number_to_words(num, andword='') 99 | 100 | 101 | def normalize_numbers(text): 102 | text = re.sub(_comma_number_re, _remove_commas, text) 103 | text = re.sub(_pounds_re, r'\1 pounds', text) 104 | text = re.sub(_dollars_re, _expand_dollars, text) 105 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 106 | text = re.sub(_ordinal_re, _expand_ordinal, text) 107 | text = re.sub(_number_re, _expand_number, text) 108 | return text 109 | 110 | 111 | def expand_numbers(text): 112 | return normalize_numbers(text) 113 | 114 | 115 | def lowercase(text): 116 | return text.lower() 117 | 118 | 119 | def collapse_whitespace(text): 120 | return re.sub(_whitespace_re, ' ', text) 121 | 122 | 123 | def convert_to_ascii(text): 124 | return unidecode(text) 125 | 126 | 127 | def basic_cleaners(text): 128 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 129 | text = lowercase(text) 130 | text = collapse_whitespace(text) 131 | return text 132 | 133 | 134 | def transliteration_cleaners(text): 135 | '''Pipeline for non-English text that transliterate to ASCII.''' 136 | text = convert_to_ascii(text) 137 | text = lowercase(text) 138 | text = collapse_whitespace(text) 139 | return text 140 | 141 | 142 | def english_cleaners(text): 143 | '''Pipeline for English text, including number and abbreviation expansion.''' 144 | text = convert_to_ascii(text) 145 | text = lowercase(text) 146 | text = expand_numbers(text) 147 | text = expand_abbreviations(text) 148 | text = collapse_whitespace(text) 149 | text = text.replace('"', '') 150 | return text 151 | 152 | 153 | def lev_distance(s1, s2): 154 | if len(s1) > len(s2): 155 | s1, s2 = s2, s1 156 | 157 | distances = range(len(s1) + 1) 158 | for i2, c2 in enumerate(s2): 159 | distances_ = [i2 + 1] 160 | for i1, c1 in enumerate(s1): 161 | if c1 == c2: 162 | distances_.append(distances[i1]) 163 | else: 164 | distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1]))) 165 | distances = distances_ 166 | return distances[-1] 167 | 168 | 169 | DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/tokenizer.json') 170 | 171 | 172 | class VoiceBpeTokenizer: 173 | def __init__(self, vocab_file=DEFAULT_VOCAB_FILE): 174 | if vocab_file is not None: 175 | self.tokenizer = Tokenizer.from_file(vocab_file) 176 | 177 | def preprocess_text(self, txt): 178 | txt = english_cleaners(txt) 179 | return txt 180 | 181 | def encode(self, txt): 182 | txt = self.preprocess_text(txt) 183 | txt = txt.replace(' ', '[SPACE]') 184 | return self.tokenizer.encode(txt).ids 185 | 186 | def decode(self, seq): 187 | if isinstance(seq, torch.Tensor): 188 | seq = seq.cpu().numpy() 189 | txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(' ', '') 190 | txt = txt.replace('[SPACE]', ' ') 191 | txt = txt.replace('[STOP]', '') 192 | txt = txt.replace('[UNK]', '') 193 | return txt 194 | -------------------------------------------------------------------------------- /tortoise/tortoise/utils/torch_intermediary.py: -------------------------------------------------------------------------------- 1 | """ 2 | from bitsandbytes.nn import Linear8bitLt as Linear 3 | from bitsandbytes.nn import StableEmbedding as Embedding 4 | from bitsandbytes.optim.adam import Adam8bit as Adam 5 | from bitsandbytes.optim.adamw import AdamW8bit as AdamW 6 | """ 7 | """ 8 | from torch.nn import Linear 9 | from torch.nn import Embedding 10 | from torch.optim.adam import Adam 11 | from torch.optim.adamw import AdamW 12 | """ 13 | 14 | """ 15 | OVERRIDE_LINEAR = False 16 | OVERRIDE_EMBEDDING = False 17 | OVERRIDE_ADAM = False # True 18 | OVERRIDE_ADAMW = False # True 19 | """ 20 | 21 | import os 22 | 23 | USE_STABLE_EMBEDDING = False 24 | try: 25 | OVERRIDE_LINEAR = False 26 | OVERRIDE_EMBEDDING = False 27 | OVERRIDE_ADAM = False 28 | OVERRIDE_ADAMW = False 29 | 30 | USE_STABLE_EMBEDDING = os.environ.get('BITSANDBYTES_USE_STABLE_EMBEDDING', '1' if USE_STABLE_EMBEDDING else '0') == '1' 31 | OVERRIDE_LINEAR = os.environ.get('BITSANDBYTES_OVERRIDE_LINEAR', '1' if OVERRIDE_LINEAR else '0') == '1' 32 | OVERRIDE_EMBEDDING = os.environ.get('BITSANDBYTES_OVERRIDE_EMBEDDING', '1' if OVERRIDE_EMBEDDING else '0') == '1' 33 | OVERRIDE_ADAM = os.environ.get('BITSANDBYTES_OVERRIDE_ADAM', '1' if OVERRIDE_ADAM else '0') == '1' 34 | OVERRIDE_ADAMW = os.environ.get('BITSANDBYTES_OVERRIDE_ADAMW', '1' if OVERRIDE_ADAMW else '0') == '1' 35 | 36 | if OVERRIDE_LINEAR or OVERRIDE_EMBEDDING or OVERRIDE_ADAM or OVERRIDE_ADAMW: 37 | import bitsandbytes as bnb 38 | except Exception as e: 39 | OVERRIDE_LINEAR = False 40 | OVERRIDE_EMBEDDING = False 41 | OVERRIDE_ADAM = False 42 | OVERRIDE_ADAMW = False 43 | 44 | if OVERRIDE_LINEAR: 45 | from bitsandbytes.nn import Linear8bitLt as Linear 46 | else: 47 | from torch.nn import Linear 48 | 49 | if OVERRIDE_EMBEDDING: 50 | if USE_STABLE_EMBEDDING: 51 | from bitsandbytes.nn import StableEmbedding as Embedding 52 | else: 53 | from bitsandbytes.nn.modules import Embedding as Embedding 54 | else: 55 | from torch.nn import Embedding 56 | 57 | if OVERRIDE_ADAM: 58 | from bitsandbytes.optim.adam import Adam8bit as Adam 59 | else: 60 | from torch.optim.adam import Adam 61 | 62 | if OVERRIDE_ADAMW: 63 | from bitsandbytes.optim.adamw import AdamW8bit as AdamW 64 | else: 65 | from torch.optim.adamw import AdamW -------------------------------------------------------------------------------- /tortoise/tortoise/utils/typical_sampling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import LogitsWarper 3 | 4 | 5 | class TypicalLogitsWarper(LogitsWarper): 6 | def __init__(self, mass: float = 0.9, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1): 7 | self.filter_value = filter_value 8 | self.mass = mass 9 | self.min_tokens_to_keep = min_tokens_to_keep 10 | 11 | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: 12 | # calculate entropy 13 | normalized = torch.nn.functional.log_softmax(scores, dim=-1) 14 | p = torch.exp(normalized) 15 | ent = -(normalized * p).nansum(-1, keepdim=True) 16 | 17 | # shift and sort 18 | shifted_scores = torch.abs((-normalized) - ent) 19 | sorted_scores, sorted_indices = torch.sort(shifted_scores, descending=False) 20 | sorted_logits = scores.gather(-1, sorted_indices) 21 | cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) 22 | 23 | # Remove tokens with cumulative mass above the threshold 24 | last_ind = (cumulative_probs < self.mass).sum(dim=1) 25 | last_ind[last_ind < 0] = 0 26 | sorted_indices_to_remove = sorted_scores > sorted_scores.gather(1, last_ind.view(-1, 1)) 27 | if self.min_tokens_to_keep > 1: 28 | # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) 29 | sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0 30 | indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) 31 | 32 | scores = scores.masked_fill(indices_to_remove, self.filter_value) 33 | return scores -------------------------------------------------------------------------------- /tortoise/tortoise/utils/wav2vec_alignment.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import torch 4 | import torchaudio 5 | from transformers import Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer, Wav2Vec2Processor 6 | 7 | from tortoise.utils.audio import load_audio 8 | 9 | 10 | def max_alignment(s1, s2, skip_character='~', record=None): 11 | """ 12 | A clever function that aligns s1 to s2 as best it can. Wherever a character from s1 is not found in s2, a '~' is 13 | used to replace that character. 14 | 15 | Finally got to use my DP skills! 16 | """ 17 | if record is None: 18 | record = {} 19 | assert skip_character not in s1, f"Found the skip character {skip_character} in the provided string, {s1}" 20 | if len(s1) == 0: 21 | return '' 22 | if len(s2) == 0: 23 | return skip_character * len(s1) 24 | if s1 == s2: 25 | return s1 26 | if s1[0] == s2[0]: 27 | return s1[0] + max_alignment(s1[1:], s2[1:], skip_character, record) 28 | 29 | take_s1_key = (len(s1), len(s2) - 1) 30 | if take_s1_key in record: 31 | take_s1, take_s1_score = record[take_s1_key] 32 | else: 33 | take_s1 = max_alignment(s1, s2[1:], skip_character, record) 34 | take_s1_score = len(take_s1.replace(skip_character, '')) 35 | record[take_s1_key] = (take_s1, take_s1_score) 36 | 37 | take_s2_key = (len(s1) - 1, len(s2)) 38 | if take_s2_key in record: 39 | take_s2, take_s2_score = record[take_s2_key] 40 | else: 41 | take_s2 = max_alignment(s1[1:], s2, skip_character, record) 42 | take_s2_score = len(take_s2.replace(skip_character, '')) 43 | record[take_s2_key] = (take_s2, take_s2_score) 44 | 45 | return take_s1 if take_s1_score > take_s2_score else skip_character + take_s2 46 | 47 | 48 | class Wav2VecAlignment: 49 | """ 50 | Uses wav2vec2 to perform audio<->text alignment. 51 | """ 52 | def __init__(self, device='cuda' if not torch.backends.mps.is_available() else 'mps'): 53 | self.model = Wav2Vec2ForCTC.from_pretrained("jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli").cpu() 54 | self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"facebook/wav2vec2-large-960h") 55 | self.tokenizer = Wav2Vec2CTCTokenizer.from_pretrained('jbetker/tacotron-symbols') 56 | self.device = device 57 | 58 | def align(self, audio, expected_text, audio_sample_rate=24000): 59 | orig_len = audio.shape[-1] 60 | 61 | with torch.no_grad(): 62 | self.model = self.model.to(self.device) 63 | audio = audio.to(self.device) 64 | audio = torchaudio.functional.resample(audio, audio_sample_rate, 16000) 65 | clip_norm = (audio - audio.mean()) / torch.sqrt(audio.var() + 1e-7) 66 | logits = self.model(clip_norm).logits 67 | self.model = self.model.cpu() 68 | 69 | logits = logits[0] 70 | pred_string = self.tokenizer.decode(logits.argmax(-1).tolist()) 71 | 72 | fixed_expectation = max_alignment(expected_text.lower(), pred_string) 73 | w2v_compression = orig_len // logits.shape[0] 74 | expected_tokens = self.tokenizer.encode(fixed_expectation) 75 | expected_chars = list(fixed_expectation) 76 | if len(expected_tokens) == 1: 77 | return [0] # The alignment is simple; there is only one token. 78 | expected_tokens.pop(0) # The first token is a given. 79 | expected_chars.pop(0) 80 | 81 | alignments = [0] 82 | def pop_till_you_win(): 83 | if len(expected_tokens) == 0: 84 | return None 85 | popped = expected_tokens.pop(0) 86 | popped_char = expected_chars.pop(0) 87 | while popped_char == '~': 88 | alignments.append(-1) 89 | if len(expected_tokens) == 0: 90 | return None 91 | popped = expected_tokens.pop(0) 92 | popped_char = expected_chars.pop(0) 93 | return popped 94 | 95 | next_expected_token = pop_till_you_win() 96 | for i, logit in enumerate(logits): 97 | top = logit.argmax() 98 | if next_expected_token == top: 99 | alignments.append(i * w2v_compression) 100 | if len(expected_tokens) > 0: 101 | next_expected_token = pop_till_you_win() 102 | else: 103 | break 104 | 105 | pop_till_you_win() 106 | if not (len(expected_tokens) == 0 and len(alignments) == len(expected_text)): 107 | torch.save([audio, expected_text], 'alignment_debug.pth') 108 | assert False, "Something went wrong with the alignment algorithm. I've dumped a file, 'alignment_debug.pth' to" \ 109 | "your current working directory. Please report this along with the file so it can get fixed." 110 | 111 | # Now fix up alignments. Anything with -1 should be interpolated. 112 | alignments.append(orig_len) # This'll get removed but makes the algorithm below more readable. 113 | for i in range(len(alignments)): 114 | if alignments[i] == -1: 115 | for j in range(i+1, len(alignments)): 116 | if alignments[j] != -1: 117 | next_found_token = j 118 | break 119 | for j in range(i, next_found_token): 120 | gap = alignments[next_found_token] - alignments[i-1] 121 | alignments[j] = (j-i+1) * gap // (next_found_token-i+1) + alignments[i-1] 122 | 123 | return alignments[:-1] 124 | 125 | def redact(self, audio, expected_text, audio_sample_rate=24000): 126 | if '[' not in expected_text: 127 | return audio 128 | splitted = expected_text.split('[') 129 | fully_split = [splitted[0]] 130 | for spl in splitted[1:]: 131 | assert ']' in spl, 'Every "[" character must be paired with a "]" with no nesting.' 132 | fully_split.extend(spl.split(']')) 133 | 134 | # At this point, fully_split is a list of strings, with every other string being something that should be redacted. 135 | non_redacted_intervals = [] 136 | last_point = 0 137 | for i in range(len(fully_split)): 138 | if i % 2 == 0 and fully_split[i] != "": # Check for empty string fixes index error 139 | end_interval = max(0, last_point + len(fully_split[i]) - 1) 140 | non_redacted_intervals.append((last_point, end_interval)) 141 | last_point += len(fully_split[i]) 142 | 143 | bare_text = ''.join(fully_split) 144 | alignments = self.align(audio, bare_text, audio_sample_rate) 145 | 146 | output_audio = [] 147 | for nri in non_redacted_intervals: 148 | start, stop = nri 149 | output_audio.append(audio[:, alignments[start]:alignments[stop]]) 150 | return torch.cat(output_audio, dim=-1) 151 | -------------------------------------------------------------------------------- /tortoise/tortoise/voices/sp1/sp1_00.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/voices/sp1/sp1_00.wav -------------------------------------------------------------------------------- /tortoise/tortoise/voices/sp1/sp1_01.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/voices/sp1/sp1_01.wav -------------------------------------------------------------------------------- /tortoise/tortoise/voices/sp1/sp1_02.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/voices/sp1/sp1_02.wav -------------------------------------------------------------------------------- /tortoise/tortoise/voices/sp1/sp1_03.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/voices/sp1/sp1_03.wav -------------------------------------------------------------------------------- /tortoise/tortoise/voices/sp1/sp1_04.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/voices/sp1/sp1_04.wav -------------------------------------------------------------------------------- /tortoise/tortoise/voices/sp1/sp1_05.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/voices/sp1/sp1_05.wav -------------------------------------------------------------------------------- /tortoise/tortoise/voices/sp1/sp1_06.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/voices/sp1/sp1_06.wav -------------------------------------------------------------------------------- /tortoise/tortoise/voices/sp1/sp1_07.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/voices/sp1/sp1_07.wav -------------------------------------------------------------------------------- /tortoise/tortoise/voices/sp1/sp1_08.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/voices/sp1/sp1_08.wav -------------------------------------------------------------------------------- /tortoise/tortoise/voices/sp1/sp1_09.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/voices/sp1/sp1_09.wav -------------------------------------------------------------------------------- /tortoise/tortoise/voices/sp1/sp1_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/voices/sp1/sp1_1.wav -------------------------------------------------------------------------------- /tortoise/tortoise/voices/sp1/sp1_10.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/voices/sp1/sp1_10.wav -------------------------------------------------------------------------------- /tortoise/tortoise/voices/sp1/sp1_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/voices/sp1/sp1_2.wav -------------------------------------------------------------------------------- /tortoise/tortoise/voices/sp1/sp1_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/voices/sp1/sp1_3.wav -------------------------------------------------------------------------------- /tortoise/tortoise/voices/sp1/sp1_4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/voices/sp1/sp1_4.wav -------------------------------------------------------------------------------- /tortoise/tortoise/voices/sp1/sp1_5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SicariusSicariiStuff/Diffusion_TTS/6acfc721943f40dd14f6ded5f330834da727fcf0/tortoise/tortoise/voices/sp1/sp1_5.wav -------------------------------------------------------------------------------- /tts_preprocessor.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from num2words import num2words 4 | 5 | punctuation = r'[\s,.?!/)\'\]>]' 6 | alphabet_map = { 7 | "A": " Eh ", 8 | "B": " Bee ", 9 | "C": " See ", 10 | "D": " Dee ", 11 | "E": " Eee ", 12 | "F": " Eff ", 13 | "G": " Jee ", 14 | "H": " Eich ", 15 | "I": " Eye ", 16 | "J": " Jay ", 17 | "K": " Kay ", 18 | "L": " El ", 19 | "M": " Emm ", 20 | "N": " Enn ", 21 | "O": " Ohh ", 22 | "P": " Pee ", 23 | "Q": " Queue ", 24 | "R": " Are ", 25 | "S": " Ess ", 26 | "T": " Tee ", 27 | "U": " You ", 28 | "V": " Vee ", 29 | "W": " Double You ", 30 | "X": " Ex ", 31 | "Y": " Why ", 32 | "Z": " Zed " 33 | } 34 | 35 | 36 | def preprocess_all(string): 37 | # the order for some of these matter 38 | # For example, you need to remove the commas in numbers before expanding them 39 | string = replace_invalid_chars(string) 40 | string = replace_numbers(string) 41 | 42 | # TODO Try to use a ML predictor to expand abbreviations. It's hard, dependent on context, and whether to actually 43 | # try to say the abbreviation or spell it out as I've done below is not agreed upon 44 | 45 | # For now, expand abbreviations to pronunciations 46 | # replace_abbreviations adds a lot of unnecessary whitespace to ensure separation 47 | string = replace_abbreviations(string) 48 | 49 | # cleanup whitespaces 50 | string = clean_whitespace(string) 51 | 52 | return string 53 | 54 | 55 | def replace_invalid_chars(string): 56 | string = remove_surrounded_chars(string) 57 | string = string.replace('"', '') 58 | string = string.replace('`', '') 59 | string = string.replace("'","") 60 | string = string.replace('\u201D', '').replace('\u201C', '') # right and left quote 61 | string = string.replace('\u201F', '') # italic looking quote 62 | string = string.replace('\n', ' ') 63 | string = string.replace(''', '') 64 | string = string.replace('AI;', 'Artificial Intelligence!') 65 | string = string.replace('iddqd;', 'Immortality cheat code') 66 | string = string.replace('😉;', 'wink wink!') 67 | string = string.replace(';);', 'wink wink!') 68 | string = string.replace(';-);', 'wink wink!') 69 | string = string.replace(':D', '*laughs* Ahahaha!') 70 | string = string.replace(';D', '*laughs* Ahahaha!') 71 | string = string.replace(':-D', '*laughs* Ahahaha!') 72 | return string 73 | 74 | 75 | def replace_numbers(string): 76 | string = convert_num_locale(string) 77 | string = replace_negative(string) 78 | string = replace_roman(string) 79 | string = hyphen_range_to(string) 80 | string = num_to_words(string) 81 | return string 82 | 83 | 84 | def remove_surrounded_chars(string): 85 | # first this expression will check if there is a string nested exclusively between a alt= 86 | # and a style= string. This would correspond to only a the alt text of an embedded image 87 | # If it matches it will only keep that part as the string, and rend it for further processing 88 | # Afterwards this expression matches to 'as few symbols as possible (0 upwards) between any 89 | # asterisks' OR' as few symbols as possible (0 upwards) between an asterisk and the end of the string' 90 | if re.search(r'(?<=alt=)(.*)(?=style=)', string, re.DOTALL): 91 | m = re.search(r'(?<=alt=)(.*)(?=style=)', string, re.DOTALL) 92 | string = m.group(0) 93 | return re.sub(r'\*[^*]*?(\*|$)', '', string) 94 | 95 | 96 | def convert_num_locale(text): 97 | # This detects locale and converts it to American without comma separators 98 | pattern = re.compile(r'(?:\s|^)\d{1,3}(?:\.\d{3})+(,\d+)(?:\s|$)') 99 | result = text 100 | while True: 101 | match = pattern.search(result) 102 | if match is None: 103 | break 104 | 105 | start = match.start() 106 | end = match.end() 107 | result = result[0:start] + result[start:end].replace('.', '').replace(',', '.') + result[end:len(result)] 108 | 109 | # removes comma separators from existing American numbers 110 | pattern = re.compile(r'(\d),(\d)') 111 | result = pattern.sub(r'\1\2', result) 112 | 113 | return result 114 | 115 | 116 | def replace_negative(string): 117 | # handles situations like -5. -5 would become negative 5, which would then be expanded to negative five 118 | return re.sub(rf'(\s)(-)(\d+)({punctuation})', r'\1negative \3\4', string) 119 | 120 | 121 | def replace_roman(string): 122 | # find a string of roman numerals. 123 | # Only 2 or more, to avoid capturing I and single character abbreviations, like names 124 | pattern = re.compile(rf'\s[IVXLCDM]{{2,}}{punctuation}') 125 | result = string 126 | while True: 127 | match = pattern.search(result) 128 | if match is None: 129 | break 130 | 131 | start = match.start() 132 | end = match.end() 133 | result = result[0:start + 1] + str(roman_to_int(result[start + 1:end - 1])) + result[end - 1:len(result)] 134 | 135 | return result 136 | 137 | 138 | def roman_to_int(s): 139 | rom_val = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000} 140 | int_val = 0 141 | for i in range(len(s)): 142 | if i > 0 and rom_val[s[i]] > rom_val[s[i - 1]]: 143 | int_val += rom_val[s[i]] - 2 * rom_val[s[i - 1]] 144 | else: 145 | int_val += rom_val[s[i]] 146 | return int_val 147 | 148 | 149 | def hyphen_range_to(text): 150 | pattern = re.compile(r'(\d+)[-–](\d+)') 151 | result = pattern.sub(lambda x: x.group(1) + ' to ' + x.group(2), text) 152 | return result 153 | 154 | 155 | def num_to_words(text): 156 | # 1000 or 10.23 157 | pattern = re.compile(r'\d+\.\d+|\d+') 158 | result = pattern.sub(lambda x: num2words(float(x.group())), text) 159 | return result 160 | 161 | 162 | def replace_abbreviations(string): 163 | string = replace_uppercase_abbreviations(string) 164 | string = replace_lowercase_abbreviations(string) 165 | return string 166 | 167 | 168 | def replace_uppercase_abbreviations(string): 169 | # abbreviations 1 to 4 characters long. It will get things like A and I, but those are pronounced with their letter 170 | pattern = re.compile(rf'(^|[\s(.\'\[<])([A-Z]{{1,4}})({punctuation}|$)') 171 | result = string 172 | while True: 173 | match = pattern.search(result) 174 | if match is None: 175 | break 176 | 177 | start = match.start() 178 | end = match.end() 179 | result = result[0:start] + replace_abbreviation(result[start:end]) + result[end:len(result)] 180 | 181 | return result 182 | 183 | 184 | def replace_lowercase_abbreviations(string): 185 | # abbreviations 1 to 4 characters long, separated by dots i.e. e.g. 186 | pattern = re.compile(rf'(^|[\s(.\'\[<])(([a-z]\.){{1,4}})({punctuation}|$)') 187 | result = string 188 | while True: 189 | match = pattern.search(result) 190 | if match is None: 191 | break 192 | 193 | start = match.start() 194 | end = match.end() 195 | result = result[0:start] + replace_abbreviation(result[start:end].upper()) + result[end:len(result)] 196 | 197 | return result 198 | 199 | 200 | def replace_abbreviation(string): 201 | result = "" 202 | for char in string: 203 | result += match_mapping(char) 204 | 205 | return result 206 | 207 | 208 | def match_mapping(char): 209 | for mapping in alphabet_map.keys(): 210 | if char == mapping: 211 | return alphabet_map[char] 212 | 213 | return char 214 | 215 | 216 | def clean_whitespace(string): 217 | # remove whitespace before punctuation 218 | string = re.sub(rf'\s+({punctuation})', r'\1', string) 219 | string = string.strip() 220 | # compact whitespace 221 | string = ' '.join(string.split()) 222 | return string 223 | 224 | 225 | def __main__(args): 226 | print(preprocess_all(args[1])) 227 | 228 | 229 | if __name__ == "__main__": 230 | import sys 231 | __main__(sys.argv) 232 | --------------------------------------------------------------------------------