├── .github
    └── workflows
    │   └── python-publish.yml
├── .gitignore
├── LICENSE
├── README.md
├── setup.py
├── spear-tts.png
└── spear_tts_pytorch
    ├── __init__.py
    ├── attend.py
    ├── data.py
    ├── distributed.py
    ├── spear_tts_pytorch.py
    └── trainer.py


/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | 
 2 |   
 3 | # This workflow will upload a Python Package using Twine when a release is created
 4 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 5 | 
 6 | # This workflow uses actions that are not certified by GitHub.
 7 | # They are provided by a third-party and are governed by
 8 | # separate terms of service, privacy policy, and support
 9 | # documentation.
10 | 
11 | name: Upload Python Package
12 | 
13 | on:
14 |   release:
15 |     types: [published]
16 | 
17 | jobs:
18 |   deploy:
19 | 
20 |     runs-on: ubuntu-latest
21 | 
22 |     steps:
23 |     - uses: actions/checkout@v2
24 |     - name: Set up Python
25 |       uses: actions/setup-python@v2
26 |       with:
27 |         python-version: '3.x'
28 |     - name: Install dependencies
29 |       run: |
30 |         python -m pip install --upgrade pip
31 |         pip install build
32 |     - name: Build package
33 |       run: python -m build
34 |     - name: Publish package
35 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
36 |       with:
37 |         user: __token__
38 |         password: ${{ secrets.PYPI_API_TOKEN }}
39 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Phil Wang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <img src="./spear-tts.png" width="450px"></img>
  2 | 
  3 | ## Spear-TTS - Pytorch
  4 | 
  5 | Implementation of <a href="https://arxiv.org/abs/2302.03540">Spear-TTS</a> - multi-speaker text-to-speech attention network, in Pytorch
  6 | 
  7 | The text-to-semantic module built here will be used for <a href="https://github.com/lucidrains/soundstorm-pytorch">SoundStorm</a> for conditioning.
  8 | 
  9 | ## Appreciation
 10 | 
 11 | - <a href="https://stability.ai/">Stability</a> for their generous sponsorships to work on and open source cutting edge artificial intelligence research
 12 | 
 13 | - <a href="https://github.com/lucasnewman">Lucas Newman</a> for completing the <a href="https://github.com/lucidrains/spear-tts-pytorch/pull/4">backtranslation</a> portion, as well as beam search decoding!
 14 | 
 15 | - <a href="https://github.com/lucasnewman">Lucas Newman</a> for completing the final text to semantic transformer training code!
 16 | 
 17 | ## Install
 18 | 
 19 | ```bash
 20 | $ pip install spear-tts-pytorch
 21 | ```
 22 | 
 23 | ## Usage
 24 | 
 25 | ```python
 26 | import torch
 27 | 
 28 | from audiolm_pytorch import HubertWithKmeans
 29 | 
 30 | from spear_tts_pytorch import (
 31 |     TextToSemantic,
 32 |     SemanticToTextDatasetGenerator,
 33 |     GeneratedAudioTextDataset,
 34 |     MockDataset
 35 | )
 36 | 
 37 | wav2vec = HubertWithKmeans(
 38 |     checkpoint_path = './hubert_base_ls960.pt',
 39 |     kmeans_path = './hubert_base_ls960_L9_km500.bin'
 40 | )
 41 | 
 42 | model = TextToSemantic(
 43 |     wav2vec = wav2vec,
 44 |     dim = 512,
 45 |     num_text_token_ids = 256,
 46 |     heads = 8,
 47 |     target_kv_heads = 2, # grouped query attention, for memory efficient decoding
 48 |     source_depth = 1,
 49 |     target_depth = 1
 50 | )
 51 | 
 52 | ds = MockDataset(10)
 53 | 
 54 | dataset_generator = SemanticToTextDatasetGenerator(
 55 |     model = model,
 56 |     dataset = ds,
 57 |     folder = './output_folder'
 58 | )
 59 | 
 60 | dataset_generator(max_length = 2)
 61 | 
 62 | generated_dataset = GeneratedAudioTextDataset(
 63 |     folder = './output_folder'
 64 | )
 65 | 
 66 | assert len(generated_dataset) == 10
 67 | ```
 68 | 
 69 | ## Todo
 70 | 
 71 | - [x] add eos logic + generate, and hook up end-to-end generation in soundstorm
 72 | - [x] add first pretraining speech-to-speech with the reconstruction of 60% deleted tokens
 73 | - [x] add dropouts for this project, as low-resource
 74 | - [x] add total flexiblity of which layers of encoder / decoder to freeze during training
 75 | - [x] add step for training on small speech -> text corpus and generating pseudo-labelled dataset + finetuning (thanks to @lucasnewman)
 76 | - [x] add final step of finetuning on text -> speech + pseudolabelled dataset
 77 | - [x] figure out the best way to store and manage the pseudo-labelled generated dataset
 78 | - [x] batched beam search decoding
 79 | - [x] allow for using rotary positions in decoder + flash attention, give Tri another citation
 80 | - [x] integrate speculative decoding with some improvisation - done in same model using early exit strategy
 81 | 
 82 | - [ ] add cached key / values for starter + single / grouped key values, make sure flash attention can support specialized causal mask before flash attention 2 is in pytorch core
 83 | - [ ] polish the audio-text generation workflow
 84 | - [ ] concatting the real audio-text dataset with the generated one -> or being able to convert real audio-text dataset to generated
 85 | 
 86 | ## Citations
 87 | 
 88 | ```bibtex
 89 | @misc{kharitonov2023speak,
 90 |     title   = {Speak, Read and Prompt: High-Fidelity Text-to-Speech with Minimal Supervision}, 
 91 |     author  = {Eugene Kharitonov and Damien Vincent and Zalán Borsos and Raphaël Marinier and Sertan Girgin and Olivier Pietquin and Matt Sharifi and Marco Tagliasacchi and Neil Zeghidour},
 92 |     year    = {2023},
 93 |     eprint  = {2302.03540},
 94 |     archivePrefix = {arXiv},
 95 |     primaryClass = {cs.SD}
 96 | }
 97 | ```
 98 | 
 99 | ```bibtex
100 | @inproceedings{dao2022flashattention,
101 |     title   = {Flash{A}ttention: Fast and Memory-Efficient Exact Attention with {IO}-Awareness},
102 |     author  = {Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
103 |     booktitle = {Advances in Neural Information Processing Systems},
104 |     year    = {2022}
105 | }
106 | ```
107 | 
108 | ```bibtex
109 | @misc{shi2023enhance,
110 |     title   = {Enhance audio generation controllability through representation similarity regularization}, 
111 |     author  = {Yangyang Shi and Gael Le Lan and Varun Nagaraja and Zhaoheng Ni and Xinhao Mei and Ernie Chang and Forrest Iandola and Yang Liu and Vikas Chandra},
112 |     year    = {2023},
113 |     eprint  = {2309.08773},
114 |     archivePrefix = {arXiv},
115 |     primaryClass = {cs.SD}
116 | }
117 | ```
118 | 
119 | ```bibtex
120 | @article{Ainslie2023GQATG,
121 |     title   = {GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints},
122 |     author  = {Joshua Ainslie and James Lee-Thorp and Michiel de Jong and Yury Zemlyanskiy and Federico Lebr'on and Sumit K. Sanghai},
123 |     journal = {ArXiv},
124 |     year    = {2023},
125 |     volume  = {abs/2305.13245},
126 |     url     = {https://api.semanticscholar.org/CorpusID:258833177}
127 | }
128 | ```
129 | 
130 | ```bibtex
131 | @inproceedings{Leviathan2022FastIF,
132 |     title   = {Fast Inference from Transformers via Speculative Decoding},
133 |     author  = {Yaniv Leviathan and Matan Kalman and Y. Matias},
134 |     booktitle = {International Conference on Machine Learning},
135 |     year    = {2022},
136 |     url     = {https://api.semanticscholar.org/CorpusID:254096365}
137 | }
138 | ```
139 | 
140 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |   name = 'spear-tts-pytorch',
 5 |   packages = find_packages(exclude=[]),
 6 |   version = '0.4.8',
 7 |   license='MIT',
 8 |   description = 'Spear-TTS - Pytorch',
 9 |   author = 'Phil Wang',
10 |   author_email = 'lucidrains@gmail.com',
11 |   long_description_content_type = 'text/markdown',
12 |   url = 'https://github.com/lucidrains/spear-tts-pytorch',
13 |   keywords = [
14 |     'artificial intelligence',
15 |     'deep learning',
16 |     'transformers',
17 |     'attention mechanism',
18 |     'text-to-speech'
19 |   ],
20 |   install_requires=[
21 |     'audiolm-pytorch>=1.2.8',
22 |     'beartype',
23 |     'einops>=0.6.1',
24 |     'rotary-embedding-torch>=0.3.0',
25 |     'torch>=1.6',
26 |     'tqdm',
27 |     'x-clip>=0.12.2'
28 |   ],
29 |   classifiers=[
30 |     'Development Status :: 4 - Beta',
31 |     'Intended Audience :: Developers',
32 |     'Topic :: Scientific/Engineering :: Artificial Intelligence',
33 |     'License :: OSI Approved :: MIT License',
34 |     'Programming Language :: Python :: 3.6',
35 |   ],
36 | )
37 | 


--------------------------------------------------------------------------------
/spear-tts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucidrains/spear-tts-pytorch/0e6a63807f3b64f0e41ddc76fe2676fb93231f0f/spear-tts.png


--------------------------------------------------------------------------------
/spear_tts_pytorch/__init__.py:
--------------------------------------------------------------------------------
 1 | from spear_tts_pytorch.spear_tts_pytorch import (
 2 |     TextToSemantic,
 3 |     SpeechSpeechPretrainWrapper,
 4 |     SemanticToTextWrapper,
 5 |     TextToSemanticWrapper,
 6 |     SemanticToTextDatasetGenerator
 7 | )
 8 | 
 9 | from spear_tts_pytorch.trainer import (
10 |     SpeechSpeechPretrainer,
11 |     SemanticToTextTrainer,
12 |     TextToSemanticTrainer
13 | )
14 | 
15 | from spear_tts_pytorch.data import (
16 |     GeneratedAudioTextDataset,
17 |     MockDataset
18 | )


--------------------------------------------------------------------------------
/spear_tts_pytorch/attend.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn, einsum
  3 | import torch.nn.functional as F
  4 | 
  5 | from collections import namedtuple
  6 | from functools import wraps
  7 | from packaging import version
  8 | 
  9 | from einops import rearrange, repeat
 10 | 
 11 | # constants
 12 | 
 13 | Config = namedtuple('EfficientAttentionConfig', ['enable_flash', 'enable_math', 'enable_mem_efficient'])
 14 | 
 15 | # helpers
 16 | 
 17 | def exists(val):
 18 |     return val is not None
 19 | 
 20 | def once(fn):
 21 |     called = False
 22 |     @wraps(fn)
 23 |     def inner(x):
 24 |         nonlocal called
 25 |         if called:
 26 |             return
 27 |         called = True
 28 |         return fn(x)
 29 |     return inner
 30 | 
 31 | print_once = once(print)
 32 | 
 33 | # main class
 34 | 
 35 | class Attend(nn.Module):
 36 |     def __init__(
 37 |         self,
 38 |         dropout = 0.,
 39 |         causal = False,
 40 |         flash = False
 41 |     ):
 42 |         super().__init__()
 43 |         self.dropout = dropout
 44 |         self.attn_dropout = nn.Dropout(dropout)
 45 | 
 46 |         self.causal = causal
 47 |         self.register_buffer("mask", None, persistent=False)
 48 | 
 49 |         self.flash = flash
 50 |         assert not (flash and version.parse(torch.__version__) < version.parse('2.0.0')), 'in order to use flash attention, you must be using pytorch 2.0 or above'
 51 | 
 52 |         # determine efficient attention configs for cuda and cpu
 53 | 
 54 |         self.cpu_config = Config(True, True, True)
 55 |         self.cuda_config = None
 56 | 
 57 |         if not torch.cuda.is_available() or not flash:
 58 |             return
 59 | 
 60 |         device_properties = torch.cuda.get_device_properties(torch.device('cuda'))
 61 | 
 62 |         if device_properties.major == 8 and device_properties.minor == 0:
 63 |             print_once('A100 GPU detected, using flash attention if input tensor is on cuda')
 64 |             self.cuda_config = Config(True, False, False)
 65 |         else:
 66 |             print_once('Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda')
 67 |             self.cuda_config = Config(False, True, True)
 68 | 
 69 |     def get_mask(self, i, j, device):
 70 |         n = max(i, j)
 71 | 
 72 |         if exists(self.mask) and self.mask.shape[-1] >= n:
 73 |             mask = self.mask[:n, :n]
 74 |         else:
 75 |             mask = torch.ones((n, n), device = device, dtype = torch.bool).triu(1)
 76 |             self.register_buffer("mask", mask, persistent = False)
 77 | 
 78 |         return mask[-i:, :]
 79 | 
 80 |     def flash_attn(self, q, k, v, mask = None):
 81 |         _, heads, q_len, _, k_len, causal, is_cuda, device = *q.shape, k.shape[-2], self.causal, q.is_cuda, q.device
 82 | 
 83 |         # Check if mask exists and expand to compatible shape
 84 |         # The mask is B L, so it would have to be expanded to B H N L
 85 | 
 86 |         if exists(mask):
 87 |             mask = rearrange(mask, 'b j -> b 1 1 j')
 88 |             mask = mask.expand(-1, heads, q_len, -1)
 89 | 
 90 |         # Check if there is a compatible device for flash attention
 91 | 
 92 |         config = self.cuda_config if is_cuda else self.cpu_config
 93 | 
 94 |         # if q and k lengths differ (caching of key/values), and causal, manually construct causal attn mask as float, as not supported (flash attn 2 will support this eventually)
 95 | 
 96 |         row_is_entirely_masked = None
 97 | 
 98 |         if causal and q_len != k_len:
 99 |             causal_mask = self.get_mask(q_len, k_len, device = device)
100 | 
101 |             if exists(mask):
102 |                 mask = mask & ~causal_mask
103 |             else:
104 |                 mask = ~causal_mask
105 | 
106 |             row_is_entirely_masked = ~mask.any(dim = -1)
107 |             mask[..., 0] = mask[..., 0] | row_is_entirely_masked
108 | 
109 |             causal = False
110 | 
111 |         # pytorch 2.0 flash attn: q, k, v, mask, dropout, causal, softmax_scale
112 |         
113 |         with torch.backends.cuda.sdp_kernel(**config._asdict()):
114 |             out = F.scaled_dot_product_attention(
115 |                 q, k, v,
116 |                 attn_mask = mask,
117 |                 dropout_p = self.dropout if self.training else 0., 
118 |                 is_causal = causal
119 |             )
120 | 
121 |         if exists(row_is_entirely_masked):
122 |             out = out.masked_fill(row_is_entirely_masked[..., None], 0.)
123 | 
124 |         return out
125 | 
126 |     def forward(self, q, k, v, mask = None):
127 |         """
128 |         einstein notation
129 |         b - batch
130 |         h - heads
131 |         n, i, j - sequence length (base sequence length, source, target)
132 |         d - feature dimension
133 |         """
134 | 
135 |         n, device = q.shape[-2], q.device
136 |         heads, kv_heads = q.shape[1], k.shape[1]
137 | 
138 |         if kv_heads < heads:
139 |             k, v = map(lambda t: repeat(t, 'b h ... -> b (g h) ...', g = heads // kv_heads), (k, v))
140 | 
141 |         scale = q.shape[-1] ** -0.5
142 | 
143 |         if self.flash:
144 |             return self.flash_attn(q, k, v, mask = mask)
145 | 
146 |         # similarity
147 | 
148 |         sim = einsum("b h i d, b h j d -> b h i j", q, k) * scale
149 | 
150 |         # key padding mask
151 | 
152 |         if exists(mask):
153 |             mask = rearrange(mask, 'b j -> b 1 1 j')
154 |             sim = sim.masked_fill(~mask, -torch.finfo(sim.dtype).max)
155 | 
156 |         # causal mask
157 | 
158 |         if self.causal:
159 |             i, j = sim.shape[-2:]
160 |             causal_mask = self.get_mask(i, j, device)
161 |             sim = sim.masked_fill(causal_mask, -torch.finfo(sim.dtype).max)
162 | 
163 |         # attention
164 | 
165 |         attn = sim.softmax(dim = -1)
166 |         attn = self.attn_dropout(attn)
167 | 
168 |         # aggregate values
169 | 
170 |         out = einsum("b h i j, b h j d -> b h i d", attn, v)
171 | 
172 |         return out
173 | 


--------------------------------------------------------------------------------
/spear_tts_pytorch/data.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import torch
 4 | from torch.utils.data import Dataset
 5 | 
 6 | from beartype import beartype
 7 | 
 8 | # mock dataset
 9 | 
10 | class MockDataset(Dataset):
11 |     def __init__(self, length: int):
12 |         self.length = length
13 | 
14 |     def __len__(self):
15 |         return self.length
16 | 
17 |     def __getitem__(self, ind):
18 |         return torch.randn(1024)
19 | 
20 | # generated audio-text dataset
21 | 
22 | class GeneratedAudioTextDataset(Dataset):
23 |     @beartype
24 |     def __init__(
25 |         self,
26 |         folder: str,
27 |         delimiter_id: int = -1
28 |     ):
29 |         self.folder = Path(folder)
30 |         assert self.folder.exists() and self.folder.is_dir()
31 |         self.paths = list(self.folder.glob('*.pt'))
32 |         self.delimiter_id = delimiter_id
33 | 
34 |     def __len__(self):
35 |         return len(self.paths)
36 | 
37 |     def __getitem__(self, ind):
38 |         path = self.paths[ind]
39 |         tensor = torch.load(str(path))
40 | 
41 |         delimiter_mask = tensor == self.delimiter_id
42 |         assert delimiter_mask.any(), f'delimeter (<audio> <delimeter> <text>) not found'
43 | 
44 |         ind = (delimiter_mask.cumsum(dim = -1) == 0).sum().item()
45 | 
46 |         return tensor[:ind], tensor[(ind + 1):]
47 | 


--------------------------------------------------------------------------------
/spear_tts_pytorch/distributed.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Function
 3 | import torch.distributed as distributed
 4 | 
 5 | from einops import rearrange
 6 | 
 7 | # distributed helpers
 8 | 
 9 | def all_gather_variable_dim(t, dim = 0, sizes = None):
10 |     device, rank, world_size = t.device, distributed.get_rank(), distributed.get_world_size()
11 | 
12 |     if not exists(sizes):
13 |         size = torch.tensor(t.shape[dim], device = device, dtype = torch.long)
14 |         sizes = [torch.empty_like(size, device = device, dtype = torch.long) for i in range(world_size)]
15 |         distributed.all_gather(sizes, size)
16 |         sizes = torch.stack(sizes)
17 | 
18 |     max_size = sizes.amax().item()
19 |     padded_t = pad_dim_to(t, max_size, dim = dim)
20 | 
21 |     gathered_tensors = [torch.empty(padded_t.shape, device = device, dtype = padded_t.dtype) for i in range(world_size)]
22 |     distributed.all_gather(gathered_tensors, padded_t)
23 | 
24 |     gathered_tensor = torch.cat(gathered_tensors, dim = dim)
25 |     seq = torch.arange(max_size, device = device)
26 | 
27 |     mask = rearrange(seq, 'j -> 1 j') < rearrange(sizes, 'i -> i 1')
28 |     mask = rearrange(mask, 'i j -> (i j)')
29 |     seq = torch.arange(mask.shape[-1], device = device)
30 |     indices = seq[mask]
31 | 
32 |     gathered_tensor = gathered_tensor.index_select(dim, indices)
33 | 
34 |     return gathered_tensor, sizes
35 | 
36 | class AllGather(Function):
37 |     @staticmethod
38 |     def forward(ctx, x, dim, sizes):
39 |         is_dist = distributed.is_initialized() and distributed.get_world_size() > 1
40 |         ctx.is_dist = is_dist
41 | 
42 |         if not is_dist:
43 |             return x, None
44 | 
45 |         x, batch_sizes = all_gather_variable_dim(x, dim = dim, sizes = sizes)
46 |         ctx.batch_sizes = batch_sizes.tolist()
47 |         ctx.dim = dim
48 |         return x, batch_sizes
49 | 
50 |     @staticmethod
51 |     def backward(ctx, grads, _):
52 |         if not ctx.is_dist:
53 |             return grads, None, None
54 | 
55 |         batch_sizes, rank = ctx.batch_sizes, distributed.get_rank()
56 |         grads_by_rank = grads.split(batch_sizes, dim = ctx.dim)
57 |         return grads_by_rank[rank], None, None
58 | 
59 | all_gather = AllGather.apply
60 | 


--------------------------------------------------------------------------------
/spear_tts_pytorch/spear_tts_pytorch.py:
--------------------------------------------------------------------------------
   1 | import math
   2 | from pathlib import Path
   3 | from functools import partial
   4 | from random import random
   5 | 
   6 | import torch
   7 | import torch.nn.functional as F
   8 | from torch.nn.utils.rnn import pad_sequence
   9 | from torch import Tensor, nn, einsum, IntTensor, LongTensor
  10 | 
  11 | from torch.nn import Module, ModuleList
  12 | 
  13 | from torch.utils.data import Dataset
  14 | 
  15 | from einops import rearrange, repeat, pack, reduce
  16 | from einops.layers.torch import Rearrange
  17 | 
  18 | from audiolm_pytorch import FairseqVQWav2Vec, HubertWithKmeans
  19 | from audiolm_pytorch.data import get_dataloader
  20 | 
  21 | from rotary_embedding_torch import RotaryEmbedding
  22 | 
  23 | from beartype import beartype
  24 | from beartype.door import is_bearable
  25 | from beartype.typing import Optional, Union, Callable, Literal, Tuple, List
  26 | 
  27 | from x_clip.tokenizer import tokenizer
  28 | 
  29 | from spear_tts_pytorch.attend import Attend
  30 | from spear_tts_pytorch.distributed import all_gather
  31 | 
  32 | from tqdm import tqdm
  33 | 
  34 | # types
  35 | 
  36 | FloatTensor = Union[
  37 |     torch.FloatTensor,
  38 |     torch.cuda.FloatTensor
  39 | ]
  40 | 
  41 | # helpers
  42 | 
  43 | def exists(val):
  44 |     return val is not None
  45 | 
  46 | def default(val, d):
  47 |     return val if exists(val) else d
  48 | 
  49 | def empty(t: Tensor):
  50 |     return t.numel() == 0
  51 | 
  52 | def l2norm(t):
  53 |     return F.normalize(t, dim = -1)
  54 | 
  55 | def set_eos_id(t: Tensor, eos_id: int, pad_id: int):
  56 |     eos_indices = ((t == pad_id).cumsum(dim = -1) == 0).sum(dim = -1, keepdim = True).long()
  57 | 
  58 |     batch_range = torch.arange(t.shape[0], device = t.device, dtype = torch.long)
  59 |     batch_range = rearrange(batch_range, '... -> ... 1')
  60 | 
  61 |     t = F.pad(t, (0, 1), value = pad_id)
  62 |     t[batch_range, eos_indices] = eos_id
  63 |     return t
  64 | 
  65 | def batch_unique_consecutive(t, pad_value = 0.):
  66 |     unique_arr = [torch.unique_consecutive(el) for el in t.unbind(dim = 0)]
  67 |     return pad_sequence(unique_arr, batch_first = True, padding_value = pad_value)
  68 | 
  69 | def mask_after_eos(target, eos_id, pad_id):
  70 |     mask = (target == eos_id).cumsum(dim = -1) > 0
  71 |     mask = F.pad(mask, (1, -1), value = False)
  72 |     return target.masked_fill(mask, pad_id)
  73 | 
  74 | def safe_div(num, den, eps = 1e-10):
  75 |     return num / max(den, eps)
  76 | 
  77 | def find_first_true_index(bool_tensor, dim = -1):
  78 |     return (bool_tensor.cumsum(dim = dim) == 0).sum(dim = dim)
  79 | 
  80 | # freezing and unfreezing helpers
  81 | 
  82 | def set_requires_grad_(module: Module, requires_grad: bool):
  83 |     for p in module.parameters():
  84 |         p.requires_grad = requires_grad
  85 | 
  86 | def freeze(module: Module):
  87 |     set_requires_grad_(module, False)
  88 | 
  89 | def unfreeze(module: Module):
  90 |     set_requires_grad_(module, True)
  91 | 
  92 | # sampling helpers
  93 | 
  94 | def eval_decorator(fn):
  95 |     def inner(self, *args, **kwargs):
  96 |         was_training = self.training
  97 |         self.eval()
  98 |         out = fn(self, *args, **kwargs)
  99 |         self.train(was_training)
 100 |         return out
 101 |     return inner
 102 | 
 103 | def log(t, eps = 1e-20):
 104 |     return torch.log(t.clamp(min = eps))
 105 | 
 106 | def gumbel_noise(t):
 107 |     noise = torch.zeros_like(t).uniform_(0, 1)
 108 |     return -log(-log(noise))
 109 | 
 110 | def gumbel_sample(t, temperature = 1., dim = -1):
 111 |     return ((t / max(temperature, 1e-10)) + gumbel_noise(t)).argmax(dim = dim)
 112 | 
 113 | def top_p(logits, thres = 0.9):
 114 |     sorted_logits, sorted_indices = torch.sort(logits, descending=True)
 115 |     cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
 116 |     sorted_indices_to_remove = F.pad(cum_probs > thres, (1, -1), value = 0)
 117 |     sorted_logits[sorted_indices_to_remove] = float('-inf')
 118 |     sorted_logits = sorted_logits.scatter(-1, sorted_indices, sorted_logits)
 119 |     return sorted_logits
 120 | 
 121 | def top_k(logits, thres = 0.1, k = None):
 122 |     if not exists(k):
 123 |         k = math.ceil(thres * logits.shape[-1])
 124 |     val, ind = torch.topk(logits, k, dim = -1)
 125 |     probs = torch.full_like(logits, float('-inf'))
 126 |     probs.scatter_(-1, ind, val)
 127 |     return probs
 128 | 
 129 | # residual wrapper
 130 | 
 131 | class Residual(nn.Module):
 132 |     def __init__(self, fn):
 133 |         super().__init__()
 134 |         self.fn = fn
 135 |     def forward(self, x, **kwargs):
 136 |         return self.fn(x, **kwargs) + x
 137 | 
 138 | # rmsnorm
 139 | 
 140 | class RMSNorm(nn.Module):
 141 |     def __init__(self, dim):
 142 |         super().__init__()
 143 |         self.scale = dim ** 0.5
 144 |         self.gamma = nn.Parameter(torch.ones(dim))
 145 | 
 146 |     def forward(self, x):
 147 |         return F.normalize(x, dim = -1) * self.scale * self.gamma
 148 | 
 149 | # feedforward
 150 | 
 151 | class GEGLU(nn.Module):
 152 |     def forward(self, x):
 153 |         x, gate = x.chunk(2, dim = -1)
 154 |         return F.gelu(gate) * x
 155 | 
 156 | def FeedForward(dim, mult = 4, dropout = 0.):
 157 |     dim_inner = int(dim * mult * 2 / 3)
 158 |     return nn.Sequential(
 159 |         RMSNorm(dim),
 160 |         nn.Linear(dim, dim_inner * 2),
 161 |         GEGLU(),
 162 |         nn.Dropout(dropout),
 163 |         nn.Linear(dim_inner, dim)
 164 |     )
 165 | 
 166 | # attention
 167 | 
 168 | class Attention(nn.Module):
 169 |     def __init__(
 170 |         self,
 171 |         dim,
 172 |         *,
 173 |         dim_head = 64,
 174 |         heads = 8,
 175 |         kv_heads = None,
 176 |         causal = False,
 177 |         dim_context = None,
 178 |         dropout = 0.,
 179 |         rotary_emb: Optional[RotaryEmbedding] = None,
 180 |         flash = False,
 181 |         add_null_kv = False
 182 |     ):
 183 |         super().__init__()
 184 |         dim_context = default(dim_context, dim)
 185 | 
 186 |         self.heads = heads
 187 |         self.kv_heads = default(kv_heads, heads)
 188 |         assert (self.heads % self.kv_heads) == 0, 'number of key value heads must be divisible by query heads'
 189 | 
 190 |         self.scale = dim_head ** -0.5
 191 |         dim_query_inner = heads * dim_head
 192 |         dim_kv_inner = self.kv_heads * dim_head
 193 | 
 194 |         self.rotary_emb = rotary_emb
 195 | 
 196 |         self.attend = Attend(
 197 |             causal = causal,
 198 |             flash = flash,
 199 |             dropout = dropout
 200 |         )
 201 | 
 202 |         self.norm = RMSNorm(dim)
 203 |         self.attn_dropout = nn.Dropout(dropout)
 204 | 
 205 |         self.to_q = nn.Sequential(
 206 |             nn.Linear(dim, dim_query_inner, bias = False),
 207 |             Rearrange('b n (h d) -> b h n d', h = self.heads)
 208 |         )
 209 | 
 210 |         self.to_kv = nn.Sequential(
 211 |             nn.Linear(dim_context, dim_kv_inner * 2, bias = False),
 212 |             Rearrange('b n (kv h d) -> kv b h n d', kv = 2, h = self.kv_heads)
 213 |         )
 214 | 
 215 |         self.to_out = nn.Linear(dim_query_inner, dim, bias = False)
 216 | 
 217 |         self.add_null_kv = add_null_kv
 218 |         if add_null_kv:
 219 |             self.null_kv = nn.Parameter(torch.randn(2, self.kv_heads, 1, dim_head))
 220 | 
 221 |     def forward(
 222 |         self,
 223 |         x,
 224 |         context = None,
 225 |         mask = None,
 226 |         cache = None,
 227 |         return_cached_key_values = False
 228 |     ):
 229 |         has_context = exists(context)
 230 |         b = x.shape[0]
 231 | 
 232 |         x = self.norm(x)
 233 | 
 234 |         context = default(context, x)
 235 | 
 236 |         q, k, v = (self.to_q(x), *self.to_kv(context))
 237 | 
 238 |         if exists(cache):
 239 |             ck, cv = cache.unbind(dim = 1)
 240 |             k = torch.cat((ck, k), dim = -2)
 241 |             v = torch.cat((cv, v), dim = -2)
 242 | 
 243 |         new_cache = torch.stack((k, v), dim = 1)
 244 | 
 245 |         if exists(self.rotary_emb):
 246 |             assert not has_context
 247 |             q, k = self.rotary_emb.rotate_queries_with_cached_keys(q, k)
 248 | 
 249 |         if self.add_null_kv:
 250 |             assert not exists(self.rotary_emb)
 251 |             nk, nv = map(lambda t: repeat(t, 'h 1 d -> b h 1 d', b = b), self.null_kv)
 252 |             k = torch.cat((nk, k), dim = -2)
 253 |             v = torch.cat((nv, v), dim = -2)
 254 | 
 255 |             if exists(mask):
 256 |                 mask = F.pad(mask, (1, 0), value = True)
 257 | 
 258 |         out = self.attend(q, k, v, mask = mask)
 259 | 
 260 |         out = rearrange(out, 'b h n d -> b n (h d)')
 261 |         out =  self.to_out(out)
 262 | 
 263 |         if not return_cached_key_values:
 264 |             return out
 265 | 
 266 |         return out, new_cache
 267 | 
 268 | # transformer
 269 | 
 270 | class Transformer(nn.Module):
 271 |     def __init__(
 272 |         self,
 273 |         *,
 274 |         dim,
 275 |         depth,
 276 |         dim_head = 64,
 277 |         heads = 8,
 278 |         kv_heads = None,
 279 |         causal = False,
 280 |         attn_dropout = 0.,
 281 |         ff_mult = 4,
 282 |         ff_dropout = 0.,
 283 |         cross_attend = False,
 284 |         attn_flash = False
 285 |     ):
 286 |         super().__init__()
 287 | 
 288 |         rotary_emb = RotaryEmbedding(dim_head)
 289 | 
 290 |         self.layers = nn.ModuleList([])
 291 | 
 292 |         for _ in range(depth):
 293 |             self.layers.append(nn.ModuleList([
 294 |                 Attention(dim = dim, causal = causal, dim_head = dim_head, heads = heads, kv_heads = kv_heads, dropout = attn_dropout, rotary_emb = rotary_emb, flash = attn_flash),
 295 |                 Attention(dim = dim, dim_head = dim_head, heads = heads, dropout = attn_dropout, flash = attn_flash, add_null_kv = True) if cross_attend else None,
 296 |                 FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
 297 |             ]))
 298 | 
 299 |         self.final_norm = RMSNorm(dim)
 300 | 
 301 |     def forward(
 302 |         self,
 303 |         x,
 304 |         mask = None,
 305 |         context = None,
 306 |         context_mask = None,
 307 |         cache = None,
 308 |         return_cache = False,
 309 |         return_hiddens = False,
 310 |         early_exit_at_layer = None,
 311 |         seq_start_pos = None
 312 |     ):
 313 |         has_context = exists(context)
 314 | 
 315 |         if exists(seq_start_pos):
 316 |             assert not exists(mask)
 317 |             seq_len = x.shape[-2]
 318 |             seq_arange = torch.arange(seq_len, device = x.device, dtype = torch.long)
 319 |             mask = seq_arange >= seq_start_pos[..., None]
 320 | 
 321 |         if exists(cache):
 322 |             cached_length, seq_len = cache.shape[-2], x.shape[-2]
 323 |             assert seq_len > cached_length
 324 |             x = x[:, cached_length:]
 325 | 
 326 |         new_cache = []
 327 |         hiddens = []
 328 | 
 329 |         if exists(cache):
 330 |             iter_cache = iter(cache.unbind(dim = 1))
 331 |         else:
 332 |             iter_cache = iter([])
 333 | 
 334 |         for ind, (self_attn, maybe_cross_attn, ff) in enumerate(self.layers):
 335 |             layer = ind + 1
 336 | 
 337 |             residual = x
 338 |             attn_out, key_values = self_attn(x, mask = mask, cache = next(iter_cache, None), return_cached_key_values = True)
 339 |             x = attn_out + residual
 340 | 
 341 |             new_cache.append(key_values)
 342 | 
 343 |             if exists(maybe_cross_attn):
 344 |                 assert has_context
 345 |                 x = maybe_cross_attn(x, context = context, mask = context_mask) + x
 346 | 
 347 |             x = ff(x) + x
 348 |             hiddens.append(x)
 349 | 
 350 |             if exists(early_exit_at_layer) and early_exit_at_layer == layer:
 351 |                 break
 352 | 
 353 |         if exists(early_exit_at_layer):
 354 |             if return_cache:
 355 |                 return x, torch.stack(new_cache, dim = 1)
 356 |             return x
 357 | 
 358 |         out = self.final_norm(x)
 359 | 
 360 |         if return_hiddens:
 361 |             assert not return_cache
 362 |             return out, torch.stack(hiddens)
 363 | 
 364 |         if not return_cache:
 365 |             return out
 366 | 
 367 |         return out, torch.stack(new_cache, dim = 1)
 368 | 
 369 | # class
 370 | 
 371 | SpeechOrTextLiteral = Union[
 372 |     Literal['speech'],
 373 |     Literal['text']
 374 | ]
 375 | 
 376 | SemanticModelType = Union[
 377 |     FairseqVQWav2Vec,
 378 |     HubertWithKmeans
 379 | ]
 380 | 
 381 | class TextToSemantic(Module):
 382 |     @beartype
 383 |     def __init__(
 384 |         self,
 385 |         dim,
 386 |         *,
 387 |         source_depth,
 388 |         target_depth,
 389 |         num_text_token_ids = None,
 390 |         tokenizer_encode: Optional[Callable] = None,
 391 |         use_openai_tokenizer = False,
 392 |         wav2vec: Optional[SemanticModelType] = None,
 393 |         num_semantic_token_ids = None,
 394 |         dim_head = 64,
 395 |         heads = 8,
 396 |         target_kv_heads = None,  # for grouped query attention, saving memory on decoder inference
 397 |         attn_dropout = 0.,
 398 |         ff_mult = 4,
 399 |         ff_dropout = 0.,
 400 |         semantic_pad_id = -1,
 401 |         text_pad_id = 0,
 402 |         autoset_semantic_eos_id = True,
 403 |         autoset_text_eos_id = True,
 404 |         attn_flash = False,
 405 |         cond_drop_prob = 0.,
 406 |         target_early_exit_layer = None,
 407 |         detach_early_exit_embed = False,
 408 |         align_reg_loss_weight = 0.1,
 409 |         align_reg_use_logsumexp_pool = True,
 410 |         align_reg_logsumexp_pool_temp = 0.1
 411 |     ):
 412 |         super().__init__()
 413 |         self.dim = dim
 414 |         self.wav2vec = wav2vec
 415 | 
 416 |         if exists(self.wav2vec):
 417 |             freeze(self.wav2vec)
 418 | 
 419 |         self.tokenizer_encode = tokenizer_encode
 420 | 
 421 |         if use_openai_tokenizer:
 422 |             assert not exists(tokenizer_encode)
 423 |             assert not exists(num_text_token_ids)
 424 |             self.tokenizer_encode = tokenizer.tokenize
 425 |             num_text_token_ids = tokenizer.vocab_size
 426 |         else:
 427 |             assert exists(num_text_token_ids), 'num_text_token_ids not specified'
 428 | 
 429 |         num_semantic_token_ids = wav2vec.codebook_size if exists(wav2vec) else num_semantic_token_ids
 430 |         assert exists(num_semantic_token_ids), 'you need to either pass in a wav2vec model from audiolm-pytorch, or specify the number of semantic token ids with num_semantic_token_ids'
 431 | 
 432 |         self.num_semantic_token_ids = num_semantic_token_ids
 433 |         self.num_text_token_ids = num_text_token_ids
 434 | 
 435 |         # padding id, for deriving attention mask automatically if not passed in
 436 | 
 437 |         self.semantic_pad_id = semantic_pad_id
 438 |         self.text_pad_id = text_pad_id
 439 | 
 440 |         self.pad_id = dict(
 441 |             speech = semantic_pad_id,
 442 |             text = text_pad_id
 443 |         )
 444 | 
 445 |         # eos id
 446 | 
 447 |         self.autoset_eos_id = dict(
 448 |             speech = autoset_semantic_eos_id,
 449 |             text = autoset_text_eos_id
 450 |         )
 451 | 
 452 |         self.eos_id = dict(
 453 |             speech = num_semantic_token_ids,
 454 |             text = num_text_token_ids
 455 |         )
 456 | 
 457 |         # embedding
 458 | 
 459 |         num_semantic_token_ids_with_eos = num_semantic_token_ids + int(autoset_semantic_eos_id)
 460 |         num_text_token_ids_with_eos = num_text_token_ids + int(autoset_text_eos_id)
 461 | 
 462 |         semantic_token_emb = nn.Embedding(num_semantic_token_ids_with_eos, dim)
 463 |         text_token_emb = nn.Embedding(num_text_token_ids_with_eos, dim)
 464 | 
 465 |         self.semantic_token_emb = semantic_token_emb
 466 | 
 467 |         self.token_emb = nn.ModuleDict(dict(
 468 |             speech = semantic_token_emb,
 469 |             text = text_token_emb
 470 |         ))
 471 | 
 472 |         # respective start tokens
 473 | 
 474 |         self.start_token = nn.ParameterDict(dict(
 475 |             speech = nn.Parameter(torch.randn(dim)),
 476 |             text = nn.Parameter(torch.randn(dim))
 477 |         ))
 478 | 
 479 |         # projection to logits
 480 | 
 481 |         to_semantic_logit = nn.Linear(dim, num_semantic_token_ids, bias = False)
 482 |         to_text_logit = nn.Linear(dim, num_text_token_ids, bias = False)
 483 | 
 484 |         to_semantic_logit.weight = semantic_token_emb.weight
 485 |         to_text_logit.weight = text_token_emb.weight
 486 | 
 487 |         self.to_logits = nn.ModuleDict(dict(
 488 |             speech = to_semantic_logit,
 489 |             text = to_text_logit
 490 |         ))
 491 | 
 492 |         # source and target attention layers
 493 | 
 494 |         self.source_transformer = Transformer(
 495 |             dim = dim,
 496 |             dim_head = dim_head,
 497 |             heads = heads,
 498 |             depth = source_depth,
 499 |             attn_dropout = attn_dropout,
 500 |             ff_mult = ff_mult,
 501 |             ff_dropout = ff_dropout,
 502 |             causal = False,
 503 |             attn_flash = attn_flash
 504 |         )
 505 | 
 506 |         self.target_transformer = Transformer(
 507 |             dim = dim,
 508 |             dim_head = dim_head,
 509 |             heads = heads,
 510 |             kv_heads = target_kv_heads,
 511 |             depth = target_depth,
 512 |             attn_dropout = attn_dropout,
 513 |             ff_mult = ff_mult,
 514 |             ff_dropout = ff_dropout,
 515 |             causal = True,
 516 |             cross_attend = True,
 517 |             attn_flash = attn_flash
 518 |         )
 519 | 
 520 |         # classifier free guidance - prob of dropping condition
 521 | 
 522 |         assert 0 <= cond_drop_prob < 1
 523 |         self.cond_drop_prob = cond_drop_prob
 524 | 
 525 |         self.align_reg_loss_weight = align_reg_loss_weight # lambda for weight of regularization loss in https://arxiv.org/abs/2309.08773
 526 |         self.align_reg_use_logsumexp_pool = align_reg_use_logsumexp_pool
 527 |         self.align_reg_logsumexp_pool_temp = align_reg_logsumexp_pool_temp
 528 | 
 529 |         # for speculative decoding, to speed up text-to-speech decoding and make real-time TTS approach more feasible with spear-tts
 530 |         # using early exist strategy so one can train just the same model
 531 | 
 532 |         self.target_has_early_exit = exists(target_early_exit_layer)
 533 |         self.early_exit_layer = target_early_exit_layer
 534 | 
 535 |         if self.target_has_early_exit:
 536 |             assert 0 < target_early_exit_layer <= target_depth, f'the early exit layer for the speech transformer must be between 1 and {target_depth}'
 537 | 
 538 |             self.detach_early_exit_embed = detach_early_exit_embed
 539 | 
 540 |             self.to_early_exit_semantic_logits = nn.Sequential(
 541 |                 Residual(FeedForward(dim)),
 542 |                 RMSNorm(dim),
 543 |                 nn.Linear(dim, num_semantic_token_ids_with_eos, bias = False)
 544 |             )
 545 | 
 546 |     @property
 547 |     def device(self):
 548 |         return next(self.parameters()).device
 549 | 
 550 |     def load(self, path, strict = True):
 551 |         # Return pkg so that if this function gets called from within a Trainer function call,
 552 |         # the trainer can also access the package loaded from the checkpoint.
 553 |         path = Path(path)
 554 |         assert path.exists()
 555 |         pkg = torch.load(str(path), map_location = 'cpu')
 556 |         self.load_state_dict(pkg['model'], strict = strict)
 557 |         return pkg
 558 | 
 559 |     # a set of freezing / unfreezing utils
 560 |     # then rely on get_optimizer to filter out the parameters that do not require grad from being exposed to optimizer
 561 | 
 562 |     def unfreeze_all(self):
 563 |         unfreeze(self)
 564 | 
 565 |     def freeze_encoder(self):
 566 |         freeze(self.source_transformer)
 567 | 
 568 |     def freeze_encoder_below_layer(self, layer: int):
 569 |         """
 570 |         for the final training of text-to-semantic on pseudo-labelled dataset
 571 |         they freeze the encoder part way up to a certain layer
 572 |         """
 573 |         unfreeze(self.source_transformer)
 574 | 
 575 |         for ind, module in enumerate(self.source_transformer.layers):
 576 |             current_layer = ind + 1
 577 | 
 578 |             if current_layer <= layer:
 579 |                 freeze(module)
 580 | 
 581 |     def freeze_decoder(self):
 582 |         freeze(self.target_transformer)
 583 | 
 584 |     def freeze_speech_emb(self):
 585 |         freeze(self.token_emb['speech'])
 586 |         self.start_token['speech'].requires_grad = False
 587 | 
 588 |     def freeze_text_emb(self):
 589 |         freeze(self.token_emb['text'])
 590 |         self.start_token['text'].requires_grad = False
 591 | 
 592 |     # sampling function
 593 | 
 594 |     @torch.no_grad()
 595 |     @eval_decorator
 596 |     @beartype
 597 |     def generate(
 598 |         self,
 599 |         source: Union[List[str], Tensor],
 600 |         *,
 601 |         source_type: SpeechOrTextLiteral,
 602 |         target_type: SpeechOrTextLiteral,
 603 |         temperature = 1.,
 604 |         filter_logits_fn = top_k,
 605 |         filter_fn_kwargs: dict = dict(),
 606 |         source_mask: Optional[Tensor] = None,
 607 |         max_length = 2048,
 608 |         beam_search_decode = False,
 609 |         spec_decode = False,
 610 |         spec_decode_gamma = 5,
 611 |         spec_decode_lenience = 1.,
 612 |         beam_size = 4,
 613 |         return_source = False,
 614 |         return_target_mask = False,
 615 |         cond_scale = 1.
 616 |     ):
 617 |         assert cond_scale >= 1.
 618 |         assert not (cond_scale > 1 and self.cond_drop_prob == 0), 'you need to train with conditional drop probability greater than 0 to use classifier free guidance at inference, and it needs to be the right source to target pair'
 619 | 
 620 |         if is_bearable(source, FloatTensor) and source_type == 'speech':
 621 |             assert exists(self.wav2vec), 'wav2vec should be passed in, if generating with source as raw soundwave'
 622 | 
 623 |             with torch.no_grad():
 624 |                 self.wav2vec.eval()
 625 |                 source = source.to(self.device)
 626 |                 source = self.wav2vec(source)
 627 | 
 628 |         if is_bearable(source, List[str]):
 629 |             assert exists(self.tokenizer_encode)
 630 |             source = self.tokenizer_encode(source)
 631 |             source = source.to(self.device)
 632 | 
 633 |         batch = source.shape[0]
 634 | 
 635 |         source_token_emb = self.token_emb[source_type]
 636 |         source_pad_id = self.pad_id[source_type]
 637 | 
 638 |         # all target modules and parameters
 639 | 
 640 |         target_token_emb = self.token_emb[target_type]
 641 |         target_start_token = self.start_token[target_type]
 642 |         target_to_logit = self.to_logits[target_type]
 643 |         target_pad_id = self.pad_id[target_type]
 644 |         target_eos_id = self.eos_id[target_type]
 645 | 
 646 |         # auto set eos id
 647 | 
 648 |         if self.autoset_eos_id[source_type]:
 649 |             source_eos_id = self.eos_id[source_type]
 650 |             source = set_eos_id(source, source_eos_id, pad_id = source_pad_id)
 651 | 
 652 |         # if source mask is not passed in
 653 |         # automatically derive by the padding id of the modality
 654 | 
 655 |         if not exists(source_mask) and source.dtype == torch.long:
 656 |             source_mask = source != source_pad_id
 657 |         
 658 |         # source embedding
 659 | 
 660 |         source_emb = source_token_emb(source)
 661 | 
 662 |         source_emb = self.source_transformer(source_emb, mask = source_mask)
 663 | 
 664 |         # decode target
 665 | 
 666 |         target = torch.empty((batch, 0), dtype = torch.long, device = self.device)
 667 |         start_token = repeat(target_start_token, 'd -> b 1 d', b = batch)
 668 | 
 669 |         # loop to decode
 670 | 
 671 |         assert not (beam_search_decode and spec_decode), 'you must choose either beam decode or speculative decoding, but not both'
 672 | 
 673 |         if not beam_search_decode and not spec_decode:
 674 |             cache = None
 675 |             null_cache = None
 676 | 
 677 |             for _ in tqdm(range(max_length)):
 678 |                 target_emb = target_token_emb(target)
 679 |                 target_emb = torch.cat((start_token, target_emb), dim = 1)
 680 | 
 681 |                 # target attention
 682 | 
 683 |                 attended_target_emb, cache = self.target_transformer(target_emb, context = source_emb, context_mask = source_mask, cache = cache, return_cache = True)
 684 | 
 685 |                 # decoder logits
 686 | 
 687 |                 logits = target_to_logit(attended_target_emb)
 688 |                 logits = logits[:, -1]
 689 | 
 690 |                 # handle classifier free guidance
 691 | 
 692 |                 if cond_scale > 1.:
 693 |                     null_source_mask = source_mask.float().zero_().bool()
 694 | 
 695 |                     attended_null_target_emb, null_cache = self.target_transformer(target_emb, context = source_emb, context_mask = null_source_mask, cache = null_cache, return_cache = True)
 696 | 
 697 |                     null_logits = target_to_logit(attended_null_target_emb)
 698 |                     null_logits = null_logits[:, -1]
 699 | 
 700 |                     logits = null_logits + (logits - null_logits) * cond_scale
 701 | 
 702 |                 # filter logits
 703 | 
 704 |                 logits = filter_logits_fn(logits, **filter_fn_kwargs)
 705 | 
 706 |                 sampled = gumbel_sample(logits, temperature = temperature)
 707 |                 target, _ = pack((target, sampled), 'b *')
 708 | 
 709 |                 if not self.autoset_eos_id[target_type]:
 710 |                     continue
 711 | 
 712 |                 is_eos = target == target_eos_id
 713 |                 all_eos = is_eos.any(dim = -1).all()
 714 | 
 715 |                 if not all_eos:
 716 |                     continue
 717 | 
 718 |                 target = mask_after_eos(target, target_eos_id, target_pad_id)
 719 |                 break
 720 |         elif beam_search_decode:
 721 |             beam = [(target, 0.0, None, None)]
 722 | 
 723 |             batch_range = torch.arange(batch, device = self.device, dtype = torch.long)
 724 |             batch_range = rearrange(batch_range, 'b -> b 1')
 725 | 
 726 |             needs_classifier_free_guidance = cond_scale > 1.
 727 | 
 728 |             for _ in tqdm(range(max_length)):
 729 |                 all_candidates = []
 730 |                 
 731 |                 for sentence, sentence_prob, sentence_cache, null_sentence_cache in beam:
 732 |                     target_emb = target_token_emb(sentence)
 733 |                     target_emb = torch.cat((start_token, target_emb), dim = 1)
 734 | 
 735 |                     # target attention
 736 | 
 737 |                     attended_target_emb, next_sentence_cache = self.target_transformer(target_emb, context = source_emb, context_mask = source_mask, cache = sentence_cache, return_cache = True)
 738 | 
 739 |                     # decoder logits
 740 | 
 741 |                     logits = target_to_logit(attended_target_emb)
 742 |                     logits = logits[:, -1]
 743 | 
 744 |                     # handle classifier free guidance
 745 | 
 746 |                     if needs_classifier_free_guidance:
 747 |                         null_source_mask = source_mask.float().zero_().bool()
 748 | 
 749 |                         attended_null_target_emb, next_null_sentence_cache = self.target_transformer(target_emb, context = source_emb, context_mask = null_source_mask, cache = null_sentence_cache, return_cache = True)
 750 | 
 751 |                         null_logits = target_to_logit(attended_null_target_emb)
 752 |                         null_logits = null_logits[:, -1]
 753 | 
 754 |                         logits = null_logits + (logits - null_logits) * cond_scale
 755 |                     else:
 756 |                         next_null_sentence_cache = next_sentence_cache[:, 0:0]
 757 | 
 758 |                     # log probs for ranking beams
 759 | 
 760 |                     log_probs = torch.log_softmax(logits / max(temperature, 1e-10), dim = -1)
 761 |                     topk_log_probs, topk_ids = log_probs.topk(beam_size, dim = -1)
 762 | 
 763 |                     for i in range(beam_size):
 764 |                         candidate = torch.cat([sentence, topk_ids[..., i:i + 1]], dim = -1)
 765 |                         candidate_prob = sentence_prob + topk_log_probs[..., i]
 766 |                         all_candidates.append((candidate, candidate_prob, next_sentence_cache, next_null_sentence_cache))
 767 | 
 768 |                 # concat into shape (beam, batch, seq), (beam, batch)
 769 | 
 770 |                 candidates, candidate_probs, candidate_caches, candidate_null_caches = map(partial(torch.stack, dim = 1), zip(*all_candidates))
 771 | 
 772 |                 # sort by candidate scores across beams
 773 | 
 774 |                 sorted_indices = candidate_probs.sort(dim = 1, descending = True).indices
 775 | 
 776 |                 sorted_candidates = candidates[batch_range, sorted_indices]
 777 |                 sorted_candidate_probs = candidate_probs[batch_range, sorted_indices]
 778 |                 sorted_candidate_caches = candidate_caches[batch_range, sorted_indices]
 779 |                 sorted_candidate_null_caches = candidate_null_caches[batch_range, sorted_indices]
 780 | 
 781 |                 # reconstitute ordered List[Tuple[Tensor, Tensor]]
 782 | 
 783 |                 ordered = list(zip(*map(partial(torch.unbind, dim = 1), (sorted_candidates, sorted_candidate_probs, sorted_candidate_caches, sorted_candidate_null_caches))))
 784 | 
 785 |                 beam = ordered[:beam_size]
 786 | 
 787 |                 # check if we've hit eos for all sequences
 788 | 
 789 |                 all_eos = all([((sentence == target_eos_id).any(dim = -1)).all() for sentence, _, _, _ in beam])
 790 | 
 791 |                 if all_eos:
 792 |                     break
 793 | 
 794 |             target = beam[0][0]
 795 | 
 796 |             if exists(target_eos_id):
 797 |                 target = mask_after_eos(target, target_eos_id, target_pad_id)
 798 | 
 799 |         elif spec_decode:
 800 |             assert self.target_has_early_exit, 'early exit layer must have been specified and trained in order to use speculative decoding (using the earlier layers of the target transformer as the small fast prediction network)'
 801 |             assert source_type == 'text' and target_type == 'speech', 'speculative decoding can only be employed for text-to-speech decoding'
 802 | 
 803 |             batch, prompt_seq_len, device = *target.shape, self.device
 804 | 
 805 |             cache = None
 806 |             small_cache = None
 807 | 
 808 |             num_steps = 0
 809 |             total_accepted = 0
 810 | 
 811 |             batch_range = torch.arange(batch, device = device, dtype = torch.long)[..., None]
 812 |             seq_lens = torch.full((batch,), prompt_seq_len, device = device, dtype = torch.long)
 813 | 
 814 |             while (seq_lens < max_length).any():
 815 | 
 816 |                 # predict with smaller network
 817 | 
 818 |                 all_small_logits = []
 819 |                 q_sampled_out = []
 820 | 
 821 |                 for _ in range(spec_decode_gamma):
 822 |                     target_emb = target_token_emb(target)
 823 |                     target_emb = torch.cat((start_token, target_emb), dim = 1)
 824 | 
 825 |                     small_emb, small_cache = self.target_transformer(
 826 |                         target_emb,
 827 |                         cache = small_cache,
 828 |                         context = source_emb,
 829 |                         context_mask = source_mask,
 830 |                         return_cache = True,
 831 |                         early_exit_at_layer = self.early_exit_layer,
 832 |                         seq_start_pos = target.shape[-1] - seq_lens
 833 |                     )
 834 | 
 835 |                     small_logits = self.to_early_exit_semantic_logits(small_emb)
 836 |                     small_logits = small_logits[:, -1]
 837 | 
 838 |                     small_logits = filter_logits_fn(small_logits, **filter_fn_kwargs)
 839 |                     all_small_logits.append(small_logits)
 840 | 
 841 |                     sample = gumbel_sample(small_logits, temperature = temperature, dim = -1)
 842 |                     target = torch.cat((target, sample[..., None]), dim = -1)
 843 |                     seq_lens += 1
 844 | 
 845 |                     q_sampled_out.append(rearrange(sample, 'b -> b 1 1'))
 846 | 
 847 |                 q_sampled_out = torch.cat(q_sampled_out, dim = -2)
 848 |                 small_logits = torch.stack(all_small_logits, dim = -2)
 849 | 
 850 |                 # verify with larger network
 851 | 
 852 |                 target_emb = target_token_emb(target)
 853 |                 target_emb = torch.cat((start_token, target_emb), dim = 1)
 854 | 
 855 |                 emb, cache = self.target_transformer(
 856 |                     target_emb,
 857 |                     cache = cache,
 858 |                     context = source_emb,
 859 |                     context_mask = source_mask,
 860 |                     return_cache = True,
 861 |                     seq_start_pos = target.shape[-1] - seq_lens
 862 |                 )
 863 | 
 864 |                 logits = target_to_logit(emb)
 865 |                 logits = logits[..., -(spec_decode_gamma + 1):, :]
 866 |                 logits = filter_logits_fn(logits, **filter_fn_kwargs)
 867 | 
 868 |                 # prob and prob of small model (p(x) and q(x) in algorithm 1)
 869 | 
 870 |                 prob = safe_div(logits, temperature).softmax(dim = -1)
 871 |                 small_prob = safe_div(small_logits, temperature).softmax(dim = -1)
 872 | 
 873 |                 p, prob_next = prob[:, :-1], prob[:, -1]
 874 | 
 875 |                 p = p.gather(-1, q_sampled_out)
 876 |                 q = small_prob.gather(-1, q_sampled_out) * spec_decode_lenience
 877 | 
 878 |                 p, q = [rearrange(t, 'b n 1 -> b n') for t in (p, q)]
 879 | 
 880 |                 r = random_uniform = torch.zeros_like(q).float().uniform_(0, 1)
 881 | 
 882 |                 accepted = find_first_true_index(r > (p / q))
 883 | 
 884 |                 total_accepted += accepted.float().mean()
 885 |                 num_steps += 1
 886 | 
 887 |                 num_rejected = spec_decode_gamma - accepted
 888 |                 has_rejected = num_rejected > 0
 889 | 
 890 |                 accepted = rearrange(accepted, 'b -> b 1')
 891 |                 accepted.clamp_(max = spec_decode_gamma - 1)
 892 | 
 893 |                 adjusted_prob = F.relu(prob[batch_range, accepted] - small_prob[batch_range, accepted])
 894 |                 adjusted_prob = adjusted_prob / adjusted_prob.sum(dim = -1, keepdim = True)
 895 |                 adjusted_prob = rearrange(adjusted_prob, 'b 1 d -> b d')
 896 | 
 897 |                 prob_next = torch.where(
 898 |                     rearrange(has_rejected, '... -> ... 1'),
 899 |                     adjusted_prob,
 900 |                     prob_next
 901 |                 )
 902 | 
 903 |                 # do a bunch of slicing and align everything to the right, including kv caches
 904 | 
 905 |                 max_num_rejected = num_rejected.amax()
 906 |                 seq_arange = torch.arange(target.shape[-1], device = device, dtype = torch.long)
 907 | 
 908 |                 seq_offset_indices = seq_arange + (max_num_rejected - num_rejected)[..., None]
 909 | 
 910 |                 seq_lens -= num_rejected
 911 |                 max_seq_len = seq_lens.amax()
 912 | 
 913 |                 if batch > 1:
 914 |                     target = F.pad(target, (0, max_num_rejected), value = target_pad_id)
 915 |                     target = target[batch_range, seq_offset_indices]
 916 | 
 917 |                     cache = F.pad(cache, (0, 0, 0, max_num_rejected), value = target_pad_id)
 918 |                     small_cache = F.pad(small_cache, (0, 0, 0, max_num_rejected), value = target_pad_id)
 919 | 
 920 |                     cache = rearrange(cache, 'b ... n d -> b n ... d')
 921 |                     small_cache = rearrange(small_cache, 'b ... n d -> b n ... d')
 922 | 
 923 |                     cache = cache[batch_range, seq_offset_indices]
 924 |                     small_cache = small_cache[batch_range, seq_offset_indices]
 925 | 
 926 |                     cache = rearrange(cache, 'b n ... d -> b ... n d')
 927 |                     small_cache = rearrange(small_cache, 'b n ... d -> b ... n d')
 928 | 
 929 |                     if target.shape[-1] > max_seq_len:
 930 |                         left_index = target.shape[-1] - max_seq_len
 931 |                         target = target[:, left_index:]
 932 |                         cache = cache[..., left_index:, :]
 933 |                         small_cache = small_cache[..., left_index:, :]
 934 | 
 935 |                 # sample the additional token, one of the tricks in the paper to better bound the worst case
 936 | 
 937 |                 next_token = torch.multinomial(prob_next, 1)
 938 | 
 939 |                 target = torch.cat((target, next_token), dim = -1)
 940 |                 seq_lens += 1
 941 | 
 942 |                 all_eos = (target == target_eos_id).any(dim = -1).all()
 943 | 
 944 |                 if all_eos:
 945 |                     break
 946 | 
 947 |             # now left align
 948 | 
 949 |             max_seq_lens = seq_lens.amax()
 950 | 
 951 |             num_pad_left = target.shape[-1] - seq_lens
 952 |             max_pad_left = num_pad_left.amax()
 953 |             target = F.pad(target, (0, max_pad_left), value = target_pad_id)
 954 | 
 955 |             seq_len_range = torch.arange(min(max_length, max_seq_lens), device = device, dtype = torch.long)
 956 |             target = target[batch_range, seq_len_range + num_pad_left[..., None]]
 957 |             target = target[..., prompt_seq_len:]
 958 | 
 959 |             # mask out anything after eos
 960 | 
 961 |             if exists(target_eos_id):
 962 |                 target = mask_after_eos(target, target_eos_id, target_pad_id)
 963 | 
 964 |         # whether to return the target mask
 965 |         # for variable lengthed generation output
 966 |         # needed for conditioning voicebox, NS2, etc
 967 | 
 968 |         if return_target_mask:
 969 |             target_mask = target != target_pad_id
 970 | 
 971 |         # 4 different types of return cases
 972 | 
 973 |         if not return_source:
 974 |             if not return_target_mask:
 975 |                 return target
 976 | 
 977 |             return target, target_mask
 978 | 
 979 |         if not return_target_mask:
 980 |             return source, target
 981 | 
 982 |         return source, target, target_mask
 983 | 
 984 |     @beartype
 985 |     def forward(
 986 |         self,
 987 |         source: Union[List[str], Tensor],
 988 |         target: Union[List[str], Tensor],
 989 |         *,
 990 |         source_type: SpeechOrTextLiteral,
 991 |         target_type: SpeechOrTextLiteral,
 992 |         source_mask: Optional[Tensor] = None,
 993 |         target_mask: Optional[Tensor] = None,
 994 |         return_loss = False,
 995 |         return_logits = False,
 996 |         cond_drop_prob: Optional[float] = None,
 997 |         should_sim_regularize = True,
 998 |         return_early_exit_loss = False
 999 |     ):
1000 |         cond_drop_prob = default(cond_drop_prob, self.cond_drop_prob)
1001 |         drop_cond = cond_drop_prob > 0 and random() < cond_drop_prob
1002 | 
1003 |         if is_bearable(source, FloatTensor) and source_type == 'speech':
1004 |             assert exists(self.wav2vec), 'wav2vec should be passed in, if generating with source as raw soundwave'
1005 | 
1006 |             with torch.no_grad():
1007 |                 self.wav2vec.eval()
1008 |                 source = self.wav2vec(source)
1009 | 
1010 |         if is_bearable(source, List[str]):
1011 |             assert exists(self.tokenizer_encode)
1012 |             source = self.tokenizer_encode(source)
1013 |             source = source.to(self.device)
1014 | 
1015 |         if is_bearable(target, List[str]):
1016 |             assert exists(self.tokenizer_encode)
1017 |             target = self.tokenizer_encode(target)
1018 |             target = target.to(self.device)
1019 | 
1020 |         assert source.shape[0] == target.shape[0]
1021 |         batch = source.shape[0]
1022 | 
1023 |         source_token_emb = self.token_emb[source_type]
1024 |         source_pad_id = self.pad_id[source_type]
1025 | 
1026 |         # all target modules and parameters
1027 | 
1028 |         target_token_emb = self.token_emb[target_type]
1029 |         target_start_token = self.start_token[target_type]
1030 |         target_to_logit = self.to_logits[target_type]
1031 |         target_pad_id = self.pad_id[target_type]
1032 | 
1033 |         # auto set eos id
1034 | 
1035 |         if self.autoset_eos_id[source_type]:
1036 |             source_eos_id = self.eos_id[source_type]
1037 |             source = set_eos_id(source, source_eos_id, pad_id = source_pad_id)
1038 | 
1039 |         if self.autoset_eos_id[target_type] and return_loss:
1040 |             target_eos_id = self.eos_id[target_type]
1041 |             target = set_eos_id(target, target_eos_id, pad_id = target_pad_id)
1042 | 
1043 |         # if source/target mask is not passed in
1044 |         # automatically derive by the padding id of the modality
1045 | 
1046 |         if not exists(source_mask) and source.dtype == torch.long:
1047 |             source_mask = source != source_pad_id
1048 | 
1049 |         if not exists(target_mask) and target.dtype == torch.long:
1050 |             target_mask = target != target_pad_id
1051 | 
1052 |             # attend to bos
1053 |             target_mask = F.pad(target_mask, (1, 0), value = True)
1054 | 
1055 |         # embedding
1056 | 
1057 |         source_emb = source_token_emb(source)
1058 | 
1059 |         target_emb = target_token_emb(target)
1060 |         start_token = repeat(target_start_token, 'd -> b 1 d', b = batch)
1061 | 
1062 |         target_emb = torch.cat((start_token, target_emb), dim = 1)
1063 | 
1064 |         # source attention
1065 | 
1066 |         source_emb = self.source_transformer(source_emb, source_mask)
1067 | 
1068 |         # whether to drop condition, for CFG
1069 | 
1070 |         context_mask = source_mask
1071 |         if drop_cond:
1072 |             context_mask = torch.zeros_like(context_mask).bool()
1073 | 
1074 |         # target attention
1075 | 
1076 |         target_emb, target_hiddens = self.target_transformer(
1077 |             target_emb,
1078 |             mask = target_mask,
1079 |             context = source_emb,
1080 |             context_mask = context_mask,
1081 |             return_hiddens = True
1082 |         )
1083 | 
1084 |         # decoder logits
1085 | 
1086 |         logits = target_to_logit(target_emb)
1087 | 
1088 |         if not return_loss:
1089 |             return logits
1090 | 
1091 |         assert (self.training and not empty(target)) or not self.training
1092 | 
1093 |         logits = rearrange(logits[:, :-1], 'b n c -> b c n')
1094 | 
1095 |         loss = F.cross_entropy(
1096 |             logits,
1097 |             target,
1098 |             ignore_index = target_pad_id
1099 |         )
1100 | 
1101 |         if return_early_exit_loss:
1102 |             assert self.target_has_early_exit, 'you need to set the `target_early_exit_layer` in order to train a predictor on an earlier hidden dimension for speculative decoding'
1103 |             assert source_type == 'text' and target_type == 'speech'
1104 | 
1105 |             early_layer_index = self.early_exit_layer - 1
1106 |             early_embed = target_hiddens[early_layer_index]
1107 | 
1108 |             if self.detach_early_exit_embed:
1109 |                 # a way to train the early exit head without affecting the main loss
1110 |                 early_embed = early_embed.detach()
1111 | 
1112 |             early_exit_logits = self.to_early_exit_semantic_logits(early_embed)
1113 |             early_exit_logits = rearrange(early_exit_logits[:, :-1], 'b n c -> b c n')
1114 | 
1115 |             early_exit_loss = F.cross_entropy(
1116 |                 early_exit_logits,
1117 |                 target,
1118 |                 ignore_index = target_pad_id
1119 |             )
1120 | 
1121 |             loss = loss + early_exit_loss
1122 | 
1123 |         if should_sim_regularize and source_type != target_type and drop_cond and self.align_reg_loss_weight > 0:
1124 |             # regularizer proposed in https://arxiv.org/abs/2309.08773, alternative to contrastive loss when unconditional
1125 |             # supposedly fixes CFG for encoder / decoder transformers
1126 | 
1127 |             source_emb, batch_sizes = all_gather(source_emb, 0, None)
1128 |             target_emb, _           = all_gather(target_emb, 0, batch_sizes)
1129 | 
1130 |             mask_value = -torch.finfo(source_emb.dtype).max
1131 | 
1132 |             if exists(source_mask):
1133 |                 source_emb = source_emb.masked_fill(~source_mask[..., None], mask_value)
1134 | 
1135 |             if exists(target_mask):
1136 |                 target_emb = target_emb.masked_fill(~target_mask[..., None], mask_value)
1137 | 
1138 |             # they found that max pool worked best
1139 |             # also offer logsumexp pool (smooth max)
1140 | 
1141 |             batch, device = source_emb.shape[0], source_emb.device
1142 | 
1143 |             if self.align_reg_use_logsumexp_pool:
1144 |                 temp = self.align_reg_logsumexp_pool_temp
1145 |                 source_emb, target_emb = map(lambda t: t / temp, (source_emb, target_emb))
1146 |                 source_emb = reduce(source_emb, 'b n d -> b d', torch.logsumexp)
1147 |                 target_emb = reduce(target_emb, 'b n d -> b d', torch.logsumexp)
1148 |                 source_emb, target_emb = map(lambda t: t * temp, (source_emb, target_emb))
1149 |             else:
1150 |                 source_emb = reduce(source_emb, 'b n d -> b d', 'max')
1151 |                 target_emb = reduce(target_emb, 'b n d -> b d', 'max')
1152 | 
1153 |             source_emb, target_emb = map(l2norm, (source_emb, target_emb))
1154 | 
1155 |             source_sim, target_sim = map(lambda t: einsum('i d, j d -> i j', t, t), (source_emb, target_emb))
1156 |             diag_mask = torch.eye(batch, device = device, dtype = torch.bool)
1157 | 
1158 |             align_reg_loss = F.mse_loss(source_sim[~diag_mask], target_sim[~diag_mask])
1159 |             loss = loss + align_reg_loss * self.align_reg_loss_weight
1160 | 
1161 |         if not return_logits:
1162 |             return loss
1163 | 
1164 |         return loss, logits
1165 | 
1166 | # pretraining modules
1167 | 
1168 | def get_mask_subset_prob(mask, prob, min_mask = 0):
1169 |     batch, seq, device = *mask.shape, mask.device
1170 |     num_to_mask = (mask.sum(dim = -1, keepdim = True) * prob).clamp(min = min_mask)
1171 |     logits = torch.rand((batch, seq), device = device)
1172 |     logits = logits.masked_fill(~mask, -1)
1173 | 
1174 |     randperm = logits.argsort(dim = -1).float()
1175 | 
1176 |     num_padding = (~mask).sum(dim = -1, keepdim = True)
1177 |     randperm -= num_padding
1178 | 
1179 |     subset_mask = randperm < num_to_mask
1180 |     subset_mask.masked_fill_(~mask, False)
1181 |     return subset_mask
1182 | 
1183 | class SpeechSpeechPretrainWrapper(nn.Module):
1184 |     @beartype
1185 |     def __init__(
1186 |         self,
1187 |         model: TextToSemantic,
1188 |         wav2vec: Optional[SemanticModelType] = None,
1189 |         deletion_prob: float = 0.6,
1190 |         reconstruct_seq: bool = False,
1191 |         mask_id = None
1192 |     ):
1193 |         super().__init__()
1194 | 
1195 |         self.model = model
1196 |         self.wav2vec = default(wav2vec, model.wav2vec)
1197 | 
1198 |         self.deletion_prob = deletion_prob
1199 |         self.reconstruct_seq = reconstruct_seq # whether to reconstruct the entire sequence, or just output the deleted ones in order
1200 |         self.mask_id = mask_id
1201 | 
1202 |     def forward(
1203 |         self,
1204 |         x,
1205 |         return_early_exit_loss = False
1206 |     ):
1207 |         is_raw_audio = x.dtype == torch.float
1208 | 
1209 |         if is_raw_audio:
1210 |             assert exists(self.wav2vec)
1211 |             
1212 |             with torch.no_grad():
1213 |                 self.wav2vec.eval()
1214 |                 x = self.wav2vec(x, flatten = False)
1215 | 
1216 |         batch = x.shape[0]
1217 | 
1218 |         mask = torch.ones_like(x, dtype = torch.bool, device = self.model.device)
1219 | 
1220 |         if exists(self.mask_id):
1221 |             assert self.reconstruct_seq, 'reconstruct_seq must be true if mask id is provided'
1222 |             
1223 |             mask = mask.masked_fill(x == self.model.semantic_pad_id, False)
1224 |             delete_mask = get_mask_subset_prob(mask, self.deletion_prob)
1225 | 
1226 |             source = x.masked_fill(delete_mask, self.mask_id)
1227 |         else:
1228 |             delete_mask = get_mask_subset_prob(mask, self.deletion_prob)
1229 | 
1230 |             source = rearrange(x[~delete_mask], '(b n) -> b n', b = batch)
1231 | 
1232 |         if self.reconstruct_seq:
1233 |             target = x
1234 |         else:
1235 |             target = rearrange(x[delete_mask], '(b n) -> b n', b = batch)
1236 | 
1237 |         loss, logits = self.model(
1238 |             source, target,
1239 |             source_type = 'speech',
1240 |             target_type = 'speech',
1241 |             return_loss = True,
1242 |             return_logits = True,
1243 |             return_early_exit_loss = return_early_exit_loss,
1244 |         )
1245 | 
1246 |         return loss, logits
1247 | 
1248 | # wrapper for backtranslation task
1249 | 
1250 | class SemanticToTextWrapper(nn.Module):
1251 |     @beartype
1252 |     def __init__(
1253 |         self,
1254 |         model: TextToSemantic
1255 |     ):
1256 |         super().__init__()
1257 | 
1258 |         self.model = model
1259 | 
1260 |     def forward(
1261 |         self,
1262 |         semantic_token_ids,
1263 |         grapheme_token_ids,
1264 |     ):
1265 |         source = semantic_token_ids
1266 |         target = grapheme_token_ids
1267 | 
1268 |         loss, logits = self.model(
1269 |             source, target,
1270 |             source_type = 'speech',
1271 |             target_type = 'text',
1272 |             return_loss = True,
1273 |             return_logits = True
1274 |         )
1275 | 
1276 |         return loss, logits
1277 | 
1278 | # wrapper for text to semantic task
1279 | 
1280 | class TextToSemanticWrapper(nn.Module):
1281 |     @beartype
1282 |     def __init__(
1283 |         self,
1284 |         model: TextToSemantic
1285 |     ):
1286 |         super().__init__()
1287 | 
1288 |         self.model = model
1289 | 
1290 |     def forward(
1291 |         self,
1292 |         grapheme_token_ids,
1293 |         semantic_token_ids,
1294 |         return_early_exit_loss = True
1295 |     ):
1296 |         source = grapheme_token_ids
1297 |         target = semantic_token_ids
1298 | 
1299 |         loss, logits = self.model(
1300 |             source, target,
1301 |             source_type = 'text',
1302 |             target_type = 'speech',
1303 |             return_loss = True,
1304 |             return_logits = True,
1305 |             return_early_exit_loss = return_early_exit_loss
1306 |         )
1307 | 
1308 |         return loss, logits
1309 | 
1310 | # wrapper for generating the pseudo-labelled audio to text dataset
1311 | 
1312 | class SemanticToTextDatasetGenerator(nn.Module):
1313 |     @beartype
1314 |     def __init__(
1315 |         self,
1316 |         model,
1317 |         *,
1318 |         dataset: Dataset,
1319 |         folder = './generated-audio-text-pairs',
1320 |         batch_size = 4,
1321 |         delimiter_id: int = -1,
1322 |         audio_pad_id = None,
1323 |         text_pad_id = 0
1324 |     ):
1325 |         super().__init__()
1326 |         self.model = model
1327 | 
1328 |         self.dataset = dataset
1329 |         self.dl = get_dataloader(dataset, batch_size = batch_size)
1330 |         self.delimiter_id = delimiter_id
1331 | 
1332 |         self.audio_pad_id = audio_pad_id
1333 |         self.text_pad_id = text_pad_id
1334 | 
1335 |         self.folder = Path(folder)
1336 |         self.folder.mkdir(exist_ok = True, parents = True)
1337 | 
1338 |     def forward(
1339 |         self,
1340 |         max_length = 2048,
1341 |         beam_search_decode = True,
1342 |         **generate_kwargs
1343 |     ):
1344 |         delimiter = torch.tensor([self.delimiter_id], device = self.model.device)
1345 | 
1346 |         counter = 0
1347 | 
1348 |         for audio, in self.dl:
1349 |             audio_semantic_ids, text_ids = self.model.generate(
1350 |                 source = audio,
1351 |                 source_type = 'speech',
1352 |                 target_type = 'text',
1353 |                 return_source = True,
1354 |                 max_length = max_length,
1355 |                 beam_search_decode = beam_search_decode,
1356 |                 **generate_kwargs
1357 |             )
1358 | 
1359 |             for audio_semantic_id, text_id in zip(audio_semantic_ids, text_ids):
1360 | 
1361 |                 if exists(self.audio_pad_id):
1362 |                     audio_pad_mask = audio_semantic_id == self.audio_pad_id
1363 |                     audio_semantic_id = audio_semantic_id[~audio_pad_mask]
1364 | 
1365 |                 if exists(self.text_pad_id):
1366 |                     text_pad_mask = text_id == self.text_pad_id
1367 |                     text_id = text_id[~text_pad_mask]
1368 | 
1369 |                 row, _ = pack([audio_semantic_id, delimiter, text_id], '*')
1370 |                 path = str(self.folder / f'{counter}.pt')
1371 | 
1372 |                 torch.save(row, path)
1373 |                 counter += 1
1374 | 


--------------------------------------------------------------------------------
/spear_tts_pytorch/trainer.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from pathlib import Path
  3 | from shutil import rmtree
  4 | 
  5 | from beartype import beartype
  6 | from beartype.door import is_bearable
  7 | from beartype.typing import Union, Optional, Tuple
  8 | 
  9 | import torch
 10 | from torch import nn, LongTensor, IntTensor
 11 | from torch.utils.data import ConcatDataset
 12 | from torch.optim.lr_scheduler import CosineAnnealingLR
 13 | from torch.utils.data import Dataset, random_split
 14 | 
 15 | from audiolm_pytorch import FairseqVQWav2Vec, HubertWithKmeans
 16 | from audiolm_pytorch.data import get_dataloader
 17 | from audiolm_pytorch.optimizer import get_optimizer
 18 | 
 19 | from spear_tts_pytorch.spear_tts_pytorch import SpeechSpeechPretrainWrapper, TextToSemantic, SemanticToTextWrapper, TextToSemanticWrapper
 20 | from spear_tts_pytorch.data import GeneratedAudioTextDataset
 21 | 
 22 | from accelerate import Accelerator, DistributedType
 23 | 
 24 | # constants
 25 | 
 26 | IndicesTensor = Union[LongTensor, IntTensor]
 27 | 
 28 | # make sure only one trainer is instantiated
 29 | 
 30 | ONE_TRAINER_INSTANTIATED = False
 31 | 
 32 | def check_one_trainer():
 33 |     global ONE_TRAINER_INSTANTIATED
 34 |     assert not ONE_TRAINER_INSTANTIATED, 'only one Trainer can be instantiated at a time for training'
 35 |     ONE_TRAINER_INSTANTIATED = True
 36 | 
 37 | # helpers
 38 | 
 39 | def exists(val):
 40 |     return val is not None
 41 | 
 42 | def noop(*args, **kwargs):
 43 |     pass
 44 | 
 45 | def cycle(dl):
 46 |     while True:
 47 |         for data in dl:
 48 |             yield data
 49 | 
 50 | def cast_tuple(t):
 51 |     return t if isinstance(t, (tuple, list)) else (t,)
 52 | 
 53 | def yes_or_no(question):
 54 |     answer = input(f'{question} (y/n) ')
 55 |     return answer.lower() in ('yes', 'y')
 56 | 
 57 | def accum_log(log, new_logs):
 58 |     for key, new_value in new_logs.items():
 59 |         old_value = log.get(key, 0.)
 60 |         log[key] = old_value + new_value
 61 |     return log
 62 | 
 63 | def checkpoint_num_steps(checkpoint_path):
 64 |     """Returns the number of steps trained from a checkpoint based on the filename.
 65 | 
 66 |     Filename format assumed to be something like "/path/to/speech.speech.20000.pt" which is
 67 |     for 20k train steps. Returns 20000 in that case.
 68 |     """
 69 |     results = re.findall(r'\d+', str(checkpoint_path))
 70 | 
 71 |     if len(results) == 0:
 72 |         return 0
 73 | 
 74 |     return int(results[-1])
 75 | 
 76 | 
 77 | class SpeechSpeechPretrainer(nn.Module):
 78 |     @beartype
 79 |     def __init__(
 80 |         self,
 81 |         model: TextToSemantic,
 82 |         wav2vec: Optional[Union[FairseqVQWav2Vec, HubertWithKmeans]],
 83 |         *,
 84 |         num_train_steps,
 85 |         num_warmup_steps,
 86 |         batch_size,
 87 |         dataset: Optional[Dataset] = None,
 88 |         deletion_prob: float = 0.6,
 89 |         reconstruct_seq: bool = False,
 90 |         mask_id = None,
 91 |         lr = 3e-4,
 92 |         initial_lr = 1e-5,
 93 |         grad_accum_every = 1,
 94 |         wd = 0.,
 95 |         max_grad_norm = 0.5,
 96 |         valid_frac = 0.05,
 97 |         random_split_seed = 42,
 98 |         log_every = 10,
 99 |         save_results_every = 100,
100 |         save_model_every = 1000,
101 |         results_folder = './results',
102 |         accelerate_kwargs: dict = dict(),
103 |         split_batches = False,
104 |         drop_last = False,
105 |         force_clear_prev_results = None
106 |     ):
107 |         super().__init__()
108 |         check_one_trainer()
109 | 
110 |         self.accelerator = Accelerator(
111 |             split_batches = split_batches,
112 |             **accelerate_kwargs
113 |         )
114 | 
115 |         self.model = model
116 |         self.wav2vec = wav2vec
117 | 
118 |         self.train_wrapper = SpeechSpeechPretrainWrapper(
119 |             model = model,
120 |             wav2vec = wav2vec,
121 |             deletion_prob = deletion_prob,
122 |             reconstruct_seq = reconstruct_seq,
123 |             mask_id = mask_id
124 |         )
125 | 
126 |         self.register_buffer('steps', torch.Tensor([0]))
127 | 
128 |         self.num_train_steps = num_train_steps
129 |         self.num_warmup_steps = num_warmup_steps
130 |         self.batch_size = batch_size
131 |         self.grad_accum_every = grad_accum_every
132 | 
133 |         # optimizers
134 |         self.lr = lr
135 |         self.initial_lr = initial_lr
136 |         self.optim = get_optimizer(model.parameters(), lr = lr, wd = wd)
137 |         self.scheduler = CosineAnnealingLR(self.optim, T_max = num_train_steps)
138 | 
139 |         # max grad norm
140 | 
141 |         self.max_grad_norm = max_grad_norm
142 | 
143 |         # create dataset
144 | 
145 |         self.ds = dataset
146 | 
147 |         # split for validation
148 | 
149 |         if valid_frac > 0:
150 |             train_size = int((1 - valid_frac) * len(self.ds))
151 |             valid_size = len(self.ds) - train_size
152 |             self.ds, self.valid_ds = random_split(self.ds, [train_size, valid_size], generator = torch.Generator().manual_seed(random_split_seed))
153 |             self.print(f'training with dataset of {len(self.ds)} samples and validating with randomly splitted {len(self.valid_ds)} samples')
154 |         else:
155 |             self.valid_ds = self.ds
156 |             self.print(f'training with shared training and valid dataset of {len(self.ds)} samples')
157 | 
158 |         assert len(self.ds) >= batch_size, 'dataset must have sufficient samples for training'
159 |         assert len(self.valid_ds) >= batch_size, f'validation dataset must have sufficient number of samples (currently {len(self.valid_ds)}) for training'
160 | 
161 |         # dataloader
162 | 
163 |         self.dl = get_dataloader(self.ds, batch_size = batch_size, shuffle = True, drop_last = drop_last)
164 | 
165 |         self.valid_dl = get_dataloader(self.valid_ds, batch_size = batch_size, shuffle = True, drop_last = drop_last)
166 | 
167 |         # prepare with accelerator
168 | 
169 |         (
170 |             self.train_wrapper,
171 |             self.optim,
172 |             self.scheduler,
173 |             self.dl,
174 |             self.valid_dl
175 |         ) = self.accelerator.prepare(
176 |             self.train_wrapper,
177 |             self.optim,
178 |             self.scheduler,
179 |             self.dl,
180 |             self.valid_dl
181 |         )
182 | 
183 |         # dataloader iterators
184 | 
185 |         self.dl_iter = cycle(self.dl)
186 |         self.valid_dl_iter = cycle(self.valid_dl)
187 | 
188 |         self.log_every = log_every
189 |         self.save_model_every = save_model_every
190 |         self.save_results_every = save_results_every
191 | 
192 |         self.results_folder = Path(results_folder)
193 | 
194 |         if self.is_main and force_clear_prev_results is True or (not exists(force_clear_prev_results) and len([*self.results_folder.glob('**/*')]) > 0 and yes_or_no('do you want to clear previous experiment checkpoints and results?')):
195 |             rmtree(str(self.results_folder))
196 | 
197 |         self.results_folder.mkdir(parents = True, exist_ok = True)
198 |         
199 |         hps = {"num_train_steps": num_train_steps, "num_warmup_steps": num_warmup_steps, "learning_rate": lr, "initial_learning_rate": lr}
200 |         self.accelerator.init_trackers("speechspeech", config=hps)
201 | 
202 |     def save(self, path):
203 |         pkg = dict(
204 |             model = self.accelerator.get_state_dict(self.model),
205 |             optim = self.optim.state_dict(),
206 |             scheduler = self.scheduler.state_dict()
207 |         )
208 |         torch.save(pkg, path)
209 | 
210 |     def load(self, path):
211 |         model = self.accelerator.unwrap_model(self.model)
212 |         pkg = model.load(path)
213 | 
214 |         self.optim.load_state_dict(pkg['optim'])
215 |         self.scheduler.load_state_dict(pkg['scheduler'])
216 | 
217 |         # + 1 to start from the next step and avoid overwriting the last checkpoint
218 |         self.steps = torch.tensor([checkpoint_num_steps(path) + 1], device=self.device)
219 | 
220 |     def print(self, msg):
221 |         self.accelerator.print(msg)
222 | 
223 |     def generate(self, *args, **kwargs):
224 |         return self.train_wrapper.generate(*args, **kwargs)
225 | 
226 |     @property
227 |     def device(self):
228 |         return self.accelerator.device
229 | 
230 |     @property
231 |     def is_distributed(self):
232 |         return not (self.accelerator.distributed_type == DistributedType.NO and self.accelerator.num_processes == 1)
233 | 
234 |     @property
235 |     def is_main(self):
236 |         return self.accelerator.is_main_process
237 | 
238 |     @property
239 |     def is_local_main(self):
240 |         return self.accelerator.is_local_main_process
241 | 
242 |     def warmup(self, step):
243 |         if step < self.num_warmup_steps:
244 |             return self.initial_lr + (self.lr - self.initial_lr) * step / self.num_warmup_steps
245 |         else:
246 |             return self.lr
247 |     
248 |     def train_step(self):
249 |         steps = int(self.steps.item())
250 | 
251 |         self.model.train()
252 |         
253 |         # adjust the lr according to the schedule
254 |         
255 |         if steps < self.num_warmup_steps:
256 |             # Apply warmup
257 |             lr = self.warmup(steps)
258 |             for param_group in self.optim.param_groups:
259 |                 param_group['lr'] = lr
260 |         else:
261 |             # After warmup period, start to apply CosineAnnealingLR
262 |             self.scheduler.step()
263 | 
264 |         # logs
265 | 
266 |         logs = {}
267 | 
268 |         # update vae (generator)
269 | 
270 |         for _ in range(self.grad_accum_every):
271 |             x, = next(self.dl_iter)
272 | 
273 |             loss, _ = self.train_wrapper(x)
274 | 
275 |             self.accelerator.backward(loss / self.grad_accum_every)
276 | 
277 |             accum_log(logs, {'loss': loss.item() / self.grad_accum_every})
278 | 
279 |         if exists(self.max_grad_norm):
280 |             self.accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
281 | 
282 |         self.optim.step()
283 |         self.optim.zero_grad()
284 | 
285 |         # log
286 | 
287 |         if not (steps % self.log_every):
288 |             self.print(f"{steps}: loss: {logs['loss']:0.3f}")
289 | 
290 |         self.accelerator.log({"train_loss": logs['loss']}, step=steps)
291 | 
292 |         # sample results every so often
293 | 
294 |         self.accelerator.wait_for_everyone()
295 | 
296 |         if self.is_main and not (steps % self.save_results_every):
297 |             x, = next(self.valid_dl_iter)
298 | 
299 |             with torch.inference_mode():
300 |                 self.train_wrapper.eval()
301 |                 valid_loss, _ = self.train_wrapper(x)
302 | 
303 |             self.print(f'{steps}: valid loss {valid_loss:0.3f}')
304 |             self.accelerator.log({"valid_loss": valid_loss}, step=steps)
305 | 
306 |         # save model every so often
307 | 
308 |         if self.is_main and not (steps % self.save_model_every):
309 |             model_path = str(self.results_folder / f'speech.speech.{steps}.pt')
310 |             self.save(model_path)
311 | 
312 |             self.print(f'{steps}: saving model to {str(self.results_folder)}')
313 | 
314 |         self.steps += 1
315 |         return logs
316 | 
317 |     def train(self, log_fn = noop):
318 |         while self.steps < self.num_train_steps:
319 |             logs = self.train_step()
320 |             log_fn(logs)
321 | 
322 |         self.print('training complete')
323 | 
324 | 
325 | class SemanticToTextTrainer(nn.Module):
326 |     @beartype
327 |     def __init__(
328 |         self,
329 |         model: TextToSemantic,
330 |         *,
331 |         num_train_steps,
332 |         num_warmup_steps,
333 |         batch_size,
334 |         dataset: Optional[Dataset] = None,
335 |         lr = 3e-4,
336 |         initial_lr = 1e-5,
337 |         grad_accum_every = 1,
338 |         wd = 0.,
339 |         max_grad_norm = 0.5,
340 |         valid_frac = 0.05,
341 |         random_split_seed = 42,
342 |         log_every = 10,
343 |         save_results_every = 100,
344 |         save_model_every = 1000,
345 |         results_folder = './results',
346 |         accelerate_kwargs: dict = dict(),
347 |         split_batches = False,
348 |         drop_last = False,
349 |         force_clear_prev_results = None
350 |     ):
351 |         super().__init__()
352 |         check_one_trainer()
353 | 
354 |         self.accelerator = Accelerator(
355 |             split_batches = split_batches,
356 |             **accelerate_kwargs
357 |         )
358 | 
359 |         self.model = model
360 | 
361 |         self.train_wrapper = SemanticToTextWrapper(model = model)
362 | 
363 |         self.register_buffer('steps', torch.Tensor([0]))
364 | 
365 |         self.num_train_steps = num_train_steps
366 |         self.num_warmup_steps = num_warmup_steps
367 |         self.batch_size = batch_size
368 |         self.grad_accum_every = grad_accum_every
369 | 
370 |         # when doing backtranslation
371 |         # encoder is frozen (and presumably all the speech embeddings)
372 | 
373 |         model.unfreeze_all()
374 |         model.freeze_speech_emb()
375 |         model.freeze_encoder()
376 | 
377 |         # optimizers
378 |         # get_optimizer should filter out frozen parameters (ones with requires_grad set to False)
379 |         # https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/optimizer.py#L24
380 | 
381 |         self.optim = get_optimizer(
382 |             model.parameters(),
383 |             lr = lr,
384 |             wd = wd,
385 |             filter_by_requires_grad = True
386 |         )
387 | 
388 |         self.lr = lr
389 |         self.initial_lr = initial_lr
390 |         self.scheduler = CosineAnnealingLR(self.optim, T_max = num_train_steps)
391 | 
392 |         # max grad norm
393 | 
394 |         self.max_grad_norm = max_grad_norm
395 | 
396 |         # create dataset
397 | 
398 |         self.ds = dataset
399 | 
400 |         # split for validation
401 | 
402 |         if valid_frac > 0:
403 |             train_size = int((1 - valid_frac) * len(self.ds))
404 |             valid_size = len(self.ds) - train_size
405 |             self.ds, self.valid_ds = random_split(self.ds, [train_size, valid_size], generator = torch.Generator().manual_seed(random_split_seed))
406 |             self.print(f'training with dataset of {len(self.ds)} samples and validating with randomly splitted {len(self.valid_ds)} samples')
407 |         else:
408 |             self.valid_ds = self.ds
409 |             self.print(f'training with shared training and valid dataset of {len(self.ds)} samples')
410 | 
411 |         assert len(self.ds) >= batch_size, 'dataset must have sufficient samples for training'
412 |         assert len(self.valid_ds) >= batch_size, f'validation dataset must have sufficient number of samples (currently {len(self.valid_ds)}) for training'
413 | 
414 |         # dataloader
415 | 
416 |         self.dl = get_dataloader(self.ds, batch_size = batch_size, shuffle = True, drop_last = drop_last)
417 | 
418 |         self.valid_dl = get_dataloader(self.valid_ds, batch_size = batch_size, shuffle = True, drop_last = drop_last)
419 | 
420 |         # prepare with accelerator
421 | 
422 |         (
423 |             self.train_wrapper,
424 |             self.optim,
425 |             self.scheduler,
426 |             self.dl,
427 |             self.valid_dl
428 |         ) = self.accelerator.prepare(
429 |             self.train_wrapper,
430 |             self.optim,
431 |             self.scheduler,
432 |             self.dl,
433 |             self.valid_dl
434 |         )
435 | 
436 |         # dataloader iterators
437 | 
438 |         self.dl_iter = cycle(self.dl)
439 |         self.valid_dl_iter = cycle(self.valid_dl)
440 | 
441 |         self.log_every = log_every
442 |         self.save_model_every = save_model_every
443 |         self.save_results_every = save_results_every
444 | 
445 |         self.results_folder = Path(results_folder)
446 | 
447 |         if self.is_main and force_clear_prev_results is True or (not exists(force_clear_prev_results) and len([*self.results_folder.glob('**/*')]) > 0 and yes_or_no('do you want to clear previous experiment checkpoints and results?')):
448 |             rmtree(str(self.results_folder))
449 | 
450 |         self.results_folder.mkdir(parents = True, exist_ok = True)
451 |         
452 |         hps = {"num_train_steps": num_train_steps, "num_warmup_steps": num_warmup_steps, "learning_rate": lr, "initial_learning_rate": lr}
453 |         self.accelerator.init_trackers("semantictext", config=hps)
454 | 
455 |     def save(self, path):
456 |         pkg = dict(
457 |             model = self.accelerator.get_state_dict(self.model),
458 |             optim = self.optim.state_dict(),
459 |             scheduler = self.scheduler.state_dict()
460 |         )
461 |         torch.save(pkg, path)
462 | 
463 |     def load(self, path, restore_optimizer = True):
464 |         model = self.accelerator.unwrap_model(self.model)
465 |         pkg = model.load(path)
466 | 
467 |         if restore_optimizer:
468 |             self.optim.load_state_dict(pkg['optim'])
469 |             self.scheduler.load_state_dict(pkg['scheduler'])
470 | 
471 |             # + 1 to start from the next step and avoid overwriting the last checkpoint
472 |             self.steps = torch.tensor([checkpoint_num_steps(path) + 1], device=self.device)
473 | 
474 |     def print(self, msg):
475 |         self.accelerator.print(msg)
476 | 
477 |     def generate(self, *args, **kwargs):
478 |         return self.train_wrapper.generate(*args, **kwargs)
479 | 
480 |     @property
481 |     def device(self):
482 |         return self.accelerator.device
483 | 
484 |     @property
485 |     def is_distributed(self):
486 |         return not (self.accelerator.distributed_type == DistributedType.NO and self.accelerator.num_processes == 1)
487 | 
488 |     @property
489 |     def is_main(self):
490 |         return self.accelerator.is_main_process
491 | 
492 |     @property
493 |     def is_local_main(self):
494 |         return self.accelerator.is_local_main_process
495 | 
496 |     def warmup(self, step):
497 |         if step < self.num_warmup_steps:
498 |             return self.initial_lr + (self.lr - self.initial_lr) * step / self.num_warmup_steps
499 |         else:
500 |             return self.lr
501 |     
502 |     def train_step(self):
503 |         steps = int(self.steps.item())
504 | 
505 |         self.model.train()
506 |         
507 |         # adjust the lr according to the schedule
508 |         
509 |         if steps < self.num_warmup_steps:
510 |             # Apply warmup
511 |             lr = self.warmup(steps)
512 |             for param_group in self.optim.param_groups:
513 |                 param_group['lr'] = lr
514 |         else:
515 |             # After warmup period, start to apply CosineAnnealingLR
516 |             self.scheduler.step()
517 | 
518 |         # logs
519 | 
520 |         logs = {}
521 | 
522 |         # update vae (generator)
523 | 
524 |         for _ in range(self.grad_accum_every):
525 |             semantic_token_ids, grapheme_token_ids = next(self.dl_iter)
526 | 
527 |             loss, _ = self.train_wrapper(semantic_token_ids = semantic_token_ids, grapheme_token_ids = grapheme_token_ids)
528 | 
529 |             self.accelerator.backward(loss / self.grad_accum_every)
530 | 
531 |             accum_log(logs, {'loss': loss.item() / self.grad_accum_every})
532 | 
533 |         if exists(self.max_grad_norm):
534 |             self.accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
535 | 
536 |         self.optim.step()
537 |         self.optim.zero_grad()
538 | 
539 |         # log
540 | 
541 |         if not (steps % self.log_every):
542 |             self.print(f"{steps}: loss: {logs['loss']:0.3f}")
543 |         self.accelerator.log({"train_loss": logs['loss']}, step=steps)
544 | 
545 |         # sample results every so often
546 | 
547 |         self.accelerator.wait_for_everyone()
548 | 
549 |         if self.is_main and not (steps % self.save_results_every):
550 |             semantic_token_ids, grapheme_token_ids = next(self.valid_dl_iter)
551 | 
552 |             with torch.inference_mode():
553 |                 self.train_wrapper.eval()
554 |                 valid_loss, _ = self.train_wrapper(semantic_token_ids = semantic_token_ids, grapheme_token_ids = grapheme_token_ids)
555 | 
556 |             self.print(f'{steps}: valid loss {valid_loss:0.3f}')
557 |             self.accelerator.log({"valid_loss": valid_loss}, step=steps)
558 | 
559 |         # save model every so often
560 | 
561 |         if self.is_main and not (steps % self.save_model_every):
562 |             model_path = str(self.results_folder / f'semantic.text.{steps}.pt')
563 |             self.save(model_path)
564 | 
565 |             self.print(f'{steps}: saving model to {str(self.results_folder)}')
566 | 
567 |         self.steps += 1
568 |         return logs
569 | 
570 |     def train(self, log_fn = noop):
571 |         while self.steps < self.num_train_steps:
572 |             logs = self.train_step()
573 |             log_fn(logs)
574 | 
575 |         self.print('training complete')
576 | 
577 | 
578 | class TextToSemanticTrainer(nn.Module):
579 |     @beartype
580 |     def __init__(
581 |         self,
582 |         model: TextToSemantic,
583 |         *,
584 |         num_train_steps,
585 |         num_warmup_steps,
586 |         batch_size,
587 |         dataset: Optional[Dataset] = None,
588 |         generated_audio_text_dataset_folder = None,
589 |         dataset_delimiter_id = -1,
590 |         lr = 3e-4,
591 |         initial_lr = 1e-5,
592 |         grad_accum_every = 1,
593 |         wd = 0.,
594 |         max_grad_norm = 0.5,
595 |         valid_frac = 0.05,
596 |         random_split_seed = 42,
597 |         log_every = 10,
598 |         save_results_every = 100,
599 |         save_model_every = 1000,
600 |         results_folder = './results',
601 |         accelerate_kwargs: dict = dict(),
602 |         split_batches = False,
603 |         drop_last = False,
604 |         force_clear_prev_results = None,
605 |         freeze_encoder_layers_below = 2,
606 |         should_train_early_exit_layer_if_available = True
607 |     ):
608 |         super().__init__()
609 |         check_one_trainer()
610 | 
611 |         self.accelerator = Accelerator(
612 |             split_batches = split_batches,
613 |             **accelerate_kwargs
614 |         )
615 | 
616 |         self.model = model
617 | 
618 |         self.train_wrapper = TextToSemanticWrapper(model = model)
619 | 
620 |         self.register_buffer('steps', torch.Tensor([0]))
621 | 
622 |         self.num_train_steps = num_train_steps
623 |         self.num_warmup_steps = num_warmup_steps
624 |         self.batch_size = batch_size
625 |         self.grad_accum_every = grad_accum_every
626 | 
627 |         self.train_early_exit = model.target_has_early_exit and should_train_early_exit_layer_if_available
628 | 
629 |         # when doing text to semantic generation
630 |         # encoder is partially frozen and decoder is frozen
631 | 
632 |         model.unfreeze_all()
633 |         model.freeze_speech_emb()
634 |         model.freeze_encoder_below_layer(freeze_encoder_layers_below)
635 |         model.freeze_decoder()
636 | 
637 |         # optimizers
638 |         # get_optimizer should filter out frozen parameters (ones with requires_grad set to False)
639 |         # https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/optimizer.py#L24
640 | 
641 |         self.optim = get_optimizer(
642 |             model.parameters(),
643 |             lr = lr,
644 |             wd = wd,
645 |             filter_by_requires_grad = True
646 |         )
647 | 
648 |         self.lr = lr
649 |         self.initial_lr = initial_lr
650 |         self.scheduler = CosineAnnealingLR(self.optim, T_max = num_train_steps)
651 | 
652 |         # max grad norm
653 | 
654 |         self.max_grad_norm = max_grad_norm
655 | 
656 |         # create dataset
657 | 
658 |         datasets = []
659 | 
660 |         if exists(dataset):
661 |             assert len(dataset) > 0 and is_bearable(dataset[0], Tuple[IndicesTensor, IndicesTensor]), 'audio-text dataset must return text and semantic token ids as a tuple of two tensors'
662 |             datasets.append(dataset)
663 | 
664 |         if exists(generated_audio_text_dataset_folder):
665 |             pseudo_labelled_dataset = GeneratedAudioTextDataset(
666 |                 folder = generated_audio_text_dataset_folder,
667 |                 delimiter_id = dataset_delimiter_id
668 |             )
669 | 
670 |             datasets.append(pseudo_labelled_dataset)
671 | 
672 |         # concat the small labelled dataset with the pseudo-labelled dataset at the folder designated
673 | 
674 |         assert len(datasets) > 0
675 |         self.ds = ConcatDataset(datasets)
676 | 
677 |         # split for validation
678 | 
679 |         if valid_frac > 0:
680 |             train_size = int((1 - valid_frac) * len(self.ds))
681 |             valid_size = len(self.ds) - train_size
682 |             self.ds, self.valid_ds = random_split(self.ds, [train_size, valid_size], generator = torch.Generator().manual_seed(random_split_seed))
683 |             self.print(f'training with dataset of {len(self.ds)} samples and validating with randomly splitted {len(self.valid_ds)} samples')
684 |         else:
685 |             self.valid_ds = self.ds
686 |             self.print(f'training with shared training and valid dataset of {len(self.ds)} samples')
687 | 
688 |         assert len(self.ds) >= batch_size, 'dataset must have sufficient samples for training'
689 |         assert len(self.valid_ds) >= batch_size, f'validation dataset must have sufficient number of samples (currently {len(self.valid_ds)}) for training'
690 | 
691 |         # dataloader
692 | 
693 |         self.dl = get_dataloader(self.ds, batch_size = batch_size, shuffle = True, drop_last = drop_last)
694 | 
695 |         self.valid_dl = get_dataloader(self.valid_ds, batch_size = batch_size, shuffle = True, drop_last = drop_last)
696 | 
697 |         # prepare with accelerator
698 | 
699 |         (
700 |             self.train_wrapper,
701 |             self.optim,
702 |             self.scheduler,
703 |             self.dl,
704 |             self.valid_dl
705 |         ) = self.accelerator.prepare(
706 |             self.train_wrapper,
707 |             self.optim,
708 |             self.scheduler,
709 |             self.dl,
710 |             self.valid_dl
711 |         )
712 | 
713 |         # dataloader iterators
714 | 
715 |         self.dl_iter = cycle(self.dl)
716 |         self.valid_dl_iter = cycle(self.valid_dl)
717 | 
718 |         self.save_model_every = save_model_every
719 |         self.save_results_every = save_results_every
720 |         self.log_every = log_every
721 | 
722 |         self.results_folder = Path(results_folder)
723 | 
724 |         if self.is_main and force_clear_prev_results is True or (not exists(force_clear_prev_results) and len([*self.results_folder.glob('**/*')]) > 0 and yes_or_no('do you want to clear previous experiment checkpoints and results?')):
725 |             rmtree(str(self.results_folder))
726 | 
727 |         self.results_folder.mkdir(parents = True, exist_ok = True)
728 |         
729 |         hps = {"num_train_steps": num_train_steps, "num_warmup_steps": num_warmup_steps, "learning_rate": lr, "initial_learning_rate": lr}
730 |         self.accelerator.init_trackers("textsemantic", config=hps)
731 | 
732 |     def save(self, path):
733 |         pkg = dict(
734 |             model = self.accelerator.get_state_dict(self.model),
735 |             optim = self.optim.state_dict(),
736 |             scheduler = self.scheduler.state_dict()
737 |         )
738 |         torch.save(pkg, path)
739 | 
740 |     def load(self, path, restore_optimizer = True):
741 |         model = self.accelerator.unwrap_model(self.model)
742 |         pkg = model.load(path)
743 | 
744 |         if restore_optimizer:
745 |             self.optim.load_state_dict(pkg['optim'])
746 |             self.scheduler.load_state_dict(pkg['scheduler'])
747 | 
748 |             # + 1 to start from the next step and avoid overwriting the last checkpoint
749 |             self.steps = torch.tensor([checkpoint_num_steps(path) + 1], device=self.device)
750 | 
751 |     def print(self, msg):
752 |         self.accelerator.print(msg)
753 | 
754 |     def generate(self, *args, **kwargs):
755 |         return self.train_wrapper.generate(*args, **kwargs)
756 | 
757 |     @property
758 |     def device(self):
759 |         return self.accelerator.device
760 | 
761 |     @property
762 |     def is_distributed(self):
763 |         return not (self.accelerator.distributed_type == DistributedType.NO and self.accelerator.num_processes == 1)
764 | 
765 |     @property
766 |     def is_main(self):
767 |         return self.accelerator.is_main_process
768 | 
769 |     @property
770 |     def is_local_main(self):
771 |         return self.accelerator.is_local_main_process
772 | 
773 |     def warmup(self, step):
774 |         if step < self.num_warmup_steps:
775 |             return self.initial_lr + (self.lr - self.initial_lr) * step / self.num_warmup_steps
776 |         else:
777 |             return self.lr
778 |     
779 |     def train_step(self):
780 |         steps = int(self.steps.item())
781 | 
782 |         self.model.train()
783 |         
784 |         # adjust the lr according to the schedule
785 |         
786 |         if steps < self.num_warmup_steps:
787 |             # Apply warmup
788 |             lr = self.warmup(steps)
789 |             for param_group in self.optim.param_groups:
790 |                 param_group['lr'] = lr
791 |         else:
792 |             # After warmup period, start to apply CosineAnnealingLR
793 |             self.scheduler.step()
794 | 
795 |         # logs
796 | 
797 |         logs = {}
798 | 
799 |         # update vae (generator)
800 | 
801 |         for _ in range(self.grad_accum_every):
802 |             semantic_token_ids, grapheme_token_ids = next(self.dl_iter)
803 | 
804 |             loss, _ = self.train_wrapper(semantic_token_ids = semantic_token_ids, grapheme_token_ids = grapheme_token_ids, return_early_exit_loss = self.train_early_exit)
805 | 
806 |             self.accelerator.backward(loss / self.grad_accum_every)
807 | 
808 |             accum_log(logs, {'loss': loss.item() / self.grad_accum_every})
809 | 
810 |         if exists(self.max_grad_norm):
811 |             self.accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
812 | 
813 |         self.optim.step()
814 |         self.optim.zero_grad()
815 | 
816 |         # log
817 | 
818 |         if not (steps % self.log_every):
819 |             self.print(f"{steps}: loss: {logs['loss']:0.3f}")
820 |         
821 |         self.accelerator.log({"train_loss": logs['loss']}, step=steps)
822 | 
823 |         # sample results every so often
824 | 
825 |         self.accelerator.wait_for_everyone()
826 | 
827 |         if self.is_main and not (steps % self.save_results_every):
828 |             semantic_token_ids, grapheme_token_ids = next(self.valid_dl_iter)
829 | 
830 |             with torch.inference_mode():
831 |                 self.train_wrapper.eval()
832 |                 valid_loss, _ = self.train_wrapper(semantic_token_ids = semantic_token_ids, grapheme_token_ids = grapheme_token_ids, return_early_exit_loss = self.train_early_exit)
833 | 
834 |             self.print(f'{steps}: valid loss {valid_loss:0.3f}')
835 |             self.accelerator.log({"valid_loss": valid_loss}, step=steps)
836 | 
837 |         # save model every so often
838 | 
839 |         if self.is_main and not (steps % self.save_model_every):
840 |             model_path = str(self.results_folder / f'text.semantic.{steps}.pt')
841 |             self.save(model_path)
842 | 
843 |             self.print(f'{steps}: saving model to {str(self.results_folder)}')
844 | 
845 |         self.steps += 1
846 |         return logs
847 | 
848 |     def train(self, log_fn = noop):
849 |         while self.steps < self.num_train_steps:
850 |             logs = self.train_step()
851 |             log_fn(logs)
852 | 
853 |         self.print('training complete')
854 | 


--------------------------------------------------------------------------------