├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── continuous_64ch.json
├── discrete_1000steps_64ch.json
├── inference.py
├── loss.py
├── models
    ├── __init__.py
    ├── mels.py
    ├── model.py
    └── noise_schedule.py
├── scripts
    └── test_lj.sh
├── setup.py
├── src
    └── diffwave
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── dataset.py
    │   ├── inference.py
    │   ├── learner.py
    │   ├── model.py
    │   ├── params.py
    │   └── preprocess.py
├── test.py
├── train_continuous.py
├── train_distributed.py
└── utils
    ├── __init__.py
    ├── schema.json
    ├── schema.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *tfevents.*
3 | *.pt
4 | *.pth
5 | generated/*


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "datasets"]
2 | 	path = datasets
3 | 	url = https://github.com/yoyololicon/pytorch-wav-datasets
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Variational DiffWave
 2 | 
 3 | Training DiffWave using variational method from [Variational Diffusion Models](https://arxiv.org/pdf/2107.00630.pdf).
 4 | 
 5 | ## Quick Start
 6 | 
 7 | ```commandline
 8 | python train_distributed.py discrete_1000steps_64ch.json
 9 | ```
10 | 
11 | ## Results
12 | 
13 | * [Trained checkpoints](https://drive.google.com/drive/folders/1qF3tHToSqMBpaw3plWSEYg4CcUeZx7GG?usp=sharing).
14 | * [Generated samples](https://drive.google.com/drive/folders/119ijy_P1QLps2M5KwzP0NJLfDmAFBInr?usp=sharing).
15 | 
16 | ## TODO 
17 | 
18 | - [x] Continuous-time training.


--------------------------------------------------------------------------------
/continuous_64ch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "DiffWave",
 3 |     "arch": {
 4 |         "type": "DiffWave",
 5 |         "args": {
 6 |             "T": 1000
 7 |         }
 8 |     },
 9 |     "dataset": {
10 |         "type": "RandomWAVDataset",
11 |         "args": {
12 |             "data_dir": "~/data-disk/Datasets/LJ/LJSpeech-1.1/wavs/",
13 |             "size": 12000000,
14 |             "segment": 16000,
15 |             "deterministic": false
16 |         }
17 |     },
18 |     "optimizer": {
19 |         "type": "Adam",
20 |         "args": {
21 |             "lr": 2e-4
22 |         }
23 |     },
24 |     "lr_scheduler": {
25 |         "type": "ReduceLROnPlateau",
26 |         "args": {
27 |             "factor": 0.3,
28 |             "patience": 1000000,
29 |             "verbose": true
30 |         }
31 |     },
32 |     "data_loader": {
33 |         "batch_size": 12,
34 |         "shuffle": false,
35 |         "drop_last": false,
36 |         "num_workers": 4,
37 |         "prefetch_factor": 4,
38 |         "pin_memory": true
39 |     },
40 |     "trainer": {
41 |         "save_dir": "saved/",
42 |         "log_dir": "runs/",
43 |         "eval_file": "~/data-disk/Datasets/LJ/LJSpeech-1.1/wavs/LJ001-0001.wav",
44 |         "cum_steps": 1,
45 |         "n_fft": 1024,
46 |         "hop_length": 256,
47 |         "n_mels": 80,
48 |         "sr": 22050,
49 |         "eval_interval": 10000,
50 |         "train_T": 0,
51 |         "eval_T": 50,
52 |         "extra_monitor": [
53 |             "kld",
54 |             "ll",
55 |             "loss_T"
56 |         ],
57 |         "with_amp": true,
58 |         "max_log_snr": 23,
59 |         "min_log_snr": -3.6
60 |     }
61 | }


--------------------------------------------------------------------------------
/discrete_1000steps_64ch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "DiffWave",
 3 |     "arch": {
 4 |         "type": "DiffWave",
 5 |         "args": {
 6 |             "T": 1000
 7 |         }
 8 |     },
 9 |     "dataset": {
10 |         "type": "RandomWAVDataset",
11 |         "args": {
12 |             "data_dir": "~/data-disk/Datasets/LJ/LJSpeech-1.1/wavs/",
13 |             "size": 16000000,
14 |             "segment": 16000,
15 |             "deterministic": false
16 |         }
17 |     },
18 |     "optimizer": {
19 |         "type": "Adam",
20 |         "args": {
21 |             "lr": 2e-4
22 |         }
23 |     },
24 |     "lr_scheduler": {
25 |         "type": "ReduceLROnPlateau",
26 |         "args": {
27 |             "factor": 0.3,
28 |             "patience": 1000000,
29 |             "verbose": true
30 |         }
31 |     },
32 |     "data_loader": {
33 |         "batch_size": 16,
34 |         "shuffle": false,
35 |         "drop_last": false,
36 |         "num_workers": 4,
37 |         "prefetch_factor": 4,
38 |         "pin_memory": true
39 |     },
40 |     "trainer": {
41 |         "save_dir": "saved/",
42 |         "log_dir": "runs/",
43 |         "eval_file": "~/data-disk/Datasets/LJ/LJSpeech-1.1/wavs/LJ001-0001.wav",
44 |         "cum_steps": 1,
45 |         "n_fft": 1024,
46 |         "hop_length": 256,
47 |         "n_mels": 80,
48 |         "sr": 22050,
49 |         "eval_interval": 10000,
50 |         "train_T": 1000,
51 |         "eval_T": 50,
52 |         "extra_monitor": [
53 |             "kld",
54 |             "ll",
55 |             "loss_T"
56 |         ],
57 |         "minimize_var": false,
58 |         "with_amp": true
59 |     }
60 | }


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch.cuda import amp
 4 | from tqdm import tqdm
 5 | from utils.utils import gamma2snr, snr2as, gamma2as, gamma2logas
 6 | 
 7 | 
 8 | def reverse_process_new(z_1, mels, gamma, steps, model, with_amp=False):
 9 |     log_alpha, log_var = gamma2logas(gamma)
10 |     var = log_var.exp()
11 |     alpha_st = torch.exp(log_alpha[:-1] - log_alpha[1:])
12 |     c = -torch.expm1(gamma[:-1] - gamma[1:])
13 |     c.relu_()
14 | 
15 |     T = gamma.numel() - 1
16 |     z_t = z_1
17 |     for t in tqdm(range(T, 0, -1)):
18 |         s = t - 1
19 |         with amp.autocast(enabled=with_amp):
20 |             noise_hat = model(z_t, mels, steps[t:t+1])
21 |         noise_hat = noise_hat.float()
22 |         mu = (z_t - var[t].sqrt() * c[s] * noise_hat) * alpha_st[s]
23 |         z_t = mu
24 |         if s:
25 |             z_t += (var[s] * c[s]).sqrt() * torch.randn_like(z_t)
26 | 
27 |     return z_t
28 | 
29 | 
30 | def reverse_process_ddim(z_1, mels, gamma, steps, model, with_amp=False):
31 |     Pm1 = -torch.expm1((gamma[1:] - gamma[:-1]) * 0.5)
32 |     log_alpha, log_var = gamma2logas(gamma)
33 |     alpha_st = torch.exp(log_alpha[:-1] - log_alpha[1:])
34 |     std = log_var.mul(0.5).exp()
35 | 
36 |     T = gamma.numel() - 1
37 |     z_t = z_1
38 |     for t in tqdm(range(T, 0, -1)):
39 |         s = t - 1
40 |         with amp.autocast(enabled=with_amp):
41 |             noise_hat = model(z_t, mels, steps[t:t+1])
42 |         noise_hat = noise_hat.float()
43 |         z_t.mul_(alpha_st[s]).add_(std[s] * Pm1[s] * noise_hat)
44 | 
45 |     return z_t
46 | 


--------------------------------------------------------------------------------
/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | import math
 4 | 
 5 | from utils.utils import gamma2as, gamma2logas
 6 | 
 7 | 
 8 | def diffusion_elbo(gamma_0, gamma_1, d_gamma_t,
 9 |                    x, noise, noise_hat):
10 |     log_alpha_0, log_var_0 = gamma2logas(gamma_0)
11 |     log_alpha_1, log_var_1 = gamma2logas(gamma_1)
12 | 
13 |     # prior loss KL(q(z_1|x) || p(z_1)))
14 |     # mu = alpha_1 * x
15 |     x_flat = x.view(-1)
16 |     x_dot = x_flat @ x_flat / x_flat.numel()
17 |     prior_loss = 0.5 * (log_var_1.exp() + x_dot *
18 |                         torch.exp(log_alpha_1 * 2) - 1 - log_var_1)
19 |     #torch.mean(var_1 + mu * mu - 1 - var_1.log())
20 | 
21 |     # recon loss E[-log p(x | z_0)]
22 |     # diff = (1 - alpha_0) * x
23 |     l2 = x_dot * torch.expm1(log_alpha_0) ** 2
24 |     ll = -0.5 * (log_var_0 + l2 / log_var_0.exp() + math.log(2 * math.pi))
25 |     recon_loss = -ll
26 | 
27 |     extra_dict = {
28 |         'kld': prior_loss.item(),
29 |         'll': ll.item()
30 |     }
31 |     # diffusion loss
32 |     diff = noise - noise_hat
33 |     loss_T_raw = 0.5 * (d_gamma_t * (diff * diff).mean(1)
34 |                         ) / d_gamma_t.shape[0]
35 |     loss_T = loss_T_raw.sum()
36 |     extra_dict['loss_T_raw'] = loss_T_raw.detach()
37 |     extra_dict['loss_T'] = loss_T.item()
38 | 
39 |     loss = prior_loss + recon_loss + loss_T
40 |     elbo = -loss
41 |     extra_dict['elbo'] = elbo.item()
42 |     return loss, extra_dict
43 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .mels import MelSpec
2 | from .model import DiffWave
3 | from .noise_schedule import NoiseScheduler, CosineScheduler
4 | 


--------------------------------------------------------------------------------
/models/mels.py:
--------------------------------------------------------------------------------
 1 | from torchaudio import transforms
 2 | from torch import Tensor
 3 | from typing import Optional, Callable
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | class MelSpec(transforms.MelSpectrogram):
 8 |     def __init__(self, sample_rate: int, n_fft: int, hop_length: int, f_min: float, f_max: float, n_mels: int) -> None:
 9 |         super().__init__(sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length, f_min=f_min, f_max=f_max, n_mels=n_mels,
10 |                          power=1., normalized=True)
11 | 
12 |     def forward(self, waveform: Tensor) -> Tensor:
13 |         waveform = F.pad(waveform, [0, self.n_fft // 2])
14 |         mels = super().forward(waveform)
15 |         mels.clamp_min_(
16 |             1e-5).log10_().mul_(20).add_(80).mul_(0.01).clamp_(0, 1)
17 |         return mels
18 | 


--------------------------------------------------------------------------------
/models/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 LMNT, Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import numpy as np
 17 | import torch
 18 | import torch.nn as nn
 19 | import torch.nn.functional as F
 20 | 
 21 | from math import sqrt
 22 | 
 23 | 
 24 | @torch.jit.script
 25 | def silu(x):
 26 |     return x * torch.sigmoid(x)
 27 | 
 28 | 
 29 | @torch.jit.script
 30 | def gru(x: torch.Tensor):
 31 |     a, b = x.chunk(2, 1)
 32 |     return a.tanh() * b.sigmoid()
 33 | 
 34 | 
 35 | class DiffusionEmbedding(nn.Module):
 36 |     def __init__(self, max_steps):
 37 |         super().__init__()
 38 |         self.register_buffer('embedding', self._build_embedding(
 39 |             max_steps + 1), persistent=False)
 40 |         self.projection1 = nn.Linear(128, 512)
 41 |         self.projection2 = nn.Linear(512, 512)
 42 | 
 43 |     def forward(self, diffusion_step):
 44 |         if diffusion_step.dtype in [torch.int32, torch.int64]:
 45 |             x = self.embedding[diffusion_step]
 46 |         else:
 47 |             x = self._lerp_embedding(diffusion_step)
 48 |         x = self.projection1(x)
 49 |         x = silu(x)
 50 |         x = self.projection2(x)
 51 |         x = silu(x)
 52 |         return x
 53 | 
 54 |     def _lerp_embedding(self, t):
 55 |         t = t * (self.embedding.shape[0] - 1)
 56 |         low_idx = torch.floor(t).long()
 57 |         high_idx = torch.ceil(t).long()
 58 |         low = self.embedding[low_idx]
 59 |         high = self.embedding[high_idx]
 60 |         return low + (high - low) * (t - low_idx)[:, None]
 61 | 
 62 |     def _build_embedding(self, max_steps):
 63 |         steps = torch.arange(max_steps).unsqueeze(1)  # [T,1]
 64 |         dims = torch.arange(64).unsqueeze(0)          # [1,64]
 65 |         table = steps * 10.0**(dims * 4.0 / 63.0)     # [T,64]
 66 |         # table = torch.cat([torch.sin(table), torch.cos(table)], dim=1)
 67 |         table = torch.view_as_real(torch.exp(1j * table)).view(max_steps, -1)
 68 |         return table
 69 | 
 70 | 
 71 | class SpectrogramUpsampler(nn.Module):
 72 |     def __init__(self):
 73 |         super().__init__()
 74 |         self.convs = nn.Sequential(
 75 |             nn.ConvTranspose2d(1, 1, [3, 33], stride=[
 76 |                 1, 16], padding=[1, 16]),
 77 |             nn.LeakyReLU(0.4, inplace=True),
 78 |             nn.ConvTranspose2d(1, 1,  [3, 33], stride=[
 79 |                 1, 16], padding=[1, 16]),
 80 |             nn.LeakyReLU(0.4, inplace=True)
 81 |         )
 82 |         self.hop_size = 256
 83 | 
 84 |     def forward(self, x):
 85 |         return self.convs(x.unsqueeze(1)).squeeze(1)
 86 | 
 87 | 
 88 | class ResidualBlock(nn.Module):
 89 |     def __init__(self, residual_channels, dilation, last_layer=False):
 90 |         super().__init__()
 91 |         self.diffusion_projection = nn.Linear(512, residual_channels)
 92 |         self.dilated_conv = nn.Conv1d(
 93 |             residual_channels, 2 * residual_channels, 3, padding=dilation, dilation=dilation)
 94 | 
 95 |         self.chs_split = [residual_channels]
 96 |         if last_layer:
 97 |             self.output_projection = nn.Conv1d(
 98 |                 residual_channels, residual_channels, 1)
 99 |         else:
100 |             self.chs_split.append(residual_channels)
101 |             self.output_projection = nn.Conv1d(
102 |                 residual_channels, residual_channels * 2, 1)
103 | 
104 |     def forward(self, x, conditioner, diffusion_step):
105 |         diffusion_step = self.diffusion_projection(
106 |             diffusion_step).unsqueeze(-1)
107 |         y = self.dilated_conv(x + diffusion_step) + conditioner
108 |         y = gru(y)
109 |         *residual, skip = self.output_projection(y).split(self.chs_split, 1)
110 |         return (x + residual[0]) / sqrt(2.0) if len(residual) else None, skip
111 | 
112 | 
113 | class DiffWave(nn.Module):
114 |     def __init__(self,
115 |                  res_channels: int = 64,
116 |                  T: int = 50,
117 |                  n_mels: int = 80,
118 |                  layers: int = 30,
119 |                  cycle_length: int = 10
120 |                  ):
121 |         super().__init__()
122 | 
123 |         self.input_projection = nn.Sequential(
124 |             nn.Conv1d(1, res_channels, 1),
125 |             nn.ReLU(inplace=True)
126 |         )
127 |         self.diffusion_embedding = DiffusionEmbedding(T)
128 |         self.spectrogram_upsampler = SpectrogramUpsampler()
129 | 
130 |         dilations = [2 ** (i % cycle_length) for i in range(layers)]
131 |         self.residual_layers = nn.ModuleList([
132 |             ResidualBlock(res_channels, d) for d in dilations
133 |         ])
134 | 
135 |         self.output_projection = nn.Sequential(
136 |             nn.Conv1d(res_channels, res_channels, 1),
137 |             nn.ReLU(inplace=True),
138 |             nn.Conv1d(res_channels, 1, 1)
139 |         )
140 |         # nn.init.zeros_(self.output_projection[2].weight)
141 |         # nn.init.zeros_(self.output_projection[2].bias)
142 | 
143 |         self.conditioner = nn.Conv1d(
144 |             n_mels, res_channels * 2 * layers, 1, bias=False)
145 | 
146 |     def forward(self, audio, spectrogram, diffusion_step):
147 |         x = audio.unsqueeze(1)
148 |         x = self.input_projection(x)
149 | 
150 |         diffusion_step = self.diffusion_embedding(diffusion_step)
151 |         spectrogram = self.spectrogram_upsampler(
152 |             spectrogram)[..., :x.shape[-1]]
153 | 
154 |         condition = self.conditioner(spectrogram).chunk(
155 |             len(self.residual_layers), 1)
156 |         skip = 0
157 |         for layer, c in zip(self.residual_layers, condition):
158 |             x, skip_connection = layer(x, c, diffusion_step)
159 |             skip += skip_connection
160 | 
161 |         x = skip / sqrt(len(self.residual_layers))
162 |         x = self.output_projection(x).squeeze(1)
163 |         return x
164 | 
165 | 
166 | if __name__ == "__main__":
167 |     from torchinfo import summary
168 |     net = DiffWave()
169 |     x = torch.rand(1, 16000)  # .cuda()
170 |     mels = torch.randn(1, 80, 64)
171 |     t = torch.randint(low=0, high=50, size=(1,))
172 |     summary(net, input_data=(x, mels, t), device='cpu',
173 |             col_names=("input_size", "output_size", "num_params", "kernel_size",
174 |                        "mult_adds"),
175 |             col_width=16)
176 | 


--------------------------------------------------------------------------------
/models/noise_schedule.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | import torch.nn.utils.parametrize as parametrize
 4 | 
 5 | 
 6 | class Nonnegative(nn.Module):
 7 |     def forward(self, X):
 8 |         return X.abs()
 9 | 
10 | 
11 | class NoiseScheduler(nn.Module):
12 |     def __init__(self):
13 |         super().__init__()
14 | 
15 |         self.l1 = parametrize.register_parametrization(
16 |             nn.Linear(1, 1, bias=True), 'weight', Nonnegative())
17 |         self.l2 = parametrize.register_parametrization(
18 |             nn.Linear(1, 1024, bias=True), 'weight', Nonnegative())
19 |         self.l3 = parametrize.register_parametrization(
20 |             nn.Linear(1024, 1, bias=False), 'weight', Nonnegative())
21 | 
22 |         self.gamma1 = nn.Parameter(torch.ones(1) * 0, requires_grad=True)
23 |         self.gamma0 = nn.Parameter(torch.ones(1) * -10, requires_grad=True)
24 |         self.register_buffer('t01', torch.tensor([0., 1.]))
25 | 
26 |     def gamma_hat(self, t: torch.Tensor):
27 |         l1 = self.l1(t)
28 |         return l1 + self.l3(self.l2(l1).sigmoid())
29 | 
30 |     def forward(self, t: torch.Tensor):
31 |         t = t.clamp(0, 1)
32 |         min_gamma_hat, max_gamma_hat,  gamma_hat = self.gamma_hat(
33 |             torch.cat([self.t01, t], dim=0).unsqueeze(-1)).squeeze(1).split([1, 1, t.numel()], dim=0)
34 |         gamma0, gamma1 = self.gamma0, self.gamma1
35 |         normalized_gamma_hat = (gamma_hat - min_gamma_hat) / \
36 |             (max_gamma_hat - min_gamma_hat)
37 |         # gamma = gamma_hat
38 |         gamma = gamma0 + (gamma1 - gamma0) * normalized_gamma_hat
39 | 
40 |         return gamma, normalized_gamma_hat
41 | 
42 | 
43 | class CosineScheduler(nn.Module):
44 |     def __init__(self, gamma0: float = -23, gamma1: float = 3.6):
45 |         super().__init__()
46 | 
47 |         self.register_buffer('gamma0', torch.tensor(gamma0))
48 |         self.register_buffer('gamma1', torch.tensor(gamma1))
49 | 
50 |     def schedule(self, t: torch.Tensor):
51 |         return torch.tan(torch.pi * t * 0.5).reciprocal_().pow(2)
52 | 
53 |     def gamma2t(self, gamma: torch.Tensor):
54 |         return 2 * torch.atan(gamma.mul(0.5).exp()) / torch.pi
55 | 
56 |     def forward(self, t: torch.Tensor):
57 |         min_t = self.gamma2t(self.gamma0)
58 |         max_t = self.gamma2t(self.gamma1)
59 |         t = min_t + (max_t - min_t) * t.clamp(0, 1)
60 | 
61 |         snr = self.schedule(t)
62 |         r = -snr.log()
63 |         return r, torch.clip_((r - self.gamma0) / (self.gamma1 - self.gamma0), 0, 1)
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     from torch.autograd import grad
68 |     gamma = NoiseScheduler()
69 | 
70 |     t = torch.arange(10) / 9
71 |     t = torch.tensor(t,  requires_grad=True)
72 |     g = gamma(t)
73 |     print(t, g)
74 | 
75 |     print(grad(g.sum(), t, only_inputs=True))
76 |     # g.sum().backward()
77 | 


--------------------------------------------------------------------------------
/scripts/test_lj.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | FILES=~/data-disk/Datasets/LJ/LJSpeech-1.1/wavs/LJ001-00[0-1]*.wav
 3 | CFG=$1
 4 | CKPT=$2
 5 | T=$3
 6 | echo "$1 $2 $3"
 7 | for f in $FILES
 8 | do
 9 |     base="$(basename -- $f)"
10 |     out=generated/$base
11 |     echo "$out, $f"
12 |     python test.py $1 $2 $f $out -T $3 --amp --ddim
13 | done


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 LMNT, Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | from setuptools import find_packages, setup
17 | 
18 | 
19 | VERSION = '0.1.7'
20 | DESCRIPTION = 'diffwave'
21 | AUTHOR = 'LMNT, Inc.'
22 | AUTHOR_EMAIL = 'github@lmnt.com'
23 | URL = 'https://www.lmnt.com'
24 | LICENSE = 'Apache 2.0'
25 | KEYWORDS = ['diffwave machine learning neural vocoder tts speech']
26 | CLASSIFIERS = [
27 |   'Development Status :: 4 - Beta',
28 |   'Intended Audience :: Developers',
29 |   'Intended Audience :: Education',
30 |   'Intended Audience :: Science/Research',
31 |   'License :: OSI Approved :: Apache Software License',
32 |   'Programming Language :: Python :: 3.5',
33 |   'Programming Language :: Python :: 3.6',
34 |   'Programming Language :: Python :: 3.7',
35 |   'Programming Language :: Python :: 3.8',
36 |   'Topic :: Scientific/Engineering :: Mathematics',
37 |   'Topic :: Software Development :: Libraries :: Python Modules',
38 |   'Topic :: Software Development :: Libraries',
39 | ]
40 | 
41 | 
42 | setup(name = 'diffwave',
43 |     version = VERSION,
44 |     description = DESCRIPTION,
45 |     long_description = open('README.md', 'r').read(),
46 |     long_description_content_type = 'text/markdown',
47 |     author = AUTHOR,
48 |     author_email = AUTHOR_EMAIL,
49 |     url = URL,
50 |     license = LICENSE,
51 |     keywords = KEYWORDS,
52 |     packages = find_packages('src'),
53 |     package_dir = { '': 'src' },
54 |     install_requires = [
55 |         'numpy',
56 |         'torch>=1.6',
57 |         'torchaudio>=0.6.0',
58 |         'tqdm'
59 |     ],
60 |     classifiers = CLASSIFIERS)
61 | 


--------------------------------------------------------------------------------
/src/diffwave/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyolicoris/variational-diffwave/b3edc2f1c3dc13fb72c068fb1dd0b24a2b9b423a/src/diffwave/__init__.py


--------------------------------------------------------------------------------
/src/diffwave/__main__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 LMNT, Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | from argparse import ArgumentParser
17 | from torch.cuda import device_count
18 | from torch.multiprocessing import spawn
19 | 
20 | from diffwave.learner import train, train_distributed
21 | from diffwave.params import params
22 | 
23 | 
24 | def _get_free_port():
25 |   import socketserver
26 |   with socketserver.TCPServer(('localhost', 0), None) as s:
27 |     return s.server_address[1]
28 | 
29 | 
30 | def main(args):
31 |   replica_count = device_count()
32 |   if replica_count > 1:
33 |     if params.batch_size % replica_count != 0:
34 |       raise ValueError(f'Batch size {params.batch_size} is not evenly divisble by # GPUs {replica_count}.')
35 |     params.batch_size = params.batch_size // replica_count
36 |     port = _get_free_port()
37 |     spawn(train_distributed, args=(replica_count, port, args, params), nprocs=replica_count, join=True)
38 |   else:
39 |     train(args, params)
40 | 
41 | 
42 | if __name__ == '__main__':
43 |   parser = ArgumentParser(description='train (or resume training) a DiffWave model')
44 |   parser.add_argument('model_dir',
45 |       help='directory in which to store model checkpoints and training logs')
46 |   parser.add_argument('data_dirs', nargs='+',
47 |       help='space separated list of directories from which to read .wav files for training')
48 |   parser.add_argument('--max_steps', default=None, type=int,
49 |       help='maximum number of training steps')
50 |   parser.add_argument('--fp16', action='store_true', default=False,
51 |       help='use 16-bit floating point operations for training')
52 |   main(parser.parse_args())
53 | 


--------------------------------------------------------------------------------
/src/diffwave/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 LMNT, Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | import numpy as np
17 | import os
18 | import random
19 | import torch
20 | import torchaudio
21 | 
22 | from glob import glob
23 | from torch.utils.data.distributed import DistributedSampler
24 | 
25 | 
26 | class NumpyDataset(torch.utils.data.Dataset):
27 |   def __init__(self, paths):
28 |     super().__init__()
29 |     self.filenames = []
30 |     for path in paths:
31 |       self.filenames += glob(f'{path}/**/*.wav', recursive=True)
32 | 
33 |   def __len__(self):
34 |     return len(self.filenames)
35 | 
36 |   def __getitem__(self, idx):
37 |     audio_filename = self.filenames[idx]
38 |     spec_filename = f'{audio_filename}.spec.npy'
39 |     signal, _ = torchaudio.load_wav(audio_filename)
40 |     spectrogram = np.load(spec_filename)
41 |     return {
42 |         'audio': signal[0] / 32767.5,
43 |         'spectrogram': spectrogram.T
44 |     }
45 | 
46 | 
47 | class Collator:
48 |   def __init__(self, params):
49 |     self.params = params
50 | 
51 |   def collate(self, minibatch):
52 |     samples_per_frame = self.params.hop_samples
53 |     for record in minibatch:
54 |       # Filter out records that aren't long enough.
55 |       if len(record['spectrogram']) < self.params.crop_mel_frames:
56 |         del record['spectrogram']
57 |         del record['audio']
58 |         continue
59 | 
60 |       start = random.randint(0, record['spectrogram'].shape[0] - self.params.crop_mel_frames)
61 |       end = start + self.params.crop_mel_frames
62 |       record['spectrogram'] = record['spectrogram'][start:end].T
63 | 
64 |       start *= samples_per_frame
65 |       end *= samples_per_frame
66 |       record['audio'] = record['audio'][start:end]
67 |       record['audio'] = np.pad(record['audio'], (0, (end-start) - len(record['audio'])), mode='constant')
68 | 
69 |     audio = np.stack([record['audio'] for record in minibatch if 'audio' in record])
70 |     spectrogram = np.stack([record['spectrogram'] for record in minibatch if 'spectrogram' in record])
71 |     return {
72 |         'audio': torch.from_numpy(audio),
73 |         'spectrogram': torch.from_numpy(spectrogram),
74 |     }
75 | 
76 | 
77 | def from_path(data_dirs, params, is_distributed=False):
78 |   dataset = NumpyDataset(data_dirs)
79 |   return torch.utils.data.DataLoader(
80 |       dataset,
81 |       batch_size=params.batch_size,
82 |       collate_fn=Collator(params).collate,
83 |       shuffle=not is_distributed,
84 |       num_workers=os.cpu_count(),
85 |       sampler=DistributedSampler(dataset) if is_distributed else None,
86 |       pin_memory=True,
87 |       drop_last=True)
88 | 


--------------------------------------------------------------------------------
/src/diffwave/inference.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 LMNT, Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import numpy as np
 17 | import os
 18 | import torch
 19 | import torchaudio
 20 | 
 21 | from argparse import ArgumentParser
 22 | 
 23 | from diffwave.params import AttrDict, params as base_params
 24 | from diffwave.model import DiffWave
 25 | 
 26 | 
 27 | models = {}
 28 | 
 29 | def predict(spectrogram, model_dir=None, params=None, device=torch.device('cuda'), fast_sampling=False):
 30 |   # Lazy load model.
 31 |   if not model_dir in models:
 32 |     if os.path.exists(f'{model_dir}/weights.pt'):
 33 |       checkpoint = torch.load(f'{model_dir}/weights.pt')
 34 |     else:
 35 |       checkpoint = torch.load(model_dir)
 36 |     model = DiffWave(AttrDict(base_params)).to(device)
 37 |     model.load_state_dict(checkpoint['model'])
 38 |     model.eval()
 39 |     models[model_dir] = model
 40 | 
 41 |   model = models[model_dir]
 42 |   model.params.override(params)
 43 |   with torch.no_grad():
 44 |     # Change in notation from the DiffWave paper for fast sampling.
 45 |     # DiffWave paper -> Implementation below
 46 |     # --------------------------------------
 47 |     # alpha -> talpha
 48 |     # beta -> training_noise_schedule
 49 |     # gamma -> alpha
 50 |     # eta -> beta
 51 |     training_noise_schedule = np.array(model.params.noise_schedule)
 52 |     inference_noise_schedule = np.array(model.params.inference_noise_schedule) if fast_sampling else training_noise_schedule
 53 | 
 54 |     talpha = 1 - training_noise_schedule
 55 |     talpha_cum = np.cumprod(talpha)
 56 | 
 57 |     beta = inference_noise_schedule
 58 |     alpha = 1 - beta
 59 |     alpha_cum = np.cumprod(alpha)
 60 | 
 61 |     T = []
 62 |     for s in range(len(inference_noise_schedule)):
 63 |       for t in range(len(training_noise_schedule) - 1):
 64 |         if talpha_cum[t+1] <= alpha_cum[s] <= talpha_cum[t]:
 65 |           twiddle = (talpha_cum[t]**0.5 - alpha_cum[s]**0.5) / (talpha_cum[t]**0.5 - talpha_cum[t+1]**0.5)
 66 |           T.append(t + twiddle)
 67 |           break
 68 |     T = np.array(T, dtype=np.float32)
 69 | 
 70 |     # Expand rank 2 tensors by adding a batch dimension.
 71 |     if len(spectrogram.shape) == 2:
 72 |       spectrogram = spectrogram.unsqueeze(0)
 73 |     spectrogram = spectrogram.to(device)
 74 | 
 75 |     audio = torch.randn(spectrogram.shape[0], model.params.hop_samples * spectrogram.shape[-1], device=device)
 76 |     noise_scale = torch.from_numpy(alpha_cum**0.5).float().unsqueeze(1).to(device)
 77 | 
 78 |     for n in range(len(alpha) - 1, -1, -1):
 79 |       c1 = 1 / alpha[n]**0.5
 80 |       c2 = beta[n] / (1 - alpha_cum[n])**0.5
 81 |       audio = c1 * (audio - c2 * model(audio, spectrogram, torch.tensor([T[n]], device=audio.device)).squeeze(1))
 82 |       if n > 0:
 83 |         noise = torch.randn_like(audio)
 84 |         sigma = ((1.0 - alpha_cum[n-1]) / (1.0 - alpha_cum[n]) * beta[n])**0.5
 85 |         audio += sigma * noise
 86 |       audio = torch.clamp(audio, -1.0, 1.0)
 87 |   return audio, model.params.sample_rate
 88 | 
 89 | 
 90 | def main(args):
 91 |   spectrogram = torch.from_numpy(np.load(args.spectrogram_path))
 92 |   audio, sr = predict(spectrogram, model_dir=args.model_dir, fast_sampling=args.fast)
 93 |   torchaudio.save(args.output, audio.cpu(), sample_rate=sr)
 94 | 
 95 | 
 96 | if __name__ == '__main__':
 97 |   parser = ArgumentParser(description='runs inference on a spectrogram file generated by diffwave.preprocess')
 98 |   parser.add_argument('model_dir',
 99 |       help='directory containing a trained model (or full path to weights.pt file)')
100 |   parser.add_argument('spectrogram_path',
101 |       help='path to a spectrogram file generated by diffwave.preprocess')
102 |   parser.add_argument('--output', '-o', default='output.wav',
103 |       help='output file name')
104 |   parser.add_argument('--fast', '-f', action='store_true',
105 |       help='fast sampling procedure')
106 |   main(parser.parse_args())
107 | 


--------------------------------------------------------------------------------
/src/diffwave/learner.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 LMNT, Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import numpy as np
 17 | import os
 18 | import torch
 19 | import torch.nn as nn
 20 | 
 21 | from torch.nn.parallel import DistributedDataParallel
 22 | from torch.utils.tensorboard import SummaryWriter
 23 | from tqdm import tqdm
 24 | 
 25 | from diffwave.dataset import from_path as dataset_from_path
 26 | from diffwave.model import DiffWave
 27 | from diffwave.params import AttrDict
 28 | 
 29 | 
 30 | def _nested_map(struct, map_fn):
 31 |     if isinstance(struct, tuple):
 32 |         return tuple(_nested_map(x, map_fn) for x in struct)
 33 |     if isinstance(struct, list):
 34 |         return [_nested_map(x, map_fn) for x in struct]
 35 |     if isinstance(struct, dict):
 36 |         return {k: _nested_map(v, map_fn) for k, v in struct.items()}
 37 |     return map_fn(struct)
 38 | 
 39 | 
 40 | class DiffWaveLearner:
 41 |     def __init__(self, model_dir, model, dataset, optimizer, params, *args, **kwargs):
 42 |         os.makedirs(model_dir, exist_ok=True)
 43 |         self.model_dir = model_dir
 44 |         self.model = model
 45 |         self.dataset = dataset
 46 |         self.optimizer = optimizer
 47 |         self.params = params
 48 |         self.autocast = torch.cuda.amp.autocast(
 49 |             enabled=kwargs.get('fp16', False))
 50 |         self.scaler = torch.cuda.amp.GradScaler(
 51 |             enabled=kwargs.get('fp16', False))
 52 |         self.step = 0
 53 |         self.is_master = True
 54 | 
 55 |         beta = np.array(self.params.noise_schedule)
 56 |         noise_level = np.cumprod(1 - beta)
 57 |         self.noise_level = torch.tensor(noise_level.astype(np.float32))
 58 |         self.loss_fn = nn.L1Loss()
 59 |         self.summary_writer = None
 60 | 
 61 |     def state_dict(self):
 62 |         if hasattr(self.model, 'module') and isinstance(self.model.module, nn.Module):
 63 |             model_state = self.model.module.state_dict()
 64 |         else:
 65 |             model_state = self.model.state_dict()
 66 |         return {
 67 |             'step': self.step,
 68 |             'model': {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in model_state.items()},
 69 |             'optimizer': {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in self.optimizer.state_dict().items()},
 70 |             'params': dict(self.params),
 71 |             'scaler': self.scaler.state_dict(),
 72 |         }
 73 | 
 74 |     def load_state_dict(self, state_dict):
 75 |         if hasattr(self.model, 'module') and isinstance(self.model.module, nn.Module):
 76 |             self.model.module.load_state_dict(state_dict['model'])
 77 |         else:
 78 |             self.model.load_state_dict(state_dict['model'])
 79 |         self.optimizer.load_state_dict(state_dict['optimizer'])
 80 |         self.scaler.load_state_dict(state_dict['scaler'])
 81 |         self.step = state_dict['step']
 82 | 
 83 |     def save_to_checkpoint(self, filename='weights'):
 84 |         save_basename = f'{filename}-{self.step}.pt'
 85 |         save_name = f'{self.model_dir}/{save_basename}'
 86 |         link_name = f'{self.model_dir}/{filename}.pt'
 87 |         torch.save(self.state_dict(), save_name)
 88 |         if os.name == 'nt':
 89 |             torch.save(self.state_dict(), link_name)
 90 |         else:
 91 |             if os.path.islink(link_name):
 92 |                 os.unlink(link_name)
 93 |             os.symlink(save_basename, link_name)
 94 | 
 95 |     def restore_from_checkpoint(self, filename='weights'):
 96 |         try:
 97 |             checkpoint = torch.load(f'{self.model_dir}/{filename}.pt')
 98 |             self.load_state_dict(checkpoint)
 99 |             return True
100 |         except FileNotFoundError:
101 |             return False
102 | 
103 |     def train(self, max_steps=None):
104 |         device = next(self.model.parameters()).device
105 |         while True:
106 |             for features in tqdm(self.dataset, desc=f'Epoch {self.step // len(self.dataset)}') if self.is_master else self.dataset:
107 |                 if max_steps is not None and self.step >= max_steps:
108 |                     return
109 |                 features = _nested_map(features, lambda x: x.to(
110 |                     device) if isinstance(x, torch.Tensor) else x)
111 |                 loss = self.train_step(features)
112 |                 if torch.isnan(loss).any():
113 |                     raise RuntimeError(
114 |                         f'Detected NaN loss at step {self.step}.')
115 |                 if self.is_master:
116 |                     if self.step % 50 == 0:
117 |                         self._write_summary(self.step, features, loss)
118 |                     if self.step % len(self.dataset) == 0:
119 |                         self.save_to_checkpoint()
120 |                 self.step += 1
121 | 
122 |     def train_step(self, features):
123 |         for param in self.model.parameters():
124 |             param.grad = None
125 | 
126 |         audio = features['audio']
127 |         spectrogram = features['spectrogram']
128 | 
129 |         N, T = audio.shape
130 |         device = audio.device
131 |         self.noise_level = self.noise_level.to(device)
132 | 
133 |         with self.autocast:
134 |             t = torch.randint(0, len(self.params.noise_schedule), [
135 |                               N], device=audio.device)
136 |             noise_scale = self.noise_level[t].unsqueeze(1)
137 |             noise_scale_sqrt = noise_scale**0.5
138 |             noise = torch.randn_like(audio)
139 |             noisy_audio = noise_scale_sqrt * audio + \
140 |                 (1.0 - noise_scale)**0.5 * noise
141 | 
142 |             predicted = self.model(noisy_audio, spectrogram, t)
143 |             loss = self.loss_fn(noise, predicted.squeeze(1))
144 | 
145 |         self.scaler.scale(loss).backward()
146 |         self.scaler.unscale_(self.optimizer)
147 |         self.grad_norm = nn.utils.clip_grad_norm_(
148 |             self.model.parameters(), self.params.max_grad_norm or 1e9)
149 |         self.scaler.step(self.optimizer)
150 |         self.scaler.update()
151 |         return loss
152 | 
153 |     def _write_summary(self, step, features, loss):
154 |         writer = self.summary_writer or SummaryWriter(
155 |             self.model_dir, purge_step=step)
156 |         writer.add_audio(
157 |             'feature/audio', features['audio'][0], step, sample_rate=self.params.sample_rate)
158 |         writer.add_image('feature/spectrogram',
159 |                          torch.flip(features['spectrogram'][:1], [1]), step)
160 |         writer.add_scalar('train/loss', loss, step)
161 |         writer.add_scalar('train/grad_norm', self.grad_norm, step)
162 |         writer.flush()
163 |         self.summary_writer = writer
164 | 
165 | 
166 | def _train_impl(replica_id, model, dataset, args, params):
167 |     torch.backends.cudnn.benchmark = True
168 |     opt = torch.optim.Adam(model.parameters(), lr=params.learning_rate)
169 | 
170 |     learner = DiffWaveLearner(args.model_dir, model,
171 |                               dataset, opt, params, fp16=args.fp16)
172 |     learner.is_master = (replica_id == 0)
173 |     learner.restore_from_checkpoint()
174 |     learner.train(max_steps=args.max_steps)
175 | 
176 | 
177 | def train(args, params):
178 |     dataset = dataset_from_path(args.data_dirs, params)
179 |     model = DiffWave(params).cuda()
180 |     _train_impl(0, model, dataset, args, params)
181 | 
182 | 
183 | def train_distributed(replica_id, replica_count, port, args, params):
184 |     os.environ['MASTER_ADDR'] = 'localhost'
185 |     os.environ['MASTER_PORT'] = str(port)
186 |     torch.distributed.init_process_group(
187 |         'nccl', rank=replica_id, world_size=replica_count)
188 | 
189 |     device = torch.device('cuda', replica_id)
190 |     torch.cuda.set_device(device)
191 |     model = DiffWave(params).to(device)
192 |     model = DistributedDataParallel(model, device_ids=[replica_id])
193 |     _train_impl(replica_id, model, dataset_from_path(
194 |         args.data_dirs, params, is_distributed=True), args, params)
195 | 


--------------------------------------------------------------------------------
/src/diffwave/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 LMNT, Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import numpy as np
 17 | import torch
 18 | import torch.nn as nn
 19 | import torch.nn.functional as F
 20 | 
 21 | from math import sqrt
 22 | 
 23 | 
 24 | Linear = nn.Linear
 25 | ConvTranspose2d = nn.ConvTranspose2d
 26 | 
 27 | 
 28 | def Conv1d(*args, **kwargs):
 29 |   layer = nn.Conv1d(*args, **kwargs)
 30 |   nn.init.kaiming_normal_(layer.weight)
 31 |   return layer
 32 | 
 33 | 
 34 | @torch.jit.script
 35 | def silu(x):
 36 |   return x * torch.sigmoid(x)
 37 | 
 38 | 
 39 | class DiffusionEmbedding(nn.Module):
 40 |   def __init__(self, max_steps):
 41 |     super().__init__()
 42 |     self.register_buffer('embedding', self._build_embedding(max_steps), persistent=False)
 43 |     self.projection1 = Linear(128, 512)
 44 |     self.projection2 = Linear(512, 512)
 45 | 
 46 |   def forward(self, diffusion_step):
 47 |     if diffusion_step.dtype in [torch.int32, torch.int64]:
 48 |       x = self.embedding[diffusion_step]
 49 |     else:
 50 |       x = self._lerp_embedding(diffusion_step)
 51 |     x = self.projection1(x)
 52 |     x = silu(x)
 53 |     x = self.projection2(x)
 54 |     x = silu(x)
 55 |     return x
 56 | 
 57 |   def _lerp_embedding(self, t):
 58 |     low_idx = torch.floor(t).long()
 59 |     high_idx = torch.ceil(t).long()
 60 |     low = self.embedding[low_idx]
 61 |     high = self.embedding[high_idx]
 62 |     return low + (high - low) * (t - low_idx)
 63 | 
 64 |   def _build_embedding(self, max_steps):
 65 |     steps = torch.arange(max_steps).unsqueeze(1)  # [T,1]
 66 |     dims = torch.arange(64).unsqueeze(0)          # [1,64]
 67 |     table = steps * 10.0**(dims * 4.0 / 63.0)     # [T,64]
 68 |     table = torch.cat([torch.sin(table), torch.cos(table)], dim=1)
 69 |     return table
 70 | 
 71 | 
 72 | class SpectrogramUpsampler(nn.Module):
 73 |   def __init__(self, n_mels):
 74 |     super().__init__()
 75 |     self.conv1 = ConvTranspose2d(1, 1, [3, 32], stride=[1, 16], padding=[1, 8])
 76 |     self.conv2 = ConvTranspose2d(1, 1,  [3, 32], stride=[1, 16], padding=[1, 8])
 77 | 
 78 |   def forward(self, x):
 79 |     x = torch.unsqueeze(x, 1)
 80 |     x = self.conv1(x)
 81 |     x = F.leaky_relu(x, 0.4)
 82 |     x = self.conv2(x)
 83 |     x = F.leaky_relu(x, 0.4)
 84 |     x = torch.squeeze(x, 1)
 85 |     return x
 86 | 
 87 | 
 88 | class ResidualBlock(nn.Module):
 89 |   def __init__(self, n_mels, residual_channels, dilation):
 90 |     super().__init__()
 91 |     self.dilated_conv = Conv1d(residual_channels, 2 * residual_channels, 3, padding=dilation, dilation=dilation)
 92 |     self.diffusion_projection = Linear(512, residual_channels)
 93 |     self.conditioner_projection = Conv1d(n_mels, 2 * residual_channels, 1)
 94 |     self.output_projection = Conv1d(residual_channels, 2 * residual_channels, 1)
 95 | 
 96 |   def forward(self, x, conditioner, diffusion_step):
 97 |     diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
 98 |     conditioner = self.conditioner_projection(conditioner)
 99 | 
100 |     y = x + diffusion_step
101 |     y = self.dilated_conv(y) + conditioner
102 | 
103 |     gate, filter = torch.chunk(y, 2, dim=1)
104 |     y = torch.sigmoid(gate) * torch.tanh(filter)
105 | 
106 |     y = self.output_projection(y)
107 |     residual, skip = torch.chunk(y, 2, dim=1)
108 |     return (x + residual) / sqrt(2.0), skip
109 | 
110 | 
111 | class DiffWave(nn.Module):
112 |   def __init__(self, params):
113 |     super().__init__()
114 |     self.params = params
115 |     self.input_projection = Conv1d(1, params.residual_channels, 1)
116 |     self.diffusion_embedding = DiffusionEmbedding(len(params.noise_schedule))
117 |     self.spectrogram_upsampler = SpectrogramUpsampler(params.n_mels)
118 |     self.residual_layers = nn.ModuleList([
119 |         ResidualBlock(params.n_mels, params.residual_channels, 2**(i % params.dilation_cycle_length))
120 |         for i in range(params.residual_layers)
121 |     ])
122 |     self.skip_projection = Conv1d(params.residual_channels, params.residual_channels, 1)
123 |     self.output_projection = Conv1d(params.residual_channels, 1, 1)
124 |     nn.init.zeros_(self.output_projection.weight)
125 | 
126 |   def forward(self, audio, spectrogram, diffusion_step):
127 |     x = audio.unsqueeze(1)
128 |     x = self.input_projection(x)
129 |     x = F.relu(x)
130 | 
131 |     diffusion_step = self.diffusion_embedding(diffusion_step)
132 |     spectrogram = self.spectrogram_upsampler(spectrogram)
133 | 
134 |     skip = []
135 |     for layer in self.residual_layers:
136 |       x, skip_connection = layer(x, spectrogram, diffusion_step)
137 |       skip.append(skip_connection)
138 | 
139 |     x = torch.sum(torch.stack(skip), dim=0) / sqrt(len(self.residual_layers))
140 |     x = self.skip_projection(x)
141 |     x = F.relu(x)
142 |     x = self.output_projection(x)
143 |     return x
144 | 


--------------------------------------------------------------------------------
/src/diffwave/params.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 LMNT, Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | import numpy as np
17 | 
18 | 
19 | class AttrDict(dict):
20 |   def __init__(self, *args, **kwargs):
21 |       super(AttrDict, self).__init__(*args, **kwargs)
22 |       self.__dict__ = self
23 | 
24 |   def override(self, attrs):
25 |     if isinstance(attrs, dict):
26 |       self.__dict__.update(**attrs)
27 |     elif isinstance(attrs, (list, tuple, set)):
28 |       for attr in attrs:
29 |         self.override(attr)
30 |     elif attrs is not None:
31 |       raise NotImplementedError
32 |     return self
33 | 
34 | 
35 | params = AttrDict(
36 |     # Training params
37 |     batch_size=16,
38 |     learning_rate=2e-4,
39 |     max_grad_norm=None,
40 | 
41 |     # Data params
42 |     sample_rate=22050,
43 |     n_mels=80,
44 |     n_fft=1024,
45 |     hop_samples=256,
46 |     crop_mel_frames=62,  # Probably an error in paper.
47 | 
48 |     # Model params
49 |     residual_layers=30,
50 |     residual_channels=64,
51 |     dilation_cycle_length=10,
52 |     noise_schedule=np.linspace(1e-4, 0.05, 50).tolist(),
53 |     inference_noise_schedule=[0.0001, 0.001, 0.01, 0.05, 0.2, 0.5],
54 | )
55 | 


--------------------------------------------------------------------------------
/src/diffwave/preprocess.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 LMNT, Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | import numpy as np
17 | import torch
18 | import torchaudio as T
19 | import torchaudio.transforms as TT
20 | 
21 | from argparse import ArgumentParser
22 | from concurrent.futures import ProcessPoolExecutor
23 | from glob import glob
24 | from tqdm import tqdm
25 | 
26 | from diffwave.params import params
27 | 
28 | 
29 | def transform(filename):
30 |   audio, sr = T.load_wav(filename)
31 |   if params.sample_rate != sr:
32 |     raise ValueError(f'Invalid sample rate {sr}.')
33 |   audio = torch.clamp(audio[0] / 32767.5, -1.0, 1.0)
34 | 
35 |   mel_args = {
36 |       'sample_rate': sr,
37 |       'win_length': params.hop_samples * 4,
38 |       'hop_length': params.hop_samples,
39 |       'n_fft': params.n_fft,
40 |       'f_min': 20.0,
41 |       'f_max': sr / 2.0,
42 |       'n_mels': params.n_mels,
43 |       'power': 1.0,
44 |       'normalized': True,
45 |   }
46 |   mel_spec_transform = TT.MelSpectrogram(**mel_args)
47 | 
48 |   with torch.no_grad():
49 |     spectrogram = mel_spec_transform(audio)
50 |     spectrogram = 20 * torch.log10(torch.clamp(spectrogram, min=1e-5)) - 20
51 |     spectrogram = torch.clamp((spectrogram + 100) / 100, 0.0, 1.0)
52 |     np.save(f'{filename}.spec.npy', spectrogram.cpu().numpy())
53 | 
54 | 
55 | def main(args):
56 |   filenames = glob(f'{args.dir}/**/*.wav', recursive=True)
57 |   with ProcessPoolExecutor() as executor:
58 |     list(tqdm(executor.map(transform, filenames), desc='Preprocessing', total=len(filenames)))
59 | 
60 | 
61 | if __name__ == '__main__':
62 |   parser = ArgumentParser(description='prepares a dataset to train DiffWave')
63 |   parser.add_argument('dir',
64 |       help='directory containing .wav files for training')
65 |   main(parser.parse_args())
66 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import torch
 4 | import torchaudio
 5 | 
 6 | 
 7 | import models as module_arch
 8 | from utils.utils import get_instance
 9 | from inference import *
10 | 
11 | 
12 | def main(config, ckpt, infile, outfile, T, amp, deterministic):
13 |     device = torch.device('cuda')
14 |     trainer_config = config['trainer']
15 |     ckpt_dict = torch.load(ckpt, map_location=device)
16 |     n_fft = trainer_config['n_fft']
17 |     hop_length = trainer_config['hop_length']
18 |     n_mels = trainer_config['n_mels']
19 |     sr = trainer_config['sr']
20 |     train_T = trainer_config['train_T']
21 |     model = get_instance(module_arch, config['arch']).to(device)
22 |     mel_spec = module_arch.MelSpec(sr, n_fft, hop_length=hop_length,
23 |                                    f_min=20, f_max=8000, n_mels=n_mels).to(device)
24 |     model.load_state_dict(ckpt_dict['ema_model'])
25 | 
26 |     if 'noise_scheduler' in ckpt_dict:
27 |         noise_scheduler = module_arch.NoiseScheduler().to(device)
28 |         noise_scheduler.load_state_dict(
29 |             ckpt_dict['noise_scheduler'], strict=False)
30 |         noise_scheduler.eval()
31 |     else:
32 |         max_log_snr = trainer_config['max_log_snr']
33 |         min_log_snr = trainer_config['min_log_snr']
34 |         noise_scheduler = module_arch.CosineScheduler(
35 |             gamma0=-max_log_snr, gamma1=-min_log_snr).to(device)
36 |     model.eval()
37 | 
38 |     y, sr = torchaudio.load(infile)
39 |     y = y.mean(0, keepdim=True).to(device)
40 |     mels = mel_spec(y)
41 | 
42 |     z_1 = torch.randn_like(y)
43 | 
44 |     if train_T:
45 |         steps = torch.linspace(0, train_T, T + 1,
46 |                                device=device).round().long()
47 |         gamma, steps = noise_scheduler(steps / train_T)
48 |     else:
49 |         steps = torch.linspace(0, 1, T + 1, device=device)
50 |         gamma, steps = noise_scheduler(steps)
51 | 
52 |     with torch.no_grad():
53 |         if deterministic:
54 |             z_0 = reverse_process_ddim(z_1, mels, gamma, steps, model, with_amp=amp)
55 |         else:
56 |             z_0 = reverse_process_new(z_1, mels, gamma, steps, model, with_amp=amp)
57 | 
58 |     x = z_0.squeeze().clip(-0.99, 0.99)
59 |     torchaudio.save(outfile, x.unsqueeze(0).cpu(), sr)
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     parser = argparse.ArgumentParser(description='Inferencer')
64 |     parser.add_argument('config', type=str, help='config file')
65 |     parser.add_argument('ckpt', type=str)
66 |     parser.add_argument('infile', type=str)
67 |     parser.add_argument('outfile', type=str)
68 |     parser.add_argument('-T', type=int, default=20)
69 |     parser.add_argument('--amp', action='store_true')
70 |     parser.add_argument('--ddim', action='store_true')
71 |     args = parser.parse_args()
72 | 
73 |     config = json.load(open(args.config))
74 |     main(config, args.ckpt, args.infile, args.outfile, args.T, args.amp, args.ddim)
75 | 


--------------------------------------------------------------------------------
/train_continuous.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import optim, nn
  4 | from torch.cuda import amp
  5 | from torch.distributed.optim import ZeroRedundancyOptimizer
  6 | from contiguous_params import ContiguousParams
  7 | import torchaudio
  8 | from torchinfo import summary
  9 | import argparse
 10 | import json
 11 | from datetime import datetime
 12 | import os
 13 | from random import randrange, sample, uniform
 14 | from jsonschema import validate
 15 | from ignite.engine import Engine, Events
 16 | from ignite.handlers import Checkpoint, EMAHandler
 17 | from ignite.contrib.handlers.tensorboard_logger import *
 18 | from ignite.contrib.engines import common
 19 | from ignite import distributed as idist
 20 | import torch_optimizer
 21 | 
 22 | 
 23 | from utils.schema import CONFIG_SCHEMA
 24 | from utils.utils import gamma2logas, get_instance, gamma2snr, snr2as, gamma2as
 25 | import models as module_arch
 26 | from inference import reverse_process_new
 27 | 
 28 | from train_distributed import get_dataflow
 29 | 
 30 | 
 31 | def initialize(config: dict, device):
 32 |     model = get_instance(module_arch, config['arch']).to(device)
 33 | 
 34 |     parameters = model.parameters()
 35 |     parameters = ContiguousParams(parameters)
 36 | 
 37 |     model = idist.auto_model(model)
 38 | 
 39 |     optim_args = config['optimizer']['args']
 40 |     try:
 41 |         optim_type = getattr(optim, config['optimizer']['type'])
 42 |     except AttributeError:
 43 |         optim_type = getattr(torch_optimizer, config['optimizer']['type'])
 44 |     optimizer = ZeroRedundancyOptimizer(
 45 |         parameters.contiguous(), optim_type, parameters_as_bucket_view=False, **optim_args)
 46 | 
 47 |     scheduler = get_instance(
 48 |         optim.lr_scheduler, config['lr_scheduler'], optimizer)
 49 | 
 50 |     return model, optimizer, scheduler
 51 | 
 52 | 
 53 | def create_trainer(model, mel_spec, noise_scheduler, optimizer: ZeroRedundancyOptimizer, scheduler, device, trainer_config, train_sampler, checkpoint_path: str):
 54 |     save_dir = trainer_config['save_dir']
 55 |     eval_interval = trainer_config['eval_interval']
 56 |     with_amp = trainer_config['with_amp']
 57 | 
 58 |     rank = idist.get_rank()
 59 | 
 60 |     scaler = amp.GradScaler(enabled=with_amp)
 61 | 
 62 |     def process_function(engine, batch):
 63 |         model.train()
 64 |         optimizer.zero_grad()
 65 | 
 66 |         x = batch
 67 |         x = x.to(device)
 68 |         noise = torch.randn_like(x)
 69 |         mels = mel_spec(x)
 70 | 
 71 |         N = x.shape[0]
 72 | 
 73 |         t = torch.remainder(
 74 |             uniform(0, 1) + torch.arange(N, device=device) / N, 1.)
 75 |         gamma_t = t * noise_scheduler.gamma1 + \
 76 |             (1 - t) * noise_scheduler.gamma0
 77 | 
 78 |         with amp.autocast(enabled=with_amp):
 79 |             alpha, var = gamma2as(gamma_t)
 80 |             z_t = alpha[:, None] * x + var.sqrt()[:, None] * noise
 81 | 
 82 |             noise_hat = model(z_t, mels, t)
 83 |             loss = 0.5 * F.mse_loss(noise_hat, noise) * \
 84 |                 (noise_scheduler.gamma1 - noise_scheduler.gamma0)
 85 | 
 86 |         scaler.scale(loss).backward()
 87 |         scaler.step(optimizer)
 88 |         scaler.update()
 89 | 
 90 |         result = {'loss': loss.item()}
 91 |         return result
 92 | 
 93 |     trainer = Engine(process_function)
 94 | 
 95 |     ema_model = None
 96 |     if rank == 0:
 97 |         ema_handler = EMAHandler(model, momentum=0.0001)
 98 |         ema_model = ema_handler.ema_model
 99 |         ema_handler.attach(trainer, name="ema_momentum",
100 |                            event=Events.ITERATION_COMPLETED)
101 | 
102 |         to_save = {
103 |             'model': model,
104 |             'ema_model': ema_model,
105 |             'optimizer': optimizer,
106 |             'scheduler': scheduler,
107 |             'trainer': trainer,
108 |             'scaler': scaler
109 |         }
110 |     else:
111 |         to_save = {
112 |             'model': model,
113 |             'optimizer': optimizer,
114 |             'scheduler': scheduler,
115 |             'trainer': trainer,
116 |             'scaler': scaler
117 |         }
118 | 
119 |     @trainer.on(Events.ITERATION_COMPLETED(every=eval_interval))
120 |     def consolidate_state_dict():
121 |         optimizer.consolidate_state_dict()
122 |         idist.barrier()
123 | 
124 |     common.setup_common_training_handlers(
125 |         trainer,
126 |         train_sampler=train_sampler,
127 |         to_save=to_save if rank == 0 else None,
128 |         save_every_iters=eval_interval,
129 |         output_path=save_dir,
130 |         lr_scheduler=scheduler if not isinstance(
131 |             scheduler, optim.lr_scheduler.ReduceLROnPlateau) else None,
132 |         output_names=['loss'],
133 |         with_pbars=True if rank == 0 else False,
134 |         with_pbar_on_iters=True,
135 |         n_saved=2,
136 |         log_every_iters=1,
137 |         clear_cuda_cache=False
138 |     )
139 | 
140 |     if isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau):
141 |         trainer.add_event_handler(
142 |             Events.ITERATION_COMPLETED, lambda engine: scheduler.step(
143 |                 engine.state.metrics['loss'])
144 |         )
145 | 
146 |     if checkpoint_path:
147 |         checkpoint = torch.load(checkpoint_path, map_location='cpu')
148 |         if 'ema_model' in to_save and 'ema_model' not in checkpoint:
149 |             checkpoint['ema_model'] = checkpoint['model']
150 |         Checkpoint.load_objects(
151 |             to_load=to_save, checkpoint=checkpoint, strict=False)
152 | 
153 |     return trainer, ema_model
154 | 
155 | 
156 | def get_logger(trainer, model, optimizer, log_dir, model_name, interval):
157 |     # Create a logger
158 |     start_time = datetime.now().strftime('%m%d_%H%M%S')
159 |     tb_logger = common.setup_tb_logging(
160 |         output_path=os.path.join(log_dir, model_name, start_time),
161 |         trainer=trainer,
162 |         optimizers=optimizer,
163 |         log_every_iters=1
164 |     )
165 | 
166 |     tb_logger.attach(
167 |         trainer,
168 |         event_name=Events.ITERATION_COMPLETED(every=interval),
169 |         log_handler=WeightsHistHandler(model)
170 |     )
171 | 
172 |     return tb_logger
173 | 
174 | 
175 | def training(local_rank, config: dict):
176 |     rank = idist.get_rank()
177 |     device = idist.device()
178 | 
179 |     print(rank, ": run with config:", config, "- backend=", idist.backend())
180 |     print(f'world size = {idist.get_world_size()}')
181 | 
182 |     model_name = config['name']
183 |     checkpoint_path = config['checkpoint']
184 |     trainer_config = config['trainer']
185 | 
186 |     log_dir = trainer_config['log_dir']
187 |     eval_file = trainer_config['eval_file']
188 |     n_fft = trainer_config['n_fft']
189 |     hop_length = trainer_config['hop_length']
190 |     n_mels = trainer_config['n_mels']
191 |     sr = trainer_config['sr']
192 |     eval_interval = trainer_config['eval_interval']
193 |     eval_T = trainer_config['eval_T']
194 |     with_amp = trainer_config['with_amp']
195 |     max_log_snr = trainer_config['max_log_snr']
196 |     min_log_snr = trainer_config['min_log_snr']
197 | 
198 |     train_loader = get_dataflow(config)
199 |     model, optimizer, scheduler = initialize(config, device)
200 | 
201 |     noise_scheduler = module_arch.CosineScheduler(
202 |         gamma0=-max_log_snr, gamma1=-min_log_snr).to(device)
203 | 
204 |     mel_spec = module_arch.MelSpec(sr, n_fft, hop_length=hop_length,
205 |                                    f_min=20, f_max=8000, n_mels=n_mels).to(device)
206 | 
207 |     trainer, ema_model = create_trainer(model, mel_spec, noise_scheduler, optimizer,
208 |                                         scheduler, device, trainer_config, train_loader.sampler,
209 |                                         checkpoint_path)
210 | 
211 |     if rank == 0:
212 |         # add model graph
213 |         # use torchinfo
214 |         for test_input in train_loader:
215 |             break
216 |         test_input = test_input[:1].to(device)
217 |         test_mels = mel_spec(test_input)
218 |         t = torch.tensor([0.], device=device)
219 |         summary(ema_model,
220 |                 input_data=(test_input, test_mels, t),
221 |                 device=device,
222 |                 col_names=("input_size", "output_size", "num_params", "kernel_size",
223 |                            "mult_adds"),
224 |                 col_width=16,
225 |                 row_settings=("depth", "var_names"))
226 | 
227 |         tb_logger = get_logger(trainer, model, optimizer,
228 |                                log_dir, model_name, eval_interval)
229 | 
230 |         eval_x, eval_sr = torchaudio.load(os.path.expanduser(eval_file))
231 |         assert sr == eval_sr
232 |         eval_x = eval_x.mean(0).to(device).unsqueeze(0)
233 |         eval_mels = mel_spec(eval_x)
234 | 
235 |         @torch.no_grad()
236 |         def predict_samples(engine):
237 |             z_1 = torch.randn_like(eval_x)
238 |             steps = torch.linspace(0, 1, eval_T + 1, device=device)
239 |             gamma, steps = noise_scheduler(steps)
240 | 
241 |             z_0 = reverse_process_new(z_1, eval_mels, gamma,
242 |                                       steps, ema_model, with_amp=with_amp)
243 | 
244 |             predict = z_0.squeeze().clip(-0.99, 0.99)
245 |             tb_logger.writer.add_audio(
246 |                 'predict', predict, engine.state.iteration, sample_rate=sr)
247 | 
248 |         trainer.add_event_handler(Events.ITERATION_COMPLETED(
249 |             every=eval_interval), predict_samples)
250 | 
251 |     e = trainer.run(train_loader, max_epochs=1)
252 | 
253 |     if rank == 0:
254 |         tb_logger.close()
255 | 
256 | 
257 | if __name__ == "__main__":
258 |     parser = argparse.ArgumentParser(
259 |         description='DiffWave Fixed-Noise Training')
260 |     parser.add_argument('config', type=str, help='config file')
261 |     parser.add_argument('--checkpoint', type=str, default=None,
262 |                         help='training checkpoint')
263 | 
264 |     args = parser.parse_args()
265 | 
266 |     config = json.load(open(args.config))
267 |     validate(config, schema=CONFIG_SCHEMA)
268 | 
269 |     args_dict = vars(args)
270 |     config.update(args_dict)
271 | 
272 |     backend = 'nccl'
273 |     dist_configs = {
274 |         'nproc_per_node': torch.cuda.device_count()
275 |     }
276 | 
277 |     with idist.Parallel(backend=backend, **dist_configs) as parallel:
278 |         parallel.run(training, config)
279 | 


--------------------------------------------------------------------------------
/train_distributed.py:
--------------------------------------------------------------------------------
  1 | from torch.autograd import grad
  2 | import math
  3 | import torch
  4 | import torch.nn.functional as F
  5 | from torch import optim, nn
  6 | from torch.cuda import amp
  7 | from torch.distributed.optim import ZeroRedundancyOptimizer
  8 | from contiguous_params import ContiguousParams
  9 | import torchaudio
 10 | from torchinfo import summary
 11 | import argparse
 12 | import json
 13 | from datetime import datetime
 14 | from itertools import chain
 15 | import matplotlib.pyplot as plt
 16 | import os
 17 | from random import randrange, sample, uniform
 18 | from jsonschema import validate
 19 | from ignite.engine import Engine, Events
 20 | from ignite.handlers import Checkpoint, EMAHandler
 21 | from ignite.contrib.handlers.tensorboard_logger import *
 22 | from ignite.contrib.engines import common
 23 | from ignite import distributed as idist
 24 | import torch_optimizer
 25 | 
 26 | 
 27 | from utils.schema import CONFIG_SCHEMA
 28 | from utils.utils import gamma2logas, get_instance, gamma2snr, snr2as, gamma2as
 29 | import models as module_arch
 30 | import datasets as module_data
 31 | import loss as module_loss
 32 | from inference import reverse_process, reverse_process_new
 33 | 
 34 | 
 35 | def get_dataflow(config: dict):
 36 |     train_data = get_instance(module_data, config['dataset'])
 37 |     train_loader = idist.auto_dataloader(train_data, **config['data_loader'])
 38 |     return train_loader
 39 | 
 40 | 
 41 | def initialize(config: dict, device):
 42 |     model = get_instance(module_arch, config['arch']).to(device)
 43 |     noise_scheduler = module_arch.NoiseScheduler().to(device)
 44 | 
 45 |     parameters = chain(model.parameters(), noise_scheduler.parameters())
 46 |     parameters = ContiguousParams(parameters)
 47 | 
 48 |     model = idist.auto_model(model)
 49 |     noise_scheduler = idist.auto_model(noise_scheduler)
 50 | 
 51 |     optim_args = config['optimizer']['args']
 52 |     try:
 53 |         optim_type = getattr(optim, config['optimizer']['type'])
 54 |     except AttributeError:
 55 |         optim_type = getattr(torch_optimizer, config['optimizer']['type'])
 56 |     optimizer = ZeroRedundancyOptimizer(
 57 |         parameters.contiguous(), optim_type, parameters_as_bucket_view=False, **optim_args)
 58 |     # optimizer = idist.auto_optim(optimizer)
 59 | 
 60 |     scheduler = get_instance(
 61 |         optim.lr_scheduler, config['lr_scheduler'], optimizer)
 62 | 
 63 |     return model, noise_scheduler, optimizer, scheduler
 64 | 
 65 | 
 66 | def get_logger(trainer, model, noise_scheduler, optimizer, log_dir, model_name, interval):
 67 |     # Create a logger
 68 |     start_time = datetime.now().strftime('%m%d_%H%M%S')
 69 |     tb_logger = common.setup_tb_logging(
 70 |         output_path=os.path.join(log_dir, model_name, start_time),
 71 |         trainer=trainer,
 72 |         optimizers=optimizer,
 73 |         log_every_iters=1
 74 |     )
 75 | 
 76 |     tb_logger.attach(
 77 |         trainer,
 78 |         event_name=Events.ITERATION_COMPLETED(every=interval),
 79 |         log_handler=WeightsHistHandler(model)
 80 |     )
 81 |     tb_logger.attach(
 82 |         trainer,
 83 |         event_name=Events.ITERATION_COMPLETED(every=interval),
 84 |         log_handler=WeightsHistHandler(noise_scheduler)
 85 |     )
 86 | 
 87 |     return tb_logger
 88 | 
 89 | 
 90 | def create_trainer(model, mel_spec, noise_scheduler, optimizer: ZeroRedundancyOptimizer, criterion, scheduler, device, trainer_config, train_sampler, model_name: str, checkpoint_path: str):
 91 |     extra_monitor = trainer_config['extra_monitor']
 92 |     save_dir = trainer_config['save_dir']
 93 |     eval_interval = trainer_config['eval_interval']
 94 |     train_T = trainer_config['train_T']
 95 |     with_amp = trainer_config['with_amp']
 96 | 
 97 |     rank = idist.get_rank()
 98 | 
 99 |     scaler = amp.GradScaler(enabled=with_amp)
100 | 
101 |     if isinstance(noise_scheduler, nn.parallel.DistributedDataParallel) or isinstance(noise_scheduler, nn.parallel.DataParallel):
102 |         base_noise_scheduler = noise_scheduler.module
103 |     else:
104 |         base_noise_scheduler = noise_scheduler
105 | 
106 |     def process_function(engine, batch):
107 |         model.train()
108 |         noise_scheduler.train()
109 |         optimizer.zero_grad()
110 | 
111 |         x = batch
112 |         x = x.to(device)
113 |         noise = torch.randn_like(x)
114 |         mels = mel_spec(x)
115 | 
116 |         N = x.shape[0]
117 |         if train_T:
118 |             s = torch.remainder(
119 |                 uniform(0, 1) + torch.arange(N, device=device) / N, 1.)
120 |             s_idx = (s * train_T).long()
121 |             t_idx = s_idx + 1
122 | 
123 |             t, s = t_idx / train_T, s_idx / train_T
124 |             with amp.autocast(enabled=with_amp):
125 |                 gamma_ts, gamma_hat = noise_scheduler(torch.cat([t, s], dim=0))
126 |                 gamma_t, gamma_s = gamma_ts[:N], gamma_ts[N:]
127 |                 alpha_t, var_t = gamma2as(gamma_t)
128 | 
129 |                 z_t = alpha_t[:, None] * x + var_t.sqrt()[:, None] * noise
130 | 
131 |                 noise_hat = model(z_t, mels, gamma_hat[:N])
132 | 
133 |                 loss, extra_dict = criterion(
134 |                     base_noise_scheduler.gamma0,
135 |                     base_noise_scheduler.gamma1,
136 |                     torch.expm1(gamma_t - gamma_s) * train_T,
137 |                     x, noise, noise_hat)
138 |         else:
139 |             t = torch.remainder(
140 |                 uniform(0, 1) + torch.arange(N, device=device) / N, 1.)
141 |             t = t.clone().detach().requires_grad_(True)
142 | 
143 |             with amp.autocast(enabled=with_amp):
144 |                 gamma_t, gamma_hat = noise_scheduler(t)
145 |                 gamma_hat.retain_grad()
146 | 
147 |                 # alpha_t, var_t = gamma2as(gamma_t)
148 |                 log_alpha_t, log_var_t = gamma2logas(gamma_t)
149 |                 alpha_t, std_t = torch.exp(
150 |                     log_alpha_t), torch.exp(log_var_t * 0.5)
151 |                 z_t = alpha_t[:, None] * x + std_t[:, None] * noise
152 | 
153 |                 noise_hat = model(z_t, mels, gamma_hat)
154 |                 d_gamma_t, *_ = grad(gamma_t.sum(), t, create_graph=True)
155 |                 loss, extra_dict = criterion(
156 |                     base_noise_scheduler.gamma0,
157 |                     base_noise_scheduler.gamma1,
158 |                     d_gamma_t,
159 |                     x, noise, noise_hat)
160 | 
161 |                 loss_T_raw = extra_dict['loss_T_raw']
162 |                 handle = gamma_hat.register_hook(
163 |                     lambda grad: 2 * grad * loss_T_raw.to(grad.dtype))
164 | 
165 |         scaler.scale(loss).backward()
166 |         scaler.step(optimizer)
167 |         scaler.update()
168 | 
169 |         if not train_T:
170 |             handle.remove()
171 |         result = {'loss': loss.item()}
172 |         result.update(extra_dict)
173 |         return result
174 | 
175 |     trainer = Engine(process_function)
176 | 
177 |     ema_model = None
178 |     if rank == 0:
179 |         ema_handler = EMAHandler(model, momentum=0.0001)
180 |         ema_model = ema_handler.ema_model
181 |         ema_handler.attach(trainer, name="ema_momentum",
182 |                            event=Events.ITERATION_COMPLETED)
183 | 
184 |         to_save = {
185 |             'model': model,
186 |             'ema_model': ema_model,
187 |             'optimizer': optimizer,
188 |             'scheduler': scheduler,
189 |             'trainer': trainer,
190 |             'noise_scheduler': noise_scheduler,
191 |             'scaler': scaler
192 |         }
193 |     else:
194 |         to_save = {
195 |             'model': model,
196 |             'optimizer': optimizer,
197 |             'scheduler': scheduler,
198 |             'trainer': trainer,
199 |             'noise_scheduler': noise_scheduler,
200 |             'scaler': scaler
201 |         }
202 | 
203 |     @trainer.on(Events.ITERATION_COMPLETED(every=eval_interval))
204 |     def consolidate_state_dict():
205 |         optimizer.consolidate_state_dict()
206 |         idist.barrier()
207 | 
208 |     common.setup_common_training_handlers(
209 |         trainer,
210 |         train_sampler=train_sampler,
211 |         to_save=to_save if rank == 0 else None,
212 |         save_every_iters=eval_interval,
213 |         output_path=save_dir,
214 |         lr_scheduler=scheduler if not isinstance(
215 |             scheduler, optim.lr_scheduler.ReduceLROnPlateau) else None,
216 |         output_names=['loss'] + extra_monitor,
217 |         with_pbars=True if rank == 0 else False,
218 |         with_pbar_on_iters=True,
219 |         n_saved=2,
220 |         log_every_iters=1,
221 |         clear_cuda_cache=False
222 |     )
223 | 
224 |     if isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau):
225 |         trainer.add_event_handler(
226 |             Events.ITERATION_COMPLETED, lambda engine: scheduler.step(
227 |                 engine.state.metrics['loss'])
228 |         )
229 | 
230 |     if checkpoint_path:
231 |         checkpoint = torch.load(checkpoint_path, map_location='cpu')
232 |         if 'ema_model' in to_save and 'ema_model' not in checkpoint:
233 |             checkpoint['ema_model'] = checkpoint['model']
234 |         Checkpoint.load_objects(
235 |             to_load=to_save, checkpoint=checkpoint, strict=False)
236 | 
237 |     return trainer, ema_model
238 | 
239 | 
240 | def training(local_rank, config: dict):
241 |     rank = idist.get_rank()
242 |     device = idist.device()
243 | 
244 |     print(rank, ": run with config:", config, "- backend=", idist.backend())
245 |     print(f'world size = {idist.get_world_size()}')
246 | 
247 |     model_name = config['name']
248 |     checkpoint_path = config['checkpoint']
249 |     trainer_config = config['trainer']
250 | 
251 |     log_dir = trainer_config['log_dir']
252 |     eval_file = trainer_config['eval_file']
253 |     n_fft = trainer_config['n_fft']
254 |     hop_length = trainer_config['hop_length']
255 |     n_mels = trainer_config['n_mels']
256 |     sr = trainer_config['sr']
257 |     eval_interval = trainer_config['eval_interval']
258 |     train_T = trainer_config['train_T']
259 |     eval_T = trainer_config['eval_T']
260 |     with_amp = trainer_config['with_amp']
261 | 
262 |     train_loader = get_dataflow(config)
263 |     model, noise_scheduler, optimizer, scheduler = initialize(
264 |         config, device)
265 | 
266 |     criterion = module_loss.diffusion_elbo
267 | 
268 |     mel_spec = module_arch.MelSpec(sr, n_fft, hop_length=hop_length,
269 |                                    f_min=20, f_max=8000, n_mels=n_mels).to(device)
270 | 
271 |     trainer, ema_model = create_trainer(model, mel_spec, noise_scheduler, optimizer,
272 |                                         criterion, scheduler, device, trainer_config, train_loader.sampler,
273 |                                         model_name, checkpoint_path)
274 | 
275 |     if rank == 0:
276 |         # add model graph
277 |         # use torchinfo
278 |         for test_input in train_loader:
279 |             break
280 |         test_input = test_input[:1].to(device)
281 |         test_mels = mel_spec(test_input)
282 |         t = torch.tensor([0], device=device)
283 |         summary(ema_model,
284 |                 input_data=(test_input, test_mels, t),
285 |                 device=device,
286 |                 col_names=("input_size", "output_size", "num_params", "kernel_size",
287 |                            "mult_adds"),
288 |                 col_width=16,
289 |                 row_settings=("depth", "var_names"))
290 | 
291 |         tb_logger = get_logger(trainer, model, noise_scheduler, optimizer,
292 |                                log_dir, model_name, eval_interval)
293 | 
294 |         eval_x, eval_sr = torchaudio.load(os.path.expanduser(eval_file))
295 |         assert sr == eval_sr
296 |         eval_x = eval_x.mean(0).to(device).unsqueeze(0)
297 |         eval_mels = mel_spec(eval_x)
298 | 
299 |         @torch.no_grad()
300 |         def predict_samples(engine):
301 |             # model.eval()
302 |             noise_scheduler.eval()
303 | 
304 |             z_1 = torch.randn_like(eval_x)
305 | 
306 |             if train_T:
307 |                 steps = torch.linspace(0, train_T, eval_T + 1,
308 |                                        device=device).round().long()
309 |                 gamma, steps = noise_scheduler(steps / train_T)
310 |             else:
311 |                 steps = torch.linspace(0, 1, eval_T + 1, device=device)
312 |                 gamma, steps = noise_scheduler(steps)
313 | 
314 |             z_0 = reverse_process_new(z_1, eval_mels, gamma,
315 |                                       steps, ema_model, with_amp=with_amp)
316 | 
317 |             predict = z_0.squeeze().clip(-0.99, 0.99)
318 |             tb_logger.writer.add_audio(
319 |                 'predict', predict, engine.state.iteration, sample_rate=sr)
320 | 
321 |         @torch.no_grad()
322 |         def plot_noise_curve(engine):
323 |             figure = plt.figure()
324 |             steps = torch.linspace(0, 1, 100, device=device)
325 |             log_snr = -noise_scheduler(steps)[0].detach().cpu().numpy()
326 |             steps = steps.cpu().numpy()
327 |             plt.plot(steps, log_snr)
328 |             tb_logger.writer.add_figure(
329 |                 'log_snr', figure, engine.state.iteration)
330 | 
331 |         trainer.add_event_handler(Events.ITERATION_COMPLETED(
332 |             every=eval_interval), predict_samples)
333 |         trainer.add_event_handler(Events.ITERATION_COMPLETED(
334 |             every=eval_interval), plot_noise_curve)
335 | 
336 |     e = trainer.run(train_loader, max_epochs=1)
337 | 
338 |     if rank == 0:
339 |         tb_logger.close()
340 | 
341 | 
342 | if __name__ == "__main__":
343 |     parser = argparse.ArgumentParser(description='DiffWave')
344 |     parser.add_argument('config', type=str, help='config file')
345 |     parser.add_argument('--checkpoint', type=str, default=None,
346 |                         help='training checkpoint')
347 | 
348 |     args = parser.parse_args()
349 | 
350 |     config = json.load(open(args.config))
351 |     validate(config, schema=CONFIG_SCHEMA)
352 | 
353 |     args_dict = vars(args)
354 |     config.update(args_dict)
355 | 
356 |     backend = 'nccl'
357 |     dist_configs = {
358 |         'nproc_per_node': torch.cuda.device_count()
359 |     }
360 | 
361 |     with idist.Parallel(backend=backend, **dist_configs) as parallel:
362 |         parallel.run(training, config)
363 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yoyolicoris/variational-diffwave/b3edc2f1c3dc13fb72c068fb1dd0b24a2b9b423a/utils/__init__.py


--------------------------------------------------------------------------------
/utils/schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "$schema": "http://json-schema.org/draft-07/schema#",
  3 |     "definitions": {
  4 |         "instance": {
  5 |             "type": "object",
  6 |             "properties": {
  7 |                 "type": {
  8 |                     "type": "string"
  9 |                 },
 10 |                 "args": {
 11 |                     "type": "object"
 12 |                 }
 13 |             },
 14 |             "required": [
 15 |                 "type",
 16 |                 "args"
 17 |             ]
 18 |         }
 19 |     },
 20 |     "type": "object",
 21 |     "properties": {
 22 |         "name": {
 23 |             "type": "string"
 24 |         },
 25 |         "arch": {
 26 |             "$ref": "#/definitions/instance"
 27 |         },
 28 |         "dataset": {
 29 |             "$ref": "#/definitions/instance"
 30 |         },
 31 |         "optimizer": {
 32 |             "$ref": "#/definitions/instance"
 33 |         },
 34 |         "lr_scheduler": {
 35 |             "$ref": "#/definitions/instance"
 36 |         },
 37 |         "data_loader": {
 38 |             "type": "object",
 39 |             "properties": {
 40 |                 "batch_size": {
 41 |                     "type": "integer"
 42 |                 },
 43 |                 "shuffle": {
 44 |                     "type": "boolean"
 45 |                 }
 46 |             },
 47 |             "required": [
 48 |                 "batch_size",
 49 |                 "shuffle"
 50 |             ]
 51 |         },
 52 |         "trainer": {
 53 |             "type": "object",
 54 |             "properties": {
 55 |                 "save_dir": {
 56 |                     "type": "string"
 57 |                 },
 58 |                 "log_dir": {
 59 |                     "type": "string"
 60 |                 },
 61 |                 "eval_file": {
 62 |                     "type": "string"
 63 |                 },
 64 |                 "cum_steps": {
 65 |                     "type": "integer"
 66 |                 },
 67 |                 "amp_enabled": {
 68 |                     "type": "boolean"
 69 |                 },
 70 |                 "n_fft": {
 71 |                     "type": "integer"
 72 |                 },
 73 |                 "hop_length": {
 74 |                     "type": "integer"
 75 |                 },
 76 |                 "n_mels": {
 77 |                     "type": "integer"
 78 |                 },
 79 |                 "sr": {
 80 |                     "type": "integer"
 81 |                 },
 82 |                 "eval_interval": {
 83 |                     "type": "integer"
 84 |                 },
 85 |                 "train_T": {
 86 |                     "type": "integer"
 87 |                 },
 88 |                 "eval_T": {
 89 |                     "type": "integer"
 90 |                 },
 91 |                 "extra_monitor": {
 92 |                     "type": "array",
 93 |                     "items": {
 94 |                         "type": "string"
 95 |                     }
 96 |                 },
 97 |                 "minimize_var": {
 98 |                     "type": "boolean"
 99 |                 },
100 |                 "with_amp": {
101 |                     "type": "boolean"
102 |                 }
103 |             },
104 |             "required": [
105 |                 "save_dir",
106 |                 "log_dir",
107 |                 "eval_file",
108 |                 "cum_steps",
109 |                 "eval_interval",
110 |                 "n_fft",
111 |                 "hop_length",
112 |                 "n_mels",
113 |                 "sr",
114 |                 "train_T",
115 |                 "eval_T",
116 |                 "extra_monitor",
117 |                 "with_amp"
118 |             ]
119 |         }
120 |     },
121 |     "required": [
122 |         "name",
123 |         "arch",
124 |         "dataset",
125 |         "optimizer",
126 |         "lr_scheduler",
127 |         "data_loader",
128 |         "trainer"
129 |     ]
130 | }


--------------------------------------------------------------------------------
/utils/schema.py:
--------------------------------------------------------------------------------
1 | from typing import Mapping, Any
2 | import json
3 | import os
4 | 
5 | dir = os.path.dirname(__file__)
6 | CONFIG_SCHEMA: Mapping[str, Any] = json.load(
7 |     open(os.path.join(dir, "schema.json")))
8 | 


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | def remove_weight_norms(m):
 7 |     if hasattr(m, 'weight_g'):
 8 |         nn.utils.remove_weight_norm(m)
 9 | 
10 | 
11 | def add_weight_norms(m):
12 |     if hasattr(m, 'weight'):
13 |         nn.utils.weight_norm(m)
14 | 
15 | 
16 | def get_instance(module, config, *args, **kwargs):
17 |     return getattr(module, config['type'])(*args, **config['args'], **kwargs)
18 | 
19 | 
20 | def gamma2snr(g: Tensor) -> Tensor:
21 |     return torch.exp(-g)
22 | 
23 | 
24 | def snr2as(snr: Tensor):
25 |     snr_p1 = snr + 1
26 |     return torch.sqrt(snr / snr_p1), snr_p1.reciprocal()
27 | 
28 | 
29 | def gamma2as(g: Tensor):
30 |     var = g.sigmoid()
31 |     return (1 - var).sqrt(), var
32 | 
33 | 
34 | def gamma2logas(g: Tensor):
35 |     log_var = -F.softplus(-g)
36 |     return 0.5 * (-g + log_var), log_var
37 | 


--------------------------------------------------------------------------------