├── scripts
    ├── __init__.py
    ├── compress.py
    ├── utils.py
    ├── test.py
    ├── metrics.py
    ├── trainer_no_adv.py
    └── trainer_adv.py
├── esc
    ├── modules
    │   ├── loss
    │   │   ├── __init__.py
    │   │   ├── gan_loss.py
    │   │   └── generator_loss.py
    │   ├── vq
    │   │   ├── __init__.py
    │   │   ├── initialize.py
    │   │   └── codebook.py
    │   ├── transformer
    │   │   ├── __init__.py
    │   │   ├── scale.py
    │   │   └── attention.py
    │   ├── __init__.py
    │   └── convolution
    │   │   └── layers.py
    ├── __init__.py
    └── models
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── discriminator.py
    │   ├── csrvq.py
    │   ├── codecs.py
    │   └── base.py
├── baselines
    └── descript
    │   ├── dac
    │       ├── compare
    │       │   ├── __init__.py
    │       │   └── encodec.py
    │       ├── nn
    │       │   ├── __init__.py
    │       │   ├── layers.py
    │       │   ├── quantize.py
    │       │   └── loss.py
    │       ├── model
    │       │   ├── __init__.py
    │       │   ├── discriminator.py
    │       │   ├── base.py
    │       │   └── dac.py
    │       ├── __init__.py
    │       ├── __main__.py
    │       └── utils
    │       │   ├── decode.py
    │       │   ├── encode.py
    │       │   └── __init__.py
    │   ├── README.md
    │   ├── conf
    │       ├── descript_6k_final.yml
    │       ├── 16khz_dns_9k.yml
    │       └── 16khz_dns_9k_tiny.yml
    │   └── scripts
    │       └── train_customize_no_adv.py
├── assets
    ├── results.png
    └── architecture.png
├── requirements.txt
├── configs
    ├── 9kbps_esc_base.yaml
    ├── ablations
    │   ├── 9kbps_csvq_conv.yaml
    │   ├── 9kbps_rvq_conv.yaml
    │   ├── 9kbps_csvq_swinT.yaml
    │   └── 9kbps_rvq_swinT.yaml
    ├── 9kbps_esc_large.yaml
    └── 9kbps_esc_base_adv.yaml
├── LICENSE
├── main.py
├── .gitignore
├── scripts_all.sh
└── README.md


/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/esc/modules/loss/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/esc/modules/vq/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/esc/modules/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/descript/dac/compare/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/esc/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import ESC, RVQCodecs


--------------------------------------------------------------------------------
/assets/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzGuu830/efficient-speech-codec/HEAD/assets/results.png


--------------------------------------------------------------------------------
/assets/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzGuu830/efficient-speech-codec/HEAD/assets/architecture.png


--------------------------------------------------------------------------------
/baselines/descript/dac/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from . import layers
2 | from . import loss
3 | from . import quantize
4 | 


--------------------------------------------------------------------------------
/esc/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .codecs import ESC, RVQCodecs, make_model
2 | from .discriminator import Discriminator


--------------------------------------------------------------------------------
/baselines/descript/dac/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import CodecMixin
2 | from .base import DACFile
3 | from .dac import DAC
4 | from .discriminator import Discriminator
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==2.0.0
2 | torchaudio==2.0.0
3 | transformers==4.36.2
4 | accelerate==0.26.1
5 | timm==0.9.12
6 | einops==0.7.0
7 | pesq==0.0.4
8 | wandb>=0.16.2
9 | git+https://github.com/descriptinc/audiotools


--------------------------------------------------------------------------------
/baselines/descript/dac/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "1.0.0"
 2 | 
 3 | # preserved here for legacy reasons
 4 | __model_version__ = "latest"
 5 | 
 6 | import audiotools
 7 | 
 8 | audiotools.ml.BaseModel.INTERN += ["dac.**"]
 9 | audiotools.ml.BaseModel.EXTERN += ["einops"]
10 | 
11 | 
12 | from . import nn
13 | from . import model
14 | from . import utils
15 | from .model import DAC
16 | from .model import DACFile
17 | 


--------------------------------------------------------------------------------
/esc/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .transformer.attention import TransformerLayer
2 | from .transformer.scale import PatchEmbed, PatchDeEmbed
3 | 
4 | from .vq.quantization import ProductVectorQuantize, ResidualVectorQuantize, ProductResidualVectorQuantize
5 | from .loss.generator_loss import MelSpectrogramLoss, ComplexSTFTLoss
6 | from .loss.gan_loss import GANLoss
7 | from .convolution.layers import ConvolutionLayer, Convolution2D


--------------------------------------------------------------------------------
/esc/models/utils.py:
--------------------------------------------------------------------------------
 1 | from ..modules import TransformerLayer, ConvolutionLayer, Convolution2D
 2 | 
 3 | def blk_func(blk, feat, feat_shape):
 4 |     Wh, Ww = feat_shape
 5 |     if isinstance(blk, TransformerLayer): 
 6 |         feat_next, Wh, Ww = blk(feat, Wh, Ww)
 7 |     elif isinstance(blk, ConvolutionLayer):
 8 |         feat_next = blk(feat)
 9 |         Wh, Ww = Wh//2, Ww
10 |     elif isinstance(blk, Convolution2D):
11 |         feat_next = blk(feat)
12 |     
13 |     return feat_next, (Wh, Ww)


--------------------------------------------------------------------------------
/baselines/descript/README.md:
--------------------------------------------------------------------------------
 1 | ## Descript's Audio Codec (DAC) Experimental Reproduction
 2 | 
 3 | This folder is mostly borrowed from [Descript's Github Repository](https://github.com/descriptinc/descript-audio-codec).
 4 | 
 5 | We adapt a few features for customized reproduction. For developmental setups, refer to the original repository.
 6 | 
 7 | 
 8 | ## Reproduce DAC Baselines
 9 | 
10 | ```ruby
11 | torchrun --nproc_per_node gpu train_customize.py --config 16kHz_dns_9k.yml
12 | ```
13 | This reproduces 16kHz (0.5kbps ~ 9.0kbps) DAC with adversarial setups.
14 | 
15 | ```ruby
16 | torchrun --nproc_per_node gpu train_customize_no_adv.py --config 16kHz_dns_9k_tiny.yml
17 | ```
18 | This reproduces 16kHz (0.5kbps ~ 9.0kbps) DAC in non-adversarial setups.


--------------------------------------------------------------------------------
/configs/9kbps_esc_base.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     train_data_path: ../data/train
 3 |     val_data_path: ../data/val
 4 |     num_workers: 36
 5 |     train_bs_per_device: 9
 6 |     val_bs_per_device: 4
 7 | 
 8 | model_name: csvq+swinT
 9 | model:
10 |     backbone: transformer
11 |     in_dim: 2
12 |     in_freq: 192
13 |     h_dims: [45,72,96,144,192,384]
14 |     max_streams: 6
15 |     win_len: 20
16 |     hop_len: 5
17 |     sr: 16000
18 |     patch_size: [3,2]
19 |     swin_heads: [3,6,12,24,24]
20 |     swin_depth: 2
21 |     window_size: 4
22 |     mlp_ratio: 4.
23 |     overlap: 2
24 |     group_size: 3
25 |     codebook_size: 1024
26 |     codebook_dims: [32,32,16,12,8,6]
27 |     l2norm: True
28 | 
29 | loss:
30 |     stft_weight: 1.0
31 |     cm_weight: .25
32 |     cb_weight: 1.0
33 |     mel_weight: .25


--------------------------------------------------------------------------------
/configs/ablations/9kbps_csvq_conv.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     train_data_path: ../data/dnscustom/processed_wav/train
 3 |     val_data_path: ../data/dnscustom/processed_wav/test
 4 |     num_workers: 36
 5 |     train_bs_per_device: 9
 6 |     val_bs_per_device: 8
 7 | 
 8 | model_name: csvq+conv
 9 | model:
10 |     backbone: convolution
11 |     in_dim: 2
12 |     in_freq: 192
13 |     h_dims: [45,72,96,144,192,384]
14 |     max_streams: 6
15 |     kernel_size: [5,2]
16 |     patch_size: [3,2]
17 |     conv_depth: 1
18 |     overlap: 2
19 |     group_size: 3
20 |     codebook_size: 1024
21 |     codebook_dim: [8,8,8,8,8,8]
22 |     l2norm: True
23 |     win_len: 20
24 |     hop_len: 5
25 |     sr: 16000
26 | 
27 | loss:
28 |     stft_weight: 1.0
29 |     cm_weight: .25
30 |     cb_weight: 1.0
31 |     mel_weight: .25


--------------------------------------------------------------------------------
/configs/ablations/9kbps_rvq_conv.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     train_data_path: ../data/dnscustom/processed_wav/train
 3 |     val_data_path: ../data/dnscustom/processed_wav/test
 4 |     num_workers: 36
 5 |     train_bs_per_device: 9
 6 |     val_bs_per_device: 8
 7 | 
 8 | model_name: rvq+conv
 9 | model:
10 |     backbone: convolution
11 |     in_dim: 2
12 |     in_freq: 192
13 |     h_dims: [45,72,96,144,192,384]
14 |     max_streams: 6
15 |     kernel_size: [5,2]
16 |     patch_size: [3,2]
17 |     conv_depth: 1
18 |     overlap: 2
19 |     num_rvqs: 6
20 |     group_size: 3
21 |     codebook_size: 1024
22 |     codebook_dim: [8,8,8,8,8,8]
23 |     l2norm: True
24 |     win_len: 20
25 |     hop_len: 5
26 |     sr: 16000
27 | 
28 | loss:
29 |     stft_weight: 1.0
30 |     cm_weight: .25
31 |     cb_weight: 1.0
32 |     mel_weight: .25


--------------------------------------------------------------------------------
/configs/9kbps_esc_large.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     train_data_path: ../data/dnscustom/processed_wav/train
 3 |     val_data_path: ../data/dnscustom/processed_wav/test
 4 |     num_workers: 36
 5 |     train_bs_per_device: 9
 6 |     val_bs_per_device: 6
 7 | 
 8 | model_name: csvq+swinT
 9 | model:
10 |     backbone: transformer
11 |     in_dim: 2
12 |     in_freq: 192
13 |     h_dims: [45,72,96,144,192,384]
14 |     max_streams: 6
15 |     patch_size: [3,2]
16 |     swin_heads: [3,6,12,24,24]
17 |     swin_depth: 4
18 |     window_size: 4
19 |     mlp_ratio: 4.
20 |     overlap: 2
21 |     group_size: 3
22 |     codebook_size: 1024
23 |     codebook_dims: [8,8,8,8,8,8]
24 |     l2norm: True
25 |     win_len: 20
26 |     hop_len: 5
27 |     sr: 16000
28 | 
29 | loss:
30 |     stft_weight: 1.0
31 |     cm_weight: .25
32 |     cb_weight: 1.0
33 |     mel_weight: .25


--------------------------------------------------------------------------------
/baselines/descript/dac/__main__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import argbind
 4 | 
 5 | from dac.utils import download
 6 | from dac.utils.decode import decode
 7 | from dac.utils.encode import encode
 8 | 
 9 | STAGES = ["encode", "decode", "download"]
10 | 
11 | 
12 | def run(stage: str):
13 |     """Run stages.
14 | 
15 |     Parameters
16 |     ----------
17 |     stage : str
18 |         Stage to run
19 |     """
20 |     if stage not in STAGES:
21 |         raise ValueError(f"Unknown command: {stage}. Allowed commands are {STAGES}")
22 |     stage_fn = globals()[stage]
23 | 
24 |     if stage == "download":
25 |         stage_fn()
26 |         return
27 | 
28 |     stage_fn()
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     group = sys.argv.pop(1)
33 |     args = argbind.parse_args(group=group)
34 | 
35 |     with argbind.scope(args):
36 |         run(group)
37 | 


--------------------------------------------------------------------------------
/configs/ablations/9kbps_csvq_swinT.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     train_data_path: ../data/dnscustom/processed_wav/train
 3 |     val_data_path: ../data/dnscustom/processed_wav/test
 4 |     num_workers: 36
 5 |     train_bs_per_device: 9
 6 |     val_bs_per_device: 6
 7 | 
 8 | model_name: csvq+swinT
 9 | model:
10 |     backbone: transformer
11 |     in_dim: 2
12 |     in_freq: 192
13 |     h_dims: [45,72,96,144,192,384]
14 |     max_streams: 6
15 |     patch_size: [3,2]
16 |     swin_heads: [3,6,12,24,24]
17 |     swin_depth: 2
18 |     window_size: 4
19 |     mlp_ratio: 4.
20 |     overlap: 2
21 |     group_size: 3
22 |     codebook_size: 1024
23 |     codebook_dims: [8,8,8,8,8,8]
24 |     l2norm: True
25 |     win_len: 20
26 |     hop_len: 5
27 |     sr: 16000
28 | 
29 | loss:
30 |     stft_weight: 1.0
31 |     cm_weight: .25
32 |     cb_weight: 1.0
33 |     mel_weight: .25


--------------------------------------------------------------------------------
/configs/ablations/9kbps_rvq_swinT.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     train_data_path: ../data/dnscustom/processed_wav/train
 3 |     val_data_path: ../data/dnscustom/processed_wav/test
 4 |     num_workers: 36
 5 |     train_bs_per_device: 18
 6 |     val_bs_per_device: 6
 7 | 
 8 | model_name: rvq+swinT
 9 | model:
10 |     backbone: transformer
11 |     in_dim: 2
12 |     in_freq: 192
13 |     h_dims: [45,72,96,144,192,384]
14 |     max_streams: 6
15 |     patch_size: [3,2]
16 |     swin_heads: [3,6,12,24,24]
17 |     swin_depth: 2
18 |     window_size: 4
19 |     mlp_ratio: 4.
20 |     overlap: 2
21 |     num_rvqs: 6
22 |     group_size: 3
23 |     codebook_size: 1024
24 |     codebook_dim: 8
25 |     l2norm: True
26 |     win_len: 20
27 |     hop_len: 5
28 |     sr: 16000
29 | 
30 | loss:
31 |     stft_weight: 1.0
32 |     cm_weight: .25
33 |     cb_weight: 1.0
34 |     mel_weight: .25


--------------------------------------------------------------------------------
/baselines/descript/dac/nn/layers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from einops import rearrange
 6 | from torch.nn.utils import weight_norm
 7 | 
 8 | 
 9 | def WNConv1d(*args, **kwargs):
10 |     return weight_norm(nn.Conv1d(*args, **kwargs))
11 | 
12 | 
13 | def WNConvTranspose1d(*args, **kwargs):
14 |     return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
15 | 
16 | 
17 | # Scripting this brings model speed up 1.4x
18 | @torch.jit.script
19 | def snake(x, alpha):
20 |     shape = x.shape
21 |     x = x.reshape(shape[0], shape[1], -1)
22 |     x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
23 |     x = x.reshape(shape)
24 |     return x
25 | 
26 | 
27 | class Snake1d(nn.Module):
28 |     def __init__(self, channels):
29 |         super().__init__()
30 |         self.alpha = nn.Parameter(torch.ones(1, channels, 1))
31 | 
32 |     def forward(self, x):
33 |         return snake(x, self.alpha)
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) [2024] [Yuzhe Gu]
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 6 | 
 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 | 


--------------------------------------------------------------------------------
/configs/9kbps_esc_base_adv.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     train_data_path: ../data/train
 3 |     val_data_path: ../data/val
 4 |     num_workers: 36
 5 |     train_bs_per_device: 9
 6 |     val_bs_per_device: 4
 7 | 
 8 | model_name: csvq+swinT
 9 | model:
10 |     backbone: transformer
11 |     in_dim: 2
12 |     in_freq: 192
13 |     h_dims: [45,72,96,144,192,384]
14 |     max_streams: 6
15 |     win_len: 20
16 |     hop_len: 5
17 |     sr: 16000
18 |     patch_size: [3,2]
19 |     swin_heads: [3,6,12,24,24]
20 |     swin_depth: 2
21 |     window_size: 4
22 |     mlp_ratio: 4.
23 |     overlap: 2
24 |     group_size: 3
25 |     codebook_size: 1024
26 |     codebook_dims: [8,8,8,8,8,8]
27 |     l2norm: True
28 | 
29 | discriminator:
30 |     sample_rate: 16000
31 |     rates: []
32 |     periods: [2, 3, 5, 7, 11]
33 |     fft_sizes: [2048, 1024, 512]
34 |     bands:
35 |         - [0.0, 0.1]
36 |         - [0.1, 0.25]
37 |         - [0.25, 0.5]
38 |         - [0.5, 0.75]
39 |         - [0.75, 1.0]
40 | 
41 | loss:
42 |     stft_weight: 0.0
43 |     cm_weight: .25
44 |     cb_weight: 1.0
45 |     mel_weight: 15.0
46 |     gen_weight: 1.0
47 |     feat_weight: 2.0


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from scripts.trainer_no_adv import main as train_no_adv
 4 | from scripts.trainer_adv import main as train_adv
 5 | from scripts.utils import read_yaml, dict2namespace
 6 | 
 7 | def parse_args_config():
 8 |     parser = argparse.ArgumentParser()
 9 |     
10 |     # Experimental Setups
11 |     parser.add_argument("--exp_name", default="esc9kbps", type=str)
12 |     parser.add_argument("--wandb_project", default=None, type=str)
13 |     parser.add_argument("--lr", default=1.e-4, type=float)
14 |     parser.add_argument("--num_epochs", default=80, type=int)
15 |     parser.add_argument("--num_pretraining_epochs", default=10, type=int)
16 |     parser.add_argument("--num_devices", default=4, type=int)
17 |     parser.add_argument("--num_warmup_steps", default=0, type=int)
18 |     parser.add_argument("--val_metric", default="PESQ", type=str)
19 |     parser.add_argument("--scheduler_type", default="constant", type=str)
20 |     parser.add_argument("--dropout_rate", type=float, default=1.0)
21 |     parser.add_argument("--adv_training", default=False, action="store_true")
22 |     parser.add_argument("--pretrain_ckp", type=str, default=None)
23 |     
24 |     parser.add_argument("--log_steps", default=5, type=int)
25 |     parser.add_argument("--save_path", default="./output", type=str)
26 |     parser.add_argument("--config_path", default="./configs/9kbps_esc_base.yaml")
27 |     parser.add_argument("--seed", default=1234, type=int)
28 | 
29 |     args = parser.parse_args()    
30 |     config = dict2namespace(read_yaml(args.config_path))
31 |     return args, config
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     args, config = parse_args_config()
36 |     if args.adv_training:
37 |         train_adv(args, config)
38 |     else:
39 |         train_no_adv(args, config)
40 |         


--------------------------------------------------------------------------------
/baselines/descript/dac/compare/encodec.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from audiotools import AudioSignal
 3 | from audiotools.ml import BaseModel
 4 | from encodec import EncodecModel
 5 | 
 6 | 
 7 | class Encodec(BaseModel):
 8 |     def __init__(self, sample_rate: int = 24000, bandwidth: float = 24.0):
 9 |         super().__init__()
10 | 
11 |         if sample_rate == 24000:
12 |             self.model = EncodecModel.encodec_model_24khz()
13 |         else:
14 |             self.model = EncodecModel.encodec_model_48khz()
15 |         self.model.set_target_bandwidth(bandwidth)
16 |         self.sample_rate = 44100
17 | 
18 |     def forward(
19 |         self,
20 |         audio_data: torch.Tensor,
21 |         sample_rate: int = 44100,
22 |         n_quantizers: int = None,
23 |     ):
24 |         signal = AudioSignal(audio_data, sample_rate)
25 |         signal.resample(self.model.sample_rate)
26 |         recons = self.model(signal.audio_data)
27 |         recons = AudioSignal(recons, self.model.sample_rate)
28 |         recons.resample(sample_rate)
29 |         return {"audio": recons.audio_data}
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     import numpy as np
34 |     from functools import partial
35 | 
36 |     model = Encodec()
37 | 
38 |     for n, m in model.named_modules():
39 |         o = m.extra_repr()
40 |         p = sum([np.prod(p.size()) for p in m.parameters()])
41 |         fn = lambda o, p: o + f" {p/1e6:<.3f}M params."
42 |         setattr(m, "extra_repr", partial(fn, o=o, p=p))
43 |     print(model)
44 |     print("Total # of params: ", sum([np.prod(p.size()) for p in model.parameters()]))
45 | 
46 |     length = 88200 * 2
47 |     x = torch.randn(1, 1, length).to(model.device)
48 |     x.requires_grad_(True)
49 |     x.retain_grad()
50 | 
51 |     # Make a forward pass
52 |     out = model(x)["audio"]
53 | 
54 |     print(x.shape, out.shape)
55 | 


--------------------------------------------------------------------------------
/baselines/descript/conf/descript_6k_final.yml:
--------------------------------------------------------------------------------
 1 | # Model setup
 2 | DAC:
 3 |   sample_rate: 16000
 4 |   encoder_dim: 64
 5 |   encoder_rates: [2, 4, 5, 8]
 6 |   decoder_dim: 1536
 7 |   decoder_rates: [8, 5, 4, 2]
 8 | 
 9 | # Quantization
10 |   n_codebooks: 12
11 |   codebook_size: 1024
12 |   codebook_dim: 8
13 |   quantizer_dropout: 0.5
14 | 
15 | # Discriminator
16 | Discriminator:
17 |   sample_rate: 16000
18 |   rates: []
19 |   periods: [2, 3, 5, 7, 11]
20 |   fft_sizes: [2048, 1024, 512]
21 |   bands:
22 |     - [0.0, 0.1]
23 |     - [0.1, 0.25]
24 |     - [0.25, 0.5]
25 |     - [0.5, 0.75]
26 |     - [0.75, 1.0]
27 | 
28 | # Optimization
29 | AdamW:
30 |   betas: [0.8, 0.99]
31 |   lr: 0.0001
32 | ExponentialLR:
33 |   gamma: 0.999996
34 | 
35 | amp: false
36 | val_batch_size: 16
37 | batch_size: 12
38 | device: cuda
39 | num_iters: 400000
40 | save_iters: [10000, 50000, 100000, 200000]
41 | valid_freq: 4000
42 | sample_freq: 10000
43 | num_workers: 8
44 | log_every: 5
45 | val_idx: [0, 1, 2, 3, 4, 5, 6, 7]
46 | seed: 53
47 | lambdas:
48 |   mel/loss: 15.0
49 |   adv/feat_loss: 2.0
50 |   adv/gen_loss: 1.0
51 |   vq/commitment_loss: 0.25
52 |   vq/codebook_loss: 1.0
53 | 
54 | # Transforms
55 | build_transform:
56 |   preprocess:
57 |     - Identity
58 |   augment_prob: 0.0
59 |   augment:
60 |     - Identity
61 |   postprocess:
62 |     - VolumeNorm
63 |     - RescaleAudio
64 |     - ShiftPhase
65 |   # - Identity
66 | 
67 | # Loss setup
68 | MultiScaleSTFTLoss:
69 |   window_lengths: [2048, 512]
70 | MelSpectrogramLoss:
71 |   n_mels: [5, 10, 20, 40, 80, 160, 320]
72 |   window_lengths: [32, 64, 128, 256, 512, 1024, 2048]
73 |   mel_fmin: [0, 0, 0, 0, 0, 0, 0]
74 |   mel_fmax: [null, null, null, null, null, null, null]
75 |   pow: 1.0
76 |   clamp_eps: 1.0e-5
77 |   mag_weight: 0.0
78 | 
79 | save_path: /scratch/eys9/descript-audio-codec/runs/compare_study_dns/ 
80 | wb_project_name: Neural_Speech_Coding
81 | wb_exp_name: DAC16k-Original


--------------------------------------------------------------------------------
/baselines/descript/conf/16khz_dns_9k.yml:
--------------------------------------------------------------------------------
 1 | # Model setup
 2 | DAC:
 3 |   sample_rate: 16000
 4 |   encoder_dim: 64
 5 |   encoder_rates: [2, 4, 5, 8]
 6 |   decoder_dim: 1536
 7 |   decoder_rates: [8, 5, 4, 2]
 8 | 
 9 | # Quantization
10 |   n_codebooks: 18
11 |   codebook_size: 1024
12 |   codebook_dim: 8
13 |   quantizer_dropout: 0.5
14 | 
15 | # Discriminator
16 | Discriminator:
17 |   sample_rate: 16000
18 |   rates: []
19 |   periods: [2, 3, 5, 7, 11]
20 |   fft_sizes: [2048, 1024, 512]
21 |   bands:
22 |     - [0.0, 0.1]
23 |     - [0.1, 0.25]
24 |     - [0.25, 0.5]
25 |     - [0.5, 0.75]
26 |     - [0.75, 1.0]
27 | 
28 | # Optimization
29 | AdamW:
30 |   betas: [0.8, 0.99]
31 |   lr: 0.0001
32 | ExponentialLR:
33 |   gamma: 0.999996
34 | 
35 | amp: false
36 | val_batch_size: 24
37 | batch_size: 16
38 | device: cuda
39 | num_iters: 400000
40 | save_iters: [10000, 50000, 100000, 200000]
41 | valid_freq: 4000
42 | sample_freq: 10000
43 | num_workers: 32
44 | log_every: 5
45 | val_idx: [0, 1, 2, 3, 4, 5, 6, 7]
46 | seed: 53
47 | lambdas:
48 |   mel/loss: 15.0
49 |   adv/feat_loss: 2.0
50 |   adv/gen_loss: 1.0
51 |   vq/commitment_loss: 0.25
52 |   vq/codebook_loss: 1.0
53 | 
54 | # Transforms
55 | build_transform:
56 |   preprocess:
57 |     - Identity
58 |   augment_prob: 0.0
59 |   augment:
60 |     - Identity
61 |   postprocess:
62 |     - VolumeNorm
63 |     - RescaleAudio
64 |     - ShiftPhase
65 | 
66 | # Loss setup
67 | MultiScaleSTFTLoss:
68 |   window_lengths: [2048, 512]
69 | MelSpectrogramLoss:
70 |   n_mels: [5, 10, 20, 40, 80, 160, 320]
71 |   window_lengths: [32, 64, 128, 256, 512, 1024, 2048]
72 |   mel_fmin: [0, 0, 0, 0, 0, 0, 0]
73 |   mel_fmax: [null, null, null, null, null, null, null]
74 |   pow: 1.0
75 |   clamp_eps: 1.0e-5
76 |   mag_weight: 0.0
77 | 
78 | 
79 | data_path: ../DNS_CHALLENGE/processed_wav
80 | save_path: ../dac_output/DAC16kHz_9kbps_base/ 
81 | wb_project_name: Neural_Speech_Coding
82 | wb_exp_name: DAC16kHz_9kbps_base


--------------------------------------------------------------------------------
/baselines/descript/conf/16khz_dns_9k_tiny.yml:
--------------------------------------------------------------------------------
 1 | # Model setup
 2 | DAC:
 3 |   sample_rate: 16000
 4 |   encoder_dim: 32
 5 |   encoder_rates: [2, 4, 5, 8]
 6 |   decoder_dim: 288
 7 |   decoder_rates: [8, 5, 4, 2]
 8 | 
 9 | # Quantization
10 |   n_codebooks: 18
11 |   codebook_size: 1024
12 |   codebook_dim: 8
13 |   quantizer_dropout: 0.5
14 | 
15 | # Discriminator
16 | Discriminator:
17 |   sample_rate: 16000
18 |   rates: []
19 |   periods: [2, 3, 5, 7, 11]
20 |   fft_sizes: [2048, 1024, 512]
21 |   bands:
22 |     - [0.0, 0.1]
23 |     - [0.1, 0.25]
24 |     - [0.25, 0.5]
25 |     - [0.5, 0.75]
26 |     - [0.75, 1.0]
27 | 
28 | # Optimization
29 | AdamW:
30 |   betas: [0.8, 0.99]
31 |   lr: 0.0001
32 | ExponentialLR:
33 |   gamma: 0.999996
34 | 
35 | amp: false
36 | val_batch_size: 32
37 | batch_size: 16
38 | device: cuda
39 | num_iters: 400000
40 | save_iters: [10000, 50000, 100000, 200000]
41 | valid_freq: 4000
42 | sample_freq: 10000
43 | num_workers: 32
44 | log_every: 5
45 | val_idx: [0, 1, 2, 3, 4, 5, 6, 7]
46 | seed: 53
47 | lambdas:
48 |   mel/loss: 15.0
49 |   adv/feat_loss: 2.0
50 |   adv/gen_loss: 1.0
51 |   vq/commitment_loss: 0.25
52 |   vq/codebook_loss: 1.0
53 | 
54 | # Transforms
55 | build_transform:
56 |   preprocess:
57 |     - Identity
58 |   augment_prob: 0.0
59 |   augment:
60 |     - Identity
61 |   postprocess:
62 |     - VolumeNorm
63 |     - RescaleAudio
64 |     - ShiftPhase
65 | 
66 | # Loss setup
67 | MultiScaleSTFTLoss:
68 |   window_lengths: [2048, 512]
69 | MelSpectrogramLoss:
70 |   n_mels: [5, 10, 20, 40, 80, 160, 320]
71 |   window_lengths: [32, 64, 128, 256, 512, 1024, 2048]
72 |   mel_fmin: [0, 0, 0, 0, 0, 0, 0]
73 |   mel_fmax: [null, null, null, null, null, null, null]
74 |   pow: 1.0
75 |   clamp_eps: 1.0e-5
76 |   mag_weight: 0.0
77 | 
78 | 
79 | data_path: ../DNS_CHALLENGE/processed_wav
80 | save_path: ../dac_output/DAC16kHz_9kbps_tiny/ 
81 | wb_project_name: Neural_Speech_Coding
82 | wb_exp_name: DAC16kHz_9kbps_tiny


--------------------------------------------------------------------------------
/esc/modules/loss/gan_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class GANLoss(nn.Module):
 6 |     """
 7 |     Computes a discriminator loss, given a discriminator on
 8 |     generated waveforms/spectrograms compared to ground truth
 9 |     waveforms/spectrograms. Computes the loss for both the
10 |     discriminator and the generator in separate functions.
11 | 
12 |     Adapted from DAC https://github.com/descriptinc/descript-audio-codec/blob/main/
13 |     """
14 | 
15 |     def __init__(self, discriminator):
16 |         super().__init__()
17 |         self.discriminator = discriminator
18 | 
19 |     def forward(self, fake, real):
20 |         """
21 |         fake/real: audio tensor of shape [batchsize, channel, len]
22 |         """
23 |         if fake.dim() == 2: fake = fake.unsqueeze(1)
24 |         if real.dim() == 2: real = real.unsqueeze(1)
25 | 
26 |         d_fake = self.discriminator(**dict(x=fake))
27 |         d_real = self.discriminator(**dict(x=real))
28 |         return d_fake, d_real
29 | 
30 |     def discriminator_loss(self, fake, real):
31 |         d_fake, d_real = self.forward(fake.clone().detach(), real)
32 | 
33 |         loss_d = 0
34 |         for x_fake, x_real in zip(d_fake, d_real):
35 |             loss_d += torch.mean(x_fake[-1] ** 2, dim=[1,2,3])
36 |             loss_d += torch.mean((1 - x_real[-1]) ** 2, dim=[1,2,3])
37 |         return loss_d
38 | 
39 |     def generator_loss(self, fake, real):
40 |         d_fake, d_real = self.forward(fake, real)
41 | 
42 |         loss_g = 0
43 |         for x_fake in d_fake:
44 |             loss_g += torch.mean((1 - x_fake[-1]) ** 2, dim=[1,2,3])
45 | 
46 |         loss_feature = 0
47 | 
48 |         for i in range(len(d_fake)):
49 |             for j in range(len(d_fake[i]) - 1):
50 |                 loss_feature += F.l1_loss(d_fake[i][j], d_real[i][j].detach(), reduction="none").mean([1,2,3])
51 |         return loss_g, loss_feature


--------------------------------------------------------------------------------
/scripts/compress.py:
--------------------------------------------------------------------------------
 1 | from esc.models import make_model
 2 | from .utils import read_yaml
 3 | import torch, os, torchaudio, argparse, warnings
 4 | warnings.filterwarnings("ignore")
 5 | 
 6 | def parse_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--input", type=str, required=True, help="input 16kHz mono audio file to encode")
 9 |     parser.add_argument("--save_path", type=str, default="./output", help="folder to save codes and reconstructed audio")
10 | 
11 |     parser.add_argument("--model_path", type=str, required=True, help="folder contains model configuration and checkpoint")
12 |     parser.add_argument("--num_streams", type=int, default=6, help="number of transmitted streams in encoding")
13 | 
14 |     parser.add_argument("--device", type=str, default="cpu")
15 |     return parser.parse_args()
16 | 
17 | def main(args):
18 |     
19 |     x, sr = torchaudio.load(f"{args.input}")
20 |     x = x.to(args.device)
21 | 
22 |     model = make_model(read_yaml(f"{args.model_path}/config.yaml")['model'])
23 |     model.load_state_dict(
24 |         torch.load(f"{args.model_path}/model.pth", map_location="cpu")["model_state_dict"],
25 |     )
26 |     model = model.to(args.device)
27 | 
28 |     codes, size = model.encode(x, num_streams=args.num_streams)
29 |     recon_x = model.decode(codes, size)
30 | 
31 |     fname = args.input.split("/")[-1]
32 |     if not os.path.exists(args.save_path): 
33 |         os.makedirs(args.save_path)
34 |     torchaudio.save(f"{args.save_path}/decoded_{args.num_streams*1.5}kbps_{fname}", recon_x, sr)
35 |     torch.save(codes, f"{args.save_path}/encoded_{args.num_streams*1.5}kbps_{fname.split('.')[0]}.pth")
36 |     print(f"compression outputs saved into {args.save_path}")
37 | 
38 | if __name__ == "__main__":
39 |     args = parse_args()
40 |     main(args)
41 | 
42 | """
43 | python -m scripts.compress \
44 |     --input ./audio.wav \
45 |     --save_path ./output \
46 |     --model_path ./esc9kbps \
47 |     --num_streams 6 \
48 |     --device cpu 
49 | 
50 | """


--------------------------------------------------------------------------------
/esc/modules/convolution/layers.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | class Convolution2D(nn.Module):
 4 |     """2D Convolution Layer"""
 5 |     def __init__(self, 
 6 |                  in_channels, 
 7 |                  out_channels,
 8 |                  kernel_size=(5,2),
 9 |                  scale=True,
10 |                  transpose=False):
11 |         super().__init__()
12 | 
13 |         stride = (2,1) if scale else (1,1)
14 |         conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding=(2,1)) if not transpose \
15 |             else nn.ConvTranspose2d(in_channels, out_channels, kernel_size, stride, padding=(1,0))
16 |         self.conv = conv
17 |         self.transpose, self.scale = transpose, scale
18 | 
19 |     def forward(self, x):
20 |         F, T = x.size(-2), x.size(-1)
21 |         y = self.conv(x)
22 |         
23 |         if self.scale:
24 |             y = y[..., :F*2, :T] if self.transpose else y[..., :F//2, :T]
25 |         else:
26 |             y = y[..., :F, :T]
27 | 
28 |         return y
29 | 
30 | class ResidualUnit(nn.Module):
31 |     def __init__(self, dim: int) -> None:
32 |         super().__init__()
33 | 
34 |         self.block = nn.Sequential(*[
35 |             Convolution2D(dim, dim, kernel_size=(5,2), scale=False),
36 |             nn.BatchNorm2d(dim),
37 |             nn.PReLU(),
38 |             Convolution2D(dim, dim, kernel_size=(5,2), scale=False),
39 |             nn.BatchNorm2d(dim),
40 |             nn.PReLU(),
41 |         ])
42 | 
43 |     def forward(self, x):
44 |         y = self.block(x)
45 | 
46 |         return x + y
47 | 
48 | 
49 | class ConvolutionLayer(nn.Module):
50 |     def __init__(self, in_dim, out_dim, depth=1,
51 |                        kernel_size=(5,2), transpose=False) -> None:
52 |         super().__init__()
53 |         
54 |         blocks = [ResidualUnit(in_dim) for _ in range(depth)]
55 |         blocks += [Convolution2D(in_dim, out_dim, kernel_size, scale=True, transpose=transpose),
56 |                    nn.BatchNorm2d(out_dim),
57 |                    nn.PReLU(),]
58 | 
59 |         self.blocks = nn.Sequential(*blocks)
60 | 
61 |     def forward(self, x):
62 |         
63 |         y = self.blocks(x)
64 |         return y


--------------------------------------------------------------------------------
/esc/modules/vq/initialize.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import warnings
 4 | 
 5 | 
 6 | @torch.no_grad()
 7 | def codebook_init_forward_hook_pvq(self, input, output):
 8 | 	""" initializes codebook from data """
 9 | 
10 | 	if (not self.training) or (self.codebook_initialized.item() == 1):
11 | 		return # no initialization during inference
12 | 
13 | 	if self.verbose_init is True:
14 | 		if self.kmeans_init is None:
15 | 			print("Initializing Product VQs with KaimingNormal")
16 | 		elif self.kmeans_init is True:
17 | 			print('Initializing Product VQs with k-means++')
18 | 		elif self.kmeans_init is False:
19 | 			print('Initializing Product VQs by randomly choosing from z_e')
20 | 
21 | 	outputs, _ = output
22 | 	_, z_e_downs, _ = outputs
23 | 	# z_e_downs [B, group_size, T, codebook_dim]
24 | 	for i in range(self.num_vqs):
25 | 		if self.kmeans_init is not None:
26 | 			z_e_i = z_e_downs[:,i] 		# [B, T, codebook_dim]
27 | 			init_codebook = sample_centroids(z_e_i, self.codebook_size, self.kmeans_init)
28 | 			self.vqs[i].embedding.weight.data = init_codebook
29 | 		else:
30 | 			nn.init.kaiming_normal_(self.vqs[i].embedding.weight) 		
31 | 	
32 | 	self.codebook_initialized.fill_(1) # set boolean flag
33 | 	return
34 | 
35 | @torch.no_grad()
36 | def sample_centroids(z_e, codebook_size, use_kmeans=False):
37 | 	""" create an initialize codebook one-time from z_e
38 | 	Args: 
39 | 		z_e: encoded embedding Tensor of size [bs,T,d]
40 | 		codebook_size: number of codewords
41 | 
42 | 		returns: 
43 | 			new_codebook: Tensor of size [codebook_size, d]
44 | 	"""
45 | 
46 | 	z_e = z_e.reshape(-1, z_e.size(-1)) # bs*T, d
47 | 	if codebook_size >= z_e.size(0):
48 | 		e_msg = f'\ncodebook size > warmup samples: {codebook_size} vs {z_e.size(0)}. ' + \
49 | 					'recommended to decrease the codebook size or increase batch size.'
50 | 		warnings.warn(e_msg)
51 | 		# repeat until it fits and add noise
52 | 		repeat = 1 + codebook_size // z_e.shape[0]
53 | 		new_codes = z_e.data.tile([repeat, 1])[:codebook_size]
54 | 		new_codes += 1e-3 * torch.randn_like(new_codes.data)
55 | 	else:
56 | 		# you have more warmup samples than codebook. subsample data
57 | 		if use_kmeans:
58 | 			from torchpq.clustering import KMeans
59 | 			kmeans = KMeans(n_clusters=codebook_size, distance='euclidean', init_mode="kmeans++")
60 | 			kmeans.fit(z_e.data.T.contiguous())
61 | 			new_codes = kmeans.centroids.T
62 | 		else:
63 | 			indices = torch.randint(low=0, high=codebook_size, size=(codebook_size,))
64 | 			indices = indices.to(z_e.device)
65 | 			new_codes = torch.index_select(z_e, 0, indices).to(z_e.device).data
66 | 
67 | 	return new_codes


--------------------------------------------------------------------------------
/esc/modules/loss/generator_loss.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | 
 4 | import torchaudio.transforms as T
 5 | import torch
 6 | 
 7 | MEL_WINDOWS = [32,64,128,256,512,1024,2048]
 8 | MEL_BINS = [5,10,20,40,80,160,320]
 9 | SR = 16000
10 | POWER = 0.3
11 | 
12 | class ComplexSTFTLoss(nn.Module):
13 |     """L2 Loss on Complex STFTs (Power Law Compressed https://arxiv.org/pdf/1811.07030)"""
14 |     def __init__(self, weight=1.0, power_law=True):
15 |         super().__init__() 
16 |         self.power_law = power_law 
17 |         self.weight = weight
18 | 
19 |     def forward(self, raw_feat, recon_feat):
20 |         """
21 |         Args: 
22 |             raw_feat/recon_feat: (B,2,F,T)
23 |             returns: (B,)
24 |         """
25 |         if self.power_law:
26 |             raw_feat = power_law(raw_feat, power=POWER)
27 |             recon_feat = power_law(recon_feat, power=POWER)
28 | 
29 |         return self.weight * F.mse_loss(raw_feat,recon_feat,reduction="none").mean([1,2,3])
30 | 
31 | def power_law(stft, power=POWER, eps=1e-10):
32 |     mask = torch.sign(stft)
33 |     power_law_compressed = (torch.abs(stft) + eps) ** power
34 |     power_law_compressed = power_law_compressed * mask
35 |     return power_law_compressed
36 | 
37 | class MelSpectrogramLoss(nn.Module):
38 |     """
39 |     L1 MelSpectrogram Loss 
40 |     Implementation adapted from https://github.com/descriptinc/descript-audio-codec/blob/main/dac/nn/loss.py
41 |     """
42 |     def __init__(self, weight=1.0,
43 |                  win_lengths=MEL_WINDOWS, n_mels=MEL_BINS, clamp_eps=1e-5,):
44 |         super().__init__()
45 | 
46 |         self.n_mels = n_mels
47 |         self.mel_transf = nn.ModuleList( [
48 |             T.MelSpectrogram(
49 |                 sample_rate=SR, n_fft=w, win_length=w, 
50 |                 hop_length=w//4, n_mels=n_mels[i], power=1)
51 |             for i, w in enumerate(win_lengths)
52 |         ] )
53 |         self.clamp_eps = clamp_eps
54 |         self.weight = weight
55 | 
56 |     def forward(self, raw_audio, recon_audio):
57 |         """
58 |         Args: 
59 |             raw_audio/recon_audio: (B,L)
60 |             returns: (B,)
61 |         """
62 |         mel_loss = 0.0
63 |         for mel_trans in self.mel_transf:
64 |             x_mels, y_mels = mel_trans(raw_audio), mel_trans(recon_audio)
65 | 
66 |             # magnitude loss
67 |             mel_loss += F.l1_loss(x_mels, y_mels, reduction="none").mean([1,2]) 
68 |             # log magnitude loss
69 |             mel_loss += F.l1_loss(  
70 |                 x_mels.clamp(self.clamp_eps).pow(2).log10(),
71 |                 y_mels.clamp(self.clamp_eps).pow(2).log10(), 
72 |                 reduction="none"
73 |             ).mean([1,2])
74 | 
75 |         return self.weight * mel_loss


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | # Usually these files are written by a python script from a template
 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | cover/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # celery beat schedule file
 90 | celerybeat-schedule
 91 | celerybeat.pid
 92 | 
 93 | # SageMath parsed files
 94 | *.sage.py
 95 | 
 96 | # Environments
 97 | .env
 98 | .venv
 99 | env/
100 | venv/
101 | ENV/
102 | env.bak/
103 | venv.bak/
104 | 
105 | # Spyder project settings
106 | .spyderproject
107 | .spyproject
108 | 
109 | # Rope project settings
110 | .ropeproject
111 | 
112 | # mkdocs documentation
113 | /site
114 | 
115 | # mypy
116 | .mypy_cache/
117 | .dmypy.json
118 | dmypy.json
119 | 
120 | # Pyre type checker
121 | .pyre/
122 | 
123 | # pytype static type analyzer
124 | .pytype/
125 | 
126 | # profiling data
127 | .prof
128 | 
129 | # Editors and IDEs
130 | # See https://help.github.com/articles/ignoring-files for more about ignoring files.
131 | # Visual Studio Code
132 | .vscode/
133 | # Intellij
134 | .idea/
135 | # Sublime Text
136 | *.sublime-workspace
137 | 
138 | # Windows image file caches
139 | Thumbs.db
140 | ehthumbs.db
141 | 
142 | # Folder config file
143 | Desktop.ini
144 | 
145 | # Recycle Bin used on file shares
146 | $RECYCLE.BIN/
147 | 
148 | # macOS files
149 | .DS_Store
150 | .AppleDouble
151 | .LSOverride
152 | 
153 | # Icon must end with two \r
154 | Icon
155 | 
156 | # Thumbnails
157 | ._*
158 | 
159 | # Files that might appear in the root of a volume
160 | .DocumentRevisions-V100
161 | .fseventsd
162 | .Spotlight-V100
163 | .TemporaryItems
164 | .Trashes
165 | .VolumeIcon.icns
166 | .com.apple.timemachine.dontbackup
167 | .PKInstallSandboxManager
168 | .PKInstallSandboxManager-SystemSoftware
169 | 
170 | 
171 | # old dev repo
172 | dev-deep-audio-signal-coding/


--------------------------------------------------------------------------------
/baselines/descript/dac/utils/decode.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from pathlib import Path
 3 | 
 4 | import argbind
 5 | import numpy as np
 6 | import torch
 7 | from audiotools import AudioSignal
 8 | from tqdm import tqdm
 9 | 
10 | from dac import DACFile
11 | from dac.utils import load_model
12 | 
13 | warnings.filterwarnings("ignore", category=UserWarning)
14 | 
15 | 
16 | @argbind.bind(group="decode", positional=True, without_prefix=True)
17 | @torch.inference_mode()
18 | @torch.no_grad()
19 | def decode(
20 |     input: str,
21 |     output: str = "",
22 |     weights_path: str = "",
23 |     model_tag: str = "latest",
24 |     model_bitrate: str = "8kbps",
25 |     device: str = "cuda",
26 |     model_type: str = "44khz",
27 |     verbose: bool = False,
28 | ):
29 |     """Decode audio from codes.
30 | 
31 |     Parameters
32 |     ----------
33 |     input : str
34 |         Path to input directory or file
35 |     output : str, optional
36 |         Path to output directory, by default "".
37 |         If `input` is a directory, the directory sub-tree relative to `input` is re-created in `output`.
38 |     weights_path : str, optional
39 |         Path to weights file, by default "". If not specified, the weights file will be downloaded from the internet using the
40 |         model_tag and model_type.
41 |     model_tag : str, optional
42 |         Tag of the model to use, by default "latest". Ignored if `weights_path` is specified.
43 |     model_bitrate: str
44 |         Bitrate of the model. Must be one of "8kbps", or "16kbps". Defaults to "8kbps".
45 |     device : str, optional
46 |         Device to use, by default "cuda". If "cpu", the model will be loaded on the CPU.
47 |     model_type : str, optional
48 |         The type of model to use. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz". Ignored if `weights_path` is specified.
49 |     """
50 |     generator = load_model(
51 |         model_type=model_type,
52 |         model_bitrate=model_bitrate,
53 |         tag=model_tag,
54 |         load_path=weights_path,
55 |     )
56 |     generator.to(device)
57 |     generator.eval()
58 | 
59 |     # Find all .dac files in input directory
60 |     _input = Path(input)
61 |     input_files = list(_input.glob("**/*.dac"))
62 | 
63 |     # If input is a .dac file, add it to the list
64 |     if _input.suffix == ".dac":
65 |         input_files.append(_input)
66 | 
67 |     # Create output directory
68 |     output = Path(output)
69 |     output.mkdir(parents=True, exist_ok=True)
70 | 
71 |     for i in tqdm(range(len(input_files)), desc=f"Decoding files"):
72 |         # Load file
73 |         artifact = DACFile.load(input_files[i])
74 | 
75 |         # Reconstruct audio from codes
76 |         recons = generator.decompress(artifact, verbose=verbose)
77 | 
78 |         # Compute output path
79 |         relative_path = input_files[i].relative_to(input)
80 |         output_dir = output / relative_path.parent
81 |         if not relative_path.name:
82 |             output_dir = output
83 |             relative_path = input_files[i]
84 |         output_name = relative_path.with_suffix(".wav").name
85 |         output_path = output_dir / output_name
86 |         output_path.parent.mkdir(parents=True, exist_ok=True)
87 | 
88 |         # Write to file
89 |         recons.write(output_path)
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     args = argbind.parse_args()
94 |     with argbind.scope(args):
95 |         decode()
96 | 


--------------------------------------------------------------------------------
/baselines/descript/dac/utils/encode.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import warnings
 3 | from pathlib import Path
 4 | 
 5 | import argbind
 6 | import numpy as np
 7 | import torch
 8 | from audiotools import AudioSignal
 9 | from audiotools.core import util
10 | from tqdm import tqdm
11 | 
12 | from dac.utils import load_model
13 | 
14 | warnings.filterwarnings("ignore", category=UserWarning)
15 | 
16 | 
17 | @argbind.bind(group="encode", positional=True, without_prefix=True)
18 | @torch.inference_mode()
19 | @torch.no_grad()
20 | def encode(
21 |     input: str,
22 |     output: str = "",
23 |     weights_path: str = "",
24 |     model_tag: str = "latest",
25 |     model_bitrate: str = "8kbps",
26 |     n_quantizers: int = None,
27 |     device: str = "cuda",
28 |     model_type: str = "44khz",
29 |     win_duration: float = 5.0,
30 |     verbose: bool = False,
31 | ):
32 |     """Encode audio files in input path to .dac format.
33 | 
34 |     Parameters
35 |     ----------
36 |     input : str
37 |         Path to input audio file or directory
38 |     output : str, optional
39 |         Path to output directory, by default "". If `input` is a directory, the directory sub-tree relative to `input` is re-created in `output`.
40 |     weights_path : str, optional
41 |         Path to weights file, by default "". If not specified, the weights file will be downloaded from the internet using the
42 |         model_tag and model_type.
43 |     model_tag : str, optional
44 |         Tag of the model to use, by default "latest". Ignored if `weights_path` is specified.
45 |     model_bitrate: str
46 |         Bitrate of the model. Must be one of "8kbps", or "16kbps". Defaults to "8kbps".
47 |     n_quantizers : int, optional
48 |         Number of quantizers to use, by default None. If not specified, all the quantizers will be used and the model will compress at maximum bitrate.
49 |     device : str, optional
50 |         Device to use, by default "cuda"
51 |     model_type : str, optional
52 |         The type of model to use. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz". Ignored if `weights_path` is specified.
53 |     """
54 |     generator = load_model(
55 |         model_type=model_type,
56 |         model_bitrate=model_bitrate,
57 |         tag=model_tag,
58 |         load_path=weights_path,
59 |     )
60 |     generator.to(device)
61 |     generator.eval()
62 |     kwargs = {"n_quantizers": n_quantizers}
63 | 
64 |     # Find all audio files in input path
65 |     input = Path(input)
66 |     audio_files = util.find_audio(input)
67 | 
68 |     output = Path(output)
69 |     output.mkdir(parents=True, exist_ok=True)
70 | 
71 |     for i in tqdm(range(len(audio_files)), desc="Encoding files"):
72 |         # Load file
73 |         signal = AudioSignal(audio_files[i])
74 | 
75 |         # Encode audio to .dac format
76 |         artifact = generator.compress(signal, win_duration, verbose=verbose, **kwargs)
77 | 
78 |         # Compute output path
79 |         relative_path = audio_files[i].relative_to(input)
80 |         output_dir = output / relative_path.parent
81 |         if not relative_path.name:
82 |             output_dir = output
83 |             relative_path = audio_files[i]
84 |         output_name = relative_path.with_suffix(".dac").name
85 |         output_path = output_dir / output_name
86 |         output_path.parent.mkdir(parents=True, exist_ok=True)
87 | 
88 |         artifact.save(output_path)
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     args = argbind.parse_args()
93 |     with argbind.scope(args):
94 |         encode()
95 | 


--------------------------------------------------------------------------------
/scripts_all.sh:
--------------------------------------------------------------------------------
  1 | ## Training Final Models
  2 | accelerate launch main.py \
  3 |     --exp_name esc-base-non-adv \
  4 |     --config_path ./configs/9kbps_esc_base.yaml \
  5 |     --wandb_project efficient-speech-codec \
  6 |     --lr 1.0e-4 \
  7 |     --num_epochs 80 \
  8 |     --num_pretraining_epochs 15 \
  9 |     --num_devices 4 \
 10 |     --dropout_rate 0.75 \
 11 |     --save_path ../output \
 12 |     --seed 53
 13 | 
 14 | accelerate launch main.py \
 15 |     --exp_name esc-base-adv \
 16 |     --adv_training \
 17 |     --config_path ./configs/9kbps_esc_base_adv.yaml \
 18 |     --wandb_project efficient-speech-codec \
 19 |     --lr 1.0e-4 \
 20 |     --num_epochs 80 \
 21 |     --num_pretraining_epochs 15 \
 22 |     --num_devices 4 \
 23 |     --dropout_rate 0.75 \
 24 |     --save_path ../output \
 25 |     --seed 53
 26 | 
 27 | # accelerate launch main.py \
 28 | #     --exp_name esc-base-post-adv \
 29 | #     --adv_training \
 30 | #     --pretrain_ckp ../esc9kbps_base_non_adversarial/model.pth \
 31 | #     --config_path ./configs/9kbps_esc_base_adv.yaml \
 32 | #     --wandb_project efficient-speech-codec \
 33 | #     --lr 1.0e-4 \
 34 | #     --num_epochs 20 \
 35 | #     --num_pretraining_epochs 0 \
 36 | #     --num_devices 4 \
 37 | #     --dropout_rate 0.75 \
 38 | #     --save_path ../output \
 39 | #     --seed 53
 40 | 
 41 | accelerate launch main.py \
 42 |     --exp_name esc-large-non-adv \
 43 |     --config_path ./configs/9kbps_esc_large.yaml \
 44 |     --wandb_project efficient-speech-codec \
 45 |     --lr 1.0e-4 \
 46 |     --num_epochs 80 \
 47 |     --num_pretraining_epochs 15 \
 48 |     --num_devices 4 \
 49 |     --dropout_rate 0.75 \
 50 |     --save_path ../output \
 51 |     --seed 53
 52 | 
 53 | 
 54 | ## Method Ablations
 55 | accelerate launch main.py \
 56 |     --exp_name csvq+swinT \
 57 |     --config_path ./configs/ablations/9kbps_csvq_swinT.yaml \
 58 |     --wandb_project efficient-speech-codec \
 59 |     --lr 1.0e-4 \
 60 |     --num_epochs 50 \
 61 |     --num_pretraining_epochs 5 \
 62 |     --num_devices 4 \
 63 |     --dropout_rate 0.75 \
 64 |     --save_path ../output \
 65 |     --seed 53
 66 | 
 67 | accelerate launch main.py \
 68 |     --exp_name csvq+conv_9kbps \
 69 |     --config_path ./configs/ablations/9kbps_csvq_conv.yaml \
 70 |     --wandb_project efficient-speech-codec \
 71 |     --lr 1.0e-4 \
 72 |     --num_epochs 50 \
 73 |     --num_pretraining_epochs 5 \
 74 |     --num_devices 4 \
 75 |     --dropout_rate 0.75 \
 76 |     --save_path ../output \
 77 |     --seed 53
 78 | 
 79 | accelerate launch main.py \
 80 |     --exp_name rvq+swinT \
 81 |     --config_path ./configs/ablations/9kbps_rvq_swinT.yaml \
 82 |     --wandb_project efficient-speech-codec \
 83 |     --lr 1.0e-4 \
 84 |     --num_epochs 50 \
 85 |     --num_pretraining_epochs 5 \
 86 |     --num_devices 2 \
 87 |     --dropout_rate 0.75 \
 88 |     --save_path ../output \
 89 |     --seed 53
 90 | 
 91 | accelerate launch main.py \
 92 |     --exp_name rvq+conv \
 93 |     --config_path ./configs/ablations/9kbps_rvq_conv.yaml \
 94 |     --wandb_project efficient-speech-codec \
 95 |     --lr 1.0e-4 \
 96 |     --num_epochs 50 \
 97 |     --num_pretraining_epochs 5 \
 98 |     --num_devices 4 \
 99 |     --dropout_rate 0.75 \
100 |     --save_path ../output \
101 |     --seed 53
102 | 
103 | accelerate launch main.py \
104 |     --exp_name csvq+swinT_w/o_pretraining \
105 |     --config_path ./configs/ablations/9kbps_csvq_swinT.yaml \
106 |     --wandb_project efficient-speech-codec \
107 |     --lr 1.0e-4 \
108 |     --num_epochs 50 \
109 |     --num_pretraining_epochs 0 \
110 |     --num_devices 2 \
111 |     --dropout_rate 0.75 \
112 |     --save_path ../output \
113 |     --seed 53


--------------------------------------------------------------------------------
/esc/modules/vq/codebook.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | from einops import rearrange
 4 | 
 5 | class Codebook(nn.Module):
 6 |     def __init__(self, 
 7 |                  embedding_dim: int=256, 
 8 |                  num_embeddings: int=1024, 
 9 |                  l2norm: bool=False,
10 |                  ):
11 |         super().__init__()
12 | 
13 |         self.embedding = nn.Embedding(num_embeddings, embedding_dim)
14 |         nn.init.kaiming_normal_(self.embedding.weight) 	
15 |         
16 |         self.embedding_dim = embedding_dim
17 |         self.num_embeddings = num_embeddings
18 |         self.l2norm = l2norm
19 | 
20 |     def quantize_to_code(self, z_e):
21 |         """ Quantize input vector to codebook indices.
22 |         Args:
23 |             z_e (Tensor): input vector with shape (bs, *, embedding_dim)
24 |         Returns: 
25 |             Tensor of indices with shape (bs, *)
26 |         """
27 | 
28 |         codebook = self.embedding.weight                  # [num_embeddings, embedding_dim]
29 |         z_flat = rearrange(z_e, "b t d -> (b t) d")       # [*, embedding_dim]
30 | 
31 |         if self.l2norm:
32 |             codebook = F.normalize(codebook, dim=-1)
33 |             z_flat = F.normalize(z_flat, dim=-1)
34 |         
35 |         dist = ( 
36 |             z_flat.pow(2).sum(1, keepdim=True)
37 |             - 2 * z_flat @ codebook.t()
38 |             + codebook.pow(2).sum(1, keepdim=True).t() 
39 |             )
40 |         indices = dist.min(1).indices
41 |         indices = rearrange(indices, "(b t) -> b t", b=z_e.size(0))
42 | 
43 |         return indices
44 | 
45 |     def dequantize_code(self, code):
46 |         """ De-quantize code indices to vectors
47 |         Args:
48 |             code (Tensor): code with shape (bs, *)
49 |         Returns:
50 |             Tensor of quantized vector with shape (bs, *, embedding_dim)
51 |         """
52 |         codebook = self.embedding.weight
53 |         z_q = F.embedding(code, codebook)
54 | 
55 |         return z_q
56 | 
57 |     def forward(self, z_e):
58 |         """ Vector Quantization Forward Function.
59 |         Args:
60 |             z_e (Tensor): input vector with shape (bs, T, embedding_dim)
61 |             z_q (Tensor): quantized vector with shape (bs, T, embedding_dim)
62 |         """              
63 | 
64 |         code = self.quantize_to_code(z_e)
65 |         z_q = self.dequantize_code(code)
66 |         
67 |         if self.training: # Straight-Through Estimator
68 |             commitment_loss = F.mse_loss(z_q.detach(), z_e, reduction="none").mean([1,2])
69 |             codebook_loss = F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1,2])
70 |             z_q = z_e + (z_q - z_e).detach()
71 |         else:
72 |             commitment_loss = F.mse_loss(z_q, z_e, reduction="none").mean([1,2])
73 |             codebook_loss = commitment_loss
74 |                                 
75 |         return z_q, code, codebook_loss, commitment_loss
76 | 
77 |     def encode(self, z_e):
78 |         code = self.quantize_to_code(z_e)
79 |         return code
80 |     
81 |     def decode(self, code):
82 |         z_q = self.dequantize_code(code)
83 |         return z_q
84 | 
85 | def count_posterior(code, codebook_size):
86 |     """ Compute the posterior codebook distribution P(q|e) on a total batch of encoded features
87 |         Args:
88 |             code: quantized discrete code of size [B, T]
89 |             codebook_size: total number of entries
90 |             returns: posterior distribution with size [B, codebook_size]
91 |     """
92 |     one_hot = F.one_hot(code, num_classes=codebook_size) # B T codebook_size
93 |     counts = one_hot.sum(dim=1) # B codebook_size
94 |     posterior = counts / code.size(1)
95 | 
96 |     return posterior
97 | 


--------------------------------------------------------------------------------
/baselines/descript/dac/utils/__init__.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import argbind
  4 | from audiotools import ml
  5 | 
  6 | import dac
  7 | 
  8 | DAC = dac.model.DAC
  9 | Accelerator = ml.Accelerator
 10 | 
 11 | __MODEL_LATEST_TAGS__ = {
 12 |     ("44khz", "8kbps"): "0.0.1",
 13 |     ("24khz", "8kbps"): "0.0.4",
 14 |     ("16khz", "8kbps"): "0.0.5",
 15 |     ("44khz", "16kbps"): "1.0.0",
 16 | }
 17 | 
 18 | __MODEL_URLS__ = {
 19 |     (
 20 |         "44khz",
 21 |         "0.0.1",
 22 |         "8kbps",
 23 |     ): "https://github.com/descriptinc/descript-audio-codec/releases/download/0.0.1/weights.pth",
 24 |     (
 25 |         "24khz",
 26 |         "0.0.4",
 27 |         "8kbps",
 28 |     ): "https://github.com/descriptinc/descript-audio-codec/releases/download/0.0.4/weights_24khz.pth",
 29 |     (
 30 |         "16khz",
 31 |         "0.0.5",
 32 |         "8kbps",
 33 |     ): "https://github.com/descriptinc/descript-audio-codec/releases/download/0.0.5/weights_16khz.pth",
 34 |     (
 35 |         "44khz",
 36 |         "1.0.0",
 37 |         "16kbps",
 38 |     ): "https://github.com/descriptinc/descript-audio-codec/releases/download/1.0.0/weights_44khz_16kbps.pth",
 39 | }
 40 | 
 41 | 
 42 | @argbind.bind(group="download", positional=True, without_prefix=True)
 43 | def download(
 44 |     model_type: str = "44khz", model_bitrate: str = "8kbps", tag: str = "latest"
 45 | ):
 46 |     """
 47 |     Function that downloads the weights file from URL if a local cache is not found.
 48 | 
 49 |     Parameters
 50 |     ----------
 51 |     model_type : str
 52 |         The type of model to download. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz".
 53 |     model_bitrate: str
 54 |         Bitrate of the model. Must be one of "8kbps", or "16kbps". Defaults to "8kbps".
 55 |         Only 44khz model supports 16kbps.
 56 |     tag : str
 57 |         The tag of the model to download. Defaults to "latest".
 58 | 
 59 |     Returns
 60 |     -------
 61 |     Path
 62 |         Directory path required to load model via audiotools.
 63 |     """
 64 |     model_type = model_type.lower()
 65 |     tag = tag.lower()
 66 | 
 67 |     assert model_type in [
 68 |         "44khz",
 69 |         "24khz",
 70 |         "16khz",
 71 |     ], "model_type must be one of '44khz', '24khz', or '16khz'"
 72 | 
 73 |     assert model_bitrate in [
 74 |         "8kbps",
 75 |         "16kbps",
 76 |     ], "model_bitrate must be one of '8kbps', or '16kbps'"
 77 | 
 78 |     if tag == "latest":
 79 |         tag = __MODEL_LATEST_TAGS__[(model_type, model_bitrate)]
 80 | 
 81 |     download_link = __MODEL_URLS__.get((model_type, tag, model_bitrate), None)
 82 | 
 83 |     if download_link is None:
 84 |         raise ValueError(
 85 |             f"Could not find model with tag {tag} and model type {model_type}"
 86 |         )
 87 | 
 88 |     local_path = (
 89 |         Path.home()
 90 |         / ".cache"
 91 |         / "descript"
 92 |         / "dac"
 93 |         / f"weights_{model_type}_{model_bitrate}_{tag}.pth"
 94 |     )
 95 |     if not local_path.exists():
 96 |         local_path.parent.mkdir(parents=True, exist_ok=True)
 97 | 
 98 |         # Download the model
 99 |         import requests
100 | 
101 |         response = requests.get(download_link)
102 | 
103 |         if response.status_code != 200:
104 |             raise ValueError(
105 |                 f"Could not download model. Received response code {response.status_code}"
106 |             )
107 |         local_path.write_bytes(response.content)
108 | 
109 |     return local_path
110 | 
111 | 
112 | def load_model(
113 |     model_type: str = "44khz",
114 |     model_bitrate: str = "8kbps",
115 |     tag: str = "latest",
116 |     load_path: str = None,
117 | ):
118 |     if not load_path:
119 |         load_path = download(
120 |             model_type=model_type, model_bitrate=model_bitrate, tag=tag
121 |         )
122 |     generator = DAC.load(load_path)
123 |     return generator
124 | 


--------------------------------------------------------------------------------
/scripts/utils.py:
--------------------------------------------------------------------------------
  1 | import torch, torchaudio
  2 | import transformers
  3 | import numpy as np
  4 | import argparse, yaml, glob
  5 | from huggingface_hub import hf_hub_download
  6 | from torch.utils.data import Dataset, DataLoader, default_collate
  7 | 
  8 | from esc.modules import ComplexSTFTLoss, MelSpectrogramLoss
  9 | 
 10 | 
 11 | def quantization_dropout(dropout_rate: float, max_streams: int):
 12 |     """
 13 |     Args:
 14 |         dropout_rate: probability that applies quantization dropout 
 15 |         max_streams: maximum number of streams codec can take
 16 |         returns: sampled number of streams for current batch
 17 |     """
 18 |     assert dropout_rate >=0 and dropout_rate <=1, "dropout_rate must be within [0, 1]"
 19 |     # Do Random Sample N w prob dropout_rate
 20 |     do_sample = np.random.choice([0, 1], p=[1-dropout_rate, dropout_rate])
 21 |     if do_sample: 
 22 |         streams = np.random.randint(1, max_streams+1)
 23 |     else:
 24 |         streams = max_streams
 25 |     return streams
 26 | 
 27 | class EvalSet(Dataset):
 28 |     def __init__(self, eval_folder_path) -> None:
 29 |         super().__init__()
 30 |         self.testset_files = glob.glob(f"{eval_folder_path}/*.wav")
 31 |         if not self.testset_files:
 32 |             self.testset_files = glob.glob(f"{eval_folder_path}/*/*.wav")
 33 |         self.testset_files = self.testset_files[:180000]
 34 |         
 35 |     def __len__(self):
 36 |         return len(self.testset_files)
 37 | 
 38 |     def __getitem__(self, i):
 39 |         x, _ = torchaudio.load(self.testset_files[i])
 40 |         return x[0, :-80]
 41 |     
 42 | def make_dataloader(data_path, batch_size, shuffle, num_workers=0):
 43 |     ds = EvalSet(data_path)
 44 |     dl = DataLoader(ds, batch_size=batch_size, shuffle=shuffle, 
 45 |                     collate_fn=default_collate, num_workers=num_workers)
 46 |     return dl
 47 | 
 48 | def make_optimizer(params, lr):
 49 |     return torch.optim.AdamW(params, lr)
 50 | 
 51 | GAMMAR = 0.999996
 52 | def make_scheduler(optimizer, scheduler_type, total_steps=250000, warmup_steps=0):
 53 |     if scheduler_type == "constant":
 54 |         scheduler = transformers.get_constant_schedule(optimizer)
 55 |     elif scheduler_type == "constant_warmup":
 56 |         scheduler = transformers.get_constant_schedule_with_warmup(
 57 |             optimizer, num_warmup_steps=warmup_steps) 
 58 |     elif scheduler_type == "cosine_warmup":
 59 |         scheduler = transformers.get_cosine_schedule_with_warmup(
 60 |             optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
 61 |     elif scheduler_type == "exponential_decay":
 62 |         scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=GAMMAR)
 63 |     else:
 64 |         raise ValueError("\{scheduler_type\} must be in ('constant', 'constant_warmup', 'cosine_warmup', 'exponential_decay')")
 65 |     return scheduler
 66 | 
 67 | def make_losses(name="mel_loss"):
 68 |     if name == "mel_loss":
 69 |         return MelSpectrogramLoss()
 70 |     elif name == "stft_loss":
 71 |         return ComplexSTFTLoss(power_law=True)
 72 |     else:
 73 |         raise ValueError("Supported losses are (mel_loss, stft_loss)")
 74 |     
 75 | def dict2namespace(config):
 76 |     namespace = argparse.Namespace()
 77 |     for key, value in config.items():
 78 |         if isinstance(value, dict):
 79 |             new_value = dict2namespace(value)
 80 |         else:
 81 |             new_value = value
 82 |         setattr(namespace, key, new_value)
 83 |     return namespace
 84 | 
 85 | def namespace2dict(config):
 86 |     return vars(config)
 87 | 
 88 | def read_yaml(pth):
 89 |     with open(pth, 'r') as f:
 90 |         config = yaml.safe_load(f)
 91 |     return config
 92 | 
 93 | def download_data_hf(repo_id="../dnscustom", 
 94 |                      filename="testset.tar.gz",
 95 |                      local_dir="./data"):
 96 | 
 97 |     file_path = hf_hub_download(repo_id=repo_id, 
 98 |                                 filename=filename, 
 99 |                                 repo_type="dataset",
100 |                                 local_dir=local_dir)
101 |     print(f"File has been downloaded and is located at {file_path}")
102 |     return file_path
103 | 


--------------------------------------------------------------------------------
/scripts/test.py:
--------------------------------------------------------------------------------
  1 | from .metrics import EntropyCounter, PESQ, MelSpectrogramDistance, SISDR
  2 | from .utils import read_yaml, EvalSet
  3 | from esc.models import make_model
  4 | 
  5 | from torch.utils.data import DataLoader, default_collate
  6 | from tqdm import tqdm
  7 | import numpy as np
  8 | 
  9 | import argparse, torch, json
 10 | 
 11 | def parse_args():
 12 |     parser = argparse.ArgumentParser()
 13 |     parser.add_argument("--eval_folder_path", type=str, required=True)
 14 |     parser.add_argument("--batch_size", type=int, default=1)
 15 | 
 16 |     parser.add_argument("--model_path", type=str, required=True, help="folder contains model configuration and checkpoint")
 17 |     parser.add_argument("--save_path", type=str, default=None, help="folder to save test statistics")
 18 | 
 19 |     parser.add_argument("--device", type=str, default="cpu")
 20 |     return parser.parse_args()
 21 | 
 22 | @torch.no_grad()
 23 | def eval_epoch(model, eval_loader:DataLoader, 
 24 |                metric_funcs:dict, e_counter:EntropyCounter, device: str, bps_per_stream: float,
 25 |                num_streams=None, verbose: bool=True):
 26 |     model.eval()
 27 | 
 28 |     all_perf = {k:[] for k in metric_funcs.keys()}
 29 |     all_perf["utilization"] = []
 30 |     eval_range = range(num_streams,num_streams+1) if num_streams is not None \
 31 |         else range(1, model.max_streams+1) # 1.5kbps -> 9kbps
 32 |     for s in eval_range: 
 33 |         perf = {k:[] for k in metric_funcs.keys()}
 34 |         e_counter.reset_stats(num_streams=s)
 35 |         for _, x in tqdm(enumerate(eval_loader), total=len(eval_loader), desc=f"Evaluating Codec at {s*bps_per_stream:.2f}kbps"):
 36 |             x = x.to(device)
 37 |             outputs = model(**dict(x=x, x_feat=None, num_streams=s))
 38 |             recon_x, codes = outputs["recon_audio"], outputs["codes"]
 39 | 
 40 |             for k, func in metric_funcs.items():    
 41 |                 perf[k].extend(func(x, recon_x).tolist())
 42 |             e_counter.update(codes)
 43 | 
 44 |         for k, v in perf.items():
 45 |             all_perf[k].append(round(np.mean(v),4))
 46 |         rate, _ = e_counter.compute_utilization()
 47 |         perf["utilization"] = [rate]
 48 |         all_perf["utilization"].append(rate)
 49 | 
 50 |         if verbose:
 51 |             print(f"Test Metrics at {s*1.5:.2f}kbps: ", end="")
 52 |             print(" | ".join(f"{k}: {np.mean(v):.4f}" for k, v in perf.items()))
 53 | 
 54 |     model.train()
 55 |     return all_perf
 56 | 
 57 | def run(args):
 58 |     # Data
 59 |     eval_set = EvalSet(args.eval_folder_path)
 60 |     eval_loader = DataLoader(eval_set, batch_size=args.batch_size, shuffle=False, collate_fn=default_collate)
 61 | 
 62 |     # Metrics
 63 |     metric_funcs = {"PESQ": PESQ(), "MelDistance": MelSpectrogramDistance().to(args.device), "SISDR": SISDR().to(args.device)}
 64 | 
 65 |     # Model
 66 |     cfg = read_yaml(f"{args.model_path}/config.yaml")
 67 |     model = make_model(cfg['model'], cfg['model_name'])
 68 |     model.load_state_dict(
 69 |         torch.load(f"{args.model_path}/model.pth", map_location="cpu")["model_state_dict"],
 70 |     )
 71 |     model = model.to(args.device)
 72 |     e_counter = EntropyCounter(cfg['model']['codebook_size'], num_streams=cfg['model']['max_streams'], 
 73 |                                num_groups=cfg['model']['group_size'], device=args.device)
 74 | 
 75 |     performances = eval_epoch(
 76 |             model, eval_loader, metric_funcs, e_counter, args.device,
 77 |             num_streams=None, verbose=True, bps_per_stream=1.5, # evaluate across all bitrates
 78 |         )
 79 |     
 80 |     save_path = args.model_path if args.save_path is None else args.save_path
 81 |     json.dump(performances, open(f"{save_path}/perf_stats.json", "w"), indent=2)
 82 |     print(f"Test statistics saved into {save_path}/perf_stats.json")
 83 | 
 84 | 
 85 | if __name__ == "__main__":
 86 |     args = parse_args()
 87 |     run(args)
 88 | 
 89 | 
 90 | """
 91 | python -m scripts.test \
 92 |     --eval_folder_path ../evaluation_set/test \
 93 |     --batch_size 12 \
 94 |     --model_path ./esc9kbps \
 95 |     --device cuda
 96 | 
 97 | 
 98 | python -m scripts.test \
 99 |     --eval_folder_path ../data/ESC_evaluation/test \
100 |     --batch_size 6 \
101 |     --model_path ../output/csvq_conv_9kbps \
102 |     --device cuda
103 | 
104 | export CUDA_VISIBLE_DEVICES=1
105 | python -m scripts.test \
106 |     --eval_folder_path ../data/ESC_evaluation/test \
107 |     --batch_size 6 \
108 |     --model_path ../output/rvq_conv_9kbps \
109 |     --device cuda
110 | 
111 | export CUDA_VISIBLE_DEVICES=2
112 | python -m scripts.test \
113 |     --eval_folder_path ../data/ESC_evaluation/test \
114 |     --batch_size 6 \
115 |     --model_path ../output/rvq_swinT_9kbps \
116 |     --device cuda
117 | 
118 | """


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Efficient Speech Coding with Cross-Scale Residual Vector Quantized Transformers
 2 | 
 3 | This is the code repository for the neural speech codec presented in the EMNLP 2024 paper **ESC: Efficient Speech Coding with Cross-Scale Residual Vector Quantized Transformers** [[paper](https://arxiv.org/abs/2404.19441)]
 4 | - Our neural speech codec ESC, within only 30MB, efficiently compresses 16kHz speech at bitrates of 1.5, 3, 4.5, 6, 7.5, and 9kbps, while maintaining comparative reconstruction quality to Descript's audio codec. 
 5 | - We provide pretrained model checkpoints [[download](#model-checkpoints)] for different ESC variants and DAC models, as well as a demo webpage [[link](https://efficient-speech-codec.notion.site/)] including multilingual speech samples. 
 6 | 
 7 | ![An illustration of ESC Architecture](assets/architecture.png)
 8 | ## Usage
 9 | 
10 | ### Environment Setup
11 | ```bash
12 | conda create -n esc python=3.8
13 | conda activate esc
14 | 
15 | pip install -r requirements.txt
16 | ```
17 | 
18 | ### Compress and de-compress audio
19 | ```ruby
20 | python -m scripts.compress  --input /path/to/input.wav --save_path /path/to/output --model_path /path/to/model --num_streams 6 --device cpu 
21 | ```
22 | This will create `.pth`(code) and `.wav`(reconstructed audio) files under the specified `save_path`. Our codec supports `num_streams` from 1 to 6, corresponding to bitrates 1.5 ~ 9.0 kbps. For programmatic usage, you can compress audio tensors using `torchaudio` as follows: 
23 | 
24 | ```python
25 | import torchaudio, torch
26 | from esc import ESC
27 | model = ESC(**config)
28 | model.load_state_dict(torch.load("model.pth", map_location="cpu"),)
29 | x, _ = torchaudio.load("input.wav")
30 | # Enc. (@ num_streams*1.5 kbps)
31 | codes, f_shape = model.encode(x, num_streams=6)
32 | # Dec.
33 | recon_x = model.decode(codes, f_shape)
34 | ```
35 | For more details, see the `example.ipynb` notebook.
36 | 
37 | ### Training
38 | 
39 | We provide developmental training and evaluation datasets available on [Hugging Face](https://huggingface.co/datasets/Tracygu/dnscustom/tree/main). For custom training, set the `train_data_path` in `exp.yaml` to the parent directory containing `.wav` audio segments. Run the following to start training:
40 | 
41 | ```ruby
42 | WANDB_API_KEY=your_API_key
43 | accelerate launch main.py --exp_name esc9kbps --config_path ./configs/9kbps_esc_base.yaml --wandb_project efficient-speech-codec --lr 1.0e-4 --num_epochs 80 --num_pretraining_epochs 15 --num_devices 4 --dropout_rate 0.75 --save_path /path/to/output --seed 53
44 | ```
45 | 
46 | We use `accelerate` library to handle distributed training and `wandb` library for monitoring. To enable adversarial training with the same discriminator in DAC, include the `--adv_training` flag. 
47 | 
48 | Training a base ESC model on 4 RTX4090 GPUs takes ~16 hours for 250k steps on 3-second speech clips with a batch size of 36. Detailed experiment configurations can be found in the `configs/` folder. For complete experiments presented in the paper, refer to `scripts_all.sh`.  
49 | 
50 | ### Evaluation
51 | 
52 | ```ruby
53 | CUDA_VISIBLE_DEVICES=0
54 | python -m scripts.test --eval_folder_path path/to/data --batch_size 12 --model_path /path/to/model --device cuda
55 | ```
56 | This will run codec evaluation across all available bandwidth on the specified test set folder. We provide four metrics for reporting: `PESQ`, `Mel-Distance`, `SI-SDR` and `Bitrate-Utilization-Rate`. Evaluation statistics will be saved under `model_path` by default.  
57 | 
58 | ### Model Checkpoints
59 | You can download the pre-trained model checkpoints below:
60 | 
61 | | Codec  | Checkpoint                                      | #Param. |
62 | |--------|-------------------------------------------------|----------|
63 | | ESC-Base           | [Download](https://drive.google.com/file/d/1OF1ab3az6nKOY8owSUhUH0ksYHFmR1bc/view?usp=sharing) | 8.39M    |
64 | | ESC-Base(adv)      | [Download](https://drive.google.com/file/d/1_g1dFYhY7qXKWkcq8_Q6I-kv8tQW_SF7/view?usp=sharing) | 8.39M    |
65 | | ESC-Large          | [Download](https://drive.google.com/file/d/180Q4zctqeNnDmRvoMsVQ-3iCB5FriJbN/view?usp=sharing) | 15.58M   |
66 | | DAC-Tiny(adv)      | [Download](https://drive.google.com/file/d/1ED-B_S7ftsb8CqoFGTNkWUIrMKrk-iiu/view?usp=sharing) | 8.17M    |
67 | | DAC-Tiny           | [Download](https://drive.google.com/file/d/1jk8zPYBYmxgsiSzrgoQynF6hnzoiIuX8/view?usp=sharing) | 8.17M    |
68 | | DAC-Base(adv)      | [Download](https://drive.google.com/file/d/1moy0FX-aPlx54MajBRuE-zjYeNlJUjI6/view?usp=sharing) | 74.31M   |
69 | 
70 | ## Results
71 | 
72 | ![Performance Evaluation](assets/results.png)
73 | We provide a comprehensive performance comparison of ESC with Descript's audio codec (DAC) at different scales of model sizes (w/ and w/o adversarial trainings).
74 | 
75 | ## Reference
76 | If you find our work useful or relevant to your research, please kindly cite our paper:
77 | ```bibtex
78 | @article{gu2024esc,
79 |         title={ESC: Efficient Speech Coding with Cross-Scale Residual Vector Quantized Transformers},
80 |         author={Gu, Yuzhe and Diao, Enmao},
81 |         journal={arXiv preprint arXiv:2404.19441},
82 |         year={2024}
83 | }
84 | ```


--------------------------------------------------------------------------------
/esc/modules/transformer/scale.py:
--------------------------------------------------------------------------------
  1 | from einops import rearrange
  2 | from typing import Literal
  3 | 
  4 | import torch.nn as nn
  5 | 
  6 | 
  7 | def pixel_unshuffle(input, downscale_factor:tuple=(2,1)):
  8 |     s1, s2 = downscale_factor
  9 |     B, H, W, C = input.size()
 10 |     C_, H_, W_ = C*(s1*s2), H//s1, W//s2
 11 | 
 12 |     unshuffle_out = input.reshape(B, H_, s1, W_, s2, C).\
 13 |         permute(0,1,3,2,4,5).reshape(B, H_, W_, C_)
 14 |     return unshuffle_out
 15 | 
 16 | def pixel_shuffle(input, upscale_factor:tuple=(2,1)):
 17 |     s1, s2 = upscale_factor
 18 |     B, H, W, C = input.size()
 19 |     C_, H_, W_ = C//(s1*s2), H*s1, W*s2
 20 | 
 21 |     shuffle_out = input.reshape(B, H, W, s1, s2, C_).\
 22 |         permute(0,1,3,2,4,5).reshape(B, H_, W_, C_)
 23 |     return shuffle_out
 24 | 
 25 | 
 26 | class PatchEmbed(nn.Module):
 27 |     """ 2D Linear Patchify """
 28 |     def __init__(self,
 29 |                  freq: int=192,
 30 |                  in_chans: int=2,
 31 |                  patch_size: tuple=(3,2),
 32 |                  embed_dim: int=48,
 33 |                  norm_layer=nn.LayerNorm,
 34 |                  backbone: Literal['transformer', 'convolution']='transformer',):
 35 |         super().__init__()
 36 | 
 37 |         self.H = freq // patch_size[0]
 38 |         self.proj = nn.Conv2d(in_chans, embed_dim, patch_size, patch_size)
 39 |         self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
 40 |         self.backbone = backbone
 41 | 
 42 |     def forward(self, x):
 43 | 
 44 |         x = self.proj(x)                          # B2FT -> BCHW
 45 |         if self.backbone == "convolution":
 46 |             return x # for convolution backbones, no normalization
 47 |         
 48 |         x = rearrange(x, "b c h w -> b (h w) c")  # BCHW -> BCL -> BLC
 49 |         x = self.norm(x)
 50 |         return x
 51 |     
 52 | class PatchDeEmbed(nn.Module):
 53 |     """ 2D Linear De-Patchify """
 54 |     def __init__(self,
 55 |                  freq: int=192,
 56 |                  in_chans: int=2,
 57 |                  patch_size: tuple=(3,2),
 58 |                  embed_dim: int=48,
 59 |                  backbone: Literal['transformer', 'convolution']='transformer',):
 60 |         super().__init__()
 61 | 
 62 |         self.patch_size = patch_size
 63 |         self.H = freq // patch_size[0]
 64 |         self.backbone = backbone
 65 | 
 66 |         self.de_proj1 = nn.Conv2d(embed_dim,
 67 |                            embed_dim*patch_size[0]*patch_size[1], 
 68 |                            kernel_size=5, stride=1, padding=2)
 69 |         self.de_proj2 = nn.Conv2d(embed_dim, 
 70 |                                   in_chans,
 71 |                                   kernel_size=3, stride=1, padding=1)   
 72 | 
 73 |     def forward(self, x):
 74 |         if self.backbone == "transformer":
 75 |             x = rearrange(x, "b (h w) c -> b c h w", h=self.H)
 76 |         
 77 |         x = self.de_proj1(x)                                    # B C*scale H W  
 78 |         x = pixel_shuffle(x.permute(0,2,3,1), self.patch_size)  # B F T C
 79 |         x = self.de_proj2(x.permute(0,3,1,2))                   # BCFT -> B2FT
 80 |         
 81 |         return x
 82 | 
 83 | class PatchMerge(nn.Module):
 84 |     """Patch Merging Layer: Perform Pixel Unshuffle and Downscale"""
 85 |     def __init__(self,
 86 |                  in_dim: int, 
 87 |                  out_dim: int, 
 88 |                  scale_factor: tuple=(2,1), 
 89 |                  norm_layer=nn.LayerNorm):
 90 |         super().__init__()
 91 |         s1, s2 = scale_factor
 92 | 
 93 |         self.norm = norm_layer(s1*s2*in_dim)
 94 |         self.down = nn.Linear(s1*s2*in_dim, out_dim, bias=False)
 95 |         self.scale_factor = scale_factor
 96 | 
 97 |     def forward(self, x, H):
 98 |         """ Forward function.
 99 |         Args:
100 |             x: Input feature, tensor size (B, H*W, in_dim)
101 |             H: num_patches along Freq Domain 
102 |             returns: downscaled feature x, tensor size (B, H*W//2, out_dim)
103 |         """
104 |         
105 |         x = rearrange(x, "b (h w) c -> b h w c", h=H)
106 |         pad_input = (H%2 == 1)
107 |         if pad_input:
108 |             x = nn.functional.pad(x, (0,0,0,0,0,H%2))
109 | 
110 |         x = pixel_unshuffle(x, self.scale_factor)
111 |         x = rearrange(x, "b h w c -> b (h w) c")
112 |         x = self.norm(x)
113 |         x = self.down(x)
114 | 
115 |         return x
116 | 
117 | class PatchSplit(nn.Module):
118 |     """Patch Splitting Layer: Perform Pixel Shuffle and Upscale"""
119 |     def __init__(self,
120 |                  in_dim: int, 
121 |                  out_dim: int, 
122 |                  scale_factor: tuple=(2,1), 
123 |                  norm_layer=nn.LayerNorm):
124 |         super().__init__()
125 |         s1, s2 = scale_factor
126 | 
127 |         self.norm = norm_layer(in_dim)
128 |         self.up = nn.Linear(in_dim, out_dim*s1*s2, bias=False)
129 |         self.scale_factor = scale_factor
130 | 
131 |     def forward(self, x, H):
132 |         """ Forward function.
133 |         Args:
134 |             x: Input feature, tensor size (B, H*W, in_dim)
135 |             H: num_patches along Freq Domain 
136 |             returns: upscaled feature x, tensor size (B, H*W*2, out_dim)
137 |         """
138 | 
139 |         x = self.norm(x)
140 |         x = self.up(x)                    
141 | 
142 |         x = rearrange(x, "b (h w) c -> b h w c", h=H)
143 |         x = pixel_shuffle(x, self.scale_factor)
144 |         x = rearrange(x, "b h w c -> b (h w) c")
145 |         return x
146 |     


--------------------------------------------------------------------------------
/scripts/metrics.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import torchaudio.transforms as T
  5 | import numpy as np
  6 | from pesq import pesq
  7 | 
  8 | MEL_WINDOWS = [32,64,128,256,512,1024,2048]
  9 | MEL_BINS = [5,10,20,40,80,160,320]
 10 | SR = 16000 
 11 | 
 12 | class EntropyCounter:
 13 |     """Counter maintaining codebook utilization rate on a held-out validation set"""
 14 |     def __init__(self, codebook_size=1024, 
 15 |                  num_streams=6, num_groups=3,
 16 |                  device="cuda"):
 17 | 
 18 |         self.num_groups = num_groups
 19 |         self.codebook_size = codebook_size
 20 |         self.device = device
 21 | 
 22 |         self.reset_stats(num_streams)
 23 | 
 24 |     def reset_stats(self, num_streams):
 25 |         self.codebook_counts = {
 26 |                 f"stream_{S}_group_{G+1}": torch.zeros(self.codebook_size, device=self.device) \
 27 |                     for S in range(num_streams) for G in range(self.num_groups)
 28 |                 } # counts codeword stats for each codebook
 29 |         self.total_counts = 0
 30 |         self.dist = None    # posterior distribution for each codebook
 31 |         self.entropy = None # entropy stats for each codebook
 32 | 
 33 |         self.max_entropy_per_book = np.log2(self.codebook_size)
 34 |         self.max_total_entropy = num_streams * self.num_groups * self.max_entropy_per_book
 35 |         self.num_streams = num_streams
 36 | 
 37 |     def update(self, codes):
 38 |         """ Update codebook counts and total counts from a batch of codes
 39 |         Args:
 40 |             codes: (B, num_streams, group_size, *)
 41 |         """ 
 42 |         assert codes.size(1) == self.num_streams and codes.size(2) == self.num_groups, "code indices size not match"
 43 |         num_codes = codes.size(0) * codes.size(-1)
 44 |         self.total_counts += num_codes
 45 | 
 46 |         for s in range(self.num_streams):
 47 |             stream_s_code = codes[:, s]                      # (B, group_size, *)
 48 |             for g in range(self.num_groups):
 49 |                 stream_s_group_g_code = stream_s_code[:,g]   # (B, *)
 50 |                 one_hot = F.one_hot(stream_s_group_g_code, num_classes=self.codebook_size) # (B, *, codebook_size)
 51 |                 self.codebook_counts[f"stream_{s}_group_{g+1}"] += one_hot.view(-1, self.codebook_size).sum(0) # (*, codebook_size)
 52 |         
 53 |     def _form_distribution(self):
 54 |         """After iterating over a held-out set, compute posterior distribution for each codebook"""
 55 |         assert self.total_counts > 0, "No data collected, please update on a specific dataset"
 56 |         self.dist = {}
 57 |         for k, _counts in self.codebook_counts.items():
 58 |             self.dist[k] = _counts / torch.tensor(self.total_counts, device=_counts.device)
 59 |     
 60 |     def _form_entropy(self):
 61 |         """After forming codebook posterior distributions, compute entropy for each distribution"""
 62 |         assert self.dist is not None, "Please compute posterior distribution first using self._form_distribution()"
 63 |         
 64 |         self.entropy = {}
 65 |         for k, dist in self.dist.items():
 66 |             self.entropy[k] = (-torch.sum(dist * torch.log2(dist+1e-10))).item()
 67 |             
 68 |     def compute_utilization(self):
 69 |         """After forming entropy statistics for each codebook, compute utilization ratio (bitrate efficiency)"""
 70 |         if self.dist is None: self._form_distribution()
 71 |         if self.entropy is None: self._form_entropy()
 72 |         
 73 |         utilization = {}
 74 |         for k, e in self.entropy.items():
 75 |             utilization[k] = round(e/self.max_entropy_per_book, 4)
 76 | 
 77 |         return round(sum(self.entropy.values())/self.max_total_entropy, 4), utilization 
 78 | 
 79 | class PESQ:      
 80 |     """Batch-wise computing of PESQ scores"""
 81 |     def __call__(self, x, y):
 82 |         """
 83 |         Args:
 84 |             x: source audio Tensor (B, L)
 85 |             y: recon audio Tensor  (B, L)
 86 |             returns: (B,)
 87 |         """
 88 |         batch_pesq = []
 89 |         for b in range(x.size(0)):
 90 |             ref = x[b].cpu().numpy()
 91 |             deg = y[b].cpu().numpy()
 92 |             batch_pesq.append(pesq(SR, ref, deg, 'wb'))
 93 |         
 94 |         return torch.tensor(batch_pesq)
 95 | 
 96 | class MelSpectrogramDistance(nn.Module):
 97 |     """
 98 |     L1 Log MelSpectrogram Distance 
 99 |     Implementation adapted from https://github.com/descriptinc/descript-audio-codec/blob/main/dac/nn/loss.py
100 |     """
101 |     def __init__(self, win_lengths=MEL_WINDOWS, 
102 |                  n_mels=MEL_BINS, clamp_eps=1e-5,):
103 |         super().__init__()
104 |         self.mel_transf = nn.ModuleList([
105 |                 T.MelSpectrogram(sample_rate=SR, 
106 |                                 n_fft=w, win_length=w, hop_length=w//4, 
107 |                                 n_mels=n_mels[i], power=1)
108 |                 for i, w in enumerate(win_lengths)
109 |             ])
110 |         self.clamp_eps = clamp_eps
111 | 
112 |     def forward(self, raw_audio, recon_audio):
113 |         mel_loss = 0.0
114 |         for mel_trans in self.mel_transf:
115 |             x_mels, y_mels = mel_trans(raw_audio), mel_trans(recon_audio)
116 |             mel_loss += F.l1_loss(  # log mel loss
117 |                 x_mels.clamp(self.clamp_eps).pow(2).log10(),
118 |                 y_mels.clamp(self.clamp_eps).pow(2).log10(), 
119 |                 reduction="none"
120 |             ).mean(dim=[1,2])
121 |         return mel_loss
122 |     
123 | class SISDR(nn.Module):
124 |     """
125 |     Scale-Invariant Source-to-Distortion Ratio
126 |     Implementation adapted from https://github.com/descriptinc/descript-audio-codec/blob/main/dac/nn/loss.py 
127 |     """
128 |     def __init__(self, scaling: int = True, 
129 |                  reduction: str = "none", zero_mean: int = True):
130 |         self.scaling = scaling
131 |         self.reduction = reduction
132 |         self.zero_mean = zero_mean
133 |         super().__init__()
134 | 
135 |     def forward(self, x, y):
136 |         eps = 1e-8
137 | 
138 |         references = x.unsqueeze(1) if x.dim() == 2 else x # add channel dim
139 |         estimates = y.unsqueeze(1) if y.dim() == 2 else y  # add channel dim
140 | 
141 |         nb = references.shape[0]
142 |         references = references.reshape(nb, 1, -1).permute(0, 2, 1)
143 |         estimates = estimates.reshape(nb, 1, -1).permute(0, 2, 1)
144 | 
145 |         # samples now on axis 1
146 |         if self.zero_mean:
147 |             mean_reference = references.mean(dim=1, keepdim=True)
148 |             mean_estimate = estimates.mean(dim=1, keepdim=True)
149 |         else:
150 |             mean_reference = 0
151 |             mean_estimate = 0
152 | 
153 |         _references = references - mean_reference
154 |         _estimates = estimates - mean_estimate
155 | 
156 |         references_projection = (_references**2).sum(dim=-2) + eps
157 |         references_on_estimates = (_estimates * _references).sum(dim=-2) + eps
158 | 
159 |         scale = (
160 |             (references_on_estimates / references_projection).unsqueeze(1)
161 |             if self.scaling
162 |             else 1
163 |         )
164 |         e_true = scale * _references
165 |         e_res = _estimates - e_true
166 | 
167 |         signal = (e_true**2).sum(dim=1)
168 |         noise = (e_res**2).sum(dim=1)
169 |         sdr = 10 * torch.log10(signal/noise + eps)
170 | 
171 |         return sdr.squeeze(1) # (B,)
172 | 


--------------------------------------------------------------------------------
/baselines/descript/dac/model/discriminator.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from audiotools import AudioSignal
  5 | from audiotools import ml
  6 | from audiotools import STFTParams
  7 | from einops import rearrange
  8 | from torch.nn.utils import weight_norm
  9 | 
 10 | 
 11 | def WNConv1d(*args, **kwargs):
 12 |     act = kwargs.pop("act", True)
 13 |     conv = weight_norm(nn.Conv1d(*args, **kwargs))
 14 |     if not act:
 15 |         return conv
 16 |     return nn.Sequential(conv, nn.LeakyReLU(0.1))
 17 | 
 18 | 
 19 | def WNConv2d(*args, **kwargs):
 20 |     act = kwargs.pop("act", True)
 21 |     conv = weight_norm(nn.Conv2d(*args, **kwargs))
 22 |     if not act:
 23 |         return conv
 24 |     return nn.Sequential(conv, nn.LeakyReLU(0.1))
 25 | 
 26 | 
 27 | class MPD(nn.Module):
 28 |     def __init__(self, period):
 29 |         super().__init__()
 30 |         self.period = period
 31 |         self.convs = nn.ModuleList(
 32 |             [
 33 |                 WNConv2d(1, 32, (5, 1), (3, 1), padding=(2, 0)),
 34 |                 WNConv2d(32, 128, (5, 1), (3, 1), padding=(2, 0)),
 35 |                 WNConv2d(128, 512, (5, 1), (3, 1), padding=(2, 0)),
 36 |                 WNConv2d(512, 1024, (5, 1), (3, 1), padding=(2, 0)),
 37 |                 WNConv2d(1024, 1024, (5, 1), 1, padding=(2, 0)),
 38 |             ]
 39 |         )
 40 |         self.conv_post = WNConv2d(
 41 |             1024, 1, kernel_size=(3, 1), padding=(1, 0), act=False
 42 |         )
 43 | 
 44 |     def pad_to_period(self, x):
 45 |         t = x.shape[-1]
 46 |         x = F.pad(x, (0, self.period - t % self.period), mode="reflect")
 47 |         return x
 48 | 
 49 |     def forward(self, x):
 50 |         fmap = []
 51 | 
 52 |         x = self.pad_to_period(x)
 53 |         x = rearrange(x, "b c (l p) -> b c l p", p=self.period)
 54 | 
 55 |         for layer in self.convs:
 56 |             x = layer(x)
 57 |             fmap.append(x)
 58 | 
 59 |         x = self.conv_post(x)
 60 |         fmap.append(x)
 61 | 
 62 |         return fmap
 63 | 
 64 | 
 65 | class MSD(nn.Module):
 66 |     def __init__(self, rate: int = 1, sample_rate: int = 44100):
 67 |         super().__init__()
 68 |         self.convs = nn.ModuleList(
 69 |             [
 70 |                 WNConv1d(1, 16, 15, 1, padding=7),
 71 |                 WNConv1d(16, 64, 41, 4, groups=4, padding=20),
 72 |                 WNConv1d(64, 256, 41, 4, groups=16, padding=20),
 73 |                 WNConv1d(256, 1024, 41, 4, groups=64, padding=20),
 74 |                 WNConv1d(1024, 1024, 41, 4, groups=256, padding=20),
 75 |                 WNConv1d(1024, 1024, 5, 1, padding=2),
 76 |             ]
 77 |         )
 78 |         self.conv_post = WNConv1d(1024, 1, 3, 1, padding=1, act=False)
 79 |         self.sample_rate = sample_rate
 80 |         self.rate = rate
 81 | 
 82 |     def forward(self, x):
 83 |         x = AudioSignal(x, self.sample_rate)
 84 |         x.resample(self.sample_rate // self.rate)
 85 |         x = x.audio_data
 86 | 
 87 |         fmap = []
 88 | 
 89 |         for l in self.convs:
 90 |             x = l(x)
 91 |             fmap.append(x)
 92 |         x = self.conv_post(x)
 93 |         fmap.append(x)
 94 | 
 95 |         return fmap
 96 | 
 97 | 
 98 | BANDS = [(0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)]
 99 | 
100 | 
101 | class MRD(nn.Module):
102 |     def __init__(
103 |         self,
104 |         window_length: int,
105 |         hop_factor: float = 0.25,
106 |         sample_rate: int = 44100,
107 |         bands: list = BANDS,
108 |     ):
109 |         """Complex multi-band spectrogram discriminator.
110 |         Parameters
111 |         ----------
112 |         window_length : int
113 |             Window length of STFT.
114 |         hop_factor : float, optional
115 |             Hop factor of the STFT, defaults to ``0.25 * window_length``.
116 |         sample_rate : int, optional
117 |             Sampling rate of audio in Hz, by default 44100
118 |         bands : list, optional
119 |             Bands to run discriminator over.
120 |         """
121 |         super().__init__()
122 | 
123 |         self.window_length = window_length
124 |         self.hop_factor = hop_factor
125 |         self.sample_rate = sample_rate
126 |         self.stft_params = STFTParams(
127 |             window_length=window_length,
128 |             hop_length=int(window_length * hop_factor),
129 |             match_stride=True,
130 |         )
131 | 
132 |         n_fft = window_length // 2 + 1
133 |         bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
134 |         self.bands = bands
135 | 
136 |         ch = 32
137 |         convs = lambda: nn.ModuleList(
138 |             [
139 |                 WNConv2d(2, ch, (3, 9), (1, 1), padding=(1, 4)),
140 |                 WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
141 |                 WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
142 |                 WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
143 |                 WNConv2d(ch, ch, (3, 3), (1, 1), padding=(1, 1)),
144 |             ]
145 |         )
146 |         self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
147 |         self.conv_post = WNConv2d(ch, 1, (3, 3), (1, 1), padding=(1, 1), act=False)
148 | 
149 |     def spectrogram(self, x):
150 |         x = AudioSignal(x, self.sample_rate, stft_params=self.stft_params)
151 |         x = torch.view_as_real(x.stft())
152 |         x = rearrange(x, "b 1 f t c -> (b 1) c t f")
153 |         # Split into bands
154 |         x_bands = [x[..., b[0] : b[1]] for b in self.bands]
155 |         return x_bands
156 | 
157 |     def forward(self, x):
158 |         x_bands = self.spectrogram(x)
159 |         fmap = []
160 | 
161 |         x = []
162 |         for band, stack in zip(x_bands, self.band_convs):
163 |             for layer in stack:
164 |                 band = layer(band)
165 |                 fmap.append(band)
166 |             x.append(band)
167 | 
168 |         x = torch.cat(x, dim=-1)
169 |         x = self.conv_post(x)
170 |         fmap.append(x)
171 | 
172 |         return fmap
173 | 
174 | 
175 | class Discriminator(ml.BaseModel):
176 |     def __init__(
177 |         self,
178 |         rates: list = [],
179 |         periods: list = [2, 3, 5, 7, 11],
180 |         fft_sizes: list = [2048, 1024, 512],
181 |         sample_rate: int = 44100,
182 |         bands: list = BANDS,
183 |     ):
184 |         """Discriminator that combines multiple discriminators.
185 | 
186 |         Parameters
187 |         ----------
188 |         rates : list, optional
189 |             sampling rates (in Hz) to run MSD at, by default []
190 |             If empty, MSD is not used.
191 |         periods : list, optional
192 |             periods (of samples) to run MPD at, by default [2, 3, 5, 7, 11]
193 |         fft_sizes : list, optional
194 |             Window sizes of the FFT to run MRD at, by default [2048, 1024, 512]
195 |         sample_rate : int, optional
196 |             Sampling rate of audio in Hz, by default 44100
197 |         bands : list, optional
198 |             Bands to run MRD at, by default `BANDS`
199 |         """
200 |         super().__init__()
201 |         discs = []
202 |         discs += [MPD(p) for p in periods]
203 |         discs += [MSD(r, sample_rate=sample_rate) for r in rates]
204 |         discs += [MRD(f, sample_rate=sample_rate, bands=bands) for f in fft_sizes]
205 |         self.discriminators = nn.ModuleList(discs)
206 | 
207 |     def preprocess(self, y):
208 |         # Remove DC offset
209 |         y = y - y.mean(dim=-1, keepdims=True)
210 |         # Peak normalize the volume of input audio
211 |         y = 0.8 * y / (y.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
212 |         return y
213 | 
214 |     def forward(self, x):
215 |         x = self.preprocess(x)
216 |         fmaps = [d(x) for d in self.discriminators]
217 |         return fmaps
218 | 
219 | 
220 | if __name__ == "__main__":
221 |     disc = Discriminator()
222 |     x = torch.zeros(1, 1, 44100)
223 |     results = disc(x)
224 |     for i, result in enumerate(results):
225 |         print(f"disc{i}")
226 |         for i, r in enumerate(result):
227 |             print(r.shape, r.mean(), r.min(), r.max())
228 |         print()
229 | 


--------------------------------------------------------------------------------
/esc/models/discriminator.py:
--------------------------------------------------------------------------------
  1 | """Same discriminator from https://github.com/descriptinc/descript-audio-codec/blob/main/dac/model/discriminator.py
  2 | Requires audiotools package to be installed.
  3 | """
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from audiotools import AudioSignal
  9 | from audiotools import ml
 10 | from audiotools import STFTParams
 11 | from einops import rearrange
 12 | from torch.nn.utils import weight_norm
 13 | 
 14 | 
 15 | def WNConv1d(*args, **kwargs):
 16 |     act = kwargs.pop("act", True)
 17 |     conv = weight_norm(nn.Conv1d(*args, **kwargs))
 18 |     if not act:
 19 |         return conv
 20 |     return nn.Sequential(conv, nn.LeakyReLU(0.1))
 21 | 
 22 | 
 23 | def WNConv2d(*args, **kwargs):
 24 |     act = kwargs.pop("act", True)
 25 |     conv = weight_norm(nn.Conv2d(*args, **kwargs))
 26 |     if not act:
 27 |         return conv
 28 |     return nn.Sequential(conv, nn.LeakyReLU(0.1))
 29 | 
 30 | 
 31 | class MPD(nn.Module):
 32 |     def __init__(self, period):
 33 |         super().__init__()
 34 |         self.period = period
 35 |         self.convs = nn.ModuleList(
 36 |             [
 37 |                 WNConv2d(1, 32, (5, 1), (3, 1), padding=(2, 0)),
 38 |                 WNConv2d(32, 128, (5, 1), (3, 1), padding=(2, 0)),
 39 |                 WNConv2d(128, 512, (5, 1), (3, 1), padding=(2, 0)),
 40 |                 WNConv2d(512, 1024, (5, 1), (3, 1), padding=(2, 0)),
 41 |                 WNConv2d(1024, 1024, (5, 1), 1, padding=(2, 0)),
 42 |             ]
 43 |         )
 44 |         self.conv_post = WNConv2d(
 45 |             1024, 1, kernel_size=(3, 1), padding=(1, 0), act=False
 46 |         )
 47 | 
 48 |     def pad_to_period(self, x):
 49 |         t = x.shape[-1]
 50 |         x = F.pad(x, (0, self.period - t % self.period), mode="reflect")
 51 |         return x
 52 | 
 53 |     def forward(self, x):
 54 |         fmap = []
 55 | 
 56 |         x = self.pad_to_period(x)
 57 |         x = rearrange(x, "b c (l p) -> b c l p", p=self.period)
 58 | 
 59 |         for layer in self.convs:
 60 |             x = layer(x)
 61 |             fmap.append(x)
 62 | 
 63 |         x = self.conv_post(x)
 64 |         fmap.append(x)
 65 | 
 66 |         return fmap
 67 | 
 68 | 
 69 | class MSD(nn.Module):
 70 |     def __init__(self, rate: int = 1, sample_rate: int = 44100):
 71 |         super().__init__()
 72 |         self.convs = nn.ModuleList(
 73 |             [
 74 |                 WNConv1d(1, 16, 15, 1, padding=7),
 75 |                 WNConv1d(16, 64, 41, 4, groups=4, padding=20),
 76 |                 WNConv1d(64, 256, 41, 4, groups=16, padding=20),
 77 |                 WNConv1d(256, 1024, 41, 4, groups=64, padding=20),
 78 |                 WNConv1d(1024, 1024, 41, 4, groups=256, padding=20),
 79 |                 WNConv1d(1024, 1024, 5, 1, padding=2),
 80 |             ]
 81 |         )
 82 |         self.conv_post = WNConv1d(1024, 1, 3, 1, padding=1, act=False)
 83 |         self.sample_rate = sample_rate
 84 |         self.rate = rate
 85 | 
 86 |     def forward(self, x):
 87 |         x = AudioSignal(x, self.sample_rate)
 88 |         x.resample(self.sample_rate // self.rate)
 89 |         x = x.audio_data
 90 | 
 91 |         fmap = []
 92 | 
 93 |         for l in self.convs:
 94 |             x = l(x)
 95 |             fmap.append(x)
 96 |         x = self.conv_post(x)
 97 |         fmap.append(x)
 98 | 
 99 |         return fmap
100 | 
101 | 
102 | BANDS = [(0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)]
103 | 
104 | 
105 | class MRD(nn.Module):
106 |     def __init__(
107 |         self,
108 |         window_length: int,
109 |         hop_factor: float = 0.25,
110 |         sample_rate: int = 44100,
111 |         bands: list = BANDS,
112 |     ):
113 |         """Complex multi-band spectrogram discriminator.
114 |         Parameters
115 |         ----------
116 |         window_length : int
117 |             Window length of STFT.
118 |         hop_factor : float, optional
119 |             Hop factor of the STFT, defaults to ``0.25 * window_length``.
120 |         sample_rate : int, optional
121 |             Sampling rate of audio in Hz, by default 44100
122 |         bands : list, optional
123 |             Bands to run discriminator over.
124 |         """
125 |         super().__init__()
126 | 
127 |         self.window_length = window_length
128 |         self.hop_factor = hop_factor
129 |         self.sample_rate = sample_rate
130 |         self.stft_params = STFTParams(
131 |             window_length=window_length,
132 |             hop_length=int(window_length * hop_factor),
133 |             match_stride=True,
134 |         )
135 | 
136 |         n_fft = window_length // 2 + 1
137 |         bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
138 |         self.bands = bands
139 | 
140 |         ch = 32
141 |         convs = lambda: nn.ModuleList(
142 |             [
143 |                 WNConv2d(2, ch, (3, 9), (1, 1), padding=(1, 4)),
144 |                 WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
145 |                 WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
146 |                 WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
147 |                 WNConv2d(ch, ch, (3, 3), (1, 1), padding=(1, 1)),
148 |             ]
149 |         )
150 |         self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
151 |         self.conv_post = WNConv2d(ch, 1, (3, 3), (1, 1), padding=(1, 1), act=False)
152 | 
153 |     def spectrogram(self, x):
154 |         x = AudioSignal(x, self.sample_rate, stft_params=self.stft_params)
155 |         x = torch.view_as_real(x.stft())
156 |         x = rearrange(x, "b 1 f t c -> (b 1) c t f")
157 |         # Split into bands
158 |         x_bands = [x[..., b[0] : b[1]] for b in self.bands]
159 |         return x_bands
160 | 
161 |     def forward(self, x):
162 |         x_bands = self.spectrogram(x)
163 |         fmap = []
164 | 
165 |         x = []
166 |         for band, stack in zip(x_bands, self.band_convs):
167 |             for layer in stack:
168 |                 band = layer(band)
169 |                 fmap.append(band)
170 |             x.append(band)
171 | 
172 |         x = torch.cat(x, dim=-1)
173 |         x = self.conv_post(x)
174 |         fmap.append(x)
175 | 
176 |         return fmap
177 | 
178 | 
179 | class Discriminator(ml.BaseModel):
180 |     def __init__(
181 |         self,
182 |         rates: list = [],
183 |         periods: list = [2, 3, 5, 7, 11],
184 |         fft_sizes: list = [2048, 1024, 512],
185 |         sample_rate: int = 44100,
186 |         bands: list = BANDS,
187 |     ):
188 |         """Discriminator that combines multiple discriminators.
189 | 
190 |         Parameters
191 |         ----------
192 |         rates : list, optional
193 |             sampling rates (in Hz) to run MSD at, by default []
194 |             If empty, MSD is not used.
195 |         periods : list, optional
196 |             periods (of samples) to run MPD at, by default [2, 3, 5, 7, 11]
197 |         fft_sizes : list, optional
198 |             Window sizes of the FFT to run MRD at, by default [2048, 1024, 512]
199 |         sample_rate : int, optional
200 |             Sampling rate of audio in Hz, by default 44100
201 |         bands : list, optional
202 |             Bands to run MRD at, by default `BANDS`
203 |         """
204 |         super().__init__()
205 |         discs = []
206 |         discs += [MPD(p) for p in periods]
207 |         discs += [MSD(r, sample_rate=sample_rate) for r in rates]
208 |         discs += [MRD(f, sample_rate=sample_rate, bands=bands) for f in fft_sizes]
209 |         self.discriminators = nn.ModuleList(discs)
210 | 
211 |     def preprocess(self, y):
212 |         # Remove DC offset
213 |         y = y - y.mean(dim=-1, keepdims=True)
214 |         # Peak normalize the volume of input audio
215 |         y = 0.8 * y / (y.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
216 |         return y
217 | 
218 |     def forward(self, x):
219 |         x = self.preprocess(x)
220 |         fmaps = [d(x) for d in self.discriminators]
221 |         return fmaps
222 | 
223 | 
224 | if __name__ == "__main__":
225 |     disc = Discriminator()
226 |     x = torch.zeros(1, 1, 44100)
227 |     results = disc(x)
228 |     for i, result in enumerate(results):
229 |         print(f"disc{i}")
230 |         for i, r in enumerate(result):
231 |             print(r.shape, r.mean(), r.min(), r.max())
232 |         print()


--------------------------------------------------------------------------------
/esc/models/csrvq.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from typing import Literal
  4 | 
  5 | from ..modules import ProductVectorQuantize, TransformerLayer, PatchDeEmbed, ConvolutionLayer, Convolution2D
  6 | from .utils import blk_func
  7 | 
  8 | class CrossScaleRVQ(nn.Module):
  9 |     """Cross-Scale Residual Vector Quantization Framework"""
 10 |     def __init__(self, backbone: Literal['transformer', 'convolution']="transformer") -> None:
 11 |         super().__init__()
 12 |         if backbone == "transformer": self.dims = 3
 13 |         elif backbone == "convolution": self.dims = 4
 14 |     
 15 |     def pre_fuse(self, enc, dec):
 16 |         """Compute residuals to quantize"""
 17 |         return enc - dec
 18 |     
 19 |     def post_fuse(self, residual_q, dec):
 20 |         """Add back quantized residuals"""
 21 |         return residual_q + dec
 22 | 
 23 |     def csrvq(self, enc: torch.tensor, dec: torch.tensor, vq: ProductVectorQuantize, 
 24 |             transmit: bool=True, freeze_vq: bool=False):
 25 |         """ Forward Function combining encoding and decoding at a single bitstream/resolution scale
 26 |         Args:
 27 |             enc (Tensor): Tensor of encoded feature with shape (B, H*W, C) / (B, C, H, W) 
 28 |             dec (Tensor): Tensor of decoded feature with shape (B, H*W, C) / (B, C, H, W)
 29 |             vq (ProductVectorQuantize): product quantizer at this stream level
 30 |             transmit (Boolean): whether this stream is transmitted (perform quantization or not)
 31 |             freeze_vq (Boolean): whether freeze the codebook (in a pre-training stage)
 32 |         Returns: 
 33 |             Tensor of dec_refine (decoded feature conditioned on quantized encodings)
 34 |         """
 35 |         if not self.training and not transmit:
 36 |             return dec, 0., 0., None
 37 | 
 38 |         residual = self.pre_fuse(enc, dec)
 39 |         outputs = vq(residual, freeze_vq)
 40 |         residual_q, code = outputs["z_q"], outputs["codes"]
 41 |         cm_loss, cb_loss = outputs["cm_loss"], outputs["cb_loss"]
 42 | 
 43 |         if not transmit: # masking non-transmitted streams
 44 |             cm_loss, cb_loss = cm_loss * 0., cb_loss * 0.
 45 |             residual_q *= 0.
 46 | 
 47 |         dec_refine = self.post_fuse(residual_q, dec)
 48 |         return dec_refine, cm_loss, cb_loss, code
 49 |     
 50 |     def csrvq_encode(self, enc, dec, vq):
 51 | 
 52 |         residual = self.pre_fuse(enc, dec)
 53 |         code = vq.encode(residual)
 54 |         return code
 55 |     
 56 |     def csrvq_decode(self, codes, dec, vq):
 57 | 
 58 |         residual_q = vq.decode(codes, self.dims)
 59 |         dec_refine = self.post_fuse(residual_q, dec)
 60 |         return dec_refine
 61 |     
 62 | 
 63 | class CrossScaleRVQDecoder(CrossScaleRVQ):
 64 |     def __init__(self, 
 65 |                  backbone: Literal['transformer', 'convolution'],
 66 |                  in_freq: int, 
 67 |                  in_dim: int, 
 68 |                  h_dims: list,
 69 |                  patch_size: tuple,
 70 |                  kernel_size: list=[],
 71 |                  conv_depth: int=1,
 72 |                  swin_heads: list=[],
 73 |                  swin_depth: int=2,
 74 |                  window_size: int=4,
 75 |                  mlp_ratio: float=4.,) -> None:
 76 |         super().__init__(backbone)
 77 | 
 78 |         in_dims, out_dims = h_dims[:-1], h_dims[1:]
 79 | 
 80 |         blocks = nn.ModuleList()
 81 |         for i in range(len(in_dims)):
 82 |             layer = ConvolutionLayer(in_dims[i], out_dims[i], conv_depth, kernel_size, transpose=True) if backbone == "convolution" \
 83 |                else TransformerLayer(
 84 |                     in_dims[i], out_dims[i], swin_heads[i], swin_depth, window_size, mlp_ratio, 
 85 |                     activation=nn.GELU, norm_layer=nn.LayerNorm, scale="up", scale_factor=(2,1)
 86 |                 )
 87 |             blocks.append(layer)
 88 |             
 89 |         self.patch_deembed = PatchDeEmbed(in_freq, in_dim, patch_size, h_dims[-1], backbone)
 90 |         self.post_nn = Convolution2D(h_dims[-1], h_dims[-1], kernel_size, scale=False) if backbone == "convolution" \
 91 |                   else TransformerLayer(
 92 |                        h_dims[-1], h_dims[-1], swin_heads[-1], swin_depth, window_size, mlp_ratio, 
 93 |                        activation=nn.GELU, norm_layer=nn.LayerNorm, scale=None
 94 |                     )
 95 |         self.blocks = blocks
 96 | 
 97 |     def forward(self, enc_hs: list, num_streams: int, quantizers: nn.ModuleList, feat_shape: tuple, freeze_vq: bool=False):
 98 |         """Forward Function: step-wise cross-scale decoding
 99 |         Args: 
100 |             enc_hs (List[Tensor, ...]): a list of encoded features at all scales
101 |             num_streams (int): number of bitstreams to use (max_streams when freeze_vq is True)
102 |             quantizers (ModuleList): a modulelist of multi-scale quantizers
103 |             feat_shape (Tuple): (Wh, Ww) feature shape at bottom level
104 |             freeze_vq (Boolean): freeze vq layers during pre-training
105 |         Returns: 
106 |             recon_feat: reconstructed complex spectrum (Bs, 2, F, T)
107 |             codes: discrete indices (Bs, num_streams, group_size, T//overlap) 
108 |                     num_streams is always max_stream in training mode
109 |             cm_loss, cb_loss: VQ losses (Bs, )
110 |         """
111 |         z0, cm_loss, cb_loss, code = self.csrvq(enc=enc_hs[-1], dec=0.0, vq=quantizers[0], 
112 |                                                 transmit=True, freeze_vq=freeze_vq)
113 |         codes, dec_hs = [code], [z0]
114 |         for i, blk in enumerate(self.blocks):
115 |             dec_i_refine, cm_loss_i, cb_loss_i, code_i = self.csrvq(
116 |                 enc=enc_hs[-1-i], dec=dec_hs[i], vq=quantizers[i+1], transmit=(i<num_streams-1), freeze_vq=freeze_vq
117 |                         )
118 |             cm_loss += cm_loss_i
119 |             cb_loss += cb_loss_i
120 |             if code_i is not None:
121 |                 codes.append(code_i)
122 |             
123 |             dec_next, feat_shape = blk_func(blk, dec_i_refine, feat_shape)
124 |             dec_hs.append(dec_next)
125 | 
126 |         dec_next, feat_shape = blk_func(self.post_nn, dec_next, feat_shape)
127 |         recon_feat = self.patch_deembed(dec_next)
128 |         codes = torch.stack(codes, dim=1)
129 |         return recon_feat, codes, cm_loss, cb_loss
130 |     
131 |     def encode(self, enc_hs: list, num_streams: int, quantizers: nn.ModuleList, feat_shape: tuple):
132 |         """Encode audio into indices
133 |         Args: 
134 |             enc_hs (List[Tensor, ...]): a list of encoded features at all scales
135 |             num_streams (int): number of bitstreams to use
136 |             quantizers (ModuleList): a modulelist of multi-scale quantizers
137 |             feat_shape (Tuple): (Wh, Ww) feature shape at bottom level
138 |         Returns: 
139 |             multi-scale codes with shape (Bs, num_streams, group_size, T)
140 |         """
141 |         code0 = quantizers[0].encode(enc_hs[-1]) # [B, group_size, T]
142 |         if num_streams == 1:
143 |             return code0.unsqueeze(1)
144 |         
145 |         z0 = quantizers[0].decode(code0, dims=self.dims)
146 |         codes, dec_hs = [code0], [z0]
147 |         for i in range(num_streams-1):
148 |             
149 |             codei = self.csrvq_encode(enc=enc_hs[-1-i], dec=dec_hs[i], vq=quantizers[i+1])
150 |             codes.append(codei)
151 |             if len(codes) == num_streams: break
152 | 
153 |             dec_i_refine = self.csrvq_decode(codei, dec=dec_hs[i], vq=quantizers[i+1])
154 |             dec_next, feat_shape = blk_func(self.blocks[i], dec_i_refine, feat_shape)
155 |             dec_hs.append(dec_next)
156 |         
157 |         codes = torch.stack(codes, dim=1) 
158 |         return codes    # [B, num_streams, group_size, T]
159 |     
160 |     def decode(self, codes: list, quantizers: nn.ModuleList, feat_shape: tuple):
161 |         """Decode from indices
162 |         Args: 
163 |             codes (Tensor): multi-scale codes with shape (B, num_streams, group_size, T)
164 |             quantizers (ModuleList): a modulelist of multi-scale quantizers
165 |             feat_shape (Tuple): (Wh, Ww) feature shape at bottom level
166 |         Returns: 
167 |             decoded hidden states: List of decoded features
168 |         """
169 |         num_streams = codes.size(1)
170 | 
171 |         z0 = quantizers[0].decode(codes[:, 0], dims=self.dims)
172 |         dec_hs = [z0]
173 |         for i in range(len(self.blocks)): # using code of residuals to refine decoding
174 |             if i < num_streams-1:
175 |                 dec_i_refine = self.csrvq_decode(codes=codes[:, i+1], dec=dec_hs[i], vq=quantizers[i+1])
176 |             else:
177 |                 dec_i_refine = dec_hs[i]
178 |             dec_next, feat_shape = blk_func(self.blocks[i], dec_i_refine, feat_shape)
179 |             dec_hs.append(dec_next)
180 | 
181 |         dec_next, feat_shape = blk_func(self.post_nn, dec_next, feat_shape)
182 |         dec_hs.append(self.patch_deembed(dec_next))
183 |         return dec_hs


--------------------------------------------------------------------------------
/esc/models/codecs.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from einops import rearrange
  3 | from typing import Literal
  4 | 
  5 | from .base import BaseAudioCodec, Encoder, Decoder
  6 | from .csrvq import CrossScaleRVQDecoder
  7 | 
  8 | 
  9 | class ESC(BaseAudioCodec):
 10 |     """ Efficient Speech Codec """
 11 |     def __init__(self, in_dim: int=2, in_freq: int=192, h_dims: list=[45,72,96,144,192,384],
 12 |                  max_streams: int=6, win_len: int=20, hop_len: int=5, sr: int=16000,
 13 |                  patch_size: list = [3,2], swin_heads: list = [3,6,12,24,24], swin_depth: int = 2,
 14 |                  window_size: int = 4, mlp_ratio: float = 4.,
 15 |                  overlap: int = 2, group_size: int = 3, 
 16 |                  codebook_size: int = 1024, codebook_dims: list = [8,8,8,8,8,8], 
 17 |                  l2norm: bool = True, backbone: Literal['transformer', 'convolution'] = 'transformer',
 18 |                  kernel_size: list=[5,2], conv_depth: int=1) -> None:
 19 |         super().__init__(in_dim, in_freq, h_dims, max_streams, win_len, hop_len, sr)
 20 | 
 21 |         self.quantizers = self.init_ProductVQs(patch_size, overlap, group_size, codebook_dims, codebook_size, l2norm)
 22 |         self.encoder = Encoder(backbone, in_freq, in_dim, h_dims, tuple(patch_size), tuple(kernel_size), conv_depth, 
 23 |                                swin_heads, swin_depth, window_size, mlp_ratio)
 24 |         self.decoder = CrossScaleRVQDecoder(backbone=backbone, in_freq=in_freq, in_dim=in_dim, 
 25 |                                             h_dims=h_dims[::-1], patch_size=tuple(patch_size), 
 26 |                                             swin_heads=swin_heads[::-1], swin_depth=swin_depth, 
 27 |                                             window_size=window_size, mlp_ratio=mlp_ratio,
 28 |                                             kernel_size=tuple(kernel_size), conv_depth=conv_depth,)
 29 | 
 30 |     def forward_one_step(self, x, x_feat=None, num_streams=6, freeze_codebook=False):
 31 |         if x_feat is None:
 32 |             x_feat = self.spec_transform(x)
 33 |         else:
 34 |             x_feat = rearrange(x_feat, "b h w c -> b c h w") 
 35 | 
 36 |         enc_hs, feat_shape = self.encoder(x_feat)
 37 |         recon_feat, codes, cm_loss, cb_loss = self.decoder(enc_hs, num_streams, self.quantizers, feat_shape, freeze_codebook)
 38 |         recon_x = self.audio_reconstruct(recon_feat)
 39 | 
 40 |         return {"cm_loss": cm_loss,
 41 |                 "cb_loss": cb_loss,
 42 |                 "raw_audio": x,
 43 |                 "recon_audio": recon_x,
 44 |                 "raw_feat": x_feat, 
 45 |                 "recon_feat": recon_feat,
 46 |                 "codes": codes}
 47 |     
 48 |     def forward(self, x, x_feat, num_streams, freeze_codebook=False):
 49 |         """ Forward Function.
 50 |         Args: 
 51 |             x (Tensor): audio waveform with shape (Bs, L)
 52 |             x_feat (Tensor): audio complex STFT with shape (Bs, 2, F, T)
 53 |             num_streams (int): number of streams transmitted
 54 |             freeze_codebook (boolean): set True during pre-training stage
 55 |         Returns:
 56 |             dict:
 57 |                 cm_loss: commitment loss with shape (Bs, )
 58 |                 cb_loss: codebook loss with shape (Bs, )
 59 |                 raw_audio: audio waveform with shape (Bs, L)
 60 |                 recon_audio: reconstructed audio waveform with shape (Bs, L)
 61 |                 raw_feat: audio complex STFT with shape (Bs, 2, F, T)
 62 |                 recon_feat: reconstructed audio complex STFT with shape (Bs, 2, F, T)
 63 |                 codes: multi-scale indices with shape (Bs, num_streams, group_size, *)
 64 |         """
 65 |         num_streams = self.max_streams if freeze_codebook else num_streams
 66 |         return self.forward_one_step(x, x_feat, num_streams, freeze_codebook)
 67 |         
 68 |     @torch.no_grad()
 69 |     def encode(self, x, num_streams=6):
 70 |         """ Encoding.
 71 |         Args:
 72 |             x (Tensor): audio waveform with shape (Bs, L)
 73 |             num_streams (int): number of streams transmitted
 74 |         Returns:
 75 |             Tensor of multi-scale codes with shape (Bs, num_streams, group_size, *) 
 76 |             Tuple of latent feature shape (H,W)
 77 |         """
 78 |         x_feat = self.spec_transform(x)
 79 |         enc_hs, feat_shape = self.encoder(x_feat)
 80 |         codes = self.decoder.encode(enc_hs, num_streams, self.quantizers, feat_shape)
 81 |         return codes, feat_shape
 82 |     
 83 |     @torch.no_grad()
 84 |     def decode(self, codes, feat_shape=(2,1000)):
 85 |         """Decoding.
 86 |         Args:
 87 |             codes (Tensor): multi-scale codes with shape (Bs, num_streams, group_size, *)
 88 |             feat_shape (tuple): latent feature shape (H, W)
 89 |         Returns:
 90 |             Tensor of reconstructed audio waveform
 91 |         """
 92 |         dec_hs = self.decoder.decode(codes, self.quantizers, feat_shape)
 93 |         recon_x = self.audio_reconstruct(dec_hs[-1])
 94 |         return recon_x
 95 | 
 96 | class RVQCodecs(BaseAudioCodec):
 97 |     """ RVQ Codecs """
 98 |     def __init__(self, 
 99 |                 in_dim: int=2, 
100 |                 in_freq: int=192, 
101 |                 h_dims: list=[45,72,96,144,192,384], # [16,16,24,24,32,64]
102 |                 max_streams: int=6,   # max layer depth here
103 |                 backbone: Literal['transformer', 'convolution'] = 'transformer',
104 |                 kernel_size: list=[5,2],
105 |                 conv_depth: int=1,
106 |                 patch_size: list=[3,2],
107 |                 swin_heads: list=[3,6,12,24,24], 
108 |                 swin_depth: int=2, 
109 |                 window_size: int=4, 
110 |                 mlp_ratio: float=4.,
111 |                 overlap: int=2,
112 |                 num_rvqs: int=6,
113 |                 group_size: int=3,
114 |                 codebook_dim: int=8,
115 |                 codebook_size: int=1024,
116 |                 l2norm: bool=True,
117 |                 win_len: int = 20, 
118 |                 hop_len: int = 5, 
119 |                 sr: int = 16000) -> None:
120 |         super().__init__(in_dim, in_freq, h_dims, max_streams, win_len, hop_len, sr)
121 | 
122 |         self.quantizers = self.init_ResidualVQs(patch_size, overlap, group_size, num_rvqs, codebook_dim, codebook_size, l2norm)
123 |         self.encoder = Encoder(backbone, in_freq, in_dim, h_dims, patch_size, kernel_size, conv_depth, swin_heads, swin_depth, window_size, mlp_ratio)
124 |         self.decoder = Decoder(backbone, in_freq, in_dim, h_dims[::-1], patch_size, kernel_size, conv_depth, swin_heads[::-1], swin_depth, window_size, mlp_ratio)
125 |         self.dims = 3 if backbone == "transformer" else 4
126 | 
127 |     def forward_one_step(self, x, x_feat=None, num_streams=6, freeze_codebook=False):
128 |         if x_feat is None:
129 |             x_feat = self.spec_transform(x)
130 |         else:
131 |             x_feat = rearrange(x_feat, "b h w c -> b c h w") 
132 | 
133 |         enc_hs, feat_shape = self.encoder(x_feat)
134 | 
135 |         outputs = self.quantizers(enc_hs[-1], num_streams, freeze_vq=freeze_codebook)
136 |         cm_loss, cb_loss = outputs['cm_loss'], outputs['cb_loss']
137 |         z_q, codes = outputs['z_q'], outputs['codes']
138 | 
139 |         recon_feat = self.decoder(z_q, feat_shape)
140 |         recon_x = self.audio_reconstruct(recon_feat)
141 | 
142 |         return {"cm_loss": cm_loss,
143 |                 "cb_loss": cb_loss,
144 |                 "raw_audio": x,
145 |                 "recon_audio": recon_x,
146 |                 "raw_feat": x_feat, 
147 |                 "recon_feat": recon_feat,
148 |                 "codes": codes}
149 | 
150 |     def forward(self, x, x_feat, num_streams, freeze_codebook=False):
151 |         """ Forward Function.
152 |         Args: 
153 |             x (Tensor): audio waveform with shape (Bs, L)
154 |             x_feat (Tensor): audio complex STFT with shape (Bs, 2, F, T)
155 |             num_streams (int): number of streams transmitted
156 |             freeze_codebook (boolean): set True during pre-training stage
157 |         Returns:
158 |             dict:
159 |                 cm_loss: commitment loss with shape (Bs, )
160 |                 cb_loss: codebook loss with shape (Bs, )
161 |                 raw_audio: audio waveform with shape (Bs, L)
162 |                 recon_audio: reconstructed audio waveform with shape (Bs, L)
163 |                 raw_feat: audio complex STFT with shape (Bs, 2, F, T)
164 |                 recon_feat: reconstructed audio complex STFT with shape (Bs, 2, F, T)
165 |                 codes: multi-scale indices with shape (Bs, num_streams, group_size, *)
166 |         """
167 |         return self.forward_one_step(x, x_feat, num_streams, freeze_codebook)
168 | 
169 |     @torch.no_grad()
170 |     def encode(self, x, num_streams=6):
171 |         x_feat = self.spec_transform(x)
172 |         enc_hs, feat_shape = self.encoder(x_feat)
173 |         codes = self.quantizers.encode(enc_hs[-1], num_streams)
174 |         return codes, feat_shape
175 |     
176 |     @torch.no_grad()
177 |     def decode(self, codes, feat_shape):
178 |         z_q = self.quantizers.decode(codes, dims=self.dims)
179 |         recon_feat = self.decoder(z_q, feat_shape)
180 |         recon_x = self.audio_reconstruct(recon_feat)
181 |         return recon_x
182 | 
183 | model_dict = {
184 |     "csvq+conv": ESC,
185 |     "csvq+swinT": ESC,
186 |     "rvq+conv": RVQCodecs,
187 |     "rvq+swinT": RVQCodecs
188 | }
189 | 
190 | def make_model(model_config, model_name):
191 |     if model_name not in model_dict:
192 |         assert f'{model_name} is not valid within [csvq+conv, csvq+swinT, rvq+conv, rvq+swinT]'
193 |     
194 |     m = model_dict[model_name]
195 |     if isinstance(model_config, dict):
196 |         model = m(**model_config)
197 |     else:
198 |         model = m(**vars(model_config))
199 |         
200 |     return model


--------------------------------------------------------------------------------
/scripts/trainer_no_adv.py:
--------------------------------------------------------------------------------
  1 | import os, torch, wandb, random, argparse
  2 | import numpy as np
  3 | from tqdm import tqdm
  4 | from accelerate import Accelerator
  5 | 
  6 | from esc.models import make_model
  7 | from .test import eval_epoch
  8 | from .metrics import PESQ, MelSpectrogramDistance, SISDR, EntropyCounter
  9 | from .utils import *
 10 | 
 11 | class Trainer:
 12 |     """Distributed Codec Trainer (non-adversarial)"""
 13 |     def __init__(self, accel: Accelerator, config: argparse.Namespace, args: argparse.Namespace) -> None:
 14 |         
 15 |         self.accel, self.config, self.args = accel, config, args
 16 |         self.log_stats = None
 17 | 
 18 |     def load(self):
 19 |         # Model
 20 |         model = make_model(self.config.model, self.config.model_name)
 21 |         n_params = sum(p.numel() for p in model.parameters())
 22 |         
 23 |         # Metrics and Losses
 24 |         self.metrics = {"PESQ": PESQ(), "MelDistance": MelSpectrogramDistance().to(self.accel.device), "SISDR": SISDR().to(self.accel.device)}
 25 |         self.e_counter = EntropyCounter(self.config.model.codebook_size, self.config.model.max_streams, device=self.accel.device)
 26 |         self.loss_funcs = {"mel_loss": make_losses(name="mel_loss").to(self.accel.device),
 27 |                            "stft_loss": make_losses(name="stft_loss").to(self.accel.device),}
 28 |         
 29 |         # DataLoaders
 30 |         train_dl, val_dl = make_dataloader(self.config.data.train_data_path, self.config.data.train_bs_per_device, 
 31 |                                            True, self.config.data.num_workers), \
 32 |                            make_dataloader(self.config.data.val_data_path, self.config.data.val_bs_per_device, 
 33 |                                            False, self.config.data.num_workers)  
 34 |         self.args.train_steps, test_steps = len(train_dl)//self.args.num_devices, len(val_dl)//self.args.num_devices
 35 |         self.args.max_train_steps = self.args.train_steps*self.args.num_epochs
 36 |         self.args.pretraining_steps = self.args.train_steps*self.args.num_pretraining_epochs
 37 |         
 38 |         # Optimizers
 39 |         optimizer = make_optimizer(model.parameters(), self.args.lr)
 40 |         scheduler = make_scheduler(optimizer, self.args.scheduler_type, 
 41 |                                    total_steps=self.args.max_train_steps, 
 42 |                                    warmup_steps=self.args.num_warmup_steps)
 43 |         
 44 |         self.accel.print(f"<<<<Experimental Setup: {self.args.exp_name}>>>>")
 45 |         self.accel.print(f"   BatchSize_per_Device: Train {self.config.data.train_bs_per_device} Test {self.config.data.val_bs_per_device}    LearningRate: {self.args.lr}")
 46 |         self.accel.print(f"   Total_Training_Steps: {self.args.train_steps}*{self.args.num_epochs}={self.args.max_train_steps}")
 47 |         self.accel.print(f"   Pre-Training_Steps: {self.args.train_steps}*{self.args.num_pretraining_epochs}={self.args.pretraining_steps}")
 48 |         self.accel.print(f"   Optimizer: AdamW    Scheduler: {self.args.scheduler_type}")
 49 |         self.accel.print(f"   Quantization_Dropout: {self.args.dropout_rate}")
 50 |         self.accel.print(f"   Model #Parameters: {n_params/1000000:.2f}M")
 51 | 
 52 |         self.bps_per_stream = 1.5
 53 | 
 54 |         return model, optimizer, scheduler, train_dl, val_dl
 55 |     
 56 |     def train(self, ):
 57 |         model, optimizer, scheduler, train_dl, val_dl = self.load()
 58 |         self.train_dl, self.val_dl = self.accel.prepare(train_dl), val_dl # No Distributing on Valset
 59 |         
 60 |         if self.args.pretrain_ckp is not None:
 61 |             ckp = torch.load(self.args.pretrain_ckp,)
 62 |             model.load_state_dict(ckp["model_state_dict"])
 63 |             optimizer.load_state_dict(ckp["optimizer_state_dict"])
 64 |             scheduler.load_state_dict(ckp["scheduler_state_dict"])
 65 |             self.start_step, self.best_perf = ckp["step"]+1, ckp['best_perf'] 
 66 |             self.accel.print(f"Load Pretrained Encoder-Decoder Checkpoints\nPrevious Best Performance: {self.best_perf} Starting Step: {self.start_step}")
 67 |         else:
 68 |             self.start_step, self.best_perf = 0, -1 
 69 |         
 70 |         self.model, self.optimizer, self.scheduler = self.accel.prepare(model, optimizer, scheduler) 
 71 |         self.pbar = tqdm(initial=self.start_step, total=self.args.max_train_steps, position=0, leave=True)
 72 |         while True:
 73 |             for _, x in enumerate(self.train_dl):
 74 | 
 75 |                 if self.args.pretraining_steps > 0 and self.pbar.n == self.args.pretraining_steps+1:
 76 |                     optimizer = make_optimizer(self.accel.unwrap_model(self.model).parameters(), self.args.lr)
 77 |                     self.optimizer = self.accel.prepare(optimizer)
 78 |                     self.accel.print("Optimizer Renewed")
 79 | 
 80 |                 self.train_step(x)
 81 |                 
 82 |                 if self.accel.is_main_process:
 83 |                     if self.pbar.n > self.args.pretraining_steps and self.pbar.n % self.args.train_steps==0:
 84 |                         self.evaluate()
 85 |                         self.model.train()
 86 |                     if (self.pbar.n+1) % self.args.log_steps==0:
 87 |                         self.log_step()
 88 |                     if self.pbar.n == self.args.pretraining_steps and self.pbar.n > 0:
 89 |                         self.save_ckp(save_pth=f"{self.args.save_path}/{self.args.exp_name}",tag="pretrained.pth")
 90 |                 self.accel.wait_for_everyone()
 91 |                 	
 92 |                 self.pbar.update(1)
 93 |                 if self.pbar.n == self.args.max_train_steps: return 
 94 | 
 95 |     def train_step(self, x):
 96 |         
 97 |         # VQ Dropout and Pre-Training
 98 |         s = quantization_dropout(dropout_rate=self.args.dropout_rate, max_streams=self.config.model.max_streams)
 99 |         freeze_vq = self.pbar.n < self.args.pretraining_steps
100 |         
101 |         stage = "Pre-Training at 0kbps" if freeze_vq else f"Sampling at {s*self.bps_per_stream:.2f}kbps"
102 |         self.pbar.set_description(f"Training Model [{stage}]")
103 | 
104 |         # Forward Pass
105 |         outputs = self.model(**dict(x=x, x_feat=None, num_streams=s, freeze_codebook=freeze_vq))
106 |         outputs["mel_loss"] = self.loss_funcs["mel_loss"](outputs["raw_audio"], outputs["recon_audio"])
107 |         outputs["stft_loss"] = self.loss_funcs["stft_loss"](outputs["raw_feat"], outputs["recon_feat"])
108 |         outputs["loss"] = outputs["cm_loss"]*self.config.loss.cm_weight + \
109 |                           outputs["cb_loss"]*self.config.loss.cb_weight + \
110 |                           outputs["mel_loss"]*self.config.loss.mel_weight + \
111 |                           outputs["stft_loss"]*self.config.loss.stft_weight
112 |         
113 |         # Backward Pass
114 |         self.optimizer.zero_grad()
115 |         self.accel.backward(outputs["loss"].mean())
116 |         self.accel.clip_grad_norm_(self.model.parameters(), 0.5)
117 |         self.optimizer.step()
118 |         self.scheduler.step()
119 |         
120 |         # Store Logs
121 |         if self.log_stats is None:
122 |             self.log_stats = {k: [] for k in outputs.keys() if k.split("_")[-1] == "loss"}
123 |         for k in self.log_stats.keys():
124 |             self.log_stats[k].append(outputs[k].mean().item())
125 | 
126 |     def log_step(self):
127 |         for k, v in self.log_stats.items():
128 |             self.log_stats[k] = np.mean(v)
129 |         if wandb.run is not None: wandb.log(self.log_stats)
130 |         self.log_stats = None
131 |     
132 |     def evaluate(self, ):
133 |         # Validation Epoch
134 |         eval_streams = self.config.model.max_streams
135 |         perf = eval_epoch(model=self.accel.unwrap_model(self.model).to(self.accel.device), 
136 |                           eval_loader=self.val_dl, metric_funcs=self.metrics, e_counter=self.e_counter,
137 |                           device=self.accel.device, bps_per_stream=self.bps_per_stream,
138 |                           num_streams=eval_streams, verbose=False)
139 | 
140 |         # wandb logging
141 |         perf = {k:v[0] for k,v in perf.items()}
142 |         if wandb.run is not None: wandb.log(perf)
143 |         self.accel.print(f"[Step {self.pbar.n+1}/{self.args.max_train_steps}] | Performance at {eval_streams*self.bps_per_stream:.2f}kbps: ", 
144 |                          " | ".join(f"{k}: {v:.4f}" for k, v in perf.items()))
145 | 
146 |         # Saving Checkpoints
147 |         if perf[self.args.val_metric] > self.best_perf: 
148 |             self.best_perf = perf[self.args.val_metric]
149 |             self.save_ckp(save_pth=f"{self.args.save_path}/{self.args.exp_name}",tag="best.pth")
150 |         self.save_ckp(save_pth=f"{self.args.save_path}/{self.args.exp_name}",tag="checkpoint.pth")
151 | 
152 |     def save_ckp(self, save_pth, tag="file.pth"):
153 |         ckp = {
154 |             'step': self.pbar.n, 
155 |             'model_state_dict': self.accel.unwrap_model(self.model).state_dict(),
156 |             'optimizer_state_dict': self.accel.unwrap_model(self.optimizer).state_dict(), 
157 |             'scheduler_state_dict': self.accel.unwrap_model(self.scheduler).state_dict(),
158 |             "best_perf": self.best_perf
159 |         }
160 |         if not os.path.exists(save_pth): os.makedirs(save_pth)
161 |         self.accel.save(ckp, os.path.join(save_pth,tag))
162 |         self.accel.print(f"[Step {self.pbar.n+1}/{self.args.max_train_steps}] | Training checkpoint saved as {os.path.join(save_pth,tag)}")
163 | 
164 | 
165 | def main(args, config):
166 |     accel = Accelerator()
167 | 
168 |     torch.manual_seed(args.seed)
169 |     torch.cuda.manual_seed_all(args.seed)
170 |     random.seed(args.seed)
171 |     np.random.seed(args.seed)
172 | 
173 |     if accel.is_main_process:
174 |         if args.wandb_project is not None:
175 |             wandb.login()
176 |             wandb.init(project=args.wandb_project, name=args.exp_name)
177 |         else:   
178 |             accel.print("Deactivated WandB Logging")
179 | 
180 |     trainer = Trainer(accel, config, args)
181 |     trainer.train()
182 |     if accel.is_main_process:
183 |         wandb.finish()


--------------------------------------------------------------------------------
/baselines/descript/dac/nn/quantize.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from einops import rearrange
  8 | from torch.nn.utils import weight_norm
  9 | 
 10 | from dac.nn.layers import WNConv1d
 11 | 
 12 | 
 13 | class VectorQuantize(nn.Module):
 14 |     """
 15 |     Implementation of VQ similar to Karpathy's repo:
 16 |     https://github.com/karpathy/deep-vector-quantization
 17 |     Additionally uses following tricks from Improved VQGAN
 18 |     (https://arxiv.org/pdf/2110.04627.pdf):
 19 |         1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
 20 |             for improved codebook usage
 21 |         2. l2-normalized codes: Converts euclidean distance to cosine similarity which
 22 |             improves training stability
 23 |     """
 24 | 
 25 |     def __init__(self, input_dim: int, codebook_size: int, codebook_dim: int):
 26 |         super().__init__()
 27 |         self.codebook_size = codebook_size
 28 |         self.codebook_dim = codebook_dim
 29 | 
 30 |         self.in_proj = WNConv1d(input_dim, codebook_dim, kernel_size=1)
 31 |         self.out_proj = WNConv1d(codebook_dim, input_dim, kernel_size=1)
 32 |         self.codebook = nn.Embedding(codebook_size, codebook_dim)
 33 | 
 34 |     def forward(self, z):
 35 |         """Quantized the input tensor using a fixed codebook and returns
 36 |         the corresponding codebook vectors
 37 | 
 38 |         Parameters
 39 |         ----------
 40 |         z : Tensor[B x D x T]
 41 | 
 42 |         Returns
 43 |         -------
 44 |         Tensor[B x D x T]
 45 |             Quantized continuous representation of input
 46 |         Tensor[1]
 47 |             Commitment loss to train encoder to predict vectors closer to codebook
 48 |             entries
 49 |         Tensor[1]
 50 |             Codebook loss to update the codebook
 51 |         Tensor[B x T]
 52 |             Codebook indices (quantized discrete representation of input)
 53 |         Tensor[B x D x T]
 54 |             Projected latents (continuous representation of input before quantization)
 55 |         """
 56 | 
 57 |         # Factorized codes (ViT-VQGAN) Project input into low-dimensional space
 58 |         z_e = self.in_proj(z)  # z_e : (B x D x T)
 59 |         z_q, indices = self.decode_latents(z_e)
 60 | 
 61 |         commitment_loss = F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
 62 |         codebook_loss = F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
 63 | 
 64 |         z_q = (
 65 |             z_e + (z_q - z_e).detach()
 66 |         )  # noop in forward pass, straight-through gradient estimator in backward pass
 67 | 
 68 |         z_q = self.out_proj(z_q)
 69 | 
 70 |         return z_q, commitment_loss, codebook_loss, indices, z_e
 71 | 
 72 |     def embed_code(self, embed_id):
 73 |         return F.embedding(embed_id, self.codebook.weight)
 74 | 
 75 |     def decode_code(self, embed_id):
 76 |         return self.embed_code(embed_id).transpose(1, 2)
 77 | 
 78 |     def decode_latents(self, latents):
 79 |         encodings = rearrange(latents, "b d t -> (b t) d")
 80 |         codebook = self.codebook.weight  # codebook: (N x D)
 81 | 
 82 |         # L2 normalize encodings and codebook (ViT-VQGAN)
 83 |         encodings = F.normalize(encodings)
 84 |         codebook = F.normalize(codebook)
 85 | 
 86 |         # Compute euclidean distance with codebook
 87 |         dist = (
 88 |             encodings.pow(2).sum(1, keepdim=True)
 89 |             - 2 * encodings @ codebook.t()
 90 |             + codebook.pow(2).sum(1, keepdim=True).t()
 91 |         )
 92 |         indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
 93 |         z_q = self.decode_code(indices)
 94 |         return z_q, indices
 95 | 
 96 | 
 97 | class ResidualVectorQuantize(nn.Module):
 98 |     """
 99 |     Introduced in SoundStream: An end2end neural audio codec
100 |     https://arxiv.org/abs/2107.03312
101 |     """
102 | 
103 |     def __init__(
104 |         self,
105 |         input_dim: int = 512,
106 |         n_codebooks: int = 9,
107 |         codebook_size: int = 1024,
108 |         codebook_dim: Union[int, list] = 8,
109 |         quantizer_dropout: float = 0.0,
110 |     ):
111 |         super().__init__()
112 |         if isinstance(codebook_dim, int):
113 |             codebook_dim = [codebook_dim for _ in range(n_codebooks)]
114 | 
115 |         self.n_codebooks = n_codebooks
116 |         self.codebook_dim = codebook_dim
117 |         self.codebook_size = codebook_size
118 | 
119 |         self.quantizers = nn.ModuleList(
120 |             [
121 |                 VectorQuantize(input_dim, codebook_size, codebook_dim[i])
122 |                 for i in range(n_codebooks)
123 |             ]
124 |         )
125 |         self.quantizer_dropout = quantizer_dropout
126 | 
127 |     def forward(self, z, n_quantizers: int = None):
128 |         """Quantized the input tensor using a fixed set of `n` codebooks and returns
129 |         the corresponding codebook vectors
130 |         Parameters
131 |         ----------
132 |         z : Tensor[B x D x T]
133 |         n_quantizers : int, optional
134 |             No. of quantizers to use
135 |             (n_quantizers < self.n_codebooks ex: for quantizer dropout)
136 |             Note: if `self.quantizer_dropout` is True, this argument is ignored
137 |                 when in training mode, and a random number of quantizers is used.
138 |         Returns
139 |         -------
140 |         dict
141 |             A dictionary with the following keys:
142 | 
143 |             "z" : Tensor[B x D x T]
144 |                 Quantized continuous representation of input
145 |             "codes" : Tensor[B x N x T]
146 |                 Codebook indices for each codebook
147 |                 (quantized discrete representation of input)
148 |             "latents" : Tensor[B x N*D x T]
149 |                 Projected latents (continuous representation of input before quantization)
150 |             "vq/commitment_loss" : Tensor[1]
151 |                 Commitment loss to train encoder to predict vectors closer to codebook
152 |                 entries
153 |             "vq/codebook_loss" : Tensor[1]
154 |                 Codebook loss to update the codebook
155 |         """
156 |         z_q = 0
157 |         residual = z
158 |         commitment_loss = 0
159 |         codebook_loss = 0
160 | 
161 |         codebook_indices = []
162 |         latents = []
163 | 
164 |         if n_quantizers is None:
165 |             n_quantizers = self.n_codebooks
166 |         if self.training:
167 |             n_quantizers = torch.ones((z.shape[0],)) * self.n_codebooks + 1
168 |             dropout = torch.randint(1, self.n_codebooks + 1, (z.shape[0],))
169 |             n_dropout = int(z.shape[0] * self.quantizer_dropout)
170 |             n_quantizers[:n_dropout] = dropout[:n_dropout]
171 |             n_quantizers = n_quantizers.to(z.device)
172 | 
173 |         for i, quantizer in enumerate(self.quantizers):
174 |             if self.training is False and i >= n_quantizers:
175 |                 break
176 | 
177 |             z_q_i, commitment_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(
178 |                 residual
179 |             )
180 | 
181 |             # Create mask to apply quantizer dropout
182 |             mask = (
183 |                 torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
184 |             )
185 |             z_q = z_q + z_q_i * mask[:, None, None]
186 |             residual = residual - z_q_i
187 | 
188 |             # Sum losses
189 |             commitment_loss += (commitment_loss_i * mask).mean()
190 |             codebook_loss += (codebook_loss_i * mask).mean()
191 | 
192 |             codebook_indices.append(indices_i)
193 |             latents.append(z_e_i)
194 | 
195 |         codes = torch.stack(codebook_indices, dim=1)
196 |         latents = torch.cat(latents, dim=1)
197 | 
198 |         return z_q, codes, latents, commitment_loss, codebook_loss
199 | 
200 |     def from_codes(self, codes: torch.Tensor):
201 |         """Given the quantized codes, reconstruct the continuous representation
202 |         Parameters
203 |         ----------
204 |         codes : Tensor[B x N x T]
205 |             Quantized discrete representation of input
206 |         Returns
207 |         -------
208 |         Tensor[B x D x T]
209 |             Quantized continuous representation of input
210 |         """
211 |         z_q = 0.0
212 |         z_p = []
213 |         n_codebooks = codes.shape[1]
214 |         for i in range(n_codebooks):
215 |             z_p_i = self.quantizers[i].decode_code(codes[:, i, :])
216 |             z_p.append(z_p_i)
217 | 
218 |             z_q_i = self.quantizers[i].out_proj(z_p_i)
219 |             z_q = z_q + z_q_i
220 |         return z_q, torch.cat(z_p, dim=1), codes
221 | 
222 |     def from_latents(self, latents: torch.Tensor):
223 |         """Given the unquantized latents, reconstruct the
224 |         continuous representation after quantization.
225 | 
226 |         Parameters
227 |         ----------
228 |         latents : Tensor[B x N x T]
229 |             Continuous representation of input after projection
230 | 
231 |         Returns
232 |         -------
233 |         Tensor[B x D x T]
234 |             Quantized representation of full-projected space
235 |         Tensor[B x D x T]
236 |             Quantized representation of latent space
237 |         """
238 |         z_q = 0
239 |         z_p = []
240 |         codes = []
241 |         dims = np.cumsum([0] + [q.codebook_dim for q in self.quantizers])
242 | 
243 |         n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[
244 |             0
245 |         ]
246 |         for i in range(n_codebooks):
247 |             j, k = dims[i], dims[i + 1]
248 |             z_p_i, codes_i = self.quantizers[i].decode_latents(latents[:, j:k, :])
249 |             z_p.append(z_p_i)
250 |             codes.append(codes_i)
251 | 
252 |             z_q_i = self.quantizers[i].out_proj(z_p_i)
253 |             z_q = z_q + z_q_i
254 | 
255 |         return z_q, torch.cat(z_p, dim=1), torch.stack(codes, dim=1)
256 | 
257 | 
258 | if __name__ == "__main__":
259 |     rvq = ResidualVectorQuantize(quantizer_dropout=True)
260 |     x = torch.randn(16, 512, 80)
261 |     y = rvq(x)
262 |     print(y["latents"].shape)
263 | 


--------------------------------------------------------------------------------
/esc/models/base.py:
--------------------------------------------------------------------------------
  1 | import torch, torchaudio, math
  2 | import torch.nn as nn
  3 | from einops import rearrange
  4 | from typing import Literal
  5 | 
  6 | from ..modules import ProductVectorQuantize, ProductResidualVectorQuantize
  7 | from ..modules import TransformerLayer, PatchEmbed, PatchDeEmbed, Convolution2D, ConvolutionLayer
  8 | from .utils import blk_func
  9 | 
 10 | class BaseAudioCodec(nn.Module):
 11 |     """Base Complex STFT Audio Codec"""
 12 |     def __init__(self, in_dim: int, in_freq: int, h_dims: list, max_streams: int,
 13 |                  win_len: int=20, hop_len: int=5, sr: int=16000) -> None:
 14 |         super().__init__()
 15 | 
 16 |         self.in_freq, self.in_dim = in_freq, in_dim
 17 |         self.max_streams = max_streams
 18 | 
 19 |         self.enc_h_dims = h_dims
 20 |         self.dec_h_dims = h_dims[::-1]
 21 | 
 22 |         self.ft = torchaudio.transforms.Spectrogram(n_fft=(in_freq-1)*2, 
 23 |                                             win_length=int(win_len*sr*1e-3),
 24 |                                             hop_length=int(hop_len*sr*1e-3), power=None)
 25 |         self.ift = torchaudio.transforms.InverseSpectrogram(n_fft=(in_freq-1)*2, 
 26 |                                             win_length=int(win_len*sr*1e-3),
 27 |                                             hop_length=int(hop_len*sr*1e-3))
 28 |         
 29 |     def spec_transform(self, x):
 30 |         """ Transform audio to Complex Spectrum
 31 |         Args: 
 32 |             x (Tensor): audio tensor of shape (B, L)
 33 |         Returns: 
 34 |             complex STFT spectrum of shape (B, 2, H, W)
 35 |         """
 36 |         feat = torch.view_as_real(self.ft(x)) # B, H, W, 2
 37 |         return rearrange(feat, "b h w c -> b c h w")
 38 |         
 39 |     def audio_reconstruct(self, feat):
 40 |         """ Recover Complex STFT Spectrum to audio 
 41 |         Args: 
 42 |             feat (Tensor):    spectrum of shape (B, 2, H, W)
 43 |         Returns: 
 44 |             audio tensor of shape (B, L)
 45 |         """
 46 |         feat = torch.view_as_complex(rearrange(feat, "b c h w -> b h w c").contiguous())
 47 |         return self.ift(feat)
 48 |         
 49 |     def init_ProductVQs(self, patch_size: tuple, overlap: int, group_size: int, codebook_dims: list, codebook_size: int, l2norm: bool):
 50 |         
 51 |         freq_patch, time_patch = patch_size
 52 |         H = self.in_freq//freq_patch
 53 | 
 54 |         quantizers = nn.ModuleList()
 55 |         quantizers.append(
 56 |                 ProductVectorQuantize( # VQ at bottom
 57 |                     in_dim=self.dec_h_dims[0], in_freq=H//2**(self.max_streams-1), 
 58 |                     overlap=overlap, num_vqs=group_size, codebook_dim=codebook_dims[0],
 59 |                     codebook_size=codebook_size, l2norm=l2norm
 60 |                 )
 61 |             ) 
 62 |         for i in range(1, self.max_streams):
 63 |             quantizers.append(
 64 |                 ProductVectorQuantize(
 65 |                     in_dim=self.dec_h_dims[i-1], in_freq=H//2**(self.max_streams-i),
 66 |                     overlap=overlap, num_vqs=group_size, codebook_dim=codebook_dims[i],
 67 |                     codebook_size=codebook_size, l2norm=l2norm
 68 |                 )
 69 |             ) 
 70 |         self.max_bps = (2/overlap)*self.max_streams * math.log2(codebook_size)*group_size // (20*time_patch//2)
 71 |         return quantizers
 72 | 
 73 |     def init_ResidualVQs(self, patch_size: tuple, overlap: int, num_product_vqs: int, num_residual_vqs: int, codebook_dim: int, codebook_size: int, l2norm: bool):
 74 |         
 75 |         freq_patch, time_patch = patch_size
 76 |         H = self.in_freq//freq_patch
 77 | 
 78 |         quantizers = ProductResidualVectorQuantize(
 79 |             in_dim=self.dec_h_dims[0], in_freq=H//2**(self.max_streams-1),
 80 |             overlap=overlap, num_pvqs=num_product_vqs, num_rvqs=num_residual_vqs, 
 81 |             codebook_dim=codebook_dim, codebook_size=codebook_size, l2norm=l2norm
 82 |         )
 83 |         self.max_bps = (2/overlap)*self.max_streams * math.log2(codebook_size)*num_product_vqs // (20*time_patch//2)
 84 |         return quantizers
 85 | 
 86 |     def print_codec(self):
 87 |         if isinstance(self.quantizers, ProductResidualVectorQuantize):
 88 |             print("Codec Visualization [only at bottom]")
 89 |             print("     Freq dim:                ", self.quantizers.in_freq)
 90 |             print("     Channel(hidden) dim:     ", self.quantizers.in_dim)
 91 |             print("     Reshaped hidden dim:     ", self.quantizers.fix_dim)
 92 |             print("     Individual z_e dim:      ", self.quantizers.fix_dim*self.quantizers.overlap//len(self.quantizers.vqs))
 93 |             print("     Codebook dim:            ", self.quantizers.codebook_dim)
 94 |         else:
 95 |             freq_dims, hidden_dims, reshaped_dims, individual_dims, codebook_dims = [], [], [], [], []
 96 |             for pvq in self.quantizers:
 97 |                 freq_dims.append(pvq.in_freq)
 98 |                 hidden_dims.append(pvq.in_dim)
 99 |                 reshaped_dims.append(pvq.fix_dim)
100 |                 individual_dims.append(pvq.fix_dim*pvq.overlap//pvq.num_vqs)
101 |                 codebook_dims.append(pvq.codebook_dim)
102 |             print("Codec Visualization [from bottom to top]: ")
103 |             print("     Freq dims:                ", freq_dims)
104 |             print("     Channel(hidden) dims:     ", hidden_dims)
105 |             print("     Reshaped hidden dims:     ", reshaped_dims)
106 |             print("     Individual z_e dims:      ", individual_dims)
107 |             print("     Codebook dims:            ", codebook_dims)
108 | 
109 | 
110 | class Encoder(nn.Module):
111 |     def __init__(self, 
112 |                  backbone: Literal['transformer', 'convolution'] = 'transformer',
113 |                  in_freq: int = 192, 
114 |                  in_dim: int = 2, 
115 |                  h_dims: list = [45,72,96,144,192,384],
116 |                  patch_size: list = [3,2],
117 |                  kernel_size: list=[5,2],
118 |                  conv_depth: int=1,
119 |                  swin_heads: list = [3,6,12,24,24],
120 |                  swin_depth: int = 2,
121 |                  window_size: int = 4,
122 |                  mlp_ratio: float = 4.0,
123 |                  ) -> None:
124 |         super().__init__()
125 |         
126 |         in_dims, out_dims = h_dims[:-1], h_dims[1:]
127 |         blocks = nn.ModuleList()
128 |         for i in range(len(in_dims)):
129 |             layer = ConvolutionLayer(in_dims[i], out_dims[i], conv_depth, kernel_size, transpose=False) if backbone == "convolution" \
130 |                else TransformerLayer(
131 |                     in_dims[i], out_dims[i], swin_heads[i], swin_depth, window_size, mlp_ratio, 
132 |                     activation=nn.GELU, norm_layer=nn.LayerNorm, scale="down", scale_factor=(2,1)
133 |                     )
134 |             blocks.append(layer)
135 | 
136 |         self.patch_embed = PatchEmbed(in_freq, in_dim, patch_size, embed_dim=h_dims[0], backbone=backbone)
137 |         self.pre_nn = Convolution2D(h_dims[0], h_dims[0], kernel_size, scale=False) if backbone == "convolution" \
138 |                  else TransformerLayer(h_dims[0], h_dims[0], swin_heads[0], swin_depth, window_size, mlp_ratio,
139 |                                          activation=nn.GELU, norm_layer=nn.LayerNorm, scale=None)
140 |         self.blocks = blocks
141 |         self.patch_size = patch_size
142 | 
143 |     def forward(self, x):
144 |         """Forward Function: step-wise encoding with downscaling
145 |         Args: 
146 |             x: complex spectrum feature, tensor with shape (B, C=2, F, T)
147 |             returns: encoder hidden states at all scales; patch num at bottom
148 |         """
149 |         feat_shape = (x.size(2)//self.patch_size[0], x.size(3)//self.patch_size[1])
150 |         x = self.patch_embed(x) 
151 | 
152 |         x, feat_shape = blk_func(self.pre_nn, x, feat_shape)
153 |         enc_hs = [x]
154 |         for blk in self.blocks:
155 |             x, feat_shape = blk_func(blk, x, feat_shape)
156 |             enc_hs.append(x)
157 | 
158 |         return enc_hs, feat_shape
159 |     
160 | 
161 | class Decoder(nn.Module):
162 |     def __init__(self,        
163 |                  backbone: Literal['transformer', 'convolution'] = 'transformer',          
164 |                  in_freq: int = 192, 
165 |                  in_dim: int = 2, 
166 |                  h_dims: list = [384,192,144,96,72,45],
167 |                  patch_size: list = [3,2],
168 |                  kernel_size: list=[5,2],
169 |                  conv_depth: int=1,
170 |                  swin_heads: list = [24,24,12,6,3],
171 |                  swin_depth: int = 2,
172 |                  window_size: int = 4,
173 |                  mlp_ratio: float = 4.0,) -> None:
174 |         super().__init__()
175 | 
176 |         in_dims, out_dims = h_dims[:-1], h_dims[1:]
177 | 
178 |         blocks = nn.ModuleList()
179 |         for i in range(len(in_dims)):
180 |             layer = ConvolutionLayer(in_dims[i], out_dims[i], conv_depth, kernel_size, transpose=True) if backbone == "convolution" \
181 |                else TransformerLayer(
182 |                     in_dims[i], out_dims[i], swin_heads[i], swin_depth, window_size, mlp_ratio, 
183 |                     activation=nn.GELU, norm_layer=nn.LayerNorm, scale="up", scale_factor=(2,1)
184 |                 )
185 |             blocks.append(layer)
186 |             
187 |         self.patch_deembed = PatchDeEmbed(in_freq, in_dim, patch_size, h_dims[-1], backbone)
188 |         self.post_nn = Convolution2D(h_dims[-1], h_dims[-1], kernel_size, scale=False) if backbone == "convolution" \
189 |                   else TransformerLayer(
190 |                        h_dims[-1], h_dims[-1], swin_heads[-1], swin_depth, window_size, mlp_ratio, 
191 |                        activation=nn.GELU, norm_layer=nn.LayerNorm, scale=None
192 |                     )
193 |         self.blocks = blocks
194 |     
195 |     def forward(self, z_q, feat_shape):
196 |         
197 |         for blk in self.blocks:
198 |             z_q, feat_shape = blk_func(blk, z_q, feat_shape)
199 |         
200 |         recon_feat, feat_shape = blk_func(self.post_nn, z_q, feat_shape)
201 |         recon_feat = self.patch_deembed(recon_feat)
202 | 
203 |         return recon_feat


--------------------------------------------------------------------------------
/baselines/descript/dac/model/base.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from dataclasses import dataclass
  3 | from pathlib import Path
  4 | from typing import Union
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import tqdm
  9 | from audiotools import AudioSignal
 10 | from torch import nn
 11 | 
 12 | SUPPORTED_VERSIONS = ["1.0.0"]
 13 | 
 14 | 
 15 | @dataclass
 16 | class DACFile:
 17 |     codes: torch.Tensor
 18 | 
 19 |     # Metadata
 20 |     chunk_length: int
 21 |     original_length: int
 22 |     input_db: float
 23 |     channels: int
 24 |     sample_rate: int
 25 |     padding: bool
 26 |     dac_version: str
 27 | 
 28 |     def save(self, path):
 29 |         artifacts = {
 30 |             "codes": self.codes.numpy().astype(np.uint16),
 31 |             "metadata": {
 32 |                 "input_db": self.input_db.numpy().astype(np.float32),
 33 |                 "original_length": self.original_length,
 34 |                 "sample_rate": self.sample_rate,
 35 |                 "chunk_length": self.chunk_length,
 36 |                 "channels": self.channels,
 37 |                 "padding": self.padding,
 38 |                 "dac_version": SUPPORTED_VERSIONS[-1],
 39 |             },
 40 |         }
 41 |         path = Path(path).with_suffix(".dac")
 42 |         with open(path, "wb") as f:
 43 |             np.save(f, artifacts)
 44 |         return path
 45 | 
 46 |     @classmethod
 47 |     def load(cls, path):
 48 |         artifacts = np.load(path, allow_pickle=True)[()]
 49 |         codes = torch.from_numpy(artifacts["codes"].astype(int))
 50 |         if artifacts["metadata"].get("dac_version", None) not in SUPPORTED_VERSIONS:
 51 |             raise RuntimeError(
 52 |                 f"Given file {path} can't be loaded with this version of descript-audio-codec."
 53 |             )
 54 |         return cls(codes=codes, **artifacts["metadata"])
 55 | 
 56 | 
 57 | class CodecMixin:
 58 |     @property
 59 |     def padding(self):
 60 |         if not hasattr(self, "_padding"):
 61 |             self._padding = True
 62 |         return self._padding
 63 | 
 64 |     @padding.setter
 65 |     def padding(self, value):
 66 |         assert isinstance(value, bool)
 67 | 
 68 |         layers = [
 69 |             l for l in self.modules() if isinstance(l, (nn.Conv1d, nn.ConvTranspose1d))
 70 |         ]
 71 | 
 72 |         for layer in layers:
 73 |             if value:
 74 |                 if hasattr(layer, "original_padding"):
 75 |                     layer.padding = layer.original_padding
 76 |             else:
 77 |                 layer.original_padding = layer.padding
 78 |                 layer.padding = tuple(0 for _ in range(len(layer.padding)))
 79 | 
 80 |         self._padding = value
 81 | 
 82 |     def get_delay(self):
 83 |         # Any number works here, delay is invariant to input length
 84 |         l_out = self.get_output_length(0)
 85 |         L = l_out
 86 | 
 87 |         layers = []
 88 |         for layer in self.modules():
 89 |             if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
 90 |                 layers.append(layer)
 91 | 
 92 |         for layer in reversed(layers):
 93 |             d = layer.dilation[0]
 94 |             k = layer.kernel_size[0]
 95 |             s = layer.stride[0]
 96 | 
 97 |             if isinstance(layer, nn.ConvTranspose1d):
 98 |                 L = ((L - d * (k - 1) - 1) / s) + 1
 99 |             elif isinstance(layer, nn.Conv1d):
100 |                 L = (L - 1) * s + d * (k - 1) + 1
101 | 
102 |             L = math.ceil(L)
103 | 
104 |         l_in = L
105 | 
106 |         return (l_in - l_out) // 2
107 | 
108 |     def get_output_length(self, input_length):
109 |         L = input_length
110 |         # Calculate output length
111 |         for layer in self.modules():
112 |             if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
113 |                 d = layer.dilation[0]
114 |                 k = layer.kernel_size[0]
115 |                 s = layer.stride[0]
116 | 
117 |                 if isinstance(layer, nn.Conv1d):
118 |                     L = ((L - d * (k - 1) - 1) / s) + 1
119 |                 elif isinstance(layer, nn.ConvTranspose1d):
120 |                     L = (L - 1) * s + d * (k - 1) + 1
121 | 
122 |                 L = math.floor(L)
123 |         return L
124 | 
125 |     @torch.no_grad()
126 |     def compress(
127 |         self,
128 |         audio_path_or_signal: Union[str, Path, AudioSignal],
129 |         win_duration: float = 1.0,
130 |         verbose: bool = False,
131 |         normalize_db: float = -16,
132 |         n_quantizers: int = None,
133 |     ) -> DACFile:
134 |         """Processes an audio signal from a file or AudioSignal object into
135 |         discrete codes. This function processes the signal in short windows,
136 |         using constant GPU memory.
137 | 
138 |         Parameters
139 |         ----------
140 |         audio_path_or_signal : Union[str, Path, AudioSignal]
141 |             audio signal to reconstruct
142 |         win_duration : float, optional
143 |             window duration in seconds, by default 5.0
144 |         verbose : bool, optional
145 |             by default False
146 |         normalize_db : float, optional
147 |             normalize db, by default -16
148 | 
149 |         Returns
150 |         -------
151 |         DACFile
152 |             Object containing compressed codes and metadata
153 |             required for decompression
154 |         """
155 |         audio_signal = audio_path_or_signal
156 |         if isinstance(audio_signal, (str, Path)):
157 |             audio_signal = AudioSignal.load_from_file_with_ffmpeg(str(audio_signal))
158 | 
159 |         self.eval()
160 |         original_padding = self.padding
161 |         original_device = audio_signal.device
162 | 
163 |         audio_signal = audio_signal.clone()
164 |         original_sr = audio_signal.sample_rate
165 | 
166 |         resample_fn = audio_signal.resample
167 |         loudness_fn = audio_signal.loudness
168 | 
169 |         # If audio is > 10 minutes long, use the ffmpeg versions
170 |         if audio_signal.signal_duration >= 10 * 60 * 60:
171 |             resample_fn = audio_signal.ffmpeg_resample
172 |             loudness_fn = audio_signal.ffmpeg_loudness
173 | 
174 |         original_length = audio_signal.signal_length
175 |         resample_fn(self.sample_rate)
176 |         input_db = loudness_fn()
177 | 
178 |         if normalize_db is not None:
179 |             audio_signal.normalize(normalize_db)
180 |         audio_signal.ensure_max_of_audio()
181 | 
182 |         nb, nac, nt = audio_signal.audio_data.shape
183 |         audio_signal.audio_data = audio_signal.audio_data.reshape(nb * nac, 1, nt)
184 |         win_duration = (
185 |             audio_signal.signal_duration if win_duration is None else win_duration
186 |         )
187 | 
188 |         if audio_signal.signal_duration <= win_duration:
189 |             # Unchunked compression (used if signal length < win duration)
190 |             self.padding = True
191 |             n_samples = nt
192 |             hop = nt
193 |         else:
194 |             # Chunked inference
195 |             self.padding = False
196 |             # Zero-pad signal on either side by the delay
197 |             audio_signal.zero_pad(self.delay, self.delay)
198 |             n_samples = int(win_duration * self.sample_rate)
199 |             # Round n_samples to nearest hop length multiple
200 |             n_samples = int(math.ceil(n_samples / self.hop_length) * self.hop_length)
201 |             hop = self.get_output_length(n_samples)
202 | 
203 |         codes = []
204 |         range_fn = range if not verbose else tqdm.trange
205 | 
206 |         for i in range_fn(0, nt, hop):
207 |             x = audio_signal[..., i : i + n_samples]
208 |             x = x.zero_pad(0, max(0, n_samples - x.shape[-1]))
209 | 
210 |             audio_data = x.audio_data.to(self.device)
211 |             audio_data = self.preprocess(audio_data, self.sample_rate)
212 |             _, c, _, _, _ = self.encode(audio_data, n_quantizers)
213 |             codes.append(c.to(original_device))
214 |             chunk_length = c.shape[-1]
215 | 
216 |         codes = torch.cat(codes, dim=-1)
217 | 
218 |         dac_file = DACFile(
219 |             codes=codes,
220 |             chunk_length=chunk_length,
221 |             original_length=original_length,
222 |             input_db=input_db,
223 |             channels=nac,
224 |             sample_rate=original_sr,
225 |             padding=self.padding,
226 |             dac_version=SUPPORTED_VERSIONS[-1],
227 |         )
228 | 
229 |         if n_quantizers is not None:
230 |             codes = codes[:, :n_quantizers, :]
231 | 
232 |         self.padding = original_padding
233 |         return dac_file
234 | 
235 |     @torch.no_grad()
236 |     def decompress(
237 |         self,
238 |         obj: Union[str, Path, DACFile],
239 |         verbose: bool = False,
240 |     ) -> AudioSignal:
241 |         """Reconstruct audio from a given .dac file
242 | 
243 |         Parameters
244 |         ----------
245 |         obj : Union[str, Path, DACFile]
246 |             .dac file location or corresponding DACFile object.
247 |         verbose : bool, optional
248 |             Prints progress if True, by default False
249 | 
250 |         Returns
251 |         -------
252 |         AudioSignal
253 |             Object with the reconstructed audio
254 |         """
255 |         self.eval()
256 |         if isinstance(obj, (str, Path)):
257 |             obj = DACFile.load(obj)
258 | 
259 |         original_padding = self.padding
260 |         self.padding = obj.padding
261 | 
262 |         range_fn = range if not verbose else tqdm.trange
263 |         codes = obj.codes
264 |         original_device = codes.device
265 |         chunk_length = obj.chunk_length
266 |         recons = []
267 | 
268 |         for i in range_fn(0, codes.shape[-1], chunk_length):
269 |             c = codes[..., i : i + chunk_length].to(self.device)
270 |             z = self.quantizer.from_codes(c)[0]
271 |             r = self.decode(z)
272 |             recons.append(r.to(original_device))
273 | 
274 |         recons = torch.cat(recons, dim=-1)
275 |         recons = AudioSignal(recons, self.sample_rate)
276 | 
277 |         resample_fn = recons.resample
278 |         loudness_fn = recons.loudness
279 | 
280 |         # If audio is > 10 minutes long, use the ffmpeg versions
281 |         if recons.signal_duration >= 10 * 60 * 60:
282 |             resample_fn = recons.ffmpeg_resample
283 |             loudness_fn = recons.ffmpeg_loudness
284 | 
285 |         recons.normalize(obj.input_db)
286 |         resample_fn(obj.sample_rate)
287 |         recons = recons[..., : obj.original_length]
288 |         loudness_fn()
289 |         recons.audio_data = recons.audio_data.reshape(
290 |             -1, obj.channels, obj.original_length
291 |         )
292 | 
293 |         self.padding = original_padding
294 |         return recons
295 | 


--------------------------------------------------------------------------------
/scripts/trainer_adv.py:
--------------------------------------------------------------------------------
  1 | import os, random, wandb
  2 | from argparse import Namespace
  3 | from accelerate import Accelerator
  4 | from tqdm import tqdm
  5 | from .trainer_no_adv import Trainer
  6 | 
  7 | from esc.models import Discriminator, make_model
  8 | from esc.modules import GANLoss
  9 | from .metrics import PESQ, MelSpectrogramDistance, SISDR, EntropyCounter
 10 | from .utils import *
 11 | 
 12 | class TrainerAdv(Trainer):
 13 |     def __init__(self, accel: Accelerator, config: Namespace, args: Namespace) -> None:
 14 |         super().__init__(accel, config, args)
 15 | 
 16 |     def load(self):
 17 |         # Model
 18 |         model_gen = make_model(self.config.model, self.config.model_name)
 19 |         n_params_gen = sum(p.numel() for p in model_gen.parameters())
 20 | 
 21 |         model_disc = Discriminator(**namespace2dict(self.config.discriminator))
 22 |         n_params_disc = sum(p.numel() for p in model_disc.parameters())
 23 |         
 24 |         # Metrics and Losses
 25 |         self.metrics = {"PESQ": PESQ(), "MelDistance": MelSpectrogramDistance().to(self.accel.device), "SISDR": SISDR().to(self.accel.device)}
 26 |         self.e_counter = EntropyCounter(self.config.model.codebook_size, self.config.model.max_streams, device=self.accel.device)
 27 |         self.loss_funcs = {"mel_loss": make_losses(name="mel_loss").to(self.accel.device),
 28 |                            "stft_loss": make_losses(name="stft_loss").to(self.accel.device)}
 29 |         
 30 |         # DataLoaders
 31 |         train_dl, val_dl = make_dataloader(self.config.data.train_data_path, self.config.data.train_bs_per_device, 
 32 |                                            True, self.config.data.num_workers), \
 33 |                            make_dataloader(self.config.data.val_data_path, self.config.data.val_bs_per_device, 
 34 |                                            False, self.config.data.num_workers)  
 35 |         self.args.train_steps, test_steps = len(train_dl)//self.args.num_devices, len(val_dl)//self.args.num_devices
 36 |         self.args.max_train_steps = self.args.train_steps*self.args.num_epochs
 37 |         self.args.pretraining_steps = self.args.train_steps*self.args.num_pretraining_epochs
 38 |         
 39 |         # Optimizers
 40 |         self.args.lr_disc = self.args.lr
 41 |         if self.args.pretrain_ckp is not None: self.args.lr = self.args.lr / 10
 42 |         optimizer_gen = make_optimizer(model_gen.parameters(), self.args.lr)
 43 |         scheduler = make_scheduler(optimizer_gen, self.args.scheduler_type, 
 44 |                                    total_steps=self.args.max_train_steps, 
 45 |                                    warmup_steps=self.args.num_warmup_steps)
 46 |         
 47 |         optimizer_disc = make_optimizer(model_disc.parameters(), self.args.lr_disc)
 48 |         
 49 |         self.accel.print(f"<<<<Experimental Setup: {self.args.exp_name}>>>>")
 50 |         self.accel.print(f"   BatchSize_per_Device: Train {self.config.data.train_bs_per_device} Test {self.config.data.val_bs_per_device}\n   LearningRate(gen): {self.args.lr}    LearningRate(disc): {self.args.lr_disc}")
 51 |         self.accel.print(f"   Total_Training_Steps: {self.args.train_steps}*{self.args.num_epochs}={self.args.max_train_steps}")
 52 |         self.accel.print(f"   Pre-Training_Steps: {self.args.train_steps}*{self.args.num_pretraining_epochs}={self.args.pretraining_steps}")
 53 |         self.accel.print(f"   Optimizer: AdamW    Scheduler: {self.args.scheduler_type}")
 54 |         self.accel.print(f"   Quantization_Dropout: {self.args.dropout_rate}")
 55 |         self.accel.print(f"   Model #Parameters: {n_params_gen/1e6:.2f}M  Discriminator #Parameters: {n_params_disc/1e6:.2f}M")
 56 | 
 57 |         self.bps_per_stream = 1.5
 58 | 
 59 |         return model_gen, model_disc, optimizer_gen, optimizer_disc, scheduler, train_dl, val_dl
 60 |     
 61 |     def train_step(self, x):
 62 |         
 63 |         # VQ Dropout and Pre-Training
 64 |         s = quantization_dropout(dropout_rate=self.args.dropout_rate, max_streams=self.config.model.max_streams)
 65 |         freeze_vq = self.pbar.n < self.args.pretraining_steps
 66 |         
 67 |         stage = "Pre-Training at 0kbps" if freeze_vq else f"Sampling at {s*self.bps_per_stream:.2f}kbps"
 68 |         self.pbar.set_description(f"Training Model [{stage}]")
 69 | 
 70 |         # Forward Pass (Generator)
 71 |         outputs = self.model(**dict(x=x, x_feat=None, num_streams=s, freeze_codebook=freeze_vq))
 72 |         outputs["mel_loss"] = self.loss_funcs["mel_loss"](outputs["raw_audio"], outputs["recon_audio"])
 73 |         outputs["stft_loss"] = self.loss_funcs["stft_loss"](outputs["raw_feat"], outputs["recon_feat"])
 74 |         
 75 |         if not freeze_vq:
 76 |             outputs["gen_loss"], outputs["feat_loss"] = self.loss_funcs["adv_loss"].generator_loss(
 77 |                 fake=outputs["recon_audio"], real=outputs["raw_audio"]
 78 |             )
 79 |         else:
 80 |             outputs["gen_loss"], outputs["feat_loss"] = torch.zeros(x.size(0), device=x.device), torch.zeros(x.size(0), device=x.device)
 81 |         
 82 |         outputs["loss"] = outputs["cm_loss"]*self.config.loss.cm_weight + \
 83 |                           outputs["cb_loss"]*self.config.loss.cb_weight + \
 84 |                           outputs["mel_loss"]*self.config.loss.mel_weight + \
 85 |                           outputs["stft_loss"]*self.config.loss.stft_weight + \
 86 |                           outputs["gen_loss"]*self.config.loss.gen_weight + \
 87 |                           outputs["feat_loss"]*self.config.loss.feat_weight
 88 |         
 89 |         # Backward Pass (Generator)
 90 |         self.opt_g.zero_grad()
 91 |         self.accel.backward(outputs["loss"].mean())
 92 |         self.accel.clip_grad_norm_(self.model.parameters(), 1e3)
 93 |         self.opt_g.step()
 94 |         self.scheduler.step()
 95 | 
 96 |         if not freeze_vq: # discriminator involved only after pre-training
 97 |             # Forward Pass (Discriminator)
 98 |             outputs["disc_loss"] = self.loss_funcs["adv_loss"].discriminator_loss(
 99 |                 fake=outputs["recon_audio"], real=outputs["raw_audio"]
100 |             )
101 |             # Backward Pass (Discriminator)
102 |             self.opt_d.zero_grad()
103 |             self.accel.backward(outputs["disc_loss"].mean())
104 |             self.accel.clip_grad_norm_(self.model_disc.parameters(), 10.0)
105 |             self.opt_d.step()
106 |         else:
107 |             outputs["disc_loss"] = torch.zeros(x.size(0), device=x.device)
108 | 
109 |         # Store Logs
110 |         if self.log_stats is None:
111 |             self.log_stats = {k: [] for k in outputs.keys() if k.split("_")[-1] == "loss"}
112 |         for k in self.log_stats.keys():
113 |             self.log_stats[k].append(outputs[k].mean().item())
114 | 
115 |     def train(self, ):
116 |         g, d, opt_g, opt_d, scheduler, train_dl, val_dl = self.load()
117 |         self.train_dl, self.val_dl = self.accel.prepare(train_dl), val_dl # No Distributing on Valset
118 |         
119 |         if self.args.pretrain_ckp is not None: # when provided, means post adversarial training instead of resume
120 |             ckp = torch.load(self.args.pretrain_ckp, map_location='cpu')
121 |             g.load_state_dict(ckp["model_state_dict"])
122 |             if 'optimizer_state_dict' in ckp:
123 |                 opt_g.load_state_dict(ckp["optimizer_state_dict"])
124 |             if 'scheduler_state_dict' in ckp:
125 |                 scheduler.load_state_dict(ckp["scheduler_state_dict"])
126 |             self.accel.print(f"Load a Pretrained ESC Codec Generator\n---Start Post Adversarial Training---")
127 | 
128 |         self.start_step, self.best_perf = 0, -1
129 |         self.pbar = tqdm(initial=self.start_step, total=self.args.max_train_steps, position=0, leave=True)
130 |         self.model, self.model_disc, self.opt_g, self.opt_d, self.scheduler = self.accel.prepare(g, d, opt_g, opt_d, scheduler) 
131 |         self.loss_funcs["adv_loss"] = GANLoss(self.model_disc).to(self.accel.device)
132 | 
133 |         if self.args.pretrain_ckp is not None and self.accel.is_main_process: 
134 |             self.evaluate() # pre-eval epoch 
135 |         self.accel.wait_for_everyone()       
136 |         
137 |         while True:
138 |             for _, x in enumerate(self.train_dl):
139 | 
140 |                 if self.args.pretraining_steps > 0 and self.pbar.n == self.args.pretraining_steps+1:
141 |                     opt_g = make_optimizer(self.accel.unwrap_model(self.model).parameters(), self.args.lr)
142 |                     self.opt_g = self.accel.prepare(opt_g)
143 |                     self.accel.print("Pretraining done. Generator's Optimizer Renewed")
144 | 
145 |                 self.train_step(x)
146 |                 
147 |                 if self.accel.is_main_process:
148 |                     if self.pbar.n > self.args.pretraining_steps and self.pbar.n % self.args.train_steps==0:
149 |                         self.evaluate()
150 |                         self.model.train()
151 |                     if (self.pbar.n+1) % self.args.log_steps==0:
152 |                         self.log_step()
153 |                     if self.pbar.n == self.args.pretraining_steps and self.pbar.n > 0:
154 |                         self.save_ckp(save_pth=f"{self.args.save_path}/{self.args.exp_name}",tag="pretrained.pth")
155 |                 self.accel.wait_for_everyone()
156 |                 	
157 |                 self.pbar.update(1)
158 |                 if self.pbar.n == self.args.max_train_steps: return 
159 | 
160 |     def save_ckp(self, save_pth, tag="file.pth"):
161 |         ckp = {
162 |             'step': self.pbar.n, 
163 |             'model_state_dict': self.accel.unwrap_model(self.model).state_dict(),
164 |             'model_disc_state_dict': self.accel.unwrap_model(self.model_disc).state_dict(),
165 |             'optimizer_state_dict': self.accel.unwrap_model(self.opt_g).state_dict(), 
166 |             'optimizer_disc_state_dict': self.accel.unwrap_model(self.opt_d).state_dict(),
167 |             'scheduler_state_dict': self.accel.unwrap_model(self.scheduler).state_dict(),
168 |             "best_perf": self.best_perf
169 |         }
170 |         if not os.path.exists(save_pth): os.makedirs(save_pth)
171 |         self.accel.save(ckp, os.path.join(save_pth,tag))
172 |         self.accel.print(f"[Step {self.pbar.n+1}/{self.args.max_train_steps}] | Training checkpoint saved as {os.path.join(save_pth,tag)}")
173 | 
174 | 
175 | def main(args, config):
176 |     accel = Accelerator()
177 | 
178 |     torch.manual_seed(args.seed)
179 |     torch.cuda.manual_seed_all(args.seed)
180 |     random.seed(args.seed)
181 |     np.random.seed(args.seed)
182 | 
183 |     if accel.is_main_process:
184 |         if args.wandb_project is not None:
185 |             wandb.login()
186 |             wandb.init(project=args.wandb_project, name=args.exp_name)
187 |         else:   
188 |             accel.print("Deactivated WandB Logging")
189 | 
190 |     trainer_adv = TrainerAdv(accel, config, args)
191 |     trainer_adv.train()
192 |     if accel.is_main_process:
193 |         wandb.finish()


--------------------------------------------------------------------------------
/esc/modules/transformer/attention.py:
--------------------------------------------------------------------------------
  1 | """Window Attentions are adapted from from https://github.com/microsoft/Swin-Transformer"""
  2 | import torch
  3 | import torch.nn as nn
  4 | import numpy as np
  5 | 
  6 | from timm.models.layers import trunc_normal_, to_2tuple
  7 | from .scale import PatchMerge, PatchSplit
  8 | 
  9 | class TransformerLayer(nn.Module):
 10 |     """ESC Building Transformer Layer"""
 11 |     def __init__(self, in_dim: int, out_dim: int, num_heads: int,
 12 |                  depth: int=2, window_size: int=4, mlp_ratio: float=2.,
 13 |                  qkv_bias: bool=True, qk_scale: float=None, proj_drop: float=0., attn_drop: float=0.,
 14 |                  activation=nn.GELU, norm_layer=nn.LayerNorm,
 15 |                  scale: str=None, scale_factor: tuple=(2,1)
 16 |                  ) -> None:
 17 |         super().__init__()
 18 | 
 19 |         self.window_size = window_size
 20 |         self.shift_size = window_size // 2
 21 |         self.depth = depth
 22 | 
 23 |         # Transformer Modules
 24 |         self.swint_blocks = nn.ModuleList([
 25 |             SwinBlock(
 26 |                  d_model=in_dim,
 27 |                  num_heads=num_heads, 
 28 |                  window_size=window_size, 
 29 |                  shift_size=0 if (i % 2 == 0) else window_size // 2,
 30 |                  mlp_ratio=mlp_ratio, 
 31 |                  qkv_bias=qkv_bias, 
 32 |                  qk_scale=qk_scale, 
 33 |                  proj_drop=proj_drop, 
 34 |                  attn_drop=attn_drop, 
 35 |                  act_layer=activation, 
 36 |                  norm_layer=norm_layer,
 37 |                 )
 38 |             for i in range(depth)]) # WA-Blocks combined with SWA-Blocks
 39 |         
 40 |         # Scaling Modules
 41 |         if scale is not None:
 42 |             scale_map = {"down": PatchMerge, "up": PatchSplit}
 43 |             assert scale in scale_map.keys(), "string scale must be down/up" 
 44 |             self.subsample = scale_map[scale](in_dim, out_dim, scale_factor, norm_layer)
 45 |         else:
 46 |             self.subsample = None
 47 | 
 48 |     def forward(self, x, H, W):
 49 |         """ Forward function.
 50 |         Args:
 51 |             x: Input feature, tensor size (B, H*W, C).
 52 |             H, W: Spatial resolution of the input feature.
 53 |             returns: x, H, W at next scale
 54 |         """
 55 | 
 56 |         # calculate attention mask for SW-MSA
 57 |         Hp = int(np.ceil(H / self.window_size)) * self.window_size
 58 |         Wp = int(np.ceil(W / self.window_size)) * self.window_size
 59 |         img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)
 60 |         h_slices = (slice(0, -self.window_size),
 61 |                     slice(-self.window_size, -self.shift_size),
 62 |                     slice(-self.shift_size, None))
 63 |         w_slices = (slice(0, -self.window_size),
 64 |                     slice(-self.window_size, -self.shift_size),
 65 |                     slice(-self.shift_size, None))
 66 |         cnt = 0
 67 |         for h in h_slices:
 68 |             for w in w_slices:
 69 |                 img_mask[:, h, w, :] = cnt
 70 |                 cnt += 1
 71 | 
 72 |         mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
 73 |         mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
 74 |         attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
 75 |         attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
 76 | 
 77 |         for blk in self.swint_blocks:
 78 |             blk.H, blk.W = H, W
 79 |             x = blk(x, attn_mask)
 80 | 
 81 |         if self.subsample is not None:
 82 |             x_scale = self.subsample(x, H)
 83 |             if isinstance(self.subsample, PatchMerge):
 84 |                 ratio = self.subsample.scale_factor
 85 |                 Wh, Ww = (H + 1) // ratio[0], (W + 1) // ratio[1] if ratio[1] > 1 else W
 86 |             elif isinstance(self.subsample, PatchSplit):
 87 |                 ratio = self.subsample.scale_factor
 88 |                 Wh, Ww = H * ratio[0], W * ratio[1]
 89 |             return x_scale, Wh, Ww
 90 |         else:
 91 |             return x, H, W
 92 | 
 93 | class SwinBlock(nn.Module):
 94 |     def __init__(self, 
 95 |                  d_model,
 96 |                  num_heads, 
 97 |                  window_size=4, 
 98 |                  shift_size=0,
 99 |                  mlp_ratio=2., 
100 |                  mlp_out_dim=None,
101 |                  qkv_bias=True, 
102 |                  qk_scale=None, 
103 |                  proj_drop=0., 
104 |                  attn_drop=0., 
105 |                  act_layer=nn.GELU, 
106 |                  norm_layer=nn.LayerNorm,
107 |                  ):
108 |         super().__init__()
109 |         self.d_model = d_model
110 |         self.num_heads = num_heads
111 |         self.window_size = window_size
112 |         self.shift_size = shift_size
113 |         self.mlp_ratio = mlp_ratio
114 |         assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
115 | 
116 |         self.norm1 = norm_layer(d_model)
117 |         self.attn = WindowAttention(
118 |             d_model, window_size=to_2tuple(self.window_size), num_heads=num_heads,
119 |             qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=proj_drop)
120 | 
121 |         self.norm2 = norm_layer(d_model)
122 |         mlp_hidden_dim = int(d_model * mlp_ratio)
123 |         mlp_out_dim = d_model if not mlp_out_dim else mlp_out_dim
124 |         self.mlp = FeedForward(d_model, mlp_out_dim, mlp_hidden_dim, proj_drop, act_layer)
125 | 
126 |         self.H = None
127 |         self.W = None
128 | 
129 |     def forward(self, x, mask_matrix):
130 |         B, L, C = x.shape
131 |         H, W = self.H, self.W
132 |         assert L == H * W, "input feature has wrong size"
133 | 
134 |         shortcut = x
135 |         x = self.norm1(x)
136 |         x = x.view(B, H, W, C)
137 | 
138 |         # pad feature maps to multiples of window size
139 |         pad_l = pad_t = 0
140 |         pad_r = (self.window_size - W % self.window_size) % self.window_size
141 |         pad_b = (self.window_size - H % self.window_size) % self.window_size
142 |         x = nn.functional.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
143 |         _, Hp, Wp, _ = x.shape
144 | 
145 |         # cyclic shift
146 |         if self.shift_size > 0:
147 |             shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
148 |             attn_mask = mask_matrix
149 |         else:
150 |             shifted_x = x
151 |             attn_mask = None
152 | 
153 |         # partition windows
154 |         x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
155 |         x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
156 | 
157 |         # W-MSA/SW-MSA
158 |         attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
159 | 
160 |         # merge windows
161 |         attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
162 |         shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
163 | 
164 |         # reverse cyclic shift
165 |         if self.shift_size > 0:
166 |             x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
167 |         else:
168 |             x = shifted_x
169 | 
170 |         if pad_r > 0 or pad_b > 0:
171 |             x = x[:, :H, :W, :]
172 | 
173 |         x = x.contiguous().view(B, H*W, C)
174 | 
175 |         # FFN
176 |         x = shortcut + x
177 |         x = x + self.mlp(self.norm2(x))
178 |         return x
179 | 
180 | class WindowAttention(nn.Module):
181 |     def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
182 | 
183 |         super().__init__()
184 |         self.dim = dim
185 |         self.window_size = window_size  # Wh, Ww
186 |         self.num_heads = num_heads
187 |         head_dim = dim // num_heads
188 |         self.scale = qk_scale or head_dim ** -0.5
189 | 
190 |         # define a parameter table of relative position bias
191 |         self.relative_position_bias_table = nn.Parameter(
192 |             torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
193 | 
194 |         # get pair-wise relative position index for each token inside the window
195 |         coords_h = torch.arange(self.window_size[0])
196 |         coords_w = torch.arange(self.window_size[1])
197 |         coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
198 |         coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
199 |         relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
200 |         relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
201 |         relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
202 |         relative_coords[:, :, 1] += self.window_size[1] - 1
203 |         relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
204 |         relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
205 |         self.register_buffer("relative_position_index", relative_position_index)
206 | 
207 |         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
208 |         self.attn_drop = nn.Dropout(attn_drop)
209 |         self.proj = nn.Linear(dim, dim)
210 |         self.proj_drop = nn.Dropout(proj_drop)
211 | 
212 |         trunc_normal_(self.relative_position_bias_table, std=.02)
213 |         self.softmax = nn.Softmax(dim=-1)
214 |     
215 |     def forward(self, x, mask=None):
216 |         """ Forward function.
217 |         Args:
218 |             x: input features with shape of (num_windows*B, N, C)
219 |             mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
220 |         """
221 |         B_, N, C = x.shape
222 |         qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4).contiguous()
223 |         q, k, v = qkv[0], qkv[1], qkv[2]
224 | 
225 |         q = q * self.scale
226 |         attn = (q @ k.transpose(-2, -1))
227 | 
228 |         relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
229 |             self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
230 |         relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
231 |         attn = attn + relative_position_bias.unsqueeze(0)
232 | 
233 |         if mask is not None:
234 |             nW = mask.shape[0]
235 |             attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
236 |             attn = attn.view(-1, self.num_heads, N, N)
237 | 
238 |         attn = self.softmax(attn)
239 |         attn = self.attn_drop(attn)
240 | 
241 |         x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
242 |         x = self.proj(x)
243 |         x = self.proj_drop(x)
244 |         return x
245 | 
246 | def window_partition(x, window_size):
247 |     B, H, W, C = x.shape
248 |     x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
249 |     windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
250 |     return windows
251 | 
252 | def window_reverse(windows, window_size, H, W):
253 |     B = int(windows.shape[0] / (H * W / window_size / window_size))
254 |     x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
255 |     x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
256 |     return x
257 | 
258 | class FeedForward(nn.Module):
259 |     def __init__(self, in_dim, out_dim, d_ff=2048, dropout=0.1, act_layer=nn.GELU,):
260 |         super().__init__() 
261 |     
262 |         self.linear_1 = nn.Linear(in_dim, d_ff)
263 |         self.dropout = nn.Dropout(dropout)
264 |         self.linear_2 = nn.Linear(d_ff, out_dim)
265 |         self.act_layer = act_layer()
266 |     
267 |     def forward(self, x):
268 |         x = self.act_layer(self.linear_1(x))
269 |         x = self.dropout(x)
270 |         x = self.linear_2(x)
271 |         x = self.dropout(x)
272 |         return x
273 | 
274 | 


--------------------------------------------------------------------------------
/baselines/descript/dac/model/dac.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from typing import List
  3 | from typing import Union
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from audiotools import AudioSignal
  8 | from audiotools.ml import BaseModel
  9 | from torch import nn
 10 | 
 11 | from .base import CodecMixin
 12 | from dac.nn.layers import Snake1d
 13 | from dac.nn.layers import WNConv1d
 14 | from dac.nn.layers import WNConvTranspose1d
 15 | from dac.nn.quantize import ResidualVectorQuantize
 16 | 
 17 | 
 18 | def init_weights(m):
 19 |     if isinstance(m, nn.Conv1d):
 20 |         nn.init.trunc_normal_(m.weight, std=0.02)
 21 |         nn.init.constant_(m.bias, 0)
 22 | 
 23 | 
 24 | class ResidualUnit(nn.Module):
 25 |     def __init__(self, dim: int = 16, dilation: int = 1):
 26 |         super().__init__()
 27 |         pad = ((7 - 1) * dilation) // 2
 28 |         self.block = nn.Sequential(
 29 |             Snake1d(dim),
 30 |             WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad),
 31 |             Snake1d(dim),
 32 |             WNConv1d(dim, dim, kernel_size=1),
 33 |         )
 34 | 
 35 |     def forward(self, x):
 36 |         y = self.block(x)
 37 |         pad = (x.shape[-1] - y.shape[-1]) // 2
 38 |         if pad > 0:
 39 |             x = x[..., pad:-pad]
 40 |         return x + y
 41 | 
 42 | 
 43 | class EncoderBlock(nn.Module):
 44 |     def __init__(self, dim: int = 16, stride: int = 1):
 45 |         super().__init__()
 46 |         self.block = nn.Sequential(
 47 |             ResidualUnit(dim // 2, dilation=1),
 48 |             ResidualUnit(dim // 2, dilation=3),
 49 |             ResidualUnit(dim // 2, dilation=9),
 50 |             Snake1d(dim // 2),
 51 |             WNConv1d(
 52 |                 dim // 2,
 53 |                 dim,
 54 |                 kernel_size=2 * stride,
 55 |                 stride=stride,
 56 |                 padding=math.ceil(stride / 2),
 57 |             ),
 58 |         )
 59 | 
 60 |     def forward(self, x):
 61 |         return self.block(x)
 62 | 
 63 | 
 64 | class Encoder(nn.Module):
 65 |     def __init__(
 66 |         self,
 67 |         d_model: int = 64,
 68 |         strides: list = [2, 4, 8, 8],
 69 |         d_latent: int = 64,
 70 |     ):
 71 |         super().__init__()
 72 |         # Create first convolution
 73 |         self.block = [WNConv1d(1, d_model, kernel_size=7, padding=3)]
 74 | 
 75 |         # Create EncoderBlocks that double channels as they downsample by `stride`
 76 |         for stride in strides:
 77 |             d_model *= 2
 78 |             self.block += [EncoderBlock(d_model, stride=stride)]
 79 | 
 80 |         # Create last convolution
 81 |         self.block += [
 82 |             Snake1d(d_model),
 83 |             WNConv1d(d_model, d_latent, kernel_size=3, padding=1),
 84 |         ]
 85 | 
 86 |         # Wrap black into nn.Sequential
 87 |         self.block = nn.Sequential(*self.block)
 88 |         self.enc_dim = d_model
 89 | 
 90 |     def forward(self, x):
 91 |         return self.block(x)
 92 | 
 93 | 
 94 | class DecoderBlock(nn.Module):
 95 |     def __init__(self, input_dim: int = 16, output_dim: int = 8, stride: int = 1):
 96 |         super().__init__()
 97 |         self.block = nn.Sequential(
 98 |             Snake1d(input_dim),
 99 |             WNConvTranspose1d(
100 |                 input_dim,
101 |                 output_dim,
102 |                 kernel_size=2 * stride,
103 |                 stride=stride,
104 |                 padding=math.ceil(stride / 2),
105 |             ),
106 |             ResidualUnit(output_dim, dilation=1),
107 |             ResidualUnit(output_dim, dilation=3),
108 |             ResidualUnit(output_dim, dilation=9),
109 |         )
110 | 
111 |     def forward(self, x):
112 |         return self.block(x)
113 | 
114 | 
115 | class Decoder(nn.Module):
116 |     def __init__(
117 |         self,
118 |         input_channel,
119 |         channels,
120 |         rates,
121 |         d_out: int = 1,
122 |     ):
123 |         super().__init__()
124 | 
125 |         # Add first conv layer
126 |         layers = [WNConv1d(input_channel, channels, kernel_size=7, padding=3)]
127 | 
128 |         # Add upsampling + MRF blocks
129 |         for i, stride in enumerate(rates):
130 |             input_dim = channels // 2**i
131 |             output_dim = channels // 2 ** (i + 1)
132 |             layers += [DecoderBlock(input_dim, output_dim, stride)]
133 | 
134 |         # Add final conv layer
135 |         layers += [
136 |             Snake1d(output_dim),
137 |             WNConv1d(output_dim, d_out, kernel_size=7, padding=3),
138 |             nn.Tanh(),
139 |         ]
140 | 
141 |         self.model = nn.Sequential(*layers)
142 | 
143 |     def forward(self, x):
144 |         return self.model(x)
145 | 
146 | 
147 | class DAC(BaseModel, CodecMixin):
148 |     def __init__(
149 |         self,
150 |         encoder_dim: int = 64,
151 |         encoder_rates: List[int] = [2, 4, 8, 8],
152 |         latent_dim: int = None,
153 |         decoder_dim: int = 1536,
154 |         decoder_rates: List[int] = [8, 8, 4, 2],
155 |         n_codebooks: int = 9,
156 |         codebook_size: int = 1024,
157 |         codebook_dim: Union[int, list] = 8,
158 |         quantizer_dropout: bool = False,
159 |         sample_rate: int = 44100,
160 |     ):
161 |         super().__init__()
162 | 
163 |         self.encoder_dim = encoder_dim
164 |         self.encoder_rates = encoder_rates
165 |         self.decoder_dim = decoder_dim
166 |         self.decoder_rates = decoder_rates
167 |         self.sample_rate = sample_rate
168 | 
169 |         if latent_dim is None:
170 |             latent_dim = encoder_dim * (2 ** len(encoder_rates))
171 | 
172 |         self.latent_dim = latent_dim
173 | 
174 |         self.hop_length = np.prod(encoder_rates)
175 |         self.encoder = Encoder(encoder_dim, encoder_rates, latent_dim)
176 | 
177 |         self.n_codebooks = n_codebooks
178 |         self.codebook_size = codebook_size
179 |         self.codebook_dim = codebook_dim
180 |         self.quantizer = ResidualVectorQuantize(
181 |             input_dim=latent_dim,
182 |             n_codebooks=n_codebooks,
183 |             codebook_size=codebook_size,
184 |             codebook_dim=codebook_dim,
185 |             quantizer_dropout=quantizer_dropout,
186 |         )
187 | 
188 |         self.decoder = Decoder(
189 |             latent_dim,
190 |             decoder_dim,
191 |             decoder_rates,
192 |         )
193 |         self.sample_rate = sample_rate
194 |         self.apply(init_weights)
195 | 
196 |         self.delay = self.get_delay()
197 | 
198 |     def preprocess(self, audio_data, sample_rate):
199 |         if sample_rate is None:
200 |             sample_rate = self.sample_rate
201 |         assert sample_rate == self.sample_rate
202 | 
203 |         length = audio_data.shape[-1]
204 |         right_pad = math.ceil(length / self.hop_length) * self.hop_length - length
205 |         audio_data = nn.functional.pad(audio_data, (0, right_pad))
206 | 
207 |         return audio_data
208 | 
209 |     def encode(
210 |         self,
211 |         audio_data: torch.Tensor,
212 |         n_quantizers: int = None,
213 |     ):
214 |         """Encode given audio data and return quantized latent codes
215 | 
216 |         Parameters
217 |         ----------
218 |         audio_data : Tensor[B x 1 x T]
219 |             Audio data to encode
220 |         n_quantizers : int, optional
221 |             Number of quantizers to use, by default None
222 |             If None, all quantizers are used.
223 | 
224 |         Returns
225 |         -------
226 |         dict
227 |             A dictionary with the following keys:
228 |             "z" : Tensor[B x D x T]
229 |                 Quantized continuous representation of input
230 |             "codes" : Tensor[B x N x T]
231 |                 Codebook indices for each codebook
232 |                 (quantized discrete representation of input)
233 |             "latents" : Tensor[B x N*D x T]
234 |                 Projected latents (continuous representation of input before quantization)
235 |             "vq/commitment_loss" : Tensor[1]
236 |                 Commitment loss to train encoder to predict vectors closer to codebook
237 |                 entries
238 |             "vq/codebook_loss" : Tensor[1]
239 |                 Codebook loss to update the codebook
240 |             "length" : int
241 |                 Number of samples in input audio
242 |         """
243 |         z = self.encoder(audio_data)
244 |         z, codes, latents, commitment_loss, codebook_loss = self.quantizer(
245 |             z, n_quantizers
246 |         )
247 |         return z, codes, latents, commitment_loss, codebook_loss
248 | 
249 |     def decode(self, z: torch.Tensor):
250 |         """Decode given latent codes and return audio data
251 | 
252 |         Parameters
253 |         ----------
254 |         z : Tensor[B x D x T]
255 |             Quantized continuous representation of input
256 |         length : int, optional
257 |             Number of samples in output audio, by default None
258 | 
259 |         Returns
260 |         -------
261 |         dict
262 |             A dictionary with the following keys:
263 |             "audio" : Tensor[B x 1 x length]
264 |                 Decoded audio data.
265 |         """
266 |         return self.decoder(z)
267 | 
268 |     def forward(
269 |         self,
270 |         audio_data: torch.Tensor,
271 |         sample_rate: int = None,
272 |         n_quantizers: int = None,
273 |     ):
274 |         """Model forward pass
275 | 
276 |         Parameters
277 |         ----------
278 |         audio_data : Tensor[B x 1 x T]
279 |             Audio data to encode
280 |         sample_rate : int, optional
281 |             Sample rate of audio data in Hz, by default None
282 |             If None, defaults to `self.sample_rate`
283 |         n_quantizers : int, optional
284 |             Number of quantizers to use, by default None.
285 |             If None, all quantizers are used.
286 | 
287 |         Returns
288 |         -------
289 |         dict
290 |             A dictionary with the following keys:
291 |             "z" : Tensor[B x D x T]
292 |                 Quantized continuous representation of input
293 |             "codes" : Tensor[B x N x T]
294 |                 Codebook indices for each codebook
295 |                 (quantized discrete representation of input)
296 |             "latents" : Tensor[B x N*D x T]
297 |                 Projected latents (continuous representation of input before quantization)
298 |             "vq/commitment_loss" : Tensor[1]
299 |                 Commitment loss to train encoder to predict vectors closer to codebook
300 |                 entries
301 |             "vq/codebook_loss" : Tensor[1]
302 |                 Codebook loss to update the codebook
303 |             "length" : int
304 |                 Number of samples in input audio
305 |             "audio" : Tensor[B x 1 x length]
306 |                 Decoded audio data.
307 |         """
308 |         length = audio_data.shape[-1]
309 |         audio_data = self.preprocess(audio_data, sample_rate)
310 |         z, codes, latents, commitment_loss, codebook_loss = self.encode(
311 |             audio_data, n_quantizers
312 |         )
313 | 
314 |         x = self.decode(z)
315 |         return {
316 |             "audio": x[..., :length],
317 |             "z": z,
318 |             "codes": codes,
319 |             "latents": latents,
320 |             "vq/commitment_loss": commitment_loss,
321 |             "vq/codebook_loss": codebook_loss,
322 |         }
323 | 
324 | 
325 | if __name__ == "__main__":
326 |     import numpy as np
327 |     from functools import partial
328 | 
329 |     model = DAC().to("cpu")
330 | 
331 |     for n, m in model.named_modules():
332 |         o = m.extra_repr()
333 |         p = sum([np.prod(p.size()) for p in m.parameters()])
334 |         fn = lambda o, p: o + f" {p/1e6:<.3f}M params."
335 |         setattr(m, "extra_repr", partial(fn, o=o, p=p))
336 |     print(model)
337 |     print("Total # of params: ", sum([np.prod(p.size()) for p in model.parameters()]))
338 | 
339 |     length = 88200 * 2
340 |     x = torch.randn(1, 1, length).to(model.device)
341 |     x.requires_grad_(True)
342 |     x.retain_grad()
343 | 
344 |     # Make a forward pass
345 |     out = model(x)["audio"]
346 |     print("Input shape:", x.shape)
347 |     print("Output shape:", out.shape)
348 | 
349 |     # Create gradient variable
350 |     grad = torch.zeros_like(out)
351 |     grad[:, :, grad.shape[-1] // 2] = 1
352 | 
353 |     # Make a backward pass
354 |     out.backward(grad)
355 | 
356 |     # Check non-zero values
357 |     gradmap = x.grad.squeeze(0)
358 |     gradmap = (gradmap != 0).sum(0)  # sum across features
359 |     rf = (gradmap != 0).sum()
360 | 
361 |     print(f"Receptive field: {rf.item()}")
362 | 
363 |     x = AudioSignal(torch.randn(1, 1, 44100 * 60), 44100)
364 |     model.decompress(model.compress(x, verbose=True), verbose=True)
365 | 


--------------------------------------------------------------------------------
/baselines/descript/dac/nn/loss.py:
--------------------------------------------------------------------------------
  1 | import typing
  2 | from typing import List
  3 | 
  4 | import torch
  5 | import torch.nn.functional as F
  6 | from audiotools import AudioSignal
  7 | from audiotools import STFTParams
  8 | from torch import nn
  9 | 
 10 | 
 11 | class L1Loss(nn.L1Loss):
 12 |     """L1 Loss between AudioSignals. Defaults
 13 |     to comparing ``audio_data``, but any
 14 |     attribute of an AudioSignal can be used.
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     attribute : str, optional
 19 |         Attribute of signal to compare, defaults to ``audio_data``.
 20 |     weight : float, optional
 21 |         Weight of this loss, defaults to 1.0.
 22 | 
 23 |     Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/distance.py
 24 |     """
 25 | 
 26 |     def __init__(self, attribute: str = "audio_data", weight: float = 1.0, **kwargs):
 27 |         self.attribute = attribute
 28 |         self.weight = weight
 29 |         super().__init__(**kwargs)
 30 | 
 31 |     def forward(self, x: AudioSignal, y: AudioSignal):
 32 |         """
 33 |         Parameters
 34 |         ----------
 35 |         x : AudioSignal
 36 |             Estimate AudioSignal
 37 |         y : AudioSignal
 38 |             Reference AudioSignal
 39 | 
 40 |         Returns
 41 |         -------
 42 |         torch.Tensor
 43 |             L1 loss between AudioSignal attributes.
 44 |         """
 45 |         if isinstance(x, AudioSignal):
 46 |             x = getattr(x, self.attribute)
 47 |             y = getattr(y, self.attribute)
 48 |         return super().forward(x, y)
 49 | 
 50 | 
 51 | class SISDRLoss(nn.Module):
 52 |     """
 53 |     Computes the Scale-Invariant Source-to-Distortion Ratio between a batch
 54 |     of estimated and reference audio signals or aligned features.
 55 | 
 56 |     Parameters
 57 |     ----------
 58 |     scaling : int, optional
 59 |         Whether to use scale-invariant (True) or
 60 |         signal-to-noise ratio (False), by default True
 61 |     reduction : str, optional
 62 |         How to reduce across the batch (either 'mean',
 63 |         'sum', or none).], by default ' mean'
 64 |     zero_mean : int, optional
 65 |         Zero mean the references and estimates before
 66 |         computing the loss, by default True
 67 |     clip_min : int, optional
 68 |         The minimum possible loss value. Helps network
 69 |         to not focus on making already good examples better, by default None
 70 |     weight : float, optional
 71 |         Weight of this loss, defaults to 1.0.
 72 | 
 73 |     Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/distance.py
 74 |     """
 75 | 
 76 |     def __init__(
 77 |         self,
 78 |         scaling: int = True,
 79 |         reduction: str = "mean",
 80 |         zero_mean: int = True,
 81 |         clip_min: int = None,
 82 |         weight: float = 1.0,
 83 |     ):
 84 |         self.scaling = scaling
 85 |         self.reduction = reduction
 86 |         self.zero_mean = zero_mean
 87 |         self.clip_min = clip_min
 88 |         self.weight = weight
 89 |         super().__init__()
 90 | 
 91 |     def forward(self, x: AudioSignal, y: AudioSignal):
 92 |         eps = 1e-8
 93 |         # nb, nc, nt
 94 |         if isinstance(x, AudioSignal):
 95 |             references = x.audio_data
 96 |             estimates = y.audio_data
 97 |         else:
 98 |             references = x
 99 |             estimates = y
100 | 
101 |         nb = references.shape[0]
102 |         references = references.reshape(nb, 1, -1).permute(0, 2, 1)
103 |         estimates = estimates.reshape(nb, 1, -1).permute(0, 2, 1)
104 | 
105 |         # samples now on axis 1
106 |         if self.zero_mean:
107 |             mean_reference = references.mean(dim=1, keepdim=True)
108 |             mean_estimate = estimates.mean(dim=1, keepdim=True)
109 |         else:
110 |             mean_reference = 0
111 |             mean_estimate = 0
112 | 
113 |         _references = references - mean_reference
114 |         _estimates = estimates - mean_estimate
115 | 
116 |         references_projection = (_references**2).sum(dim=-2) + eps
117 |         references_on_estimates = (_estimates * _references).sum(dim=-2) + eps
118 | 
119 |         scale = (
120 |             (references_on_estimates / references_projection).unsqueeze(1)
121 |             if self.scaling
122 |             else 1
123 |         )
124 | 
125 |         e_true = scale * _references
126 |         e_res = _estimates - e_true
127 | 
128 |         signal = (e_true**2).sum(dim=1)
129 |         noise = (e_res**2).sum(dim=1)
130 |         sdr = -10 * torch.log10(signal / noise + eps)
131 | 
132 |         if self.clip_min is not None:
133 |             sdr = torch.clamp(sdr, min=self.clip_min)
134 | 
135 |         if self.reduction == "mean":
136 |             sdr = sdr.mean()
137 |         elif self.reduction == "sum":
138 |             sdr = sdr.sum()
139 |         return sdr
140 | 
141 | 
142 | class MultiScaleSTFTLoss(nn.Module):
143 |     """Computes the multi-scale STFT loss from [1].
144 | 
145 |     Parameters
146 |     ----------
147 |     window_lengths : List[int], optional
148 |         Length of each window of each STFT, by default [2048, 512]
149 |     loss_fn : typing.Callable, optional
150 |         How to compare each loss, by default nn.L1Loss()
151 |     clamp_eps : float, optional
152 |         Clamp on the log magnitude, below, by default 1e-5
153 |     mag_weight : float, optional
154 |         Weight of raw magnitude portion of loss, by default 1.0
155 |     log_weight : float, optional
156 |         Weight of log magnitude portion of loss, by default 1.0
157 |     pow : float, optional
158 |         Power to raise magnitude to before taking log, by default 2.0
159 |     weight : float, optional
160 |         Weight of this loss, by default 1.0
161 |     match_stride : bool, optional
162 |         Whether to match the stride of convolutional layers, by default False
163 | 
164 |     References
165 |     ----------
166 | 
167 |     1.  Engel, Jesse, Chenjie Gu, and Adam Roberts.
168 |         "DDSP: Differentiable Digital Signal Processing."
169 |         International Conference on Learning Representations. 2019.
170 | 
171 |     Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/spectral.py
172 |     """
173 | 
174 |     def __init__(
175 |         self,
176 |         window_lengths: List[int] = [2048, 512],
177 |         loss_fn: typing.Callable = nn.L1Loss(),
178 |         clamp_eps: float = 1e-5,
179 |         mag_weight: float = 1.0,
180 |         log_weight: float = 1.0,
181 |         pow: float = 2.0,
182 |         weight: float = 1.0,
183 |         match_stride: bool = False,
184 |         window_type: str = None,
185 |     ):
186 |         super().__init__()
187 |         self.stft_params = [
188 |             STFTParams(
189 |                 window_length=w,
190 |                 hop_length=w // 4,
191 |                 match_stride=match_stride,
192 |                 window_type=window_type,
193 |             )
194 |             for w in window_lengths
195 |         ]
196 |         self.loss_fn = loss_fn
197 |         self.log_weight = log_weight
198 |         self.mag_weight = mag_weight
199 |         self.clamp_eps = clamp_eps
200 |         self.weight = weight
201 |         self.pow = pow
202 | 
203 |     def forward(self, x: AudioSignal, y: AudioSignal):
204 |         """Computes multi-scale STFT between an estimate and a reference
205 |         signal.
206 | 
207 |         Parameters
208 |         ----------
209 |         x : AudioSignal
210 |             Estimate signal
211 |         y : AudioSignal
212 |             Reference signal
213 | 
214 |         Returns
215 |         -------
216 |         torch.Tensor
217 |             Multi-scale STFT loss.
218 |         """
219 |         loss = 0.0
220 |         for s in self.stft_params:
221 |             x.stft(s.window_length, s.hop_length, s.window_type)
222 |             y.stft(s.window_length, s.hop_length, s.window_type)
223 |             loss += self.log_weight * self.loss_fn(
224 |                 x.magnitude.clamp(self.clamp_eps).pow(self.pow).log10(),
225 |                 y.magnitude.clamp(self.clamp_eps).pow(self.pow).log10(),
226 |             )
227 |             loss += self.mag_weight * self.loss_fn(x.magnitude, y.magnitude)
228 |         return loss
229 | 
230 | 
231 | class MelSpectrogramLoss(nn.Module):
232 |     """Compute distance between mel spectrograms. Can be used
233 |     in a multi-scale way.
234 | 
235 |     Parameters
236 |     ----------
237 |     n_mels : List[int]
238 |         Number of mels per STFT, by default [150, 80],
239 |     window_lengths : List[int], optional
240 |         Length of each window of each STFT, by default [2048, 512]
241 |     loss_fn : typing.Callable, optional
242 |         How to compare each loss, by default nn.L1Loss()
243 |     clamp_eps : float, optional
244 |         Clamp on the log magnitude, below, by default 1e-5
245 |     mag_weight : float, optional
246 |         Weight of raw magnitude portion of loss, by default 1.0
247 |     log_weight : float, optional
248 |         Weight of log magnitude portion of loss, by default 1.0
249 |     pow : float, optional
250 |         Power to raise magnitude to before taking log, by default 2.0
251 |     weight : float, optional
252 |         Weight of this loss, by default 1.0
253 |     match_stride : bool, optional
254 |         Whether to match the stride of convolutional layers, by default False
255 | 
256 |     Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/spectral.py
257 |     """
258 | 
259 |     def __init__(
260 |         self,
261 |         n_mels: List[int] = [150, 80],
262 |         window_lengths: List[int] = [2048, 512],
263 |         loss_fn: typing.Callable = nn.L1Loss(),
264 |         clamp_eps: float = 1e-5,
265 |         mag_weight: float = 1.0,
266 |         log_weight: float = 1.0,
267 |         pow: float = 2.0,
268 |         weight: float = 1.0,
269 |         match_stride: bool = False,
270 |         mel_fmin: List[float] = [0.0, 0.0],
271 |         mel_fmax: List[float] = [None, None],
272 |         window_type: str = None,
273 |     ):
274 |         super().__init__()
275 |         self.stft_params = [
276 |             STFTParams(
277 |                 window_length=w,
278 |                 hop_length=w // 4,
279 |                 match_stride=match_stride,
280 |                 window_type=window_type,
281 |             )
282 |             for w in window_lengths
283 |         ]
284 |         self.n_mels = n_mels
285 |         self.loss_fn = loss_fn
286 |         self.clamp_eps = clamp_eps
287 |         self.log_weight = log_weight
288 |         self.mag_weight = mag_weight
289 |         self.weight = weight
290 |         self.mel_fmin = mel_fmin
291 |         self.mel_fmax = mel_fmax
292 |         self.pow = pow
293 | 
294 |     def forward(self, x: AudioSignal, y: AudioSignal):
295 |         """Computes mel loss between an estimate and a reference
296 |         signal.
297 | 
298 |         Parameters
299 |         ----------
300 |         x : AudioSignal
301 |             Estimate signal
302 |         y : AudioSignal
303 |             Reference signal
304 | 
305 |         Returns
306 |         -------
307 |         torch.Tensor
308 |             Mel loss.
309 |         """
310 |         loss = 0.0
311 |         for n_mels, fmin, fmax, s in zip(
312 |             self.n_mels, self.mel_fmin, self.mel_fmax, self.stft_params
313 |         ):
314 |             kwargs = {
315 |                 "window_length": s.window_length,
316 |                 "hop_length": s.hop_length,
317 |                 "window_type": s.window_type,
318 |             }
319 |             x_mels = x.mel_spectrogram(n_mels, mel_fmin=fmin, mel_fmax=fmax, **kwargs)
320 |             y_mels = y.mel_spectrogram(n_mels, mel_fmin=fmin, mel_fmax=fmax, **kwargs)
321 | 
322 |             loss += self.log_weight * self.loss_fn(
323 |                 x_mels.clamp(self.clamp_eps).pow(self.pow).log10(),
324 |                 y_mels.clamp(self.clamp_eps).pow(self.pow).log10(),
325 |             )
326 |             loss += self.mag_weight * self.loss_fn(x_mels, y_mels)
327 |         return loss
328 | 
329 | 
330 | class GANLoss(nn.Module):
331 |     """
332 |     Computes a discriminator loss, given a discriminator on
333 |     generated waveforms/spectrograms compared to ground truth
334 |     waveforms/spectrograms. Computes the loss for both the
335 |     discriminator and the generator in separate functions.
336 |     """
337 | 
338 |     def __init__(self, discriminator):
339 |         super().__init__()
340 |         self.discriminator = discriminator
341 | 
342 |     def forward(self, fake, real):
343 |         d_fake = self.discriminator(fake.audio_data)
344 |         d_real = self.discriminator(real.audio_data)
345 |         return d_fake, d_real
346 | 
347 |     def discriminator_loss(self, fake, real):
348 |         d_fake, d_real = self.forward(fake.clone().detach(), real)
349 | 
350 |         loss_d = 0
351 |         for x_fake, x_real in zip(d_fake, d_real):
352 |             loss_d += torch.mean(x_fake[-1] ** 2)
353 |             loss_d += torch.mean((1 - x_real[-1]) ** 2)
354 |         return loss_d
355 | 
356 |     def generator_loss(self, fake, real):
357 |         d_fake, d_real = self.forward(fake, real)
358 | 
359 |         loss_g = 0
360 |         for x_fake in d_fake:
361 |             loss_g += torch.mean((1 - x_fake[-1]) ** 2)
362 | 
363 |         loss_feature = 0
364 | 
365 |         for i in range(len(d_fake)):
366 |             for j in range(len(d_fake[i]) - 1):
367 |                 loss_feature += F.l1_loss(d_fake[i][j], d_real[i][j].detach())
368 |         return loss_g, loss_feature
369 | 


--------------------------------------------------------------------------------
/baselines/descript/scripts/train_customize_no_adv.py:
--------------------------------------------------------------------------------
  1 | import os, torchaudio
  2 | import sys
  3 | import warnings
  4 | from dataclasses import dataclass
  5 | from pathlib import Path
  6 | from tqdm import tqdm
  7 | 
  8 | import argbind
  9 | import argparse, yaml
 10 | import numpy as np
 11 | import torch
 12 | from audiotools import AudioSignal
 13 | from audiotools import ml
 14 | from audiotools.core import util
 15 | from audiotools.data import transforms
 16 | 
 17 | from torch.utils.data import Dataset
 18 | from pesq import pesq, NoUtterancesError, BufferTooShortError
 19 | 
 20 | from audiotools.ml.decorators import timer
 21 | from audiotools.ml.decorators import Tracker
 22 | from audiotools.ml.decorators import when
 23 | from torch.utils.tensorboard import SummaryWriter
 24 | 
 25 | import sys
 26 | sys.path.append("..")
 27 | 
 28 | import dac
 29 | import wandb
 30 | from glob import glob
 31 | 
 32 | warnings.filterwarnings("ignore", category=UserWarning)
 33 | 
 34 | # Enable cudnn autotuner to speed up training
 35 | # (can be altered by the funcs.seed function)
 36 | torch.backends.cudnn.benchmark = bool(int(os.getenv("CUDNN_BENCHMARK", 1)))
 37 | # Uncomment to trade memory for speed.
 38 | 
 39 | # Transforms
 40 | filter_fn = lambda fn: hasattr(fn, "transform") and fn.__qualname__ not in [
 41 |     "BaseTransform",
 42 |     "Compose",
 43 |     "Choose",
 44 | ]
 45 | tfm = argbind.bind_module(transforms, "train", "val", filter_fn=filter_fn)
 46 | 
 47 | def parse_args():
 48 |     parser = argparse.ArgumentParser()
 49 |     parser.add_argument("--config", type=str, default="16kHz_dns_9k.yml")
 50 |     args = parser.parse_args()
 51 | 
 52 |     with open(f"../conf/{args.config}", "r") as f:
 53 |         args = yaml.safe_load(f)
 54 |     args = dict2namespace(args)
 55 |     return args
 56 | 
 57 | def dict2namespace(config):
 58 |     namespace = argparse.Namespace()
 59 |     for key, value in config.items():
 60 |         if isinstance(value, dict):
 61 |             new_value = dict2namespace(value)
 62 |         else:
 63 |             new_value = value
 64 |         setattr(namespace, key, new_value)
 65 |     return namespace
 66 | 
 67 | def get_infinite_loader(dataloader):
 68 |     while True:
 69 |         for batch in dataloader:
 70 |             yield batch
 71 | 
 72 | def build_transform(
 73 |     augment_prob: float = 1.0,
 74 |     preprocess: list = ["Identity"],
 75 |     augment: list = ["Identity"],
 76 |     postprocess: list = ["Identity"],
 77 | ):
 78 |     to_tfm = lambda l: [getattr(tfm, x)() for x in l]
 79 |     preprocess = transforms.Compose(*to_tfm(preprocess), name="preprocess")
 80 |     augment = transforms.Compose(*to_tfm(augment), name="augment", prob=augment_prob)
 81 |     postprocess = transforms.Compose(*to_tfm(postprocess), name="postprocess")
 82 |     transform = transforms.Compose(preprocess, augment, postprocess)
 83 |     return transform
 84 | 
 85 | def build_dataset(
 86 |     data_path: str = "/scratch/yg172/DNS_CHALLENGE/processed_wav",
 87 |     split: str = "train",
 88 | ):  
 89 |     transform = build_transform()
 90 |     dataset = DNS(data_path, split)
 91 |     dataset.transform = transform
 92 |     return dataset
 93 | 
 94 | class DNS(Dataset):
 95 |     data_name = 'DNS'
 96 |     def __init__(self, data_path, split) -> None:
 97 |         self.split = split
 98 |         d_pth = '{}/{}'.format(data_path, split)
 99 | 
100 |         self.source_audio = glob(f"{d_pth}/*/*.wav") # all wav paths
101 |         self.source_audio = self.source_audio[:180000] if split == "train" else self.source_audio
102 | 
103 |     def __len__(self):
104 |         return len(self.source_audio)
105 | 
106 |     def __getitem__(self, idx):
107 |         x = {'signal': torchaudio.load(self.source_audio[idx])[0][:, :-80], } # [1*N]
108 |         return x
109 |     
110 | def collate_fn(batch):
111 |     if isinstance(batch[0], dict):
112 |         output = {key: [] for key in batch[0].keys() if key != "feat"}  # output = {'audio':[.....], 'feature':[.....]}
113 |         for b in batch:
114 |             for key in b:
115 |                 output[key].append(b[key])
116 | 
117 |         for key in output:
118 |             output[key] = AudioSignal(torch.stack(output[key], dim=0), sample_rate=16000)
119 | 
120 |         return output
121 | 
122 | def PESQ(recon: AudioSignal, raw: AudioSignal):
123 |     score = 0.0
124 |     for i in range(raw.batch_size):
125 |         try:
126 |             obj_score = pesq(16000, 
127 |                             raw.audio_data[i].squeeze(0).cpu().numpy(), 
128 |                             recon.audio_data[i].squeeze(0).cpu().numpy(), 'wb')
129 |         except NoUtterancesError:
130 |             obj_score = 0.0
131 |         except BufferTooShortError:
132 |             obj_score = 0.0
133 |         
134 |         score += obj_score
135 |     
136 |     return score/raw.batch_size
137 |     
138 | @dataclass
139 | class State:
140 |     generator: dac.model.DAC
141 |     optimizer_g: torch.optim.AdamW
142 |     scheduler_g: torch.optim.lr_scheduler.ExponentialLR
143 | 
144 |     stft_loss: dac.nn.loss.MultiScaleSTFTLoss
145 |     mel_loss: dac.nn.loss.MelSpectrogramLoss
146 |     waveform_loss: dac.nn.loss.L1Loss
147 | 
148 |     pesq_score: PESQ
149 | 
150 |     train_data: Dataset
151 |     val_data: Dataset
152 | 
153 |     tracker: Tracker
154 | 
155 | def load(
156 |     accel: ml.Accelerator,
157 |     tracker: Tracker,
158 |     args: argparse.Namespace
159 | ):
160 | 
161 |     # Models
162 |     generator = dac.model.DAC(
163 |         encoder_dim=args.DAC.encoder_dim,
164 |         encoder_rates=args.DAC.encoder_rates,
165 |         decoder_dim=args.DAC.decoder_dim,
166 |         decoder_rates=args.DAC.decoder_rates,
167 |         n_codebooks=args.DAC.n_codebooks,
168 |         codebook_size=args.DAC.codebook_size,
169 |         codebook_dim=args.DAC.codebook_dim,
170 |         quantizer_dropout=args.DAC.quantizer_dropout,
171 |         sample_rate=args.DAC.sample_rate
172 |     )
173 |     print("Generator num Params: ", sum(p.numel() for p in generator.parameters() if p.requires_grad))
174 | 
175 |     generator = accel.prepare_model(generator)
176 | 
177 |     optimizer_g = torch.optim.AdamW(generator.parameters(), 
178 |                                     lr=args.AdamW.lr, betas=args.AdamW.betas,)
179 |     scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optimizer_g, args.ExponentialLR.gamma)
180 | 
181 |     print("Loading data...")
182 |     train_data = build_dataset(data_path=args.data_path, split="train")
183 |     val_data = build_dataset(data_path=args.data_path, split="test")
184 |     train_data.collate = collate_fn
185 |     val_data.collate = collate_fn
186 |     print("data ready")
187 | 
188 |     waveform_loss = dac.nn.loss.L1Loss()
189 |     stft_loss = dac.nn.loss.MultiScaleSTFTLoss(args.MultiScaleSTFTLoss.window_lengths)
190 |     mel_loss = dac.nn.loss.MelSpectrogramLoss(args.MelSpectrogramLoss.n_mels,
191 |                                               window_lengths=args.MelSpectrogramLoss.window_lengths,
192 |                                               clamp_eps=args.MelSpectrogramLoss.clamp_eps,
193 |                                               pow=args.MelSpectrogramLoss.pow,
194 |                                               mag_weight=args.MelSpectrogramLoss.mag_weight,
195 |                                               mel_fmin=args.MelSpectrogramLoss.mel_fmin,
196 |                                               mel_fmax=args.MelSpectrogramLoss.mel_fmax,
197 |                                               )
198 | 
199 |     eval_metric = PESQ
200 |     return State(
201 |         generator=generator,
202 |         optimizer_g=optimizer_g,
203 |         scheduler_g=scheduler_g,
204 |         waveform_loss=waveform_loss,
205 |         stft_loss=stft_loss,
206 |         mel_loss=mel_loss,
207 |         pesq_score=eval_metric,
208 |         tracker=tracker,
209 |         train_data=train_data,
210 |         val_data=val_data,
211 |     )
212 | 
213 | 
214 | @timer()
215 | @torch.no_grad()
216 | def val_loop(batch, state, accel):
217 |     state.generator.eval()
218 |     for key in batch.keys():
219 |         batch[key] = batch[key].to(accel.device)
220 |     # batch = util.prepare_batch(batch, accel.device)
221 |     # signal = state.val_data.transform(
222 |     #     batch["signal"].clone(), #**batch["transform_args"]
223 |     # )
224 |     signal = batch["signal"].clone()
225 | 
226 |     out = state.generator(signal.audio_data, signal.sample_rate)
227 |     recons = AudioSignal(out["audio"], signal.sample_rate)
228 | 
229 |     return {
230 |         "loss": state.mel_loss(recons, signal),
231 |         "mel/loss": state.mel_loss(recons, signal),
232 |         "stft/loss": state.stft_loss(recons, signal),
233 |         "waveform/loss": state.waveform_loss(recons, signal),
234 |         "pesq": state.pesq_score(recons, signal)
235 |     }
236 | 
237 | @timer()
238 | def train_loop(state, batch, accel, lambdas):
239 |     state.generator.train()
240 |     output = {}
241 | 
242 |     # batch = util.prepare_batch(batch, accel.device)
243 |     for key in batch.keys():
244 |         batch[key] = batch[key].to(accel.device)
245 |     with torch.no_grad():
246 |         # signal = state.train_data.transform(
247 |         #     batch["signal"].clone(), #**batch["transform_args"]
248 |         # )
249 |         signal = batch["signal"].clone()
250 | 
251 |     with accel.autocast():
252 |         out = state.generator(signal.audio_data, signal.sample_rate)
253 |         recons = AudioSignal(out["audio"], signal.sample_rate)
254 |         commitment_loss = out["vq/commitment_loss"]
255 |         codebook_loss = out["vq/codebook_loss"]
256 | 
257 |     with accel.autocast():
258 |         output["stft/loss"] = state.stft_loss(recons, signal)
259 |         output["mel/loss"] = state.mel_loss(recons, signal)
260 |         output["waveform/loss"] = state.waveform_loss(recons, signal)
261 |         output["vq/commitment_loss"] = commitment_loss
262 |         output["vq/codebook_loss"] = codebook_loss
263 |         output["loss"] = sum([v * output[k] for k, v in vars(lambdas).items() if k in output])
264 | 
265 |     state.optimizer_g.zero_grad()
266 |     accel.backward(output["loss"])
267 |     accel.scaler.unscale_(state.optimizer_g)
268 |     output["other/grad_norm"] = torch.nn.utils.clip_grad_norm_(
269 |         state.generator.parameters(), 1e3
270 |     )
271 |     accel.step(state.optimizer_g)
272 |     state.scheduler_g.step()
273 |     accel.update()
274 | 
275 |     output["other/learning_rate"] = state.optimizer_g.param_groups[0]["lr"]
276 |     output["other/batch_size"] = signal.batch_size * accel.world_size
277 | 
278 |     return {k: v for k, v in sorted(output.items())}
279 | 
280 | 
281 | def validate(state, val_dataloader, accel):
282 | 
283 |     wandb_log = {}
284 |     for batch in tqdm(val_dataloader, desc="Validating Model"):
285 |         output = val_loop(batch, state, accel)
286 |         for key, val in output.items():
287 |             if not isinstance(val, float):
288 |                 val = val.item()
289 |             key = f"test/{key}"
290 |             if key not in wandb_log:
291 |                 wandb_log[key] = [val]
292 |             else:
293 |                 wandb_log[key].append(val)
294 | 
295 |     for key in wandb_log.keys():
296 |         wandb_log[key] = np.mean(wandb_log[key])
297 | 
298 |     # Consolidate state dicts if using ZeroRedundancyOptimizer
299 |     if hasattr(state.optimizer_g, "consolidate_state_dict"):
300 |         state.optimizer_g.consolidate_state_dict()
301 | 
302 |     return wandb_log
303 | 
304 | def checkpoint(state, step, score, best_score, 
305 |                save_iters, save_path, accel):
306 |     # metadata = {"logs": state.tracker.history}
307 | 
308 |     tags = ["latest"]
309 |     print(f"Saving to {str(Path('.').absolute())}")
310 |     if score > best_score:
311 |         print("Best generator so far")
312 |         tags.append("best")
313 |         best_score = score
314 |     if step in save_iters:
315 |         tags.append(f"{state.tracker.step // 1000}k")
316 | 
317 |     for tag in tags:
318 |         generator_extra = {
319 |             "optimizer.pth": state.optimizer_g.state_dict(),
320 |             "scheduler.pth": state.scheduler_g.state_dict(),
321 |             # "metadata.pth": metadata,
322 |         }
323 |         # accel.unwrap(state.generator).metadata = metadata
324 |         accel.unwrap(state.generator).save_to_folder(
325 |             f"{save_path}/{tag}", generator_extra
326 |         )
327 | 
328 |     return best_score
329 | 
330 | def train_dns(
331 |     accel: ml.Accelerator,
332 |     seed: int = 0,
333 |     save_path: str = "ckpt",
334 |     num_iters: int = 250000,
335 |     save_iters: list = [10000, 50000, 100000, 200000],
336 |     sample_freq: int = 10000,
337 |     log_every: int = 5,
338 |     valid_freq: int = 1000,
339 |     batch_size: int = 12,
340 |     val_batch_size: int = 10,
341 |     num_workers: int = 8,
342 |     val_idx: list = [0, 1, 2, 3, 4, 5, 6, 7],
343 |     lambdas: dict = {
344 |         "mel/loss": 100.0,
345 |         "adv/feat_loss": 2.0,
346 |         "adv/gen_loss": 1.0,
347 |         "vq/commitment_loss": 0.25,
348 |         "vq/codebook_loss": 1.0,
349 |     },
350 |     args: argparse.Namespace=None
351 | ):
352 |     if accel.local_rank == 0:
353 |         wandb.login(key="********")
354 |         wandb.init(project=args.wb_project_name, name=args.wb_exp_name)
355 | 
356 |     util.seed(seed)
357 |     Path(save_path).mkdir(exist_ok=True, parents=True)
358 |     writer = (
359 |         SummaryWriter(log_dir=f"{save_path}/logs") if accel.local_rank == 0 else None
360 |     )
361 |     tracker = Tracker(
362 |         writer=writer, log_file=f"{save_path}/log.txt", rank=accel.local_rank
363 |     )
364 |     state = load(accel, tracker, args)
365 | 
366 |     train_dataloader = accel.prepare_dataloader(
367 |         state.train_data,
368 |         start_idx=0 * batch_size,
369 |         num_workers=num_workers,
370 |         batch_size=batch_size,
371 |         collate_fn=state.train_data.collate,
372 |     )
373 |     train_dataloader = get_infinite_loader(train_dataloader)
374 |     val_dataloader = accel.prepare_dataloader(
375 |         state.val_data,
376 |         start_idx=0,
377 |         num_workers=num_workers,
378 |         batch_size=val_batch_size,
379 |         collate_fn=state.val_data.collate,
380 |         persistent_workers=True if num_workers > 0 else False,
381 |     )
382 | 
383 |     print("Start Training...")
384 |     # These functions run only on the 0-rank process
385 |     # save_samples = when(lambda: accel.local_rank == 0)(save_samples)
386 |     global checkpoint
387 |     checkpoint = when(lambda: accel.local_rank == 0)(checkpoint)
388 |     best_score = -1
389 |     for step, batch in tqdm(enumerate(train_dataloader, start=1), desc="Training Model", total=num_iters):
390 |         outputs = train_loop(state, batch, accel, lambdas)
391 |         if step % log_every == 0:
392 |             wandb_log_train = {f"train/{key}":val for key,val in outputs.items()}
393 |             if wandb.run is not None and accel.local_rank == 0:
394 |                 wandb.log(wandb_log_train)
395 | 
396 |         last_iter = (
397 |             step == num_iters - 1 if num_iters is not None else False
398 |         )
399 | 
400 |         if step % valid_freq == 0 or last_iter:
401 |             wandb_log_test = validate(state, val_dataloader, accel)
402 |             if wandb.run is not None and accel.local_rank == 0:
403 |                 wandb.log(wandb_log_test)
404 |             score = wandb_log_test["test/pesq"]
405 |             best_score = checkpoint(state, step, score, best_score, save_iters, save_path, accel)
406 | 
407 |         if last_iter:
408 |             break
409 |     
410 |     if accel.local_rank == 0:
411 |         wandb.finish()
412 | 
413 | def main(args):
414 |     print(f"Reproducing Experiment: DAC16kHz (without gan) on DNSCHALLENGE\nConfiguration:{args.wb_exp_name}")
415 | 
416 |     args.wb_exp_name += "_non_adversarial"
417 |     with ml.Accelerator(amp=args.amp) as accel:
418 |         if accel.local_rank != 0:
419 |             sys.tracebacklimit = 0
420 |         train_dns(accel, args.seed, args.save_path, args.num_iters,
421 |                   args.save_iters, args.sample_freq, args.log_every, args.valid_freq,
422 |                   args.batch_size, args.val_batch_size, args.num_workers,
423 |                   args.val_idx, args.lambdas, args)
424 |     
425 | 
426 | if __name__ == "__main__":
427 |     args = parse_args()
428 |     main(args)
429 |     
430 | 
431 | """
432 | export CUDA_VISIBLE_DEVICES=0,1,2,3
433 | torchrun --nproc_per_node gpu train_customize.py --config 16kHz_dns_9k.yml
434 | 
435 | export CUDA_VISIBLE_DEVICES=0,1,2,3
436 | torchrun --nproc_per_node gpu train_customize.py --config 16kHz_dns_9k_tiny.yml
437 | """
438 | 


--------------------------------------------------------------------------------