├── test
    ├── __init__.py
    ├── test_log.py
    ├── test_dataset.py
    └── test_others.ipynb
├── vocoder
    ├── __init__.py
    ├── utils
    │   ├── __init__.py
    │   └── log.py
    ├── inference.py
    ├── layers
    │   ├── location_variable_conv.py
    │   ├── __init__.py
    │   ├── causal_conv.py
    │   ├── residual_stack.py
    │   ├── residual_block.py
    │   ├── pqmf.py
    │   └── upsample.py
    ├── audio
    │   ├── __init__.py
    │   ├── mel.py
    │   ├── util.py
    │   └── stft.py
    ├── losses
    │   ├── __init__.py
    │   ├── pwg_loss.py
    │   └── stft_loss.py
    ├── strategy
    │   ├── __init__.py
    │   ├── base.py
    │   └── pwg_strategy.py
    ├── optimizers
    │   ├── __init__.py
    │   ├── pwg_opt.py
    │   └── radam.py
    ├── models
    │   ├── __init__.py
    │   ├── lvcgan.py
    │   ├── lvcnet.py
    │   ├── melgan.py
    │   └── parallel_wavegan.py
    ├── datasets
    │   ├── utils.py
    │   ├── __init__.py
    │   └── audio_mel.py
    ├── hparams.py
    ├── test.py
    ├── preprocess.py
    └── train.py
├── samples
    ├── 0001_lvc.wav
    ├── 0001_pwg.wav
    ├── 0002_lvc.wav
    ├── 0002_pwg.wav
    ├── 0003_lvc.wav
    ├── 0003_pwg.wav
    ├── 0004_lvc.wav
    ├── 0004_pwg.wav
    ├── 0005_lvc.wav
    ├── 0006_lvc.wav
    ├── 0006_pwg.wav
    ├── 0007_lvc.wav
    ├── 0007_pwg.wav
    ├── 0008_lvc.wav
    ├── 0008_pwg.wav
    ├── 0009_lvc.wav
    ├── 0009_pwg.wav
    ├── 0010_lvc.wav
    ├── 0010_pwg.wav
    ├── 0001_real.wav
    ├── 0002_real.wav
    ├── 0003_real.wav
    ├── 0004_real.wav
    ├── 0005_gpwg.wav
    ├── 0005_real.wav
    ├── 0006_real.wav
    ├── 0007_real.wav
    ├── 0008_real.wav
    ├── 0009_real.wav
    ├── 0010_real.wav
    ├── train-loss.png
    └── evaluate-loss.png
├── scripts
    └── preprocess.sh
├── .gitignore
├── README.md
├── configs
    ├── lvcgan.v1.yaml
    ├── pwg.v1.yaml
    └── parallel_wavegan.v1.yaml
└── LICENSE


/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vocoder/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vocoder/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vocoder/inference.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/vocoder/layers/location_variable_conv.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/samples/0001_lvc.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0001_lvc.wav


--------------------------------------------------------------------------------
/samples/0001_pwg.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0001_pwg.wav


--------------------------------------------------------------------------------
/samples/0002_lvc.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0002_lvc.wav


--------------------------------------------------------------------------------
/samples/0002_pwg.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0002_pwg.wav


--------------------------------------------------------------------------------
/samples/0003_lvc.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0003_lvc.wav


--------------------------------------------------------------------------------
/samples/0003_pwg.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0003_pwg.wav


--------------------------------------------------------------------------------
/samples/0004_lvc.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0004_lvc.wav


--------------------------------------------------------------------------------
/samples/0004_pwg.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0004_pwg.wav


--------------------------------------------------------------------------------
/samples/0005_lvc.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0005_lvc.wav


--------------------------------------------------------------------------------
/samples/0006_lvc.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0006_lvc.wav


--------------------------------------------------------------------------------
/samples/0006_pwg.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0006_pwg.wav


--------------------------------------------------------------------------------
/samples/0007_lvc.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0007_lvc.wav


--------------------------------------------------------------------------------
/samples/0007_pwg.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0007_pwg.wav


--------------------------------------------------------------------------------
/samples/0008_lvc.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0008_lvc.wav


--------------------------------------------------------------------------------
/samples/0008_pwg.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0008_pwg.wav


--------------------------------------------------------------------------------
/samples/0009_lvc.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0009_lvc.wav


--------------------------------------------------------------------------------
/samples/0009_pwg.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0009_pwg.wav


--------------------------------------------------------------------------------
/samples/0010_lvc.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0010_lvc.wav


--------------------------------------------------------------------------------
/samples/0010_pwg.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0010_pwg.wav


--------------------------------------------------------------------------------
/samples/0001_real.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0001_real.wav


--------------------------------------------------------------------------------
/samples/0002_real.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0002_real.wav


--------------------------------------------------------------------------------
/samples/0003_real.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0003_real.wav


--------------------------------------------------------------------------------
/samples/0004_real.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0004_real.wav


--------------------------------------------------------------------------------
/samples/0005_gpwg.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0005_gpwg.wav


--------------------------------------------------------------------------------
/samples/0005_real.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0005_real.wav


--------------------------------------------------------------------------------
/samples/0006_real.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0006_real.wav


--------------------------------------------------------------------------------
/samples/0007_real.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0007_real.wav


--------------------------------------------------------------------------------
/samples/0008_real.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0008_real.wav


--------------------------------------------------------------------------------
/samples/0009_real.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0009_real.wav


--------------------------------------------------------------------------------
/samples/0010_real.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/0010_real.wav


--------------------------------------------------------------------------------
/samples/train-loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/train-loss.png


--------------------------------------------------------------------------------
/samples/evaluate-loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zceng/LVCNet/HEAD/samples/evaluate-loss.png


--------------------------------------------------------------------------------
/vocoder/audio/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .util import load_wav_to_torch 
3 | from .stft import TacotronSTFT 
4 | from .mel import griffin_lim_inverse_mel 
5 | 


--------------------------------------------------------------------------------
/scripts/preprocess.sh:
--------------------------------------------------------------------------------
1 | 
2 | python -m vocoder.preprocess \ 
3 |             --data-dir ../data/LJSpeech-1.1 \ 
4 |             --config configs/lvcgan.v1.yaml
5 | 
6 | 


--------------------------------------------------------------------------------
/vocoder/layers/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .causal_conv import *  # NOQA
3 | from .pqmf import *  # NOQA
4 | from .residual_block import *  # NOQA
5 | from .residual_stack import *  # NOQA
6 | from .upsample import *  # NOQA
7 | 
8 | 


--------------------------------------------------------------------------------
/vocoder/losses/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | from .pwg_loss import PWGLoss
 4 | 
 5 | loss_modules = {
 6 |     "PWGLoss": PWGLoss
 7 | }
 8 | 
 9 | def create_loss(name, params, device) -> Union[PWGLoss]:
10 |     return loss_modules[ name ]( **params ).to(device)


--------------------------------------------------------------------------------
/vocoder/strategy/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | from .base import TrainStrategy
 5 | from .pwg_strategy import PWGStrategy
 6 | 
 7 | strategy_classes = {
 8 |     "PWGStrategy": PWGStrategy
 9 | }
10 | 
11 | 
12 | def create_strategy(name, params) -> TrainStrategy:
13 |     return strategy_classes[ name ](**params)


--------------------------------------------------------------------------------
/vocoder/optimizers/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from typing import Union
 4 | 
 5 | from .pwg_opt import PWGOptimizer
 6 | 
 7 | optimizer_list = {
 8 |     "PWGOptimizer": PWGOptimizer
 9 | }
10 | 
11 | def create_optimizer(name, model, params) -> Union[PWGOptimizer]:
12 |     return optimizer_list[ name ]( model, **params ) 
13 | 


--------------------------------------------------------------------------------
/vocoder/strategy/base.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class TrainStrategy:
 4 | 
 5 |     def __init__(self):
 6 |         pass 
 7 | 
 8 |     def train_step(self, batch, cur_step, model, loss, optimizer):
 9 |         return {'train_loss': 0}  
10 | 
11 |     def eval_step(self, batch, model, loss):
12 |         return {'eval_loss': 0}
13 | 
14 |     def test_step(self, batch, model):
15 |         return {'audio': 0}


--------------------------------------------------------------------------------
/test/test_log.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os, logging, time 
 3 | from vocoder.utils.log import Logger 
 4 | 
 5 | def test_logger():
 6 |     exp_dir = 'exps/exp-test' 
 7 |     os.makedirs( exp_dir, exist_ok=True )
 8 | 
 9 |     log = Logger( exp_dir )
10 |     log.info('log test finish.') 
11 | 
12 |     while True:
13 |         time.sleep(1)
14 |         log.add_scalars('train', {'test': 0.2}, 1)
15 | 
16 | if __name__ == "__main__":
17 |     test_logger() 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/vocoder/models/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from typing import Union
 3 | 
 4 | from .parallel_wavegan import ParallelWaveGAN 
 5 | from .lvcgan import LVCNetWaveGAN
 6 | 
 7 | model_list = {
 8 |     "ParallelWaveGAN": ParallelWaveGAN, 
 9 |     "LVCNetWaveGAN": LVCNetWaveGAN
10 | }
11 | 
12 | 
13 | def create_model(name, params, device) -> Union[ParallelWaveGAN]:
14 |     ''' Create model according to the model classname  
15 |     Args:
16 |         name (str): model classname.  
17 |         params (dict): the parameter for create model.
18 |     Return:
19 |         torch.nn.Module : Model. 
20 |     '''
21 |     return model_list[ name ](**params).to(device) 
22 | 
23 | 


--------------------------------------------------------------------------------
/test/test_dataset.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import tqdm 
 3 | from vocoder.datasets import create_dataloader 
 4 | 
 5 | def test_mel_audio_dataset():
 6 |     dataset_config = {
 7 |         'metadata_file': 'temp/metadata.txt',
 8 |         'hop_length': 256,
 9 |         'batch_mel_length': 64
10 |     }
11 | 
12 |     dataloader = create_dataloader( "AudioMelNoiseDataset", 
13 |                                     dataset_config=dataset_config,
14 |                                     batch_size=4,
15 |                                     shuffle=True,
16 |                                     num_workers=4,
17 |                                     drop_last=False ) 
18 |     for batch in tqdm.tqdm(dataloader):
19 |         wavs, mels, noises = batch 
20 |         
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     test_mel_audio_dataset()
25 | 


--------------------------------------------------------------------------------
/vocoder/losses/pwg_loss.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import torch 
 3 | 
 4 | from .stft_loss import * 
 5 | 
 6 | class PWGLoss(torch.nn.Module):
 7 | 
 8 |     def __init__(self, stft_loss_params={}):
 9 |         super(PWGLoss, self).__init__() 
10 |         self.stft_criterion = MultiResolutionSTFTLoss( **stft_loss_params )
11 |         self.mse_criterion = torch.nn.MSELoss() 
12 | 
13 | 
14 |     def stft_loss(self, audio, audio_):
15 |         sc_loss, mag_loss = self.stft_criterion( audio_.squeeze(1), audio.squeeze(1) ) 
16 |         return sc_loss, mag_loss 
17 | 
18 |     def adversarial_loss(self, prob_ ):
19 |         return self.mse_criterion( prob_, torch.ones_like( prob_ ) ) 
20 | 
21 |     def discriminator_loss(self, prob, prob_ ):
22 |         real_loss = self.mse_criterion( prob,  torch.ones_like( prob ) ) 
23 |         fake_loss = self.mse_criterion( prob_, torch.zeros_like(prob_) )
24 |         return real_loss, fake_loss
25 | 


--------------------------------------------------------------------------------
/vocoder/datasets/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | def read_metadata(metadata_path, split='|'):
 6 |     ''' read data from metadata file. 
 7 | 
 8 |     Args:
 9 |         metadata_path (str): the path of metadata file. 
10 |         split (str): the char to split each line in metadata file. 
11 |             default: '|'  
12 |     Returns:
13 |         list: data from metadata file. 
14 |     '''
15 |     with open(metadata_path, 'r', encoding='utf-8') as f:
16 |         data = f.readlines() 
17 |     data = [ d.strip().split('|') for d in data ] 
18 |     return data 
19 | 
20 | 
21 | def save_metadata(data, metadata_path, split='|'):
22 |     '''save data to file as metadata. 
23 | 
24 |     Args:
25 |         data (list): data
26 |         metadata_path (str): path for saving file.
27 |         split (str): the char to join each element of data.  
28 |     Returns:
29 |         None
30 |     '''
31 |     with open(metadata_path, 'w', encoding='utf-8') as f:
32 |         for d in data: 
33 |             line = split.join( d ) 
34 |             line.replace('\n',' ')
35 |             f.write(line + '\n')


--------------------------------------------------------------------------------
/vocoder/audio/mel.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | from .stft import TacotronSTFT
 5 | from .util import griffin_lim
 6 | 
 7 | 
 8 | def griffin_lim_inverse_mel(mel, tacotron_stft, griffin_iters=60):
 9 |     ''' generate waveform using griffin_lim according to the mel-spectrums.  
10 | 
11 |     Args: 
12 |         mel (Tensor): shape (B, L, C) 
13 |         tacotron_stft (TacotronSTFT): A transformation class to calcuate the mel-spectrum 
14 |         griffin_iters (int): the iters for griffin_lim  
15 |     
16 |     Returns:
17 |         audio (Tensor): the generated waveform. 
18 |     '''
19 |     # mel = torch.stack([torch.from_numpy(_denormalize(mel.numpy()))])
20 |     mel_decompress = tacotron_stft.spectral_de_normalize(mel)
21 |     mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
22 |     spec_from_mel_scaling = 1000
23 |     spec_from_mel = torch.mm(mel_decompress[0], _stft.mel_basis)
24 |     spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
25 |     spec_from_mel = spec_from_mel * spec_from_mel_scaling
26 | 
27 |     audio = griffin_lim(torch.autograd.Variable(
28 |         spec_from_mel[:, :, :-1]), tacotron_stft.stft_fn, griffin_iters)
29 | 
30 |     return audio 
31 | 


--------------------------------------------------------------------------------
/vocoder/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from .audio_mel import PWGAudioMelNoiseDataset, DataLoader
 3 | 
 4 | dataset_class_dict = {
 5 |     "PWGAudioMelNoiseDataset": PWGAudioMelNoiseDataset
 6 | }
 7 | 
 8 | def create_dataloader(dataset_classname, 
 9 |                       dataset_config,
10 |                       batch_size=1,
11 |                       collate_fn=None,
12 |                       shuffle=False,
13 |                       num_workers=0,
14 |                       drop_last=False,
15 |                       ) -> DataLoader:
16 |     ''' create dataloader   
17 |     Args: 
18 |         dataset_classname (str) : the classname of dataset.
19 |         dataset_config (dict): the config for dataset.
20 |         ...
21 |     Returns:
22 |         Dataloader. 
23 |     '''
24 |     dataset = dataset_class_dict[ dataset_classname ]( **dataset_config )
25 |     dataloader = DataLoader( dataset, 
26 |                              batch_size=batch_size,
27 |                              collate_fn=collate_fn,
28 |                              shuffle=shuffle,
29 |                              num_workers=num_workers,
30 |                              drop_last=drop_last)
31 |     return dataloader  
32 | 
33 | 


--------------------------------------------------------------------------------
/vocoder/utils/log.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from functools import wraps
 3 | import logging, os, time, sys 
 4 | from logging import DEBUG, INFO, WARN, ERROR
 5 | 
 6 | from torch.utils.tensorboard import SummaryWriter 
 7 | 
 8 | logging.basicConfig(
 9 |         stream=sys.stdout, 
10 |         format='[ %(levelname)s ] %(message)s',
11 |         level=DEBUG)
12 | 
13 | class Logger:
14 | 
15 |     def __init__(self, log_dir, level=DEBUG, tensorboard=True):
16 |         os.makedirs( log_dir, exist_ok=True )
17 | 
18 |         self.logger = logging.getLogger('log') 
19 |         self.logger.setLevel(level)
20 |         handler = logging.FileHandler( 
21 |             os.path.join( log_dir, time.strftime('%Y%m%d-%H%M%S.log') ), 
22 |             mode='w',
23 |             encoding='utf-8')
24 |         handler.setFormatter( logging.Formatter('[ %(levelname)s, %(asctime)s ] %(message)s') ) 
25 |         self.logger.addHandler( handler )
26 |         self.handler = handler
27 | 
28 |         if tensorboard:
29 |             self.tbwriter = SummaryWriter(log_dir) 
30 | 
31 |     def add_scalar(self, tag, value, step):
32 |         self.tbwriter.add_scalar(tag, value, step) 
33 | 
34 |     def info(self, *msg, **kwargs):
35 |         self.logger.info( *msg, **kwargs ) 
36 | 
37 |     def warn(self, *msg, **kwargs):
38 |         self.logger.warn( *msg, **kwargs ) 
39 | 
40 |     def error(self, *msg, **kwargs):
41 |         self.logger.error( *msg, **kwargs ) 
42 | 
43 |     def debug(self, *msg, **kwargs):
44 |         self.logger.debug( *msg, **kwargs )
45 | 
46 |     def flush(self):
47 |         self.tbwriter.flush() 
48 |         self.handler.flush()
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/vocoder/optimizers/pwg_opt.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import torch 
 3 | 
 4 | from vocoder.models import ParallelWaveGAN
 5 | from .radam import RAdam
 6 | 
 7 | 
 8 | class PWGOptimizer:
 9 | 
10 |     def __init__(self, model: ParallelWaveGAN,
11 |                  generator_optimizer_params={"lr": 1e-4, "eps": 1e-6},
12 |                  generator_scheduler_params={"step_size": 200000, "gamma": 0.5},
13 |                  discriminator_optimizer_params={"lr": 5e-5, "eps": 1e-6},
14 |                  discriminator_scheduler_params={"step_size": 200000, "gamma": 0.5}):
15 |         self.generator_optimizer = RAdam( 
16 |             model.generator.parameters(), **generator_optimizer_params ) 
17 |         self.generator_scheduler = torch.optim.lr_scheduler.StepLR( 
18 |             optimizer=self.generator_optimizer, **generator_scheduler_params)
19 | 
20 |         self.discriminator_optimizer = RAdam( 
21 |             model.discriminator.parameters(), **discriminator_optimizer_params ) 
22 |         self.discriminator_scheduler = torch.optim.lr_scheduler.StepLR(
23 |             optimizer=self.discriminator_optimizer, **discriminator_scheduler_params) 
24 | 
25 |     def state_dict(self):
26 |         return {
27 |             "generator_optimizer": self.generator_optimizer.state_dict(),
28 |             "generator_scheduler": self.generator_scheduler.state_dict(),
29 |             "discriminator_optimizer": self.discriminator_optimizer.state_dict(),
30 |             "discriminator_scheduler": self.discriminator_scheduler.state_dict()
31 |         }
32 | 
33 |     def load_state_dict(self, state_dict):
34 |         self.generator_optimizer.load_state_dict( state_dict["generator_optimizer"] )
35 |         self.generator_scheduler.load_state_dict( state_dict["generator_scheduler"] )
36 |         self.discriminator_optimizer.load_state_dict( state_dict["discriminator_optimizer"] )
37 |         self.discriminator_scheduler.load_state_dict( state_dict["discriminator_scheduler"] )
38 | 
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/vocoder/layers/causal_conv.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2020 Tomoki Hayashi
 4 | #  MIT License (https://opensource.org/licenses/MIT)
 5 | 
 6 | """Causal convolusion layer modules."""
 7 | 
 8 | 
 9 | import torch
10 | 
11 | 
12 | class CausalConv1d(torch.nn.Module):
13 |     """CausalConv1d module with customized initialization."""
14 | 
15 |     def __init__(self, in_channels, out_channels, kernel_size,
16 |                  dilation=1, bias=True, pad="ConstantPad1d", pad_params={"value": 0.0}):
17 |         """Initialize CausalConv1d module."""
18 |         super(CausalConv1d, self).__init__()
19 |         self.pad = getattr(torch.nn, pad)((kernel_size - 1) * dilation, **pad_params)
20 |         self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size,
21 |                                     dilation=dilation, bias=bias)
22 | 
23 |     def forward(self, x):
24 |         """Calculate forward propagation.
25 | 
26 |         Args:
27 |             x (Tensor): Input tensor (B, in_channels, T).
28 | 
29 |         Returns:
30 |             Tensor: Output tensor (B, out_channels, T).
31 | 
32 |         """
33 |         return self.conv(self.pad(x))[:, :, :x.size(2)]
34 | 
35 | 
36 | class CausalConvTranspose1d(torch.nn.Module):
37 |     """CausalConvTranspose1d module with customized initialization."""
38 | 
39 |     def __init__(self, in_channels, out_channels, kernel_size, stride, bias=True):
40 |         """Initialize CausalConvTranspose1d module."""
41 |         super(CausalConvTranspose1d, self).__init__()
42 |         self.deconv = torch.nn.ConvTranspose1d(
43 |             in_channels, out_channels, kernel_size, stride, bias=bias)
44 |         self.stride = stride
45 | 
46 |     def forward(self, x):
47 |         """Calculate forward propagation.
48 | 
49 |         Args:
50 |             x (Tensor): Input tensor (B, in_channels, T_in).
51 | 
52 |         Returns:
53 |             Tensor: Output tensor (B, out_channels, T_out).
54 | 
55 |         """
56 |         return self.deconv(x)[:, :, :-self.stride]
57 | 


--------------------------------------------------------------------------------
/vocoder/hparams.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import yaml 
 4 | 
 5 | 
 6 | class Hyperparameter:
 7 |     ''' hyperparameter manager '''
 8 | 
 9 |     def __init__(self, config_file: str):
10 |         ''' Hyperparameter  
11 |         Args:
12 |             config_file (str): the config file. 
13 |         '''
14 |         # Audio 
15 |         self.sample_rate = 22050
16 |         self.hop_length  = 256
17 |         self.win_length  = 1024
18 |         self.n_fft       = 1024
19 |         self.n_mels      = 80
20 |         self.mel_fmax   = 8000
21 |         self.mel_fmin   = 70
22 | 
23 |         # Moel 
24 |         self.model_name = 'ParallelWaveGAN'
25 |         self.model_params = dict() 
26 |         
27 |         # Loss 
28 |         self.loss_name = 'PWGLoss'
29 |         self.loss_params = dict()
30 | 
31 |         # Optimizer
32 |         self.opt_name = 'PWGOptimizer'
33 |         self.opt_params = dict()
34 | 
35 |         # Strategy 
36 |         self.strategy_name = "PWGStrategy"
37 |         self.strategy_params = dict() 
38 | 
39 |         # Training
40 |         self.dataset_classname   = 'PWGAudioMelNoiseDataset'
41 |         self.dataset_num_workers = 5
42 |         self.train_metadata_file = 'temp/metadata.train.txt' 
43 |         self.batch_mel_length    = 52 
44 |         self.train_batch_size    = 8
45 |         self.max_train_steps     = 40000 
46 |         self.log_interval_steps  = 100
47 |         self.save_interval_steps = 10000
48 | 
49 |         # Evaluate
50 |         self.eval_sample_num = 500 
51 |         self.eval_metadata_file = 'temp/metadata.eval.txt'
52 |         self.eval_interval_steps = 1000
53 | 
54 |         # Test 
55 |         self.test_sample_num = 100 
56 |         self.test_metadata_file = 'temp/metadata.test.txt' 
57 | 
58 | 
59 |         with open(config_file, 'r', encoding='utf-8') as f:
60 |             config = yaml.safe_load( f )
61 |             for k in config:
62 |                 self.__setattr__(k, config[k]) 
63 | 
64 |     def save_config(self, file):
65 |         with open(file, 'w', encoding='utf-8') as f:
66 |             yaml.safe_dump(self.__dict__, f) 
67 | 
68 |     def __str__(self):
69 |         return yaml.safe_dump(self.__dict__)
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | exps/ 
  3 | temp/
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | pip-wheel-metadata/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # LVCNet: Efficient Condition-Dependent Modeling Network for Waveform Generation
 3 | 
 4 | Using LVCNet to design the generator of Parallel WaveGAN and the *same strategy* to train it, 
 5 | the inference speed of the new vocoder is **more than 5x faster** than the original vocoder 
 6 | *without any degradation in audio quality*. 
 7 | 
 8 | Our current works [[Paper](https://arxiv.org/abs/2102.10815)]  has been accepted by ICASSP2021, and our previous works were described in [Melglow](https://arxiv.org/abs/2012.01684). 
 9 | 
10 | ## Training and Test 
11 | 
12 | 1. prepare the data, download `LJSpeech` dataset from https://keithito.com/LJ-Speech-Dataset/,
13 | and save it in `data/LJSpeech-1.1`. Then run 
14 |     ```python
15 |     python -m vocoder.preprocess --data-dir ./data/LJSpeech-1.1 --config configs/lvcgan.v1.yaml
16 |     ```
17 |     The mel-sepctrums are calculated and saved in the folder `temp/`. 
18 | 
19 | 2. Training LVCNet
20 |     ```python
21 |     python -m vocoder.train --config configs/lvcgan.v1.yaml --exp-dir exps/exp.lvcgan.v1
22 |     ```
23 | 
24 | 3. Test LVCNet 
25 |     ```python 
26 |     python -m vocoder.test --config configs/lvcgan.v1.yaml --exp-dir exps/exp.lvcgan.v1
27 |     ```
28 | 
29 | 4. The experimental results, including training logs, model checkpoints and synthesized audios, are stored in the folder `exps/exp.lvcgan.v1/`.  
30 |     Similarity, you can also use the config file `configs/pwg.v1.yaml` to train a Parallel WaveGAN model. 
31 |     ```Python
32 |     # training
33 |     python -m vocoder.train --config configs/pwg.v1.yaml --exp-dir exps/exp.pwg.v1
34 |     # test
35 |     python -m vocoder.test --config configs/pwg.v1.yaml --exp-dir exps/exp.pwg.v1
36 |     ```
37 | 
38 | ## Results 
39 | 
40 | ### Tensorboard 
41 | 
42 | Use the tensorboard to view the experimental training process:
43 | 
44 | ```
45 | tensorboard --logdir exps
46 | ```
47 | 
48 | ### Traning Loss
49 | ![image](samples/train-loss.png)
50 | 
51 | ### Evaluate Loss
52 | ![image](samples/evaluate-loss.png)
53 | 
54 | 
55 | ### Aduio Sample 
56 | 
57 | Audio Samples are saved in `samples/`, where  
58 |  - `samples/*_lvc.wav` are generated by LVCNet, 
59 |  - `samples/*_pwg.wav` are generated by Parallel WaveGAN, 
60 |  - `samples/*_real.wav` are the real audio. 
61 | 
62 | 
63 | ## Reference 
64 | > LVCNet: Efficient Condition-Dependent Modeling Network for Waveform Generation, https://arxiv.org/abs/2102.10815  
65 | > MelGlow: Efficient Waveform Generative Network Based on Location-Variable Convolution, https://arxiv.org/abs/2012.01684   
66 | > https://github.com/kan-bayashi/ParallelWaveGAN  
67 | > https://github.com/lmnt-com/diffwave  
68 | 


--------------------------------------------------------------------------------
/vocoder/datasets/audio_mel.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | import numpy as np 
 3 | from scipy.io import wavfile
 4 | 
 5 | import torch
 6 | from torch.utils.data import DataLoader, Dataset  
 7 | 
 8 | from vocoder.audio import load_wav_to_torch 
 9 | from .utils import read_metadata 
10 | 
11 | class PWGAudioMelNoiseDataset(Dataset):
12 |     ''' the Pytorch Dataset for loading audio(.wav) and mel(.npy) '''
13 | 
14 |     def __init__(self, metadata_file, batch_mel_length, sample_rate, hop_length, cut=True):
15 |         '''Initialize   
16 |         Args:
17 |             metadata_file (str): the file including paths of audio and mel.  
18 |             batch_mel_length (int): the length of mel-spectrum for batch. 
19 |             hop_length (int): the hop length used when calculating mel-spectrum.
20 | 
21 |         Description:
22 |             Example of metadata_file:
23 |                 ./data/wavs/001.wav|./temp/mels/001.npy 
24 |                 ./data/wavs/002.wav|./temp/mels/002.npy
25 |                 ./data/wavs/003.wav|./temp/mels/003.npy     
26 |         '''
27 |         super().__init__() 
28 |         self.batch_mel_length = batch_mel_length 
29 |         self.hop_length = hop_length
30 |         self.sample_rate = sample_rate 
31 |         self.cut = cut
32 | 
33 |         self.metadata = read_metadata( metadata_file ) 
34 |         # metadata: contains paths of entire wav files and mel-spectrum files. 
35 |         #   Examples: [ ('./data/wavs/001.wav', './dump/mels/001.npy'), ... ] 
36 | 
37 |     def __len__(self):
38 |         return len(self.metadata)
39 | 
40 |     def __getitem__(self, idx):
41 |         '''
42 |         Returns: 
43 |             Tensor (float): audio, shape (L,)
44 |             Tensor (float): mel-spectrum, shape ( ML, MC) 
45 |             Tensor (float): guassian noise with the same shape as audio, shape (L,) 
46 |         
47 |         Note: 
48 |             the length of mel-spectrum (ML) is equal to `batch_mel_length`
49 |             the equation relationship between the length of audio and mel-spectrum: 
50 |                 L = ML * hop_length
51 |         '''
52 |         wav_path, mel_path = self.metadata[ idx ] 
53 |         
54 |         audio, sr = load_wav_to_torch( wav_path, self.sample_rate )
55 |         assert sr == self.sample_rate 
56 | 
57 |         mel = np.load( mel_path ) 
58 | 
59 |         if self.cut:
60 |             assert mel.shape[0] > self.batch_mel_length + 1, f"the length of audio is too short: {wav_path}" 
61 |             mel_start = np.random.randint( 0, mel.shape[0] - self.batch_mel_length - 1 ) 
62 |             audio_start = (mel_start + 2) * self.hop_length 
63 | 
64 |             mel = mel[ mel_start : mel_start + self.batch_mel_length ] 
65 |             audio = audio[ :, audio_start : audio_start + (self.batch_mel_length - 4) * self.hop_length ] 
66 |         else:
67 |             audio = audio[ :, 2*self.hop_length:(mel.shape[0] - 2) * self.hop_length ] 
68 | 
69 |         mel = torch.from_numpy( mel.T ) 
70 |         noise = torch.randn_like( audio  )
71 |         return audio, mel, noise 
72 | 
73 | 
74 | 
75 | 
76 |         
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/vocoder/layers/residual_stack.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2020 Tomoki Hayashi
 4 | #  MIT License (https://opensource.org/licenses/MIT)
 5 | 
 6 | """Residual stack module in MelGAN."""
 7 | 
 8 | import torch
 9 | 
10 | from vocoder.layers import CausalConv1d
11 | 
12 | 
13 | class ResidualStack(torch.nn.Module):
14 |     """Residual stack module introduced in MelGAN."""
15 | 
16 |     def __init__(self,
17 |                  kernel_size=3,
18 |                  channels=32,
19 |                  dilation=1,
20 |                  bias=True,
21 |                  nonlinear_activation="LeakyReLU",
22 |                  nonlinear_activation_params={"negative_slope": 0.2},
23 |                  pad="ReflectionPad1d",
24 |                  pad_params={},
25 |                  use_causal_conv=False,
26 |                  ):
27 |         """Initialize ResidualStack module.
28 | 
29 |         Args:
30 |             kernel_size (int): Kernel size of dilation convolution layer.
31 |             channels (int): Number of channels of convolution layers.
32 |             dilation (int): Dilation factor.
33 |             bias (bool): Whether to add bias parameter in convolution layers.
34 |             nonlinear_activation (str): Activation function module name.
35 |             nonlinear_activation_params (dict): Hyperparameters for activation function.
36 |             pad (str): Padding function module name before dilated convolution layer.
37 |             pad_params (dict): Hyperparameters for padding function.
38 |             use_causal_conv (bool): Whether to use causal convolution.
39 | 
40 |         """
41 |         super(ResidualStack, self).__init__()
42 | 
43 |         # defile residual stack part
44 |         if not use_causal_conv:
45 |             assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
46 |             self.stack = torch.nn.Sequential(
47 |                 getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
48 |                 getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params),
49 |                 torch.nn.Conv1d(channels, channels, kernel_size, dilation=dilation, bias=bias),
50 |                 getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
51 |                 torch.nn.Conv1d(channels, channels, 1, bias=bias),
52 |             )
53 |         else:
54 |             self.stack = torch.nn.Sequential(
55 |                 getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
56 |                 CausalConv1d(channels, channels, kernel_size, dilation=dilation,
57 |                              bias=bias, pad=pad, pad_params=pad_params),
58 |                 getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
59 |                 torch.nn.Conv1d(channels, channels, 1, bias=bias),
60 |             )
61 | 
62 |         # defile extra layer for skip connection
63 |         self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias)
64 | 
65 |     def forward(self, c):
66 |         """Calculate forward propagation.
67 | 
68 |         Args:
69 |             c (Tensor): Input tensor (B, channels, T).
70 | 
71 |         Returns:
72 |             Tensor: Output tensor (B, chennels, T).
73 | 
74 |         """
75 |         return self.stack(c) + self.skip_layer(c)
76 | 


--------------------------------------------------------------------------------
/vocoder/optimizers/radam.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """RAdam optimizer.
 4 | 
 5 | This code is drived from https://github.com/LiyuanLucasLiu/RAdam.
 6 | """
 7 | 
 8 | import math
 9 | import torch
10 | 
11 | from torch.optim.optimizer import Optimizer
12 | 
13 | 
14 | class RAdam(Optimizer):
15 |     """Rectified Adam optimizer."""
16 | 
17 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
18 |         """Initilize RAdam optimizer."""
19 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
20 |         self.buffer = [[None, None, None] for ind in range(10)]
21 |         super(RAdam, self).__init__(params, defaults)
22 | 
23 |     def __setstate__(self, state):
24 |         """Set state."""
25 |         super(RAdam, self).__setstate__(state)
26 | 
27 |     def step(self, closure=None):
28 |         """Run one step."""
29 |         loss = None
30 |         if closure is not None:
31 |             loss = closure()
32 | 
33 |         for group in self.param_groups:
34 | 
35 |             for p in group['params']:
36 |                 if p.grad is None:
37 |                     continue
38 |                 grad = p.grad.data.float()
39 |                 if grad.is_sparse:
40 |                     raise RuntimeError('RAdam does not support sparse gradients')
41 | 
42 |                 p_data_fp32 = p.data.float()
43 | 
44 |                 state = self.state[p]
45 | 
46 |                 if len(state) == 0:
47 |                     state['step'] = 0
48 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
49 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
50 |                 else:
51 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
52 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
53 | 
54 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
55 |                 beta1, beta2 = group['betas']
56 | 
57 |                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value = 1 - beta2)
58 |                 exp_avg.mul_(beta1).add_( grad, alpha = 1 - beta1)
59 | 
60 |                 state['step'] += 1
61 |                 buffered = self.buffer[int(state['step'] % 10)]
62 |                 if state['step'] == buffered[0]:
63 |                     N_sma, step_size = buffered[1], buffered[2]
64 |                 else:
65 |                     buffered[0] = state['step']
66 |                     beta2_t = beta2 ** state['step']
67 |                     N_sma_max = 2 / (1 - beta2) - 1
68 |                     N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
69 |                     buffered[1] = N_sma
70 | 
71 |                     # more conservative since it's an approximated value
72 |                     if N_sma >= 5:
73 |                         step_size = math.sqrt(
74 |                             (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])  # NOQA
75 |                     else:
76 |                         step_size = 1.0 / (1 - beta1 ** state['step'])
77 |                     buffered[2] = step_size
78 | 
79 |                 if group['weight_decay'] != 0:
80 |                     p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
81 | 
82 |                 # more conservative since it's an approximated value
83 |                 if N_sma >= 5:
84 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
85 |                     p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
86 |                 else:
87 |                     p_data_fp32.add_(-step_size * group['lr'], exp_avg)
88 | 
89 |                 p.data.copy_(p_data_fp32)
90 | 
91 |         return loss
92 | 


--------------------------------------------------------------------------------
/vocoder/audio/util.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import torch
  3 | import numpy as np
  4 | from scipy.signal import get_window
  5 | import librosa
  6 | 
  7 | 
  8 | def window_sumsquare(window, n_frames, hop_length, win_length,
  9 |                      n_fft, dtype=np.float32, norm=None):
 10 |     """
 11 |     # from librosa 0.6
 12 |     Compute the sum-square envelope of a window function at a given hop length.
 13 | 
 14 |     This is used to estimate modulation effects induced by windowing
 15 |     observations in short-time fourier transforms.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     window : string, tuple, number, callable, or list-like
 20 |         Window specification, as in `get_window`
 21 | 
 22 |     n_frames : int > 0
 23 |         The number of analysis frames
 24 | 
 25 |     hop_length : int > 0
 26 |         The number of samples to advance between frames
 27 | 
 28 |     win_length : [optional]
 29 |         The length of the window function.  By default, this matches `n_fft`.
 30 | 
 31 |     n_fft : int > 0
 32 |         The length of each analysis frame.
 33 | 
 34 |     dtype : np.dtype
 35 |         The data type of the output
 36 | 
 37 |     Returns
 38 |     -------
 39 |     wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
 40 |         The sum-squared envelope of the window function
 41 |     """
 42 |     if win_length is None:
 43 |         win_length = n_fft
 44 | 
 45 |     n = n_fft + hop_length * (n_frames - 1)
 46 |     x = np.zeros(n, dtype=dtype)
 47 | 
 48 |     # Compute the squared window at the desired length
 49 |     win_sq = get_window(window, win_length, fftbins=True)
 50 |     win_sq = librosa.util.normalize(win_sq, norm=norm)**2
 51 |     win_sq = librosa.util.pad_center(win_sq, n_fft)
 52 | 
 53 |     # Fill the envelope
 54 |     for i in range(n_frames):
 55 |         sample = i * hop_length
 56 |         x[sample:min(n, sample + n_fft)
 57 |           ] += win_sq[:max(0, min(n_fft, n - sample))]
 58 |     return x
 59 | 
 60 | 
 61 | def griffin_lim(magnitudes, stft_fn, n_iters=30):
 62 |     """
 63 |     PARAMS
 64 |     ------
 65 |     magnitudes: spectrogram magnitudes
 66 |     stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
 67 |     """
 68 | 
 69 |     angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
 70 |     angles = angles.astype(np.float32)
 71 |     angles = torch.autograd.Variable(torch.from_numpy(angles))
 72 |     signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
 73 | 
 74 |     for i in range(n_iters):
 75 |         _, angles = stft_fn.transform(signal)
 76 |         signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
 77 |     return signal
 78 | 
 79 | 
 80 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
 81 |     """
 82 |     PARAMS
 83 |     ------
 84 |     C: compression factor
 85 |     """
 86 |     x = 20 * torch.log10(torch.clamp(x, min=clip_val)) - 20
 87 |     x = torch.clamp((x + 100) / 100, 0.0, 1.0) 
 88 |     return x
 89 | 
 90 | 
 91 | def dynamic_range_decompression(x, C=1):
 92 |     """
 93 |     PARAMS
 94 |     ------
 95 |     C: compression factor used to compress
 96 |     """
 97 |     x = x * 100 - 100 
 98 |     x = torch.pow(10, x/20 + 1)
 99 |     return x
100 | 
101 | 
102 | def load_wav_to_torch(wav_file, sample_rate):
103 |     ''' load wav and convert into Tensor.  
104 | 
105 |     Args:
106 |         wav_file (str): the path of wav file.  
107 |         sample_rate (int): sample_rate 
108 | 
109 |     Returns: 
110 |         audio (Tensor): shape (1, L) 
111 |         sample_rate (int) 
112 |     '''
113 |     data, sr = librosa.load(wav_file, sample_rate)
114 |     if len(data.shape) != 1:
115 |         raise ValueError( f"the audio ({wav_file}) is not single channel." ) 
116 |     return torch.FloatTensor(data).unsqueeze(0), sr
117 | 
118 | 


--------------------------------------------------------------------------------
/vocoder/test.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import argparse, yaml, datetime, os, time 
  3 | import yaml, tqdm 
  4 | from collections import defaultdict
  5 | import soundfile
  6 | 
  7 | import torch 
  8 | from vocoder.datasets import create_dataloader 
  9 | from vocoder.models import create_model
 10 | from vocoder.strategy import create_strategy
 11 | from vocoder.utils.log import Logger
 12 | from vocoder.hparams import Hyperparameter 
 13 | 
 14 | 
 15 | 
 16 | class Tester:
 17 | 
 18 |     def __init__(self, args, hparams: Hyperparameter):
 19 |         self.log = Logger(args.exp_dir, tensorboard=False) 
 20 | 
 21 |         self.exp_dir = args.exp_dir 
 22 |         self.device = torch.device( args.device )
 23 |         self.hparams = hparams 
 24 |         
 25 |         self.model     = create_model( hparams.model_name, hparams.model_params, device=self.device )
 26 |         self.strategy  = create_strategy( hparams.strategy_name, hparams.strategy_params ) 
 27 |         self.restore_checkpoint()
 28 | 
 29 |         self.test_result_dir = os.path.join( self.exp_dir, f'test-{self.step}-step' )
 30 |         os.makedirs( self.test_result_dir, exist_ok=True )
 31 | 
 32 |         self.train_results = defaultdict(float) 
 33 | 
 34 |     def restore_checkpoint(self, checkpoint=None):
 35 |         pt = os.path.join( self.exp_dir, 'checkpoint.pt') 
 36 |         if checkpoint is None and os.path.islink(pt):
 37 |             checkpoint = os.path.join( self.exp_dir, os.readlink(pt) )
 38 |         state_dict = torch.load( checkpoint, map_location='cpu')  
 39 |         self.step = state_dict['step']
 40 |         self.model.load_state_dict( state_dict['model'] ) 
 41 |         self.log.info( f"Restore model from {checkpoint}" )
 42 | 
 43 |     def init_dataloader(self):
 44 |         ''' initialize dataloader for training and evaluate '''
 45 |         dataset_config = {
 46 |             'metadata_file': self.hparams.test_metadata_file,
 47 |             'hop_length': self.hparams.hop_length,
 48 |             'sample_rate': self.hparams.sample_rate, 
 49 |             'batch_mel_length': self.hparams.batch_mel_length,
 50 |             'cut': False
 51 |         }
 52 |         self.dataloader = create_dataloader( 
 53 |                             dataset_classname=self.hparams.dataset_classname, 
 54 |                             dataset_config=dataset_config,
 55 |                             batch_size=1,
 56 |                             num_workers=self.hparams.dataset_num_workers,
 57 |                             shuffle=False,
 58 |                             drop_last=False )
 59 | 
 60 |     def run(self):
 61 |         # 初始化 dataloader 
 62 |         self.init_dataloader() 
 63 |         total_rtf = 0.0
 64 |         with tqdm.tqdm( self.dataloader, desc= "Test" ) as phbar:
 65 |             for idx, batch in enumerate( phbar, start=1 ): 
 66 |                 st = time.time() 
 67 |                 result = self.strategy.test_step( batch, self.model) 
 68 |                 tc = time.time() - st
 69 | 
 70 |                 audio = result['audio'].squeeze(0).squeeze(0).cpu().numpy() 
 71 |                 soundfile.write(os.path.join( self.test_result_dir, f"{idx:04d}_gene.wav"),
 72 |                         audio, self.hparams.sample_rate, "PCM_16")
 73 |                 real_audio = batch[0].squeeze(0).squeeze(0).numpy()
 74 |                 soundfile.write(os.path.join( self.test_result_dir, f"{idx:04d}_real.wav"),
 75 |                         real_audio, self.hparams.sample_rate, "PCM_16") 
 76 | 
 77 |                 rtf = tc*self.hparams.sample_rate/len(audio)
 78 |                 total_rtf += rtf
 79 |                 phbar.set_postfix({"RTF": rtf})
 80 | 
 81 |         self.log.info('Average RTF: {}'.format( total_rtf/idx ) )
 82 |         self.log.info( f'Test result saving into {self.test_result_dir}' )
 83 | 
 84 | 
 85 | def main():
 86 |     parser = argparse.ArgumentParser(
 87 |         description="Train LVC-WaveGAN (See detail in vocoder/train.py).")
 88 |     parser.add_argument("--config", type=str, required=True,
 89 |                         help="yaml format configuration file.")
 90 |     parser.add_argument("--exp-dir", type=str, required=True,
 91 |                         help="the directory saving expriment data, "
 92 |                         "including model checkpoints, log, results. ")
 93 |     parser.add_argument("--checkpoint", default=None, type=str,
 94 |                         help="checkpoint file path to load saving model")
 95 |     parser.add_argument("--device", default='cuda', type=str,
 96 |                         help="the device for training. (default: cuda:0)")
 97 |     args = parser.parse_args() 
 98 |     hparams = Hyperparameter( args.config ) 
 99 | 
100 |     tester = Tester(args, hparams)
101 | 
102 |     try:
103 |         tester.run() 
104 |     except KeyboardInterrupt:
105 |         pass 
106 | 
107 | 
108 | if __name__ == "__main__":
109 |     main() 


--------------------------------------------------------------------------------
/vocoder/preprocess.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import argparse, glob, tqdm, os, random
  3 | from functools import partial
  4 | from concurrent.futures import ProcessPoolExecutor
  5 | import numpy as np
  6 | import torch
  7 | 
  8 | from .audio import TacotronSTFT, load_wav_to_torch 
  9 | 
 10 | from vocoder.datasets.utils import save_metadata
 11 | from vocoder.hparams import Hyperparameter 
 12 | 
 13 | 
 14 | 
 15 | def mel_transform(wav_files, mel_dir, mel_config, device, min_wav_length):
 16 |     # device = torch.device( device )
 17 |     # transfomer = MelSpectrogram( **mel_config ).to( device ) 
 18 |     taco_stft = TacotronSTFT( **mel_config )
 19 |     files = []
 20 |     with torch.no_grad():
 21 |         for fn in wav_files:
 22 |             audio, sr = load_wav_to_torch( fn, mel_config['sampling_rate'] ) 
 23 |             if audio.shape[1] < min_wav_length:
 24 |                 print( 'skip {}, sr: {}, length: {}'.format(fn, sr, audio.shape[1]) )
 25 |                 continue
 26 |             # audio = audio.to( device ) 
 27 |             mel, _ = taco_stft.mel_spectrogram( audio )
 28 |             mel_fn = os.path.join( mel_dir, os.path.basename(fn) + '.mel.npy' )
 29 |             np.save( mel_fn, mel[0].cpu().numpy().T )
 30 |             files.append( (fn, mel_fn) )
 31 |     return files
 32 | 
 33 | 
 34 | 
 35 | def preprocess( data_dir, 
 36 |                 hparams: Hyperparameter, 
 37 |                 temp_dir='temp', 
 38 |                 device='cuda:0', 
 39 |                 max_workers=4 ):
 40 |     '''Preprocess for LVC-WaveGAN.  
 41 |     Args: 
 42 |         data_dir (str): the directory containing .wav files.
 43 |         hparams (Hyperparameter): including parameter for calculating mel-spectrogram.
 44 |         temp_dir (str): the directory for saving preprocessing results.
 45 |         device (str): the cuda device for runing preprocessing.
 46 |         max_workers (int): the number of process worker.
 47 |     '''
 48 |     data_dir = os.path.abspath(data_dir)
 49 |     temp_dir = os.path.abspath(temp_dir)
 50 |     mel_dir = os.path.join( temp_dir, 'mels' ) 
 51 |     os.makedirs(mel_dir, exist_ok=True)
 52 |     mel_config = {
 53 |         'sampling_rate': hparams.sample_rate,
 54 |         'win_length': hparams.win_length,
 55 |         'hop_length': hparams.hop_length,
 56 |         'filter_length': hparams.n_fft,
 57 |         'mel_fmin': hparams.mel_fmin,
 58 |         'mel_fmax': hparams.mel_fmax,
 59 |         'n_mel_channels': hparams.n_mels,
 60 |     }
 61 |     min_wav_length = hparams.batch_mel_length * hparams.hop_length
 62 | 
 63 |     wav_files = glob.glob(f'{data_dir}/**/*.wav', recursive=True) 
 64 |     print('num of wavs:', len(wav_files))
 65 | 
 66 |     batch_size = 100 
 67 |     batch_num = int(np.ceil( len(wav_files) / batch_size ))
 68 |     batches = [ wav_files[ i*batch_size : (i+1)*batch_size ] for i in range( batch_num ) ]
 69 |     results = []
 70 |     with ProcessPoolExecutor(max_workers=max_workers) as executor:
 71 |         futures = [ executor.submit( mel_transform, batch, mel_dir, mel_config, 
 72 |                                      device, min_wav_length ) for batch in batches ] 
 73 |         for f in tqdm.tqdm( futures, desc='Preprocessing', total=batch_num ):
 74 |             results.extend( f.result() ) 
 75 | 
 76 |     save_metadata(results, os.path.join(temp_dir, 'metadata.txt'))
 77 | 
 78 |     # 产生训练、验证、测试训练集
 79 |     random.shuffle(results) 
 80 |     save_metadata(results[ : hparams.eval_sample_num ], hparams.eval_metadata_file )
 81 |     save_metadata(results[ -hparams.test_sample_num : ], hparams.test_metadata_file )
 82 |     save_metadata(results[ hparams.eval_sample_num : -hparams.test_sample_num ], 
 83 |                     hparams.train_metadata_file )
 84 | 
 85 | 
 86 | 
 87 | def main():
 88 |     parser = argparse.ArgumentParser(
 89 |         description="Preprocess for LVC-WaveGAN (See detail in vocoder/preprocess.py).")
 90 |     parser.add_argument("--data-dir", type=str, required=True, 
 91 |                         help="the directory containing .wav files")
 92 |     parser.add_argument("--config", type=str, required=True,
 93 |                         help="yaml format configuration file.")
 94 |     parser.add_argument("--temp-dir", type=str, default='temp',
 95 |                         help="the directory to save preprocessing results")
 96 |     parser.add_argument("--max-workers", type=int, default=4,
 97 |                         help="yaml format configuration file.")
 98 |     parser.add_argument("--device", default='cuda:0', type=str,
 99 |                         help="the device for training. (default: cuda:0)")
100 |     args = parser.parse_args() 
101 |     hparams = Hyperparameter( args.config ) 
102 |     
103 |     preprocess( args.data_dir, 
104 |                 hparams, 
105 |                 temp_dir=args.temp_dir, 
106 |                 device=args.device, 
107 |                 max_workers=args.max_workers)
108 | 
109 | 
110 | if __name__ == '__main__':
111 |     main()
112 | 
113 | 


--------------------------------------------------------------------------------
/configs/lvcgan.v1.yaml:
--------------------------------------------------------------------------------
  1 | ###########################################################
  2 | #             ADUIO & MEL-SPECTRUM                               #
  3 | ###########################################################
  4 | sample_rate: 22050
  5 | hop_length: 256
  6 | win_length: 1024
  7 | n_fft: 1024
  8 | n_mels: 80
  9 | mel_fmin: 70
 10 | mel_fmax: 8000
 11 | 
 12 | 
 13 | ###########################################################
 14 | #             MODEL SETTING                               #
 15 | ###########################################################
 16 | model_name: "LVCNetWaveGAN"
 17 | model_params:
 18 |     generator_params:
 19 |         in_channels: 1        # Number of input channels.
 20 |         out_channels: 1       # Number of output channels.
 21 |         inner_channels: 8
 22 |         cond_channels: 80
 23 |         cond_hop_length: 256
 24 |         lvc_block_nums: 3
 25 |         lvc_layers_each_block: 10
 26 |         lvc_kernel_size: 3
 27 |         kpnet_hidden_channels: 64
 28 |         kpnet_conv_size: 1
 29 |         dropout: 0.0
 30 |         use_weight_norm: true # Whether to use weight norm.
 31 |                             # If set to true, it will be applied to all of the conv layers.
 32 | 
 33 |     discriminator_params:
 34 |         in_channels: 1        # Number of input channels.
 35 |         out_channels: 1       # Number of output channels.
 36 |         kernel_size: 3        # Number of output channels.
 37 |         layers: 10            # Number of conv layers.
 38 |         conv_channels: 64     # Number of chnn layers.
 39 |         bias: true            # Whether to use bias parameter in conv.
 40 |         use_weight_norm: true # Whether to use weight norm.
 41 |                             # If set to true, it will be applied to all of the conv layers.
 42 |         nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
 43 |         nonlinear_activation_params:      # Nonlinear function parameters
 44 |             negative_slope: 0.2           # Alpha in LeakyReLU.
 45 | 
 46 | 
 47 | ###########################################################
 48 | #             LOSS SETTING                               #
 49 | ###########################################################
 50 | loss_name: "PWGLoss" 
 51 | loss_params:
 52 |     stft_loss_params:
 53 |         fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
 54 |         hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
 55 |         win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
 56 |         window: "hann_window"         # Window function for STFT-based loss
 57 | 
 58 | 
 59 | ###########################################################
 60 | #             OPTIMIZER & SCHEDULER SETTING               #
 61 | ###########################################################
 62 | opt_name: "PWGOptimizer"
 63 | opt_params:
 64 |     generator_optimizer_params:
 65 |         lr: 0.0001             # Generator's learning rate.
 66 |         eps: 1.0e-6            # Generator's epsilon.
 67 |         weight_decay: 0.0      # Generator's weight decay coefficient.
 68 |     generator_scheduler_params:
 69 |         step_size: 200000      # Generator's scheduler step size.
 70 |         gamma: 0.5             # Generator's scheduler gamma.
 71 |                             # At each step size, lr will be multiplied by this parameter.
 72 |     discriminator_optimizer_params:
 73 |         lr: 0.00005            # Discriminator's learning rate.
 74 |         eps: 1.0e-6            # Discriminator's epsilon.
 75 |         weight_decay: 0.0      # Discriminator's weight decay coefficient.
 76 |     discriminator_scheduler_params:
 77 |         step_size: 200000      # Discriminator's scheduler step size.
 78 |         gamma: 0.5             # Discriminator's scheduler gamma.
 79 |                             # At each step size, lr will be multiplied by this parameter.
 80 | 
 81 | 
 82 | ###########################################################
 83 | #             STRATEGY SETTING               #
 84 | ###########################################################
 85 | strategy_name: "PWGStrategy"
 86 | strategy_params:
 87 |     lambda_adv: 4.0
 88 |     discriminator_start_steps: 100000
 89 |     generator_grad_norm: 10    # Generator's gradient norm.
 90 |     discriminator_grad_norm: 1 # Discriminator's gradient norm.
 91 | 
 92 | 
 93 | ###########################################################
 94 | #             TRANINING SETTING               #
 95 | ###########################################################
 96 | dataset_classname: "PWGAudioMelNoiseDataset" 
 97 | dataset_num_workers: 5
 98 | batch_mel_length: 48
 99 | train_batch_size: 8
100 | max_train_steps: 800000 
101 | log_interval_steps: 100
102 | 
103 | 
104 | ###########################################################
105 | #             EVALUATE & TEST                #
106 | ###########################################################
107 | eval_interval_steps: 1000
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/vocoder/layers/residual_block.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """Residual block module in WaveNet.
  4 | 
  5 | This code is modified from https://github.com/r9y9/wavenet_vocoder.
  6 | 
  7 | """
  8 | 
  9 | import math
 10 | 
 11 | import torch
 12 | import torch.nn.functional as F
 13 | 
 14 | 
 15 | class Conv1d(torch.nn.Conv1d):
 16 |     """Conv1d module with customized initialization."""
 17 | 
 18 |     def __init__(self, *args, **kwargs):
 19 |         """Initialize Conv1d module."""
 20 |         super(Conv1d, self).__init__(*args, **kwargs)
 21 | 
 22 |     def reset_parameters(self):
 23 |         """Reset parameters."""
 24 |         torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu")
 25 |         if self.bias is not None:
 26 |             torch.nn.init.constant_(self.bias, 0.0)
 27 | 
 28 | 
 29 | class Conv1d1x1(Conv1d):
 30 |     """1x1 Conv1d with customized initialization."""
 31 | 
 32 |     def __init__(self, in_channels, out_channels, bias):
 33 |         """Initialize 1x1 Conv1d module."""
 34 |         super(Conv1d1x1, self).__init__(in_channels, out_channels,
 35 |                                         kernel_size=1, padding=0,
 36 |                                         dilation=1, bias=bias)
 37 | 
 38 | 
 39 | class ResidualBlock(torch.nn.Module):
 40 |     """Residual block module in WaveNet."""
 41 | 
 42 |     def __init__(self,
 43 |                  kernel_size=3,
 44 |                  residual_channels=64,
 45 |                  gate_channels=128,
 46 |                  skip_channels=64,
 47 |                  aux_channels=80,
 48 |                  dropout=0.0,
 49 |                  dilation=1,
 50 |                  bias=True,
 51 |                  use_causal_conv=False
 52 |                  ):
 53 |         """Initialize ResidualBlock module.
 54 | 
 55 |         Args:
 56 |             kernel_size (int): Kernel size of dilation convolution layer.
 57 |             residual_channels (int): Number of channels for residual connection.
 58 |             skip_channels (int): Number of channels for skip connection.
 59 |             aux_channels (int): Local conditioning channels i.e. auxiliary input dimension.
 60 |             dropout (float): Dropout probability.
 61 |             dilation (int): Dilation factor.
 62 |             bias (bool): Whether to add bias parameter in convolution layers.
 63 |             use_causal_conv (bool): Whether to use use_causal_conv or non-use_causal_conv convolution.
 64 | 
 65 |         """
 66 |         super(ResidualBlock, self).__init__()
 67 |         self.dropout = dropout
 68 |         # no future time stamps available
 69 |         if use_causal_conv:
 70 |             padding = (kernel_size - 1) * dilation
 71 |         else:
 72 |             assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
 73 |             padding = (kernel_size - 1) // 2 * dilation
 74 |         self.use_causal_conv = use_causal_conv
 75 | 
 76 |         # dilation conv
 77 |         self.conv = Conv1d(residual_channels, gate_channels, kernel_size,
 78 |                            padding=padding, dilation=dilation, bias=bias)
 79 | 
 80 |         # local conditioning
 81 |         if aux_channels > 0:
 82 |             self.conv1x1_aux = Conv1d1x1(aux_channels, gate_channels, bias=False)
 83 |         else:
 84 |             self.conv1x1_aux = None
 85 | 
 86 |         # conv output is split into two groups
 87 |         gate_out_channels = gate_channels // 2
 88 |         self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, bias=bias)
 89 |         self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_channels, bias=bias)
 90 | 
 91 |     def forward(self, x, c):
 92 |         """Calculate forward propagation.
 93 | 
 94 |         Args:
 95 |             x (Tensor): Input tensor (B, residual_channels, T).
 96 |             c (Tensor): Local conditioning auxiliary tensor (B, aux_channels, T).
 97 | 
 98 |         Returns:
 99 |             Tensor: Output tensor for residual connection (B, residual_channels, T).
100 |             Tensor: Output tensor for skip connection (B, skip_channels, T).
101 | 
102 |         """
103 |         residual = x
104 |         x = F.dropout(x, p=self.dropout, training=self.training)
105 |         x = self.conv(x)
106 | 
107 |         # remove future time steps if use_causal_conv conv
108 |         x = x[:, :, :residual.size(-1)] if self.use_causal_conv else x
109 | 
110 |         # split into two part for gated activation
111 |         splitdim = 1
112 |         xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim)
113 | 
114 |         # local conditioning
115 |         if c is not None:
116 |             assert self.conv1x1_aux is not None
117 |             c = self.conv1x1_aux(c)
118 |             ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim)
119 |             xa, xb = xa + ca, xb + cb
120 | 
121 |         x = torch.tanh(xa) * torch.sigmoid(xb)
122 | 
123 |         # for skip connection
124 |         s = self.conv1x1_skip(x)
125 | 
126 |         # for residual connection
127 |         x = (self.conv1x1_out(x) + residual) * math.sqrt(0.5)
128 | 
129 |         return x, s
130 | 


--------------------------------------------------------------------------------
/vocoder/layers/pqmf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2020 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """Pseudo QMF modules."""
  7 | 
  8 | import numpy as np
  9 | import torch
 10 | import torch.nn.functional as F
 11 | 
 12 | from scipy.signal import kaiser
 13 | 
 14 | 
 15 | def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
 16 |     """Design prototype filter for PQMF.
 17 | 
 18 |     This method is based on `A Kaiser window approach for the design of prototype
 19 |     filters of cosine modulated filterbanks`_.
 20 | 
 21 |     Args:
 22 |         taps (int): The number of filter taps.
 23 |         cutoff_ratio (float): Cut-off frequency ratio.
 24 |         beta (float): Beta coefficient for kaiser window.
 25 | 
 26 |     Returns:
 27 |         ndarray: Impluse response of prototype filter (taps + 1,).
 28 | 
 29 |     .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
 30 |         https://ieeexplore.ieee.org/abstract/document/681427
 31 | 
 32 |     """
 33 |     # check the arguments are valid
 34 |     assert taps % 2 == 0, "The number of taps mush be even number."
 35 |     assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0."
 36 | 
 37 |     # make initial filter
 38 |     omega_c = np.pi * cutoff_ratio
 39 |     with np.errstate(invalid='ignore'):
 40 |         h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) \
 41 |             / (np.pi * (np.arange(taps + 1) - 0.5 * taps))
 42 |     h_i[taps // 2] = np.cos(0) * cutoff_ratio  # fix nan due to indeterminate form
 43 | 
 44 |     # apply kaiser window
 45 |     w = kaiser(taps + 1, beta)
 46 |     h = h_i * w
 47 | 
 48 |     return h
 49 | 
 50 | 
 51 | class PQMF(torch.nn.Module):
 52 |     """PQMF module.
 53 | 
 54 |     This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
 55 | 
 56 |     .. _`Near-perfect-reconstruction pseudo-QMF banks`:
 57 |         https://ieeexplore.ieee.org/document/258122
 58 | 
 59 |     """
 60 | 
 61 |     def __init__(self, subbands=4, taps=62, cutoff_ratio=0.142, beta=9.0):
 62 |         """Initilize PQMF module.
 63 | 
 64 |         The cutoff_ratio and beta parameters are optimized for #subbands = 4.
 65 |         See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195.
 66 | 
 67 |         Args:
 68 |             subbands (int): The number of subbands.
 69 |             taps (int): The number of filter taps.
 70 |             cutoff_ratio (float): Cut-off frequency ratio.
 71 |             beta (float): Beta coefficient for kaiser window.
 72 | 
 73 |         """
 74 |         super(PQMF, self).__init__()
 75 | 
 76 |         # build analysis & synthesis filter coefficients
 77 |         h_proto = design_prototype_filter(taps, cutoff_ratio, beta)
 78 |         h_analysis = np.zeros((subbands, len(h_proto)))
 79 |         h_synthesis = np.zeros((subbands, len(h_proto)))
 80 |         for k in range(subbands):
 81 |             h_analysis[k] = 2 * h_proto * np.cos(
 82 |                 (2 * k + 1) * (np.pi / (2 * subbands)) *
 83 |                 (np.arange(taps + 1) - (taps / 2)) +
 84 |                 (-1) ** k * np.pi / 4)
 85 |             h_synthesis[k] = 2 * h_proto * np.cos(
 86 |                 (2 * k + 1) * (np.pi / (2 * subbands)) *
 87 |                 (np.arange(taps + 1) - (taps / 2)) -
 88 |                 (-1) ** k * np.pi / 4)
 89 | 
 90 |         # convert to tensor
 91 |         analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1)
 92 |         synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0)
 93 | 
 94 |         # register coefficients as beffer
 95 |         self.register_buffer("analysis_filter", analysis_filter)
 96 |         self.register_buffer("synthesis_filter", synthesis_filter)
 97 | 
 98 |         # filter for downsampling & upsampling
 99 |         updown_filter = torch.zeros((subbands, subbands, subbands)).float()
100 |         for k in range(subbands):
101 |             updown_filter[k, k, 0] = 1.0
102 |         self.register_buffer("updown_filter", updown_filter)
103 |         self.subbands = subbands
104 | 
105 |         # keep padding info
106 |         self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
107 | 
108 |     def analysis(self, x):
109 |         """Analysis with PQMF.
110 | 
111 |         Args:
112 |             x (Tensor): Input tensor (B, 1, T).
113 | 
114 |         Returns:
115 |             Tensor: Output tensor (B, subbands, T // subbands).
116 | 
117 |         """
118 |         x = F.conv1d(self.pad_fn(x), self.analysis_filter)
119 |         return F.conv1d(x, self.updown_filter, stride=self.subbands)
120 | 
121 |     def synthesis(self, x):
122 |         """Synthesis with PQMF.
123 | 
124 |         Args:
125 |             x (Tensor): Input tensor (B, subbands, T // subbands).
126 | 
127 |         Returns:
128 |             Tensor: Output tensor (B, 1, T).
129 | 
130 |         """
131 |         # NOTE(kan-bayashi): Power will be dreased so here multipy by # subbands.
132 |         #   Not sure this is the correct way, it is better to check again.
133 |         # TODO(kan-bayashi): Understand the reconstruction procedure
134 |         x = F.conv_transpose1d(x, self.updown_filter * self.subbands, stride=self.subbands)
135 |         return F.conv1d(self.pad_fn(x), self.synthesis_filter)
136 | 


--------------------------------------------------------------------------------
/vocoder/strategy/pwg_strategy.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | import torch
  4 | 
  5 | from vocoder.models import ParallelWaveGAN
  6 | from vocoder.losses import PWGLoss
  7 | from vocoder.optimizers import PWGOptimizer
  8 | from .base import TrainStrategy
  9 | 
 10 | 
 11 | 
 12 | 
 13 | class PWGStrategy(TrainStrategy):
 14 | 
 15 |     def __init__(self, 
 16 |                  lambda_adv=4.0,
 17 |                  discriminator_start_steps=100000,
 18 |                  generator_grad_norm=10,
 19 |                  discriminator_grad_norm=1):
 20 |         super().__init__() 
 21 | 
 22 |         self.lambda_adv = lambda_adv
 23 |         self.discriminator_start_steps = discriminator_start_steps
 24 |         self.generator_grad_norm = generator_grad_norm
 25 |         self.discriminator_grad_norm = discriminator_grad_norm
 26 | 
 27 |     def train_step(self, batch, step, 
 28 |                    model: ParallelWaveGAN, 
 29 |                    loss: PWGLoss, 
 30 |                    optimizer: PWGOptimizer):
 31 |         '''Train strategy for Parallel WaveGAN.
 32 |         Args:
 33 |             batch (list): the batch data for training model. 
 34 |                            [ audio(B,L), mel(B,ML,MC), noise(B,L) ]  
 35 |             step (int): current global step in training process. 
 36 |             model (ParallelWaveGAN): the parallel wavegan model.
 37 |             loss (PWGLoss): the loss module for parallel wavegan
 38 |             optimizer (PWGOptimizer): customized optimizer. 
 39 |         Returns:
 40 |             dict: the loss value dict.
 41 |         '''
 42 |         device = next(model.parameters()).device
 43 |         audio, mel, noise = [ x.to(device) for x in batch ]
 44 | 
 45 |         #######################
 46 |         #      Generator      #
 47 |         #######################
 48 |         audio_ = model.generator(noise, mel) 
 49 | 
 50 |         sc_loss, mag_loss = loss.stft_loss( audio, audio_ ) 
 51 |         gen_loss = sc_loss + mag_loss
 52 | 
 53 |         adv_loss = torch.zeros(1)
 54 |         if step > self.discriminator_start_steps:
 55 |             prob_ = model.discriminator( audio_ ) 
 56 |             adv_loss = loss.adversarial_loss( prob_ ) 
 57 |             gen_loss += self.lambda_adv * adv_loss 
 58 |         
 59 |         optimizer.generator_optimizer.zero_grad() 
 60 |         gen_loss.backward() 
 61 |         if self.generator_grad_norm > 0:
 62 |             torch.nn.utils.clip_grad_norm_(
 63 |                 model.generator.parameters(), 
 64 |                 self.generator_grad_norm) 
 65 |         optimizer.generator_optimizer.step() 
 66 |         optimizer.generator_scheduler.step()
 67 | 
 68 |         #######################
 69 |         #    Discriminator    #
 70 |         #######################
 71 |         real_loss, fake_loss, disc_loss = torch.zeros(1), torch.zeros(1), torch.zeros(1)
 72 |         if step > self.discriminator_start_steps:
 73 |             with torch.no_grad():
 74 |                 audio_ = model.generator( noise, mel ) 
 75 |             prob  = model.discriminator( audio ) 
 76 |             prob_ = model.discriminator( audio_.detach() ) 
 77 | 
 78 |             real_loss, fake_loss = loss.discriminator_loss( prob, prob_ ) 
 79 |             disc_loss = real_loss + fake_loss 
 80 | 
 81 |             optimizer.discriminator_optimizer.zero_grad()
 82 |             disc_loss.backward() 
 83 |             if self.discriminator_grad_norm > 0:
 84 |                 torch.nn.utils.clip_grad_norm_(
 85 |                     model.discriminator.parameters(), 
 86 |                     self.discriminator_grad_norm) 
 87 |             optimizer.discriminator_optimizer.step()
 88 |             optimizer.discriminator_scheduler.step() 
 89 | 
 90 |         return { 
 91 |             "generator_loss": gen_loss.item(), 
 92 |             "spectral_convergence_loss": sc_loss.item(), 
 93 |             "log_stft_magnitude_loss": mag_loss.item(),
 94 |             "adversarial_loss": adv_loss.item(), 
 95 |             "discriminator_loss": disc_loss.item(), 
 96 |             "real_loss": real_loss.item(), 
 97 |             "fake_loss": fake_loss.item()
 98 |         }
 99 | 
100 |     @torch.no_grad()
101 |     def eval_step(self, batch, 
102 |                   model: ParallelWaveGAN, 
103 |                   loss: PWGLoss):
104 |         device = next(model.parameters()).device
105 |         audio, mel, noise = [ x.to(device) for x in batch ]
106 | 
107 |         audio_ = model.generator( noise, mel ) 
108 |         prob_  = model.discriminator( audio_ ) 
109 |         prob   = model.discriminator( audio ) 
110 | 
111 |         sc_loss, mag_loss = loss.stft_loss( audio, audio_ ) 
112 |         adv_loss = loss.adversarial_loss( prob_ ) 
113 |         gen_loss = sc_loss + mag_loss + self.lambda_adv * adv_loss 
114 | 
115 |         real_loss, fake_loss = loss.discriminator_loss( prob, prob_ ) 
116 |         disc_loss = real_loss + fake_loss 
117 | 
118 |         return { 
119 |             "generator_loss": gen_loss.item(), 
120 |             "spectral_convergence_loss": sc_loss.item(), 
121 |             "log_stft_magnitude_loss": mag_loss.item(),
122 |             "adversarial_loss": adv_loss.item(), 
123 |             "discriminator_loss": disc_loss.item(), 
124 |             "real_loss": real_loss.item(), 
125 |             "fake_loss": fake_loss.item()
126 |         }
127 |     
128 |     @torch.no_grad()
129 |     def test_step(self, batch, model: ParallelWaveGAN):
130 |         device = next(model.parameters()).device
131 |         audio, mel, noise = [ x.to(device) for x in batch ]
132 | 
133 |         audio_ = model.generator( noise, mel ) 
134 |         return { 'audio' : audio_ }
135 | 
136 |         
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 


--------------------------------------------------------------------------------
/configs/pwg.v1.yaml:
--------------------------------------------------------------------------------
  1 | ###########################################################
  2 | #             ADUIO & MEL-SPECTRUM                               #
  3 | ###########################################################
  4 | sample_rate: 22050
  5 | hop_length: 256
  6 | win_length: 1024
  7 | n_fft: 1024
  8 | n_mels: 80
  9 | mel_f_min: 8000
 10 | mel_f_max: 70
 11 | 
 12 | 
 13 | ###########################################################
 14 | #             MODEL SETTING                               #
 15 | ###########################################################
 16 | model_name: "ParallelWaveGAN"
 17 | model_params:
 18 |     generator_params:
 19 |         in_channels: 1        # Number of input channels.
 20 |         out_channels: 1       # Number of output channels.
 21 |         kernel_size: 3        # Kernel size of dilated convolution.
 22 |         layers: 30            # Number of residual block layers.
 23 |         stacks: 3             # Number of stacks i.e., dilation cycles.
 24 |         residual_channels: 64 # Number of channels in residual conv.
 25 |         gate_channels: 128    # Number of channels in gated conv.
 26 |         skip_channels: 64     # Number of channels in skip conv.
 27 |         aux_channels: 80      # Number of channels for auxiliary feature conv.
 28 |                             # Must be the same as num_mels.
 29 |         aux_context_window: 2 # Context window size for auxiliary feature.
 30 |                             # If set to 2, previous 2 and future 2 frames will be considered.
 31 |         dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
 32 |         use_weight_norm: true # Whether to use weight norm.
 33 |                             # If set to true, it will be applied to all of the conv layers.
 34 |         upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
 35 |         upsample_params:                      # Upsampling network parameters.
 36 |             upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
 37 |     
 38 |     discriminator_params:
 39 |         in_channels: 1        # Number of input channels.
 40 |         out_channels: 1       # Number of output channels.
 41 |         kernel_size: 3        # Number of output channels.
 42 |         layers: 10            # Number of conv layers.
 43 |         conv_channels: 64     # Number of chnn layers.
 44 |         bias: true            # Whether to use bias parameter in conv.
 45 |         use_weight_norm: true # Whether to use weight norm.
 46 |                             # If set to true, it will be applied to all of the conv layers.
 47 |         nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
 48 |         nonlinear_activation_params:      # Nonlinear function parameters
 49 |             negative_slope: 0.2           # Alpha in LeakyReLU.
 50 | 
 51 | 
 52 | ###########################################################
 53 | #             LOSS SETTING                               #
 54 | ###########################################################
 55 | loss_name: "PWGLoss" 
 56 | loss_params:
 57 |     stft_loss_params:
 58 |         fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
 59 |         hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
 60 |         win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
 61 |         window: "hann_window"         # Window function for STFT-based loss
 62 | 
 63 | 
 64 | ###########################################################
 65 | #             OPTIMIZER & SCHEDULER SETTING               #
 66 | ###########################################################
 67 | opt_name: "PWGOptimizer"
 68 | opt_params:
 69 |     generator_optimizer_params:
 70 |         lr: 0.0001             # Generator's learning rate.
 71 |         eps: 1.0e-6            # Generator's epsilon.
 72 |         weight_decay: 0.0      # Generator's weight decay coefficient.
 73 |     generator_scheduler_params:
 74 |         step_size: 200000      # Generator's scheduler step size.
 75 |         gamma: 0.5             # Generator's scheduler gamma.
 76 |                             # At each step size, lr will be multiplied by this parameter.
 77 |     discriminator_optimizer_params:
 78 |         lr: 0.00005            # Discriminator's learning rate.
 79 |         eps: 1.0e-6            # Discriminator's epsilon.
 80 |         weight_decay: 0.0      # Discriminator's weight decay coefficient.
 81 |     discriminator_scheduler_params:
 82 |         step_size: 200000      # Discriminator's scheduler step size.
 83 |         gamma: 0.5             # Discriminator's scheduler gamma.
 84 |                             # At each step size, lr will be multiplied by this parameter.
 85 | 
 86 | 
 87 | ###########################################################
 88 | #             STRATEGY SETTING               #
 89 | ###########################################################
 90 | strategy_name: "PWGStrategy"
 91 | strategy_params:
 92 |     lambda_adv: 4.0
 93 |     discriminator_start_steps: 100000
 94 |     generator_grad_norm: 10    # Generator's gradient norm.
 95 |     discriminator_grad_norm: 1 # Discriminator's gradient norm.
 96 | 
 97 | 
 98 | ###########################################################
 99 | #             TRANINING SETTING               #
100 | ###########################################################
101 | dataset_classname: "PWGAudioMelNoiseDataset" 
102 | dataset_num_workers: 5
103 | batch_mel_length: 48
104 | train_batch_size: 8
105 | max_train_steps: 400000 
106 | log_interval_steps: 100
107 | 
108 | 
109 | ###########################################################
110 | #             EVALUATE & TEST                #
111 | ###########################################################
112 | eval_interval_steps: 1000
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/vocoder/losses/stft_loss.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2019 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """STFT-based Loss modules."""
  7 | 
  8 | import torch
  9 | import torch.nn.functional as F
 10 | 
 11 | 
 12 | def stft(x, fft_size, hop_size, win_length, window):
 13 |     """Perform STFT and convert to magnitude spectrogram.
 14 | 
 15 |     Args:
 16 |         x (Tensor): Input signal tensor (B, T).
 17 |         fft_size (int): FFT size.
 18 |         hop_size (int): Hop size.
 19 |         win_length (int): Window length.
 20 |         window (str): Window function type.
 21 | 
 22 |     Returns:
 23 |         Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
 24 | 
 25 |     """
 26 |     x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
 27 |     real = x_stft[..., 0]
 28 |     imag = x_stft[..., 1]
 29 | 
 30 |     # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
 31 |     return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1)
 32 | 
 33 | 
 34 | class SpectralConvergenceLoss(torch.nn.Module):
 35 |     """Spectral convergence loss module."""
 36 | 
 37 |     def __init__(self):
 38 |         """Initilize spectral convergence loss module."""
 39 |         super(SpectralConvergenceLoss, self).__init__()
 40 | 
 41 |     def forward(self, x_mag, y_mag):
 42 |         """Calculate forward propagation.
 43 | 
 44 |         Args:
 45 |             x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
 46 |             y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
 47 | 
 48 |         Returns:
 49 |             Tensor: Spectral convergence loss value.
 50 | 
 51 |         """
 52 |         return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro")
 53 | 
 54 | 
 55 | class LogSTFTMagnitudeLoss(torch.nn.Module):
 56 |     """Log STFT magnitude loss module."""
 57 | 
 58 |     def __init__(self):
 59 |         """Initilize los STFT magnitude loss module."""
 60 |         super(LogSTFTMagnitudeLoss, self).__init__()
 61 | 
 62 |     def forward(self, x_mag, y_mag):
 63 |         """Calculate forward propagation.
 64 | 
 65 |         Args:
 66 |             x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
 67 |             y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
 68 | 
 69 |         Returns:
 70 |             Tensor: Log STFT magnitude loss value.
 71 | 
 72 |         """
 73 |         return F.l1_loss(torch.log(y_mag), torch.log(x_mag))
 74 | 
 75 | 
 76 | class STFTLoss(torch.nn.Module):
 77 |     """STFT loss module."""
 78 | 
 79 |     def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"):
 80 |         """Initialize STFT loss module."""
 81 |         super(STFTLoss, self).__init__()
 82 |         self.fft_size = fft_size
 83 |         self.shift_size = shift_size
 84 |         self.win_length = win_length
 85 |         window = getattr(torch, window)(win_length)
 86 |         self.register_buffer('window', window)
 87 |         self.spectral_convergence_loss = SpectralConvergenceLoss()
 88 |         self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
 89 | 
 90 |     def forward(self, x, y):
 91 |         """Calculate forward propagation.
 92 | 
 93 |         Args:
 94 |             x (Tensor): Predicted signal (B, T).
 95 |             y (Tensor): Groundtruth signal (B, T).
 96 | 
 97 |         Returns:
 98 |             Tensor: Spectral convergence loss value.
 99 |             Tensor: Log STFT magnitude loss value.
100 | 
101 |         """
102 |         x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
103 |         y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
104 |         sc_loss = self.spectral_convergence_loss(x_mag, y_mag)
105 |         mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
106 | 
107 |         return sc_loss, mag_loss
108 | 
109 | 
110 | class MultiResolutionSTFTLoss(torch.nn.Module):
111 |     """Multi resolution STFT loss module."""
112 | 
113 |     def __init__(self,
114 |                  fft_sizes=[1024, 2048, 512],
115 |                  hop_sizes=[120, 240, 50],
116 |                  win_lengths=[600, 1200, 240],
117 |                  window="hann_window"):
118 |         """Initialize Multi resolution STFT loss module.
119 | 
120 |         Args:
121 |             fft_sizes (list): List of FFT sizes.
122 |             hop_sizes (list): List of hop sizes.
123 |             win_lengths (list): List of window lengths.
124 |             window (str): Window function type.
125 | 
126 |         """
127 |         super(MultiResolutionSTFTLoss, self).__init__()
128 |         assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
129 |         self.stft_losses = torch.nn.ModuleList()
130 |         for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
131 |             self.stft_losses += [STFTLoss(fs, ss, wl, window)]
132 | 
133 |     def forward(self, x, y):
134 |         """Calculate forward propagation.
135 | 
136 |         Args:
137 |             x (Tensor): Predicted signal (B, T).
138 |             y (Tensor): Groundtruth signal (B, T).
139 | 
140 |         Returns:
141 |             Tensor: Multi resolution spectral convergence loss value.
142 |             Tensor: Multi resolution log STFT magnitude loss value.
143 | 
144 |         """
145 |         sc_loss = 0.0
146 |         mag_loss = 0.0
147 |         for f in self.stft_losses:
148 |             # print( 'stft parameter device:', next(f.parameters()).device )
149 |             sc_l, mag_l = f(x, y)
150 |             sc_loss += sc_l
151 |             mag_loss += mag_l
152 |         sc_loss /= len(self.stft_losses)
153 |         mag_loss /= len(self.stft_losses)
154 | 
155 |         return sc_loss, mag_loss
156 | 


--------------------------------------------------------------------------------
/vocoder/layers/upsample.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """Upsampling module.
  4 | 
  5 | This code is modified from https://github.com/r9y9/wavenet_vocoder.
  6 | 
  7 | """
  8 | 
  9 | import numpy as np
 10 | import torch
 11 | import torch.nn.functional as F
 12 | 
 13 | from vocoder.layers import Conv1d 
 14 | 
 15 | 
 16 | class Stretch2d(torch.nn.Module):
 17 |     """Stretch2d module."""
 18 | 
 19 |     def __init__(self, x_scale, y_scale, mode="nearest"):
 20 |         """Initialize Stretch2d module.
 21 | 
 22 |         Args:
 23 |             x_scale (int): X scaling factor (Time axis in spectrogram).
 24 |             y_scale (int): Y scaling factor (Frequency axis in spectrogram).
 25 |             mode (str): Interpolation mode.
 26 | 
 27 |         """
 28 |         super(Stretch2d, self).__init__()
 29 |         self.x_scale = x_scale
 30 |         self.y_scale = y_scale
 31 |         self.mode = mode
 32 | 
 33 |     def forward(self, x):
 34 |         """Calculate forward propagation.
 35 | 
 36 |         Args:
 37 |             x (Tensor): Input tensor (B, C, F, T).
 38 | 
 39 |         Returns:
 40 |             Tensor: Interpolated tensor (B, C, F * y_scale, T * x_scale),
 41 | 
 42 |         """
 43 |         return F.interpolate(
 44 |             x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode)
 45 | 
 46 | 
 47 | class Conv2d(torch.nn.Conv2d):
 48 |     """Conv2d module with customized initialization."""
 49 | 
 50 |     def __init__(self, *args, **kwargs):
 51 |         """Initialize Conv2d module."""
 52 |         super(Conv2d, self).__init__(*args, **kwargs)
 53 | 
 54 |     def reset_parameters(self):
 55 |         """Reset parameters."""
 56 |         self.weight.data.fill_(1. / np.prod(self.kernel_size))
 57 |         if self.bias is not None:
 58 |             torch.nn.init.constant_(self.bias, 0.0)
 59 | 
 60 | 
 61 | class UpsampleNetwork(torch.nn.Module):
 62 |     """Upsampling network module."""
 63 | 
 64 |     def __init__(self,
 65 |                  upsample_scales,
 66 |                  nonlinear_activation=None,
 67 |                  nonlinear_activation_params={},
 68 |                  interpolate_mode="nearest",
 69 |                  freq_axis_kernel_size=1,
 70 |                  use_causal_conv=False,
 71 |                  ):
 72 |         """Initialize upsampling network module.
 73 | 
 74 |         Args:
 75 |             upsample_scales (list): List of upsampling scales.
 76 |             nonlinear_activation (str): Activation function name.
 77 |             nonlinear_activation_params (dict): Arguments for specified activation function.
 78 |             interpolate_mode (str): Interpolation mode.
 79 |             freq_axis_kernel_size (int): Kernel size in the direction of frequency axis.
 80 | 
 81 |         """
 82 |         super(UpsampleNetwork, self).__init__()
 83 |         self.use_causal_conv = use_causal_conv
 84 |         self.up_layers = torch.nn.ModuleList()
 85 |         for scale in upsample_scales:
 86 |             # interpolation layer
 87 |             stretch = Stretch2d(scale, 1, interpolate_mode)
 88 |             self.up_layers += [stretch]
 89 | 
 90 |             # conv layer
 91 |             assert (freq_axis_kernel_size - 1) % 2 == 0, "Not support even number freq axis kernel size."
 92 |             freq_axis_padding = (freq_axis_kernel_size - 1) // 2
 93 |             kernel_size = (freq_axis_kernel_size, scale * 2 + 1)
 94 |             if use_causal_conv:
 95 |                 padding = (freq_axis_padding, scale * 2)
 96 |             else:
 97 |                 padding = (freq_axis_padding, scale)
 98 |             conv = Conv2d(1, 1, kernel_size=kernel_size, padding=padding, bias=False)
 99 |             self.up_layers += [conv]
100 | 
101 |             # nonlinear
102 |             if nonlinear_activation is not None:
103 |                 nonlinear = getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)
104 |                 self.up_layers += [nonlinear]
105 | 
106 |     def forward(self, c):
107 |         """Calculate forward propagation.
108 | 
109 |         Args:
110 |             c : Input tensor (B, C, T).
111 | 
112 |         Returns:
113 |             Tensor: Upsampled tensor (B, C, T'), where T' = T * prod(upsample_scales).
114 | 
115 |         """
116 |         c = c.unsqueeze(1)  # (B, 1, C, T)
117 |         for f in self.up_layers:
118 |             if self.use_causal_conv and isinstance(f, Conv2d):
119 |                 c = f(c)[..., :c.size(-1)]
120 |             else:
121 |                 c = f(c)
122 |         return c.squeeze(1)  # (B, C, T')
123 | 
124 | 
125 | class ConvInUpsampleNetwork(torch.nn.Module):
126 |     """Convolution + upsampling network module."""
127 | 
128 |     def __init__(self,
129 |                  upsample_scales,
130 |                  nonlinear_activation=None,
131 |                  nonlinear_activation_params={},
132 |                  interpolate_mode="nearest",
133 |                  freq_axis_kernel_size=1,
134 |                  aux_channels=80,
135 |                  aux_context_window=0,
136 |                  use_causal_conv=False
137 |                  ):
138 |         """Initialize convolution + upsampling network module.
139 | 
140 |         Args:
141 |             upsample_scales (list): List of upsampling scales.
142 |             nonlinear_activation (str): Activation function name.
143 |             nonlinear_activation_params (dict): Arguments for specified activation function.
144 |             mode (str): Interpolation mode.
145 |             freq_axis_kernel_size (int): Kernel size in the direction of frequency axis.
146 |             aux_channels (int): Number of channels of pre-convolutional layer.
147 |             aux_context_window (int): Context window size of the pre-convolutional layer.
148 |             use_causal_conv (bool): Whether to use causal structure.
149 | 
150 |         """
151 |         super(ConvInUpsampleNetwork, self).__init__()
152 |         self.aux_context_window = aux_context_window
153 |         self.use_causal_conv = use_causal_conv and aux_context_window > 0
154 |         # To capture wide-context information in conditional features
155 |         kernel_size = aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1
156 |         # NOTE(kan-bayashi): Here do not use padding because the input is already padded
157 |         self.conv_in = Conv1d(aux_channels, aux_channels, kernel_size=kernel_size, bias=False)
158 |         self.upsample = UpsampleNetwork(
159 |             upsample_scales=upsample_scales,
160 |             nonlinear_activation=nonlinear_activation,
161 |             nonlinear_activation_params=nonlinear_activation_params,
162 |             interpolate_mode=interpolate_mode,
163 |             freq_axis_kernel_size=freq_axis_kernel_size,
164 |             use_causal_conv=use_causal_conv,
165 |         )
166 | 
167 |     def forward(self, c):
168 |         """Calculate forward propagation.
169 | 
170 |         Args:
171 |             c : Input tensor (B, C, T').
172 | 
173 |         Returns:
174 |             Tensor: Upsampled tensor (B, C, T),
175 |                 where T = (T' - aux_context_window * 2) * prod(upsample_scales).
176 | 
177 |         Note:
178 |             The length of inputs considers the context window size.
179 | 
180 |         """
181 |         c_ = self.conv_in(c)
182 |         c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_
183 |         return self.upsample(c)
184 | 


--------------------------------------------------------------------------------
/configs/parallel_wavegan.v1.yaml:
--------------------------------------------------------------------------------
  1 | # This is the hyperparameter configuration file for Parallel WaveGAN.
  2 | # Please make sure this is adjusted for the LJSpeech dataset. If you want to
  3 | # apply to the other dataset, you might need to carefully change some parameters.
  4 | # This configuration requires 12 GB GPU memory and takes ~3 days on TITAN V.
  5 | 
  6 | ###########################################################
  7 | #                FEATURE EXTRACTION SETTING               #
  8 | ###########################################################
  9 | sampling_rate: 22050     # Sampling rate.
 10 | fft_size: 1024           # FFT size.
 11 | hop_size: 256            # Hop size.
 12 | win_length: null         # Window length.
 13 |                          # If set to null, it will be the same as fft_size.
 14 | window: "hann"           # Window function.
 15 | num_mels: 80             # Number of mel basis.
 16 | fmin: 80                 # Minimum freq in mel basis calculation.
 17 | fmax: 7600               # Maximum frequency in mel basis calculation.
 18 | global_gain_scale: 1.0   # Will be multiplied to all of waveform.
 19 | trim_silence: true       # Whether to trim the start and end of silence.
 20 | trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
 21 | trim_frame_size: 2048    # Frame size in trimming.
 22 | trim_hop_size: 512       # Hop size in trimming.
 23 | format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
 24 | 
 25 | ###########################################################
 26 | #         GENERATOR NETWORK ARCHITECTURE SETTING          #
 27 | ###########################################################
 28 | generator_params:
 29 |     in_channels: 1        # Number of input channels.
 30 |     out_channels: 1       # Number of output channels.
 31 |     kernel_size: 3        # Kernel size of dilated convolution.
 32 |     layers: 30            # Number of residual block layers.
 33 |     stacks: 3             # Number of stacks i.e., dilation cycles.
 34 |     residual_channels: 64 # Number of channels in residual conv.
 35 |     gate_channels: 128    # Number of channels in gated conv.
 36 |     skip_channels: 64     # Number of channels in skip conv.
 37 |     aux_channels: 80      # Number of channels for auxiliary feature conv.
 38 |                           # Must be the same as num_mels.
 39 |     aux_context_window: 2 # Context window size for auxiliary feature.
 40 |                           # If set to 2, previous 2 and future 2 frames will be considered.
 41 |     dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
 42 |     use_weight_norm: true # Whether to use weight norm.
 43 |                           # If set to true, it will be applied to all of the conv layers.
 44 |     upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
 45 |     upsample_params:                      # Upsampling network parameters.
 46 |         upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
 47 | 
 48 | ###########################################################
 49 | #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
 50 | ###########################################################
 51 | discriminator_params:
 52 |     in_channels: 1        # Number of input channels.
 53 |     out_channels: 1       # Number of output channels.
 54 |     kernel_size: 3        # Number of output channels.
 55 |     layers: 10            # Number of conv layers.
 56 |     conv_channels: 64     # Number of chnn layers.
 57 |     bias: true            # Whether to use bias parameter in conv.
 58 |     use_weight_norm: true # Whether to use weight norm.
 59 |                           # If set to true, it will be applied to all of the conv layers.
 60 |     nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
 61 |     nonlinear_activation_params:      # Nonlinear function parameters
 62 |         negative_slope: 0.2           # Alpha in LeakyReLU.
 63 | 
 64 | ###########################################################
 65 | #                   STFT LOSS SETTING                     #
 66 | ###########################################################
 67 | stft_loss_params:
 68 |     fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
 69 |     hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
 70 |     win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
 71 |     window: "hann_window"         # Window function for STFT-based loss
 72 | 
 73 | ###########################################################
 74 | #               ADVERSARIAL LOSS SETTING                  #
 75 | ###########################################################
 76 | lambda_adv: 4.0  # Loss balancing coefficient.
 77 | 
 78 | ###########################################################
 79 | #                  DATA LOADER SETTING                    #
 80 | ###########################################################
 81 | batch_size: 6              # Batch size.
 82 | batch_max_steps: 25600     # Length of each audio in batch. Make sure dividable by hop_size.
 83 | pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
 84 | num_workers: 2             # Number of workers in Pytorch DataLoader.
 85 | remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
 86 | allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
 87 | 
 88 | ###########################################################
 89 | #             OPTIMIZER & SCHEDULER SETTING               #
 90 | ###########################################################
 91 | generator_optimizer_params:
 92 |     lr: 0.0001             # Generator's learning rate.
 93 |     eps: 1.0e-6            # Generator's epsilon.
 94 |     weight_decay: 0.0      # Generator's weight decay coefficient.
 95 | generator_scheduler_params:
 96 |     step_size: 200000      # Generator's scheduler step size.
 97 |     gamma: 0.5             # Generator's scheduler gamma.
 98 |                            # At each step size, lr will be multiplied by this parameter.
 99 | generator_grad_norm: 10    # Generator's gradient norm.
100 | discriminator_optimizer_params:
101 |     lr: 0.00005            # Discriminator's learning rate.
102 |     eps: 1.0e-6            # Discriminator's epsilon.
103 |     weight_decay: 0.0      # Discriminator's weight decay coefficient.
104 | discriminator_scheduler_params:
105 |     step_size: 200000      # Discriminator's scheduler step size.
106 |     gamma: 0.5             # Discriminator's scheduler gamma.
107 |                            # At each step size, lr will be multiplied by this parameter.
108 | discriminator_grad_norm: 1 # Discriminator's gradient norm.
109 | 
110 | ###########################################################
111 | #                    INTERVAL SETTING                     #
112 | ###########################################################
113 | discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
114 | train_max_steps: 400000                 # Number of training steps.
115 | save_interval_steps: 5000               # Interval steps to save checkpoint.
116 | eval_interval_steps: 1000               # Interval steps to evaluate the network.
117 | log_interval_steps: 100                 # Interval steps to record the training log.
118 | 
119 | ###########################################################
120 | #                     OTHER SETTING                       #
121 | ###########################################################
122 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
123 | 


--------------------------------------------------------------------------------
/vocoder/models/lvcgan.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | import logging
  4 | import math
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | 
  9 | from vocoder.layers import Conv1d
 10 | from vocoder.layers import Conv1d1x1
 11 | from vocoder.layers import ResidualBlock
 12 | from vocoder.layers import upsample
 13 | from vocoder import models
 14 | 
 15 | from .parallel_wavegan import ParallelWaveGANDiscriminator
 16 | from .lvcnet import LVCBlock 
 17 | 
 18 | 
 19 | class LVCNetWaveGAN(torch.nn.Module):
 20 |     """Parallel WaveGAN module"""
 21 | 
 22 |     def __init__(self, generator_params={}, discriminator_params={}):
 23 |         super().__init__() 
 24 | 
 25 |         self.generator = LVCNetGenerator(**generator_params) 
 26 |         self.discriminator = ParallelWaveGANDiscriminator(**discriminator_params) 
 27 | 
 28 |     def generator_forward(self, x, c):
 29 |         return self.generator(x, c) 
 30 |     
 31 |     def discriminator_forward(self, x):
 32 |         return self.discriminator(x) 
 33 | 
 34 | 
 35 | class LVCNetGenerator(torch.nn.Module):
 36 |     """Parallel WaveGAN Generator module."""
 37 | 
 38 |     def __init__(self,
 39 |                  in_channels=1,
 40 |                  out_channels=1,
 41 |                  inner_channels=8,
 42 |                  cond_channels=80,
 43 |                  cond_hop_length=256,
 44 |                  lvc_block_nums=3,
 45 |                  lvc_layers_each_block=10,
 46 |                  lvc_kernel_size=3,
 47 |                  kpnet_hidden_channels=64,
 48 |                  kpnet_conv_size=1,
 49 |                  dropout=0.0,
 50 |                  use_weight_norm=True,
 51 |                  ):
 52 |         """Initialize Parallel WaveGAN Generator module.
 53 | 
 54 |         Args:
 55 |             in_channels (int): Number of input channels.
 56 |             out_channels (int): Number of output channels.
 57 |             kernel_size (int): Kernel size of dilated convolution.
 58 |             layers (int): Number of residual block layers.
 59 |             stacks (int): Number of stacks i.e., dilation cycles.
 60 |             residual_channels (int): Number of channels in residual conv.
 61 |             gate_channels (int):  Number of channels in gated conv.
 62 |             skip_channels (int): Number of channels in skip conv.
 63 |             aux_channels (int): Number of channels for auxiliary feature conv.
 64 |             aux_context_window (int): Context window size for auxiliary feature.
 65 |             dropout (float): Dropout rate. 0.0 means no dropout applied.
 66 |             bias (bool): Whether to use bias parameter in conv layer.
 67 |             use_weight_norm (bool): Whether to use weight norm.
 68 |                 If set to true, it will be applied to all of the conv layers.
 69 |             use_causal_conv (bool): Whether to use causal structure.
 70 |             upsample_conditional_features (bool): Whether to use upsampling network.
 71 |             upsample_net (str): Upsampling network architecture.
 72 |             upsample_params (dict): Upsampling network parameters.
 73 | 
 74 |         """
 75 |         super().__init__()
 76 |         self.in_channels = in_channels
 77 |         self.out_channels = out_channels
 78 |         self.cond_channels = cond_channels
 79 |         self.lvc_block_nums = lvc_block_nums
 80 | 
 81 |         # define first convolution
 82 |         self.first_conv = Conv1d1x1(in_channels, inner_channels, bias=True)
 83 | 
 84 |         # define residual blocks
 85 |         self.lvc_blocks = torch.nn.ModuleList()
 86 |         for n in range(lvc_block_nums):
 87 |             lvcb = LVCBlock(
 88 |                 in_channels=inner_channels, 
 89 |                 cond_channels=cond_channels,
 90 |                 conv_layers=lvc_layers_each_block,
 91 |                 conv_kernel_size=lvc_kernel_size,
 92 |                 cond_hop_length=cond_hop_length,
 93 |                 kpnet_hidden_channels=kpnet_hidden_channels,
 94 |                 kpnet_conv_size=kpnet_conv_size,
 95 |                 kpnet_dropout=dropout,
 96 |             )
 97 |             self.lvc_blocks += [lvcb]
 98 | 
 99 |         # define output layers
100 |         self.last_conv_layers = torch.nn.ModuleList([
101 |             torch.nn.ReLU(inplace=True),
102 |             Conv1d1x1(inner_channels, inner_channels, bias=True),
103 |             torch.nn.ReLU(inplace=True),
104 |             Conv1d1x1(inner_channels, out_channels, bias=True),
105 |         ])
106 | 
107 |         # apply weight norm
108 |         if use_weight_norm:
109 |             self.apply_weight_norm()
110 | 
111 |     def forward(self, x, c):
112 |         """Calculate forward propagation.
113 | 
114 |         Args:
115 |             x (Tensor): Input noise signal (B, 1, T).
116 |             c (Tensor): Local conditioning auxiliary features (B, C ,T').
117 | 
118 |         Returns:
119 |             Tensor: Output tensor (B, out_channels, T)
120 | 
121 |         """
122 | 
123 |         x = self.first_conv(x)
124 |         x = self.lvc_blocks[0]( x, c )
125 |         for n in range(1, self.lvc_block_nums):
126 |             x = x + self.lvc_blocks[n]( x, c )
127 | 
128 |         # apply final layers
129 |         for f in self.last_conv_layers:
130 |             x = f(x)
131 | 
132 |         return x
133 | 
134 |     def remove_weight_norm(self):
135 |         """Remove weight normalization module from all of the layers."""
136 |         def _remove_weight_norm(m):
137 |             try:
138 |                 logging.debug(f"Weight norm is removed from {m}.")
139 |                 torch.nn.utils.remove_weight_norm(m)
140 |             except ValueError:  # this module didn't have weight norm
141 |                 return
142 | 
143 |         self.apply(_remove_weight_norm)
144 | 
145 |     def apply_weight_norm(self):
146 |         """Apply weight normalization module from all of the layers."""
147 |         def _apply_weight_norm(m):
148 |             if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
149 |                 torch.nn.utils.weight_norm(m)
150 |                 logging.debug(f"Weight norm is applied to {m}.")
151 | 
152 |         self.apply(_apply_weight_norm)
153 | 
154 |     @staticmethod
155 |     def _get_receptive_field_size(layers, stacks, kernel_size,
156 |                                   dilation=lambda x: 2 ** x):
157 |         assert layers % stacks == 0
158 |         layers_per_cycle = layers // stacks
159 |         dilations = [dilation(i % layers_per_cycle) for i in range(layers)]
160 |         return (kernel_size - 1) * sum(dilations) + 1
161 | 
162 |     @property
163 |     def receptive_field_size(self):
164 |         """Return receptive field size."""
165 |         return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size)
166 | 
167 |     def inference(self, c=None, x=None):
168 |         """Perform inference.
169 | 
170 |         Args:
171 |             c (Union[Tensor, ndarray]): Local conditioning auxiliary features (T' ,C).
172 |             x (Union[Tensor, ndarray]): Input noise signal (T, 1).
173 | 
174 |         Returns:
175 |             Tensor: Output tensor (T, out_channels)
176 | 
177 |         """
178 |         if x is not None:
179 |             if not isinstance(x, torch.Tensor):
180 |                 x = torch.tensor(x, dtype=torch.float).to(next(self.parameters()).device)
181 |             x = x.transpose(1, 0).unsqueeze(0)
182 |         else:
183 |             assert c is not None
184 |             x = torch.randn(1, 1, len(c) * self.upsample_factor).to(next(self.parameters()).device)
185 |         if c is not None:
186 |             if not isinstance(c, torch.Tensor):
187 |                 c = torch.tensor(c, dtype=torch.float).to(next(self.parameters()).device)
188 |             c = c.transpose(1, 0).unsqueeze(0)
189 |             c = torch.nn.ReplicationPad1d(self.aux_context_window)(c)
190 |         return self.forward(x, c).squeeze(0).transpose(1, 0)
191 | 


--------------------------------------------------------------------------------
/vocoder/audio/stft.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch.autograd import Variable
  4 | import numpy as np
  5 | 
  6 | from scipy.signal import get_window
  7 | from librosa.util import pad_center, tiny
  8 | from librosa.filters import mel as librosa_mel_fn
  9 | 
 10 | from .util import window_sumsquare
 11 | from .util import dynamic_range_compression
 12 | from .util import dynamic_range_decompression
 13 | 
 14 | 
 15 | class STFT(torch.nn.Module):
 16 |     """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
 17 | 
 18 |     def __init__(self, filter_length, hop_length, win_length,
 19 |                  window='hann'):
 20 |         super(STFT, self).__init__()
 21 |         self.filter_length = filter_length
 22 |         self.hop_length = hop_length
 23 |         self.win_length = win_length
 24 |         self.window = window
 25 |         self.forward_transform = None
 26 |         scale = self.filter_length / self.hop_length
 27 |         fourier_basis = np.fft.fft(np.eye(self.filter_length))
 28 | 
 29 |         cutoff = int((self.filter_length / 2 + 1))
 30 |         fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
 31 |                                    np.imag(fourier_basis[:cutoff, :])])
 32 | 
 33 |         forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
 34 |         inverse_basis = torch.FloatTensor(
 35 |             np.linalg.pinv(scale * fourier_basis).T[:, None, :])
 36 | 
 37 |         if window is not None:
 38 |             assert(filter_length >= win_length)
 39 |             # get window and zero center pad it to filter_length
 40 |             fft_window = get_window(window, win_length, fftbins=True)
 41 |             fft_window = pad_center(fft_window, filter_length)
 42 |             fft_window = torch.from_numpy(fft_window).float()
 43 | 
 44 |             # window the bases
 45 |             forward_basis *= fft_window
 46 |             inverse_basis *= fft_window
 47 | 
 48 |         self.register_buffer('forward_basis', forward_basis.float())
 49 |         self.register_buffer('inverse_basis', inverse_basis.float())
 50 | 
 51 |     def transform(self, input_data):
 52 |         num_batches = input_data.size(0)
 53 |         num_samples = input_data.size(1)
 54 | 
 55 |         self.num_samples = num_samples
 56 | 
 57 |         # similar to librosa, reflect-pad the input
 58 |         input_data = input_data.view(num_batches, 1, num_samples)
 59 |         input_data = F.pad(
 60 |             input_data.unsqueeze(1),
 61 |             (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
 62 |             mode='reflect')
 63 |         input_data = input_data.squeeze(1)
 64 | 
 65 |         forward_transform = F.conv1d(
 66 |             input_data.cuda(),
 67 |             Variable(self.forward_basis, requires_grad=False).cuda(),
 68 |             stride=self.hop_length,
 69 |             padding=0).cpu()
 70 | 
 71 |         cutoff = int((self.filter_length / 2) + 1)
 72 |         real_part = forward_transform[:, :cutoff, :]
 73 |         imag_part = forward_transform[:, cutoff:, :]
 74 | 
 75 |         magnitude = torch.sqrt(real_part**2 + imag_part**2)
 76 |         phase = torch.autograd.Variable(
 77 |             torch.atan2(imag_part.data, real_part.data))
 78 | 
 79 |         return magnitude, phase
 80 | 
 81 |     def inverse(self, magnitude, phase):
 82 |         recombine_magnitude_phase = torch.cat(
 83 |             [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
 84 | 
 85 |         inverse_transform = F.conv_transpose1d(
 86 |             recombine_magnitude_phase,
 87 |             Variable(self.inverse_basis, requires_grad=False),
 88 |             stride=self.hop_length,
 89 |             padding=0)
 90 | 
 91 |         if self.window is not None:
 92 |             window_sum = window_sumsquare(
 93 |                 self.window, magnitude.size(-1), hop_length=self.hop_length,
 94 |                 win_length=self.win_length, n_fft=self.filter_length,
 95 |                 dtype=np.float32)
 96 |             # remove modulation effects
 97 |             approx_nonzero_indices = torch.from_numpy(
 98 |                 np.where(window_sum > tiny(window_sum))[0])
 99 |             window_sum = torch.autograd.Variable(
100 |                 torch.from_numpy(window_sum), requires_grad=False)
101 |             window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
102 |             inverse_transform[:, :,
103 |                               approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
104 | 
105 |             # scale by hop ratio
106 |             inverse_transform *= float(self.filter_length) / self.hop_length
107 | 
108 |         inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
109 |         inverse_transform = inverse_transform[:,
110 |                                               :, :-int(self.filter_length/2):]
111 | 
112 |         return inverse_transform
113 | 
114 |     def forward(self, input_data):
115 |         self.magnitude, self.phase = self.transform(input_data)
116 |         reconstruction = self.inverse(self.magnitude, self.phase)
117 |         return reconstruction
118 | 
119 | 
120 | class TacotronSTFT(torch.nn.Module):
121 |     ''' A mel-spectrum Transfomation class  
122 | 
123 |     Example:
124 |         n_fft = 1024
125 |         hop_length = 256
126 |         win_length = 1024
127 |         n_mels = 80
128 |         mel_fmin = 70
129 |         mel_fmax = 8000
130 |         sample_rate = 22050 
131 | 
132 |         taco_stft = TacotronSTFT( filter_length=n_fft, 
133 |                                   hop_length=ho_length, 
134 |                                   win_length=win_length, 
135 |                                   n_mel_channels=n_mels,
136 |                                   sampling_rate=sample_rate,
137 |                                   mel_fmin=mel_fmin,
138 |                                   mel_fmax=mel_fmax) 
139 | 
140 |         audio, sr = load_wav_to_torch('path/to/00001.wav') 
141 |         mel, _ = taco_stft.mel_spectrogram(audio)
142 |     '''
143 |     def __init__(self, filter_length, hop_length, win_length,
144 |                  n_mel_channels, sampling_rate, mel_fmin=0.0,
145 |                  mel_fmax=8000.0):
146 |         super(TacotronSTFT, self).__init__()
147 |         self.n_mel_channels = n_mel_channels
148 |         self.sampling_rate = sampling_rate
149 |         self.stft_fn = STFT(filter_length, hop_length, win_length)
150 |         mel_basis = librosa_mel_fn(
151 |             sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
152 |         mel_basis = torch.from_numpy(mel_basis).float()
153 |         self.register_buffer('mel_basis', mel_basis)
154 | 
155 |     def spectral_normalize(self, magnitudes):
156 |         output = dynamic_range_compression(magnitudes)
157 |         return output
158 | 
159 |     def spectral_de_normalize(self, magnitudes):
160 |         output = dynamic_range_decompression(magnitudes)
161 |         return output
162 | 
163 |     def mel_spectrogram(self, audio):
164 |         """Computes mel-spectrograms from a batch of wav  
165 | 
166 |         Args:
167 |             audio: Variable(torch.FloatTensor) with shape (B, L) in range [-1, 1]
168 | 
169 |         Returns:
170 |             mel (Tensor): torch.FloatTensor of shape (B, n_mel_channels, T)
171 |             energy (Tensor): shape (B, n_fft, T)
172 |         """
173 |         y = torch.autograd.Variable(audio, requires_grad=False)  
174 | 
175 |         assert(torch.min(y.data) >= -1)
176 |         assert(torch.max(y.data) <= 1)
177 | 
178 |         magnitudes, phases = self.stft_fn.transform(y)
179 |         magnitudes = magnitudes.data
180 |         mel_output = torch.matmul(self.mel_basis, magnitudes)
181 |         mel_output = self.spectral_normalize(mel_output)
182 |         energy = torch.norm(magnitudes, dim=1)
183 | 
184 |         return mel_output, energy
185 | 


--------------------------------------------------------------------------------
/vocoder/models/lvcnet.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import math 
  3 | import torch 
  4 | import torch.nn.functional as F 
  5 | 
  6 | from vocoder.layers import Conv1d 
  7 | from vocoder.layers import Conv1d1x1 
  8 | 
  9 | class KernelPredictor(torch.nn.Module):
 10 |     ''' Kernel predictor for the location-variable convolutions
 11 |     '''
 12 | 
 13 |     def __init__(self, 
 14 |                  cond_channels,
 15 |                  conv_in_channels,
 16 |                  conv_out_channels,
 17 |                  conv_layers,
 18 |                  conv_kernel_size=3,
 19 |                  kpnet_hidden_channels=64,
 20 |                  kpnet_conv_size=1,
 21 |                  kpnet_dropout=0.0,
 22 |                  kpnet_nonlinear_activation="LeakyReLU",
 23 |                  kpnet_nonlinear_activation_params={"negative_slope":0.1}
 24 |                  ):
 25 |         '''
 26 |         Args:
 27 |             cond_channels (int): number of channel for the conditioning sequence,
 28 |             conv_in_channels (int): number of channel for the input sequence,
 29 |             conv_out_channels (int): number of channel for the output sequence,
 30 |             conv_layers (int):
 31 |             kpnet_
 32 |         '''
 33 |         super().__init__() 
 34 | 
 35 |         self.conv_in_channels = conv_in_channels 
 36 |         self.conv_out_channels = conv_out_channels 
 37 |         self.conv_kernel_size = conv_kernel_size 
 38 |         self.conv_layers = conv_layers
 39 | 
 40 |         kpnet_kernel_channels = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers 
 41 |         kpnet_bias_channels = conv_out_channels * conv_layers 
 42 | 
 43 |         padding = (kpnet_conv_size - 1)//2 
 44 |         self.input_conv = torch.nn.Sequential(
 45 |             torch.nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=0, bias=True), 
 46 |             getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
 47 |         )
 48 | 
 49 |         self.residual_conv = torch.nn.Sequential(
 50 |             torch.nn.Dropout(kpnet_dropout), 
 51 |             torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), 
 52 |             getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
 53 |             torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), 
 54 |         )
 55 | 
 56 |         self.kernel_conv = torch.nn.Conv1d(kpnet_hidden_channels, kpnet_kernel_channels, kpnet_conv_size, padding=padding, bias=True) 
 57 |         self.bias_conv = torch.nn.Conv1d(kpnet_hidden_channels, kpnet_bias_channels, kpnet_conv_size, padding=padding, bias=True) 
 58 |         
 59 |     def forward(self, c):
 60 |         '''
 61 |         Args:
 62 |             c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) 
 63 |         Returns:
 64 |         '''
 65 |         batch, cond_channels, cond_length = c.shape 
 66 | 
 67 |         c = self.input_conv( c ) 
 68 |         c = c + self.residual_conv( c ) 
 69 |         k = self.kernel_conv( c ) 
 70 |         b = self.bias_conv( c ) 
 71 |         kernels = k.contiguous().view( batch, 
 72 |                                       self.conv_layers,
 73 |                                       self.conv_in_channels,
 74 |                                       self.conv_out_channels,
 75 |                                       self.conv_kernel_size,
 76 |                                       cond_length - 4 ) 
 77 |         bias = b.contiguous().view( batch, 
 78 |                                     self.conv_layers, 
 79 |                                     self.conv_out_channels,
 80 |                                     cond_length - 4 ) 
 81 |         return kernels, bias
 82 | 
 83 | 
 84 | 
 85 | class LVCBlock(torch.nn.Module):
 86 |     ''' the location-variable convolutions 
 87 |     '''
 88 | 
 89 |     def __init__(self,
 90 |                  in_channels,
 91 |                  cond_channels,
 92 |                  conv_layers=10,
 93 |                  conv_kernel_size=3,
 94 |                  cond_hop_length=256,
 95 |                  kpnet_hidden_channels=64,
 96 |                  kpnet_conv_size=1,
 97 |                  kpnet_dropout=0.0
 98 |                  ):
 99 |         super().__init__() 
100 | 
101 |         self.cond_hop_length = cond_hop_length 
102 |         self.conv_layers = conv_layers 
103 |         self.conv_kernel_size = conv_kernel_size 
104 | 
105 |         self.kernel_predictor = KernelPredictor( 
106 |                     cond_channels=cond_channels,
107 |                     conv_in_channels=in_channels,
108 |                     conv_out_channels=2*in_channels, 
109 |                     conv_layers=conv_layers,
110 |                     conv_kernel_size=conv_kernel_size,
111 |                     kpnet_hidden_channels=kpnet_hidden_channels,
112 |                     kpnet_conv_size=kpnet_conv_size,
113 |                     kpnet_dropout=kpnet_dropout 
114 |                     ) 
115 | 
116 |     def forward(self, x, c):
117 |         ''' forward propagation of the location-variable convolutions.  
118 |         Args: 
119 |             x (Tensor): the input sequence (batch, in_channels, in_length) 
120 |             c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
121 |         
122 |         Returns:
123 |             Tensor: the output sequence (batch, in_channels, in_length) 
124 |         ''' 
125 |         batch, in_channels, in_length = x.shape 
126 |         batch, cond_channels, cond_length = c.shape 
127 |         assert in_length == ( (cond_length - 4) * self.cond_hop_length ), ( 
128 |             f"the length of input ({in_length}, {cond_length}) is not match in LVCNet" ) 
129 | 
130 |         kernels, bias = self.kernel_predictor( c ) 
131 | 
132 |         for i in range(self.conv_layers):
133 |             dilation = 2**i 
134 |             k = kernels[ :, i, :, :, :, : ] 
135 |             b = bias[ :, i, :, : ] 
136 |             x = self.location_variable_convolution( x, k, b, dilation, self.cond_hop_length ) 
137 |             x = torch.sigmoid( x[ :, :in_channels, : ] ) * torch.tanh( x[ :, in_channels:, : ] ) 
138 |         return x 
139 |             
140 |     
141 |     def location_variable_convolution(self, x, kernel, bias, dilation, hop_size):
142 |         ''' perform location-variable convolution operation on the input sequence (x) using the local convolution kernl. 
143 |         Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100. 
144 | 
145 |         Args:
146 |             x (Tensor): the input sequence (batch, in_channels, in_length). 
147 |             kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length) 
148 |             bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length) 
149 |             dilation (int): the dilation of convolution. 
150 |             hop_size (int): the hop_size of the conditioning sequence. 
151 | 
152 |         Returns:
153 |             (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length).
154 |         '''
155 |         batch, in_channels, in_length = x.shape 
156 |         batch, in_channels, out_channels, kernel_size, kernel_length = kernel.shape 
157 | 
158 |         assert in_length == (kernel_length*hop_size), "length of (x, kernel) is not matched" 
159 | 
160 |         padding = dilation * int( (kernel_size - 1) / 2 ) 
161 |         x = F.pad( x, (padding, padding), 'constant', 0 )      # (batch, in_channels, in_length + 2*padding)
162 |         x = x.unfold( 2, hop_size + 2 * padding, hop_size )    # (batch, in_channels, kernel_length, hop_size + 2*padding)
163 | 
164 |         if hop_size < dilation:
165 |             x = F.pad( x, (0, dilation), 'constant', 0 ) 
166 |         x = x.unfold(3, dilation, dilation)     # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation)
167 |         x = x[ :, :, :, :, :hop_size ]          
168 |         x = x.transpose( 3, 4 )                 # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation)  
169 |         x = x.unfold( 4, kernel_size, 1 )       # (batch, in_channels, kernel_length, dilation, _, kernel_size)
170 | 
171 |         o = torch.einsum( 'bildsk,biokl->bolsd', x, kernel ) 
172 |         o = o + bias.unsqueeze(-1).unsqueeze(-1) 
173 |         o = o.contiguous().view(batch, out_channels, -1) 
174 |         return o 
175 | 
176 | 
177 | 
178 | 
179 | 


--------------------------------------------------------------------------------
/vocoder/train.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import argparse, yaml, datetime, os 
  3 | import yaml, tqdm 
  4 | from collections import defaultdict
  5 | 
  6 | import torch 
  7 | from vocoder.datasets import create_dataloader 
  8 | from vocoder.models import create_model
  9 | from vocoder.losses import create_loss 
 10 | from vocoder.optimizers import create_optimizer
 11 | from vocoder.strategy import create_strategy
 12 | from vocoder.utils.log import Logger
 13 | from vocoder.hparams import Hyperparameter 
 14 | 
 15 | 
 16 | 
 17 | class Trainer:
 18 | 
 19 |     def __init__(self, args, hparams: Hyperparameter):
 20 |         self.log = Logger(args.exp_dir) 
 21 | 
 22 |         self.exp_dir = args.exp_dir
 23 |         os.makedirs( self.exp_dir, exist_ok=True )
 24 | 
 25 |         self.device = torch.device( args.device )
 26 |         self.hparams = hparams 
 27 |         
 28 |         self.step = 1 
 29 |         self.epoch = 1
 30 |         
 31 |         self.model     = create_model( hparams.model_name, hparams.model_params, device=self.device )
 32 |         self.loss      = create_loss( hparams.loss_name, hparams.loss_params, device=self.device )
 33 |         self.optimizer = create_optimizer( hparams.opt_name, self.model, hparams.opt_params ) 
 34 |         self.strategy  = create_strategy( hparams.strategy_name, hparams.strategy_params ) 
 35 | 
 36 |         self.restore_checkpoint(args.restart, args.checkpoint)
 37 | 
 38 |         self.train_results = defaultdict(float) 
 39 |         self.num_train_reuslts = 0
 40 | 
 41 |     def restore_checkpoint(self, restart=False, checkpoint=None):
 42 |         if not restart:
 43 |             try:
 44 |                 pt = os.path.join( self.exp_dir, 'checkpoint.pt') 
 45 |                 if checkpoint is None and os.path.islink(pt):
 46 |                     checkpoint = os.path.join( self.exp_dir, os.readlink(pt) )
 47 |                 if not os.path.isfile( checkpoint ):
 48 |                     print('start new training.')
 49 |                     return
 50 |                 state_dict = torch.load( checkpoint, map_location='cpu')
 51 |                 self.step = state_dict['step'] 
 52 |                 self.epoch = state_dict['epoch'] 
 53 |                 self.model.load_state_dict( state_dict['model'] ) 
 54 |                 self.optimizer.load_state_dict( state_dict['optimizer'] )
 55 |                 self.log.info( f"Restore model from {checkpoint}")
 56 |             except:
 57 |                 print('Error in restore model. Start New training')
 58 | 
 59 |     def save_checkpoint(self):
 60 |         state_dict = {
 61 |             "step": self.step,
 62 |             "epoch": self.epoch,
 63 |             "optimizer": self.optimizer.state_dict(), 
 64 |             "model": self.model.state_dict()
 65 |         }
 66 |         save_path = os.path.join( self.exp_dir, f'checkpoint-{self.step}.pt')
 67 |         link_path = os.path.join( self.exp_dir, 'checkpoint.pt')
 68 |         torch.save( state_dict, save_path )
 69 |         if os.path.islink(link_path):
 70 |             os.unlink(link_path)
 71 |         os.symlink(f'checkpoint-{self.step}.pt', link_path)
 72 |         self.log.info( f'Save chechpoint as {save_path}' )
 73 | 
 74 |     def init_dataloader(self):
 75 |         ''' initialize dataloader for training and evaluate '''
 76 |         train_dataset_config = {
 77 |             'metadata_file': self.hparams.train_metadata_file,
 78 |             'hop_length': self.hparams.hop_length,
 79 |             'sample_rate': self.hparams.sample_rate,
 80 |             'batch_mel_length': self.hparams.batch_mel_length
 81 |         }
 82 |         eval_dataset_config = {
 83 |             'metadata_file': self.hparams.eval_metadata_file,
 84 |             'hop_length': self.hparams.hop_length,
 85 |             'sample_rate': self.hparams.sample_rate,
 86 |             'batch_mel_length': self.hparams.batch_mel_length
 87 |         }
 88 |         self.dataloader = {
 89 |             "train": create_dataloader( 
 90 |                             dataset_classname=self.hparams.dataset_classname, 
 91 |                             dataset_config=train_dataset_config,
 92 |                             batch_size=self.hparams.train_batch_size,
 93 |                             num_workers=self.hparams.dataset_num_workers,
 94 |                             shuffle=True,
 95 |                             drop_last=True ), 
 96 |             "eval": create_dataloader( 
 97 |                             dataset_classname=self.hparams.dataset_classname, 
 98 |                             dataset_config=eval_dataset_config,
 99 |                             batch_size=self.hparams.train_batch_size,
100 |                             num_workers=1,
101 |                             shuffle=False,
102 |                             drop_last=False )
103 |         }
104 | 
105 |     def train(self):
106 |         # 初始化 dataloader 
107 |         self.init_dataloader() 
108 |         while True:
109 |             with tqdm.tqdm( self.dataloader["train"], desc= f"Train, Epoch: {self.epoch}" ) as tqbar:
110 |                 for batch in tqbar: 
111 |                     if self.step > self.hparams.max_train_steps:
112 |                         return 
113 |                     tqbar.set_postfix({"Step": self.step})
114 |                     
115 |                     result = self.strategy.train_step( batch, self.step, self.model, self.loss, self.optimizer ) 
116 | 
117 |                     self._check_log(result)
118 |                     self._check_evaluate()
119 | 
120 |                     self.step += 1
121 |                 self.epoch += 1
122 | 
123 |     def evaluate(self):
124 |         eval_results = defaultdict(float)
125 |         for batch in tqdm.tqdm(self.dataloader["eval"], desc= f"Evaluate"):
126 |             result = self.strategy.eval_step( batch, self.model, self.loss ) 
127 |             for k in result:
128 |                 eval_results[k] += result[k] 
129 | 
130 |         self.log.info( f'Step {self.step}, Evaluate results:')
131 |         for k in eval_results:
132 |             v = eval_results[k] / len( self.dataloader["eval"] )
133 |             self.log.add_scalar( f'evaluate/{k}', v, self.step) 
134 |             self.log.info( f'  {k}: {v:.4f}' )
135 |         
136 |         self.log.flush()
137 | 
138 |     def _check_evaluate(self):
139 |         if self.step % self.hparams.eval_interval_steps == 0:
140 |             self.model.eval()
141 |             self.evaluate()
142 |             self.model.train()
143 | 
144 |     def _check_log(self, train_result):
145 |         for k in train_result:
146 |             self.train_results[k] += train_result[k]
147 |         self.num_train_reuslts += 1
148 | 
149 |         if self.step % self.hparams.log_interval_steps == 0:
150 |             for k in self.train_results:
151 |                 v = self.train_results[k] / self.num_train_reuslts
152 |                 self.log.add_scalar( f'train/{k}', v, self.step) 
153 | 
154 |             self.train_results = defaultdict(float)
155 |             self.num_train_reuslts = 0
156 |         
157 |         if self.step % self.hparams.save_interval_steps == 0:
158 |             self.save_checkpoint()
159 | 
160 | 
161 | def check_args(args, hparams: Hyperparameter):
162 |     if args.exp_dir is None:
163 |         args.exp_dir = os.path.join('exps', datetime.datetime.now().strftime('exp-%Y%m%d-%H%M%S') )
164 |     
165 |         # 保存配置文件
166 |         hparams.save_config( os.path.join( args.exp_dir, 'config.yaml' ) )
167 | 
168 |     # 是否需要进行数据预处理 
169 |     if args.preprocess or not os.path.isfile( hparams.train_metadata_file ):
170 |         if args.data_dir is None:
171 |             raise RuntimeError('Must provide data directory for training.')
172 |         from vocoder.preprocess import preprocess 
173 |         preprocess(args.data_dir, hparams, args.temp_dir, args.device) 
174 |     
175 | 
176 | 
177 | def main():
178 |     parser = argparse.ArgumentParser(
179 |         description="Train LVC-WaveGAN (See detail in vocoder/train.py).")
180 |     parser.add_argument("--config", type=str, required=True,
181 |                         help="yaml format configuration file.")
182 |     parser.add_argument("--exp-dir", default=None, type=str,
183 |                         help="the directory for saving expriment data, "
184 |                         "including model checkpoints, log, results. ")
185 |     parser.add_argument("--data-dir", default=None, type=str,
186 |                         help="the directory containing .wav files for training")
187 |     parser.add_argument("--temp-dir", default='temp', type=str,
188 |                         help="the directory containing preprocess results")
189 |     parser.add_argument("--restart", action="store_true", default=False,
190 |                         help="Whether to restart a new training")
191 |     parser.add_argument("--preprocess", action="store_true", default=False,
192 |                         help="Whether force to preprocess data")
193 |     parser.add_argument("--checkpoint", default=None, type=str,
194 |                         help="checkpoint file path to load saving model")
195 |     parser.add_argument("--device", default='cuda', type=str,
196 |                         help="the device for training. (default: cuda:0)")
197 |     args = parser.parse_args() 
198 |     hparams = Hyperparameter( args.config ) 
199 | 
200 |     check_args( args, hparams ) 
201 | 
202 |     trainer = Trainer(args, hparams)
203 | 
204 |     try:
205 |         trainer.train() 
206 |     except KeyboardInterrupt:
207 |         trainer.save_checkpoint() 
208 |         trainer.log.flush()
209 | 
210 | 
211 | if __name__ == "__main__":
212 |     main() 


--------------------------------------------------------------------------------
/test/test_others.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import sys\n",
 10 |     "sys.path.append('..')"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "import torch \n",
 20 |     "import numpy as np"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 3,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "mel_file = '../temp/mels/LJ001-0001.wav.mel.npy'\n",
 30 |     "\n",
 31 |     "a = np.load(mel_file)"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 4,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "data": {
 41 |       "text/plain": [
 42 |        "array([[0.13142769, 0.16869499, 0.21409377, ..., 0.14441383, 0.17254333,\n",
 43 |        "        0.14861533],\n",
 44 |        "       [0.19932327, 0.26880988, 0.2584364 , ..., 0.5645433 , 0.59844124,\n",
 45 |        "        0.5185001 ],\n",
 46 |        "       [0.30107734, 0.33759803, 0.33762237, ..., 0.6241323 , 0.66163576,\n",
 47 |        "        0.5661982 ],\n",
 48 |        "       ...,\n",
 49 |        "       [0.06722992, 0.22493614, 0.30000472, ..., 0.22025412, 0.23424743,\n",
 50 |        "        0.17675872],\n",
 51 |        "       [0.1122142 , 0.23285972, 0.28549805, ..., 0.22142944, 0.24440841,\n",
 52 |        "        0.19024192],\n",
 53 |        "       [0.21358238, 0.2089363 , 0.18085976, ..., 0.25496688, 0.219793  ,\n",
 54 |        "        0.19626419]], dtype=float32)"
 55 |       ]
 56 |      },
 57 |      "execution_count": 4,
 58 |      "metadata": {},
 59 |      "output_type": "execute_result"
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "a"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 5,
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/plain": [
 74 |        "0.0"
 75 |       ]
 76 |      },
 77 |      "execution_count": 5,
 78 |      "metadata": {},
 79 |      "output_type": "execute_result"
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "a.min()"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 7,
 89 |    "metadata": {},
 90 |    "outputs": [
 91 |     {
 92 |      "ename": "FileNotFoundError",
 93 |      "evalue": "[Errno 2] No such file or directory: '../exps/exp-20201018-032250/checkpoint.pt'",
 94 |      "output_type": "error",
 95 |      "traceback": [
 96 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 97 |       "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
 98 |       "\u001b[0;32m<ipython-input-7-f7d62bdf5fe6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../exps/exp-20201018-032250/checkpoint.pt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
 99 |       "\u001b[0;32m~/miniconda3/lib/python3.8/site-packages/torch/serialization.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(f, map_location, pickle_module, **pickle_load_args)\u001b[0m\n\u001b[1;32m    579\u001b[0m         \u001b[0mpickle_load_args\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'encoding'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'utf-8'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    580\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 581\u001b[0;31m     \u001b[0;32mwith\u001b[0m \u001b[0m_open_file_like\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mopened_file\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    582\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0m_is_zipfile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopened_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    583\u001b[0m             \u001b[0;31m# The zipfile reader is going to advance the current file position.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
100 |       "\u001b[0;32m~/miniconda3/lib/python3.8/site-packages/torch/serialization.py\u001b[0m in \u001b[0;36m_open_file_like\u001b[0;34m(name_or_buffer, mode)\u001b[0m\n\u001b[1;32m    228\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_open_file_like\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    229\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0m_is_path\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname_or_buffer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0m_open_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    231\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    232\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;34m'w'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
101 |       "\u001b[0;32m~/miniconda3/lib/python3.8/site-packages/torch/serialization.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, name, mode)\u001b[0m\n\u001b[1;32m    209\u001b[0m \u001b[0;32mclass\u001b[0m \u001b[0m_open_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_opener\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    210\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m         \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_open_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    212\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    213\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__exit__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
102 |       "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../exps/exp-20201018-032250/checkpoint.pt'"
103 |      ]
104 |     }
105 |    ],
106 |    "source": [
107 |     "torch.load('../exps/exp-20201018-032250/checkpoint.pt')"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 10,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "ename": "FileNotFoundError",
117 |      "evalue": "[Errno 2] No such file or directory: '../exps/exp-20201018-032250/checkpoint'",
118 |      "output_type": "error",
119 |      "traceback": [
120 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
121 |       "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
122 |       "\u001b[0;32m<ipython-input-10-0eafc69f1f11>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadlink\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0;34m'../exps/exp-20201018-032250/checkpoint'\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
123 |       "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../exps/exp-20201018-032250/checkpoint'"
124 |      ]
125 |     }
126 |    ],
127 |    "source": [
128 |     "import os \n",
129 |     "os.readlink( '../exps/exp-20201018-032250/checkpoint' )"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 9,
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "data": {
139 |       "text/plain": [
140 |        "False"
141 |       ]
142 |      },
143 |      "execution_count": 9,
144 |      "metadata": {},
145 |      "output_type": "execute_result"
146 |     }
147 |    ],
148 |    "source": [
149 |     "os.path.isfile( '../exps/exp-20201018-032250/checkpoint.pt' )"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "os.path.islink()"
159 |    ]
160 |   }
161 |  ],
162 |  "metadata": {
163 |   "kernelspec": {
164 |    "display_name": "Python 3",
165 |    "language": "python",
166 |    "name": "python3"
167 |   },
168 |   "language_info": {
169 |    "codemirror_mode": {
170 |     "name": "ipython",
171 |     "version": 3
172 |    },
173 |    "file_extension": ".py",
174 |    "mimetype": "text/x-python",
175 |    "name": "python",
176 |    "nbconvert_exporter": "python",
177 |    "pygments_lexer": "ipython3",
178 |    "version": "3.8.3"
179 |   }
180 |  },
181 |  "nbformat": 4,
182 |  "nbformat_minor": 4
183 | }
184 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/vocoder/models/melgan.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2020 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """MelGAN Modules."""
  7 | 
  8 | import logging
  9 | 
 10 | import numpy as np
 11 | import torch
 12 | 
 13 | from vocoder.layers import CausalConv1d
 14 | from vocoder.layers import CausalConvTranspose1d
 15 | from vocoder.layers import ResidualStack
 16 | 
 17 | 
 18 | class MelGANGenerator(torch.nn.Module):
 19 |     """MelGAN generator module."""
 20 | 
 21 |     def __init__(self,
 22 |                  in_channels=80,
 23 |                  out_channels=1,
 24 |                  kernel_size=7,
 25 |                  channels=512,
 26 |                  bias=True,
 27 |                  upsample_scales=[8, 8, 2, 2],
 28 |                  stack_kernel_size=3,
 29 |                  stacks=3,
 30 |                  nonlinear_activation="LeakyReLU",
 31 |                  nonlinear_activation_params={"negative_slope": 0.2},
 32 |                  pad="ReflectionPad1d",
 33 |                  pad_params={},
 34 |                  use_final_nonlinear_activation=True,
 35 |                  use_weight_norm=True,
 36 |                  use_causal_conv=False,
 37 |                  ):
 38 |         """Initialize MelGANGenerator module.
 39 | 
 40 |         Args:
 41 |             in_channels (int): Number of input channels.
 42 |             out_channels (int): Number of output channels.
 43 |             kernel_size (int): Kernel size of initial and final conv layer.
 44 |             channels (int): Initial number of channels for conv layer.
 45 |             bias (bool): Whether to add bias parameter in convolution layers.
 46 |             upsample_scales (list): List of upsampling scales.
 47 |             stack_kernel_size (int): Kernel size of dilated conv layers in residual stack.
 48 |             stacks (int): Number of stacks in a single residual stack.
 49 |             nonlinear_activation (str): Activation function module name.
 50 |             nonlinear_activation_params (dict): Hyperparameters for activation function.
 51 |             pad (str): Padding function module name before dilated convolution layer.
 52 |             pad_params (dict): Hyperparameters for padding function.
 53 |             use_final_nonlinear_activation (torch.nn.Module): Activation function for the final layer.
 54 |             use_weight_norm (bool): Whether to use weight norm.
 55 |                 If set to true, it will be applied to all of the conv layers.
 56 |             use_causal_conv (bool): Whether to use causal convolution.
 57 | 
 58 |         """
 59 |         super(MelGANGenerator, self).__init__()
 60 | 
 61 |         # check hyper parameters is valid
 62 |         assert channels >= np.prod(upsample_scales)
 63 |         assert channels % (2 ** len(upsample_scales)) == 0
 64 |         if not use_causal_conv:
 65 |             assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
 66 | 
 67 |         # add initial layer
 68 |         layers = []
 69 |         if not use_causal_conv:
 70 |             layers += [
 71 |                 getattr(torch.nn, pad)((kernel_size - 1) // 2, **pad_params),
 72 |                 torch.nn.Conv1d(in_channels, channels, kernel_size, bias=bias),
 73 |             ]
 74 |         else:
 75 |             layers += [
 76 |                 CausalConv1d(in_channels, channels, kernel_size,
 77 |                              bias=bias, pad=pad, pad_params=pad_params),
 78 |             ]
 79 | 
 80 |         for i, upsample_scale in enumerate(upsample_scales):
 81 |             # add upsampling layer
 82 |             layers += [getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)]
 83 |             if not use_causal_conv:
 84 |                 layers += [
 85 |                     torch.nn.ConvTranspose1d(
 86 |                         channels // (2 ** i),
 87 |                         channels // (2 ** (i + 1)),
 88 |                         upsample_scale * 2,
 89 |                         stride=upsample_scale,
 90 |                         padding=upsample_scale // 2 + upsample_scale % 2,
 91 |                         output_padding=upsample_scale % 2,
 92 |                         bias=bias,
 93 |                     )
 94 |                 ]
 95 |             else:
 96 |                 layers += [
 97 |                     CausalConvTranspose1d(
 98 |                         channels // (2 ** i),
 99 |                         channels // (2 ** (i + 1)),
100 |                         upsample_scale * 2,
101 |                         stride=upsample_scale,
102 |                         bias=bias,
103 |                     )
104 |                 ]
105 | 
106 |             # add residual stack
107 |             for j in range(stacks):
108 |                 layers += [
109 |                     ResidualStack(
110 |                         kernel_size=stack_kernel_size,
111 |                         channels=channels // (2 ** (i + 1)),
112 |                         dilation=stack_kernel_size ** j,
113 |                         bias=bias,
114 |                         nonlinear_activation=nonlinear_activation,
115 |                         nonlinear_activation_params=nonlinear_activation_params,
116 |                         pad=pad,
117 |                         pad_params=pad_params,
118 |                         use_causal_conv=use_causal_conv,
119 |                     )
120 |                 ]
121 | 
122 |         # add final layer
123 |         layers += [getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)]
124 |         if not use_causal_conv:
125 |             layers += [
126 |                 getattr(torch.nn, pad)((kernel_size - 1) // 2, **pad_params),
127 |                 torch.nn.Conv1d(channels // (2 ** (i + 1)), out_channels, kernel_size, bias=bias),
128 |             ]
129 |         else:
130 |             layers += [
131 |                 CausalConv1d(channels // (2 ** (i + 1)), out_channels, kernel_size,
132 |                              bias=bias, pad=pad, pad_params=pad_params),
133 |             ]
134 |         if use_final_nonlinear_activation:
135 |             layers += [torch.nn.Tanh()]
136 | 
137 |         # define the model as a single function
138 |         self.melgan = torch.nn.Sequential(*layers)
139 | 
140 |         # apply weight norm
141 |         if use_weight_norm:
142 |             self.apply_weight_norm()
143 | 
144 |         # reset parameters
145 |         self.reset_parameters()
146 | 
147 |         # initialize pqmf for inference
148 |         self.pqmf = None
149 | 
150 |     def forward(self, c):
151 |         """Calculate forward propagation.
152 | 
153 |         Args:
154 |             c (Tensor): Input tensor (B, channels, T).
155 | 
156 |         Returns:
157 |             Tensor: Output tensor (B, 1, T ** prod(upsample_scales)).
158 | 
159 |         """
160 |         return self.melgan(c)
161 | 
162 |     def remove_weight_norm(self):
163 |         """Remove weight normalization module from all of the layers."""
164 |         def _remove_weight_norm(m):
165 |             try:
166 |                 logging.debug(f"Weight norm is removed from {m}.")
167 |                 torch.nn.utils.remove_weight_norm(m)
168 |             except ValueError:  # this module didn't have weight norm
169 |                 return
170 | 
171 |         self.apply(_remove_weight_norm)
172 | 
173 |     def apply_weight_norm(self):
174 |         """Apply weight normalization module from all of the layers."""
175 |         def _apply_weight_norm(m):
176 |             if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d):
177 |                 torch.nn.utils.weight_norm(m)
178 |                 logging.debug(f"Weight norm is applied to {m}.")
179 | 
180 |         self.apply(_apply_weight_norm)
181 | 
182 |     def reset_parameters(self):
183 |         """Reset parameters.
184 | 
185 |         This initialization follows official implementation manner.
186 |         https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py
187 | 
188 |         """
189 |         def _reset_parameters(m):
190 |             if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d):
191 |                 m.weight.data.normal_(0.0, 0.02)
192 |                 logging.debug(f"Reset parameters in {m}.")
193 | 
194 |         self.apply(_reset_parameters)
195 | 
196 |     def inference(self, c):
197 |         """Perform inference.
198 | 
199 |         Args:
200 |             c (Union[Tensor, ndarray]): Input tensor (T, in_channels).
201 | 
202 |         Returns:
203 |             Tensor: Output tensor (T ** prod(upsample_scales), out_channels).
204 | 
205 |         """
206 |         if not isinstance(c, torch.Tensor):
207 |             c = torch.tensor(c, dtype=torch.float).to(next(self.parameters()).device)
208 |         c = self.melgan(c.transpose(1, 0).unsqueeze(0))
209 |         if self.pqmf is not None:
210 |             c = self.pqmf.synthesis(c)
211 |         return c.squeeze(0).transpose(1, 0)
212 | 
213 | 
214 | class MelGANDiscriminator(torch.nn.Module):
215 |     """MelGAN discriminator module."""
216 | 
217 |     def __init__(self,
218 |                  in_channels=1,
219 |                  out_channels=1,
220 |                  kernel_sizes=[5, 3],
221 |                  channels=16,
222 |                  max_downsample_channels=1024,
223 |                  bias=True,
224 |                  downsample_scales=[4, 4, 4, 4],
225 |                  nonlinear_activation="LeakyReLU",
226 |                  nonlinear_activation_params={"negative_slope": 0.2},
227 |                  pad="ReflectionPad1d",
228 |                  pad_params={},
229 |                  ):
230 |         """Initilize MelGAN discriminator module.
231 | 
232 |         Args:
233 |             in_channels (int): Number of input channels.
234 |             out_channels (int): Number of output channels.
235 |             kernel_sizes (list): List of two kernel sizes. The prod will be used for the first conv layer,
236 |                 and the first and the second kernel sizes will be used for the last two layers.
237 |                 For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
238 |                 the last two layers' kernel size will be 5 and 3, respectively.
239 |             channels (int): Initial number of channels for conv layer.
240 |             max_downsample_channels (int): Maximum number of channels for downsampling layers.
241 |             bias (bool): Whether to add bias parameter in convolution layers.
242 |             downsample_scales (list): List of downsampling scales.
243 |             nonlinear_activation (str): Activation function module name.
244 |             nonlinear_activation_params (dict): Hyperparameters for activation function.
245 |             pad (str): Padding function module name before dilated convolution layer.
246 |             pad_params (dict): Hyperparameters for padding function.
247 | 
248 |         """
249 |         super(MelGANDiscriminator, self).__init__()
250 |         self.layers = torch.nn.ModuleList()
251 | 
252 |         # check kernel size is valid
253 |         assert len(kernel_sizes) == 2
254 |         assert kernel_sizes[0] % 2 == 1
255 |         assert kernel_sizes[1] % 2 == 1
256 | 
257 |         # add first layer
258 |         self.layers += [
259 |             torch.nn.Sequential(
260 |                 getattr(torch.nn, pad)((np.prod(kernel_sizes) - 1) // 2, **pad_params),
261 |                 torch.nn.Conv1d(in_channels, channels, np.prod(kernel_sizes), bias=bias),
262 |                 getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
263 |             )
264 |         ]
265 | 
266 |         # add downsample layers
267 |         in_chs = channels
268 |         for downsample_scale in downsample_scales:
269 |             out_chs = min(in_chs * downsample_scale, max_downsample_channels)
270 |             self.layers += [
271 |                 torch.nn.Sequential(
272 |                     torch.nn.Conv1d(
273 |                         in_chs, out_chs,
274 |                         kernel_size=downsample_scale * 10 + 1,
275 |                         stride=downsample_scale,
276 |                         padding=downsample_scale * 5,
277 |                         groups=in_chs // 4,
278 |                         bias=bias,
279 |                     ),
280 |                     getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
281 |                 )
282 |             ]
283 |             in_chs = out_chs
284 | 
285 |         # add final layers
286 |         out_chs = min(in_chs * 2, max_downsample_channels)
287 |         self.layers += [
288 |             torch.nn.Sequential(
289 |                 torch.nn.Conv1d(
290 |                     in_chs, out_chs, kernel_sizes[0],
291 |                     padding=(kernel_sizes[0] - 1) // 2,
292 |                     bias=bias,
293 |                 ),
294 |                 getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
295 |             )
296 |         ]
297 |         self.layers += [
298 |             torch.nn.Conv1d(
299 |                 out_chs, out_channels, kernel_sizes[1],
300 |                 padding=(kernel_sizes[1] - 1) // 2,
301 |                 bias=bias,
302 |             ),
303 |         ]
304 | 
305 |     def forward(self, x):
306 |         """Calculate forward propagation.
307 | 
308 |         Args:
309 |             x (Tensor): Input noise signal (B, 1, T).
310 | 
311 |         Returns:
312 |             List: List of output tensors of each layer.
313 | 
314 |         """
315 |         outs = []
316 |         for f in self.layers:
317 |             x = f(x)
318 |             outs += [x]
319 | 
320 |         return outs
321 | 
322 | 
323 | class MelGANMultiScaleDiscriminator(torch.nn.Module):
324 |     """MelGAN multi-scale discriminator module."""
325 | 
326 |     def __init__(self,
327 |                  in_channels=1,
328 |                  out_channels=1,
329 |                  scales=3,
330 |                  downsample_pooling="AvgPool1d",
331 |                  # follow the official implementation setting
332 |                  downsample_pooling_params={
333 |                      "kernel_size": 4,
334 |                      "stride": 2,
335 |                      "padding": 1,
336 |                      "count_include_pad": False,
337 |                  },
338 |                  kernel_sizes=[5, 3],
339 |                  channels=16,
340 |                  max_downsample_channels=1024,
341 |                  bias=True,
342 |                  downsample_scales=[4, 4, 4, 4],
343 |                  nonlinear_activation="LeakyReLU",
344 |                  nonlinear_activation_params={"negative_slope": 0.2},
345 |                  pad="ReflectionPad1d",
346 |                  pad_params={},
347 |                  use_weight_norm=True,
348 |                  ):
349 |         """Initilize MelGAN multi-scale discriminator module.
350 | 
351 |         Args:
352 |             in_channels (int): Number of input channels.
353 |             out_channels (int): Number of output channels.
354 |             downsample_pooling (str): Pooling module name for downsampling of the inputs.
355 |             downsample_pooling_params (dict): Parameters for the above pooling module.
356 |             kernel_sizes (list): List of two kernel sizes. The sum will be used for the first conv layer,
357 |                 and the first and the second kernel sizes will be used for the last two layers.
358 |             channels (int): Initial number of channels for conv layer.
359 |             max_downsample_channels (int): Maximum number of channels for downsampling layers.
360 |             bias (bool): Whether to add bias parameter in convolution layers.
361 |             downsample_scales (list): List of downsampling scales.
362 |             nonlinear_activation (str): Activation function module name.
363 |             nonlinear_activation_params (dict): Hyperparameters for activation function.
364 |             pad (str): Padding function module name before dilated convolution layer.
365 |             pad_params (dict): Hyperparameters for padding function.
366 |             use_causal_conv (bool): Whether to use causal convolution.
367 | 
368 |         """
369 |         super(MelGANMultiScaleDiscriminator, self).__init__()
370 |         self.discriminators = torch.nn.ModuleList()
371 | 
372 |         # add discriminators
373 |         for _ in range(scales):
374 |             self.discriminators += [
375 |                 MelGANDiscriminator(
376 |                     in_channels=in_channels,
377 |                     out_channels=out_channels,
378 |                     kernel_sizes=kernel_sizes,
379 |                     channels=channels,
380 |                     max_downsample_channels=max_downsample_channels,
381 |                     bias=bias,
382 |                     downsample_scales=downsample_scales,
383 |                     nonlinear_activation=nonlinear_activation,
384 |                     nonlinear_activation_params=nonlinear_activation_params,
385 |                     pad=pad,
386 |                     pad_params=pad_params,
387 |                 )
388 |             ]
389 |         self.pooling = getattr(torch.nn, downsample_pooling)(**downsample_pooling_params)
390 | 
391 |         # apply weight norm
392 |         if use_weight_norm:
393 |             self.apply_weight_norm()
394 | 
395 |         # reset parameters
396 |         self.reset_parameters()
397 | 
398 |     def forward(self, x):
399 |         """Calculate forward propagation.
400 | 
401 |         Args:
402 |             x (Tensor): Input noise signal (B, 1, T).
403 | 
404 |         Returns:
405 |             List: List of list of each discriminator outputs, which consists of each layer output tensors.
406 | 
407 |         """
408 |         outs = []
409 |         for f in self.discriminators:
410 |             outs += [f(x)]
411 |             x = self.pooling(x)
412 | 
413 |         return outs
414 | 
415 |     def remove_weight_norm(self):
416 |         """Remove weight normalization module from all of the layers."""
417 |         def _remove_weight_norm(m):
418 |             try:
419 |                 logging.debug(f"Weight norm is removed from {m}.")
420 |                 torch.nn.utils.remove_weight_norm(m)
421 |             except ValueError:  # this module didn't have weight norm
422 |                 return
423 | 
424 |         self.apply(_remove_weight_norm)
425 | 
426 |     def apply_weight_norm(self):
427 |         """Apply weight normalization module from all of the layers."""
428 |         def _apply_weight_norm(m):
429 |             if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d):
430 |                 torch.nn.utils.weight_norm(m)
431 |                 logging.debug(f"Weight norm is applied to {m}.")
432 | 
433 |         self.apply(_apply_weight_norm)
434 | 
435 |     def reset_parameters(self):
436 |         """Reset parameters.
437 | 
438 |         This initialization follows official implementation manner.
439 |         https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py
440 | 
441 |         """
442 |         def _reset_parameters(m):
443 |             if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d):
444 |                 m.weight.data.normal_(0.0, 0.02)
445 |                 logging.debug(f"Reset parameters in {m}.")
446 | 
447 |         self.apply(_reset_parameters)
448 | 


--------------------------------------------------------------------------------
/vocoder/models/parallel_wavegan.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2019 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """Parallel WaveGAN Modules."""
  7 | 
  8 | import logging
  9 | import math
 10 | 
 11 | import numpy as np
 12 | import torch
 13 | 
 14 | from vocoder.layers import Conv1d
 15 | from vocoder.layers import Conv1d1x1
 16 | from vocoder.layers import ResidualBlock
 17 | from vocoder.layers import upsample
 18 | from vocoder import models
 19 | 
 20 | class ParallelWaveGAN(torch.nn.Module):
 21 |     """Parallel WaveGAN module"""
 22 | 
 23 |     def __init__(self, generator_params={}, discriminator_params={}):
 24 |         super().__init__() 
 25 | 
 26 |         self.generator = ParallelWaveGANGenerator(**generator_params) 
 27 |         self.discriminator = ParallelWaveGANDiscriminator(**discriminator_params) 
 28 | 
 29 |     def generator_forward(self, x, c):
 30 |         return self.generator(x, c) 
 31 |     
 32 |     def discriminator_forward(self, x):
 33 |         return self.discriminator(x) 
 34 | 
 35 | 
 36 | class ParallelWaveGANGenerator(torch.nn.Module):
 37 |     """Parallel WaveGAN Generator module."""
 38 | 
 39 |     def __init__(self,
 40 |                  in_channels=1,
 41 |                  out_channels=1,
 42 |                  kernel_size=3,
 43 |                  layers=30,
 44 |                  stacks=3,
 45 |                  residual_channels=64,
 46 |                  gate_channels=128,
 47 |                  skip_channels=64,
 48 |                  aux_channels=80,
 49 |                  aux_context_window=2,
 50 |                  dropout=0.0,
 51 |                  bias=True,
 52 |                  use_weight_norm=True,
 53 |                  use_causal_conv=False,
 54 |                  upsample_conditional_features=True,
 55 |                  upsample_net="ConvInUpsampleNetwork",
 56 |                  upsample_params={"upsample_scales": [4, 4, 4, 4]},
 57 |                  ):
 58 |         """Initialize Parallel WaveGAN Generator module.
 59 | 
 60 |         Args:
 61 |             in_channels (int): Number of input channels.
 62 |             out_channels (int): Number of output channels.
 63 |             kernel_size (int): Kernel size of dilated convolution.
 64 |             layers (int): Number of residual block layers.
 65 |             stacks (int): Number of stacks i.e., dilation cycles.
 66 |             residual_channels (int): Number of channels in residual conv.
 67 |             gate_channels (int):  Number of channels in gated conv.
 68 |             skip_channels (int): Number of channels in skip conv.
 69 |             aux_channels (int): Number of channels for auxiliary feature conv.
 70 |             aux_context_window (int): Context window size for auxiliary feature.
 71 |             dropout (float): Dropout rate. 0.0 means no dropout applied.
 72 |             bias (bool): Whether to use bias parameter in conv layer.
 73 |             use_weight_norm (bool): Whether to use weight norm.
 74 |                 If set to true, it will be applied to all of the conv layers.
 75 |             use_causal_conv (bool): Whether to use causal structure.
 76 |             upsample_conditional_features (bool): Whether to use upsampling network.
 77 |             upsample_net (str): Upsampling network architecture.
 78 |             upsample_params (dict): Upsampling network parameters.
 79 | 
 80 |         """
 81 |         super(ParallelWaveGANGenerator, self).__init__()
 82 |         self.in_channels = in_channels
 83 |         self.out_channels = out_channels
 84 |         self.aux_channels = aux_channels
 85 |         self.aux_context_window = aux_context_window
 86 |         self.layers = layers
 87 |         self.stacks = stacks
 88 |         self.kernel_size = kernel_size
 89 | 
 90 |         # check the number of layers and stacks
 91 |         assert layers % stacks == 0
 92 |         layers_per_stack = layers // stacks
 93 | 
 94 |         # define first convolution
 95 |         self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True)
 96 | 
 97 |         # define conv + upsampling network
 98 |         if upsample_conditional_features:
 99 |             upsample_params.update({
100 |                 "use_causal_conv": use_causal_conv,
101 |             })
102 |             if upsample_net == "MelGANGenerator":
103 |                 assert aux_context_window == 0
104 |                 upsample_params.update({
105 |                     "use_weight_norm": False,  # not to apply twice
106 |                     "use_final_nonlinear_activation": False,
107 |                 })
108 |                 self.upsample_net = getattr(models, upsample_net)(**upsample_params)
109 |             else:
110 |                 if upsample_net == "ConvInUpsampleNetwork":
111 |                     upsample_params.update({
112 |                         "aux_channels": aux_channels,
113 |                         "aux_context_window": aux_context_window,
114 |                     })
115 |                 self.upsample_net = getattr(upsample, upsample_net)(**upsample_params)
116 |             self.upsample_factor = np.prod(upsample_params["upsample_scales"])
117 |         else:
118 |             self.upsample_net = None
119 |             self.upsample_factor = 1
120 | 
121 |         # define residual blocks
122 |         self.conv_layers = torch.nn.ModuleList()
123 |         for layer in range(layers):
124 |             dilation = 2 ** (layer % layers_per_stack)
125 |             conv = ResidualBlock(
126 |                 kernel_size=kernel_size,
127 |                 residual_channels=residual_channels,
128 |                 gate_channels=gate_channels,
129 |                 skip_channels=skip_channels,
130 |                 aux_channels=aux_channels,
131 |                 dilation=dilation,
132 |                 dropout=dropout,
133 |                 bias=bias,
134 |                 use_causal_conv=use_causal_conv,
135 |             )
136 |             self.conv_layers += [conv]
137 | 
138 |         # define output layers
139 |         self.last_conv_layers = torch.nn.ModuleList([
140 |             torch.nn.ReLU(inplace=True),
141 |             Conv1d1x1(skip_channels, skip_channels, bias=True),
142 |             torch.nn.ReLU(inplace=True),
143 |             Conv1d1x1(skip_channels, out_channels, bias=True),
144 |         ])
145 | 
146 |         # apply weight norm
147 |         if use_weight_norm:
148 |             self.apply_weight_norm()
149 | 
150 |     def forward(self, x, c):
151 |         """Calculate forward propagation.
152 | 
153 |         Args:
154 |             x (Tensor): Input noise signal (B, 1, T).
155 |             c (Tensor): Local conditioning auxiliary features (B, C ,T').
156 | 
157 |         Returns:
158 |             Tensor: Output tensor (B, out_channels, T)
159 | 
160 |         """
161 |         # perform upsampling
162 |         if c is not None and self.upsample_net is not None:
163 |             c = self.upsample_net(c)
164 |             assert c.size(-1) == x.size(-1), f"c {c.shape}, x {x.shape}"
165 | 
166 |         # encode to hidden representation
167 |         x = self.first_conv(x)
168 |         skips = 0
169 |         for f in self.conv_layers:
170 |             x, h = f(x, c)
171 |             skips += h
172 |         skips *= math.sqrt(1.0 / len(self.conv_layers))
173 | 
174 |         # apply final layers
175 |         x = skips
176 |         for f in self.last_conv_layers:
177 |             x = f(x)
178 | 
179 |         return x
180 | 
181 |     def remove_weight_norm(self):
182 |         """Remove weight normalization module from all of the layers."""
183 |         def _remove_weight_norm(m):
184 |             try:
185 |                 logging.debug(f"Weight norm is removed from {m}.")
186 |                 torch.nn.utils.remove_weight_norm(m)
187 |             except ValueError:  # this module didn't have weight norm
188 |                 return
189 | 
190 |         self.apply(_remove_weight_norm)
191 | 
192 |     def apply_weight_norm(self):
193 |         """Apply weight normalization module from all of the layers."""
194 |         def _apply_weight_norm(m):
195 |             if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
196 |                 torch.nn.utils.weight_norm(m)
197 |                 logging.debug(f"Weight norm is applied to {m}.")
198 | 
199 |         self.apply(_apply_weight_norm)
200 | 
201 |     @staticmethod
202 |     def _get_receptive_field_size(layers, stacks, kernel_size,
203 |                                   dilation=lambda x: 2 ** x):
204 |         assert layers % stacks == 0
205 |         layers_per_cycle = layers // stacks
206 |         dilations = [dilation(i % layers_per_cycle) for i in range(layers)]
207 |         return (kernel_size - 1) * sum(dilations) + 1
208 | 
209 |     @property
210 |     def receptive_field_size(self):
211 |         """Return receptive field size."""
212 |         return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size)
213 | 
214 |     def inference(self, c=None, x=None):
215 |         """Perform inference.
216 | 
217 |         Args:
218 |             c (Union[Tensor, ndarray]): Local conditioning auxiliary features (T' ,C).
219 |             x (Union[Tensor, ndarray]): Input noise signal (T, 1).
220 | 
221 |         Returns:
222 |             Tensor: Output tensor (T, out_channels)
223 | 
224 |         """
225 |         if x is not None:
226 |             if not isinstance(x, torch.Tensor):
227 |                 x = torch.tensor(x, dtype=torch.float).to(next(self.parameters()).device)
228 |             x = x.transpose(1, 0).unsqueeze(0)
229 |         else:
230 |             assert c is not None
231 |             x = torch.randn(1, 1, len(c) * self.upsample_factor).to(next(self.parameters()).device)
232 |         if c is not None:
233 |             if not isinstance(c, torch.Tensor):
234 |                 c = torch.tensor(c, dtype=torch.float).to(next(self.parameters()).device)
235 |             c = c.transpose(1, 0).unsqueeze(0)
236 |             c = torch.nn.ReplicationPad1d(self.aux_context_window)(c)
237 |         return self.forward(x, c).squeeze(0).transpose(1, 0)
238 | 
239 | 
240 | class ParallelWaveGANDiscriminator(torch.nn.Module):
241 |     """Parallel WaveGAN Discriminator module."""
242 | 
243 |     def __init__(self,
244 |                  in_channels=1,
245 |                  out_channels=1,
246 |                  kernel_size=3,
247 |                  layers=10,
248 |                  conv_channels=64,
249 |                  dilation_factor=1,
250 |                  nonlinear_activation="LeakyReLU",
251 |                  nonlinear_activation_params={"negative_slope": 0.2},
252 |                  bias=True,
253 |                  use_weight_norm=True,
254 |                  ):
255 |         """Initialize Parallel WaveGAN Discriminator module.
256 | 
257 |         Args:
258 |             in_channels (int): Number of input channels.
259 |             out_channels (int): Number of output channels.
260 |             kernel_size (int): Number of output channels.
261 |             layers (int): Number of conv layers.
262 |             conv_channels (int): Number of chnn layers.
263 |             dilation_factor (int): Dilation factor. For example, if dilation_factor = 2,
264 |                 the dilation will be 2, 4, 8, ..., and so on.
265 |             nonlinear_activation (str): Nonlinear function after each conv.
266 |             nonlinear_activation_params (dict): Nonlinear function parameters
267 |             bias (bool): Whether to use bias parameter in conv.
268 |             use_weight_norm (bool) Whether to use weight norm.
269 |                 If set to true, it will be applied to all of the conv layers.
270 | 
271 |         """
272 |         super(ParallelWaveGANDiscriminator, self).__init__()
273 |         assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
274 |         assert dilation_factor > 0, "Dilation factor must be > 0."
275 |         self.conv_layers = torch.nn.ModuleList()
276 |         conv_in_channels = in_channels
277 |         for i in range(layers - 1):
278 |             if i == 0:
279 |                 dilation = 1
280 |             else:
281 |                 dilation = i if dilation_factor == 1 else dilation_factor ** i
282 |                 conv_in_channels = conv_channels
283 |             padding = (kernel_size - 1) // 2 * dilation
284 |             conv_layer = [
285 |                 Conv1d(conv_in_channels, conv_channels,
286 |                        kernel_size=kernel_size, padding=padding,
287 |                        dilation=dilation, bias=bias),
288 |                 getattr(torch.nn, nonlinear_activation)(inplace=True, **nonlinear_activation_params)
289 |             ]
290 |             self.conv_layers += conv_layer
291 |         padding = (kernel_size - 1) // 2
292 |         last_conv_layer = Conv1d(
293 |             conv_in_channels, out_channels,
294 |             kernel_size=kernel_size, padding=padding, bias=bias)
295 |         self.conv_layers += [last_conv_layer]
296 | 
297 |         # apply weight norm
298 |         if use_weight_norm:
299 |             self.apply_weight_norm()
300 | 
301 |     def forward(self, x):
302 |         """Calculate forward propagation.
303 | 
304 |         Args:
305 |             x (Tensor): Input noise signal (B, 1, T).
306 | 
307 |         Returns:
308 |             Tensor: Output tensor (B, 1, T)
309 | 
310 |         """
311 |         for f in self.conv_layers:
312 |             x = f(x)
313 |         return x
314 | 
315 |     def apply_weight_norm(self):
316 |         """Apply weight normalization module from all of the layers."""
317 |         def _apply_weight_norm(m):
318 |             if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
319 |                 torch.nn.utils.weight_norm(m)
320 |                 logging.debug(f"Weight norm is applied to {m}.")
321 | 
322 |         self.apply(_apply_weight_norm)
323 | 
324 |     def remove_weight_norm(self):
325 |         """Remove weight normalization module from all of the layers."""
326 |         def _remove_weight_norm(m):
327 |             try:
328 |                 logging.debug(f"Weight norm is removed from {m}.")
329 |                 torch.nn.utils.remove_weight_norm(m)
330 |             except ValueError:  # this module didn't have weight norm
331 |                 return
332 | 
333 |         self.apply(_remove_weight_norm)
334 | 
335 | 
336 | class ResidualParallelWaveGANDiscriminator(torch.nn.Module):
337 |     """Parallel WaveGAN Discriminator module."""
338 | 
339 |     def __init__(self,
340 |                  in_channels=1,
341 |                  out_channels=1,
342 |                  kernel_size=3,
343 |                  layers=30,
344 |                  stacks=3,
345 |                  residual_channels=64,
346 |                  gate_channels=128,
347 |                  skip_channels=64,
348 |                  dropout=0.0,
349 |                  bias=True,
350 |                  use_weight_norm=True,
351 |                  use_causal_conv=False,
352 |                  nonlinear_activation="LeakyReLU",
353 |                  nonlinear_activation_params={"negative_slope": 0.2},
354 |                  ):
355 |         """Initialize Parallel WaveGAN Discriminator module.
356 | 
357 |         Args:
358 |             in_channels (int): Number of input channels.
359 |             out_channels (int): Number of output channels.
360 |             kernel_size (int): Kernel size of dilated convolution.
361 |             layers (int): Number of residual block layers.
362 |             stacks (int): Number of stacks i.e., dilation cycles.
363 |             residual_channels (int): Number of channels in residual conv.
364 |             gate_channels (int):  Number of channels in gated conv.
365 |             skip_channels (int): Number of channels in skip conv.
366 |             dropout (float): Dropout rate. 0.0 means no dropout applied.
367 |             bias (bool): Whether to use bias parameter in conv.
368 |             use_weight_norm (bool): Whether to use weight norm.
369 |                 If set to true, it will be applied to all of the conv layers.
370 |             use_causal_conv (bool): Whether to use causal structure.
371 |             nonlinear_activation_params (dict): Nonlinear function parameters
372 | 
373 |         """
374 |         super(ResidualParallelWaveGANDiscriminator, self).__init__()
375 |         assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
376 | 
377 |         self.in_channels = in_channels
378 |         self.out_channels = out_channels
379 |         self.layers = layers
380 |         self.stacks = stacks
381 |         self.kernel_size = kernel_size
382 | 
383 |         # check the number of layers and stacks
384 |         assert layers % stacks == 0
385 |         layers_per_stack = layers // stacks
386 | 
387 |         # define first convolution
388 |         self.first_conv = torch.nn.Sequential(
389 |             Conv1d1x1(in_channels, residual_channels, bias=True),
390 |             getattr(torch.nn, nonlinear_activation)(
391 |                 inplace=True, **nonlinear_activation_params),
392 |         )
393 | 
394 |         # define residual blocks
395 |         self.conv_layers = torch.nn.ModuleList()
396 |         for layer in range(layers):
397 |             dilation = 2 ** (layer % layers_per_stack)
398 |             conv = ResidualBlock(
399 |                 kernel_size=kernel_size,
400 |                 residual_channels=residual_channels,
401 |                 gate_channels=gate_channels,
402 |                 skip_channels=skip_channels,
403 |                 aux_channels=-1,
404 |                 dilation=dilation,
405 |                 dropout=dropout,
406 |                 bias=bias,
407 |                 use_causal_conv=use_causal_conv,
408 |             )
409 |             self.conv_layers += [conv]
410 | 
411 |         # define output layers
412 |         self.last_conv_layers = torch.nn.ModuleList([
413 |             getattr(torch.nn, nonlinear_activation)(
414 |                 inplace=True, **nonlinear_activation_params),
415 |             Conv1d1x1(skip_channels, skip_channels, bias=True),
416 |             getattr(torch.nn, nonlinear_activation)(
417 |                 inplace=True, **nonlinear_activation_params),
418 |             Conv1d1x1(skip_channels, out_channels, bias=True),
419 |         ])
420 | 
421 |         # apply weight norm
422 |         if use_weight_norm:
423 |             self.apply_weight_norm()
424 | 
425 |     def forward(self, x):
426 |         """Calculate forward propagation.
427 | 
428 |         Args:
429 |             x (Tensor): Input noise signal (B, 1, T).
430 | 
431 |         Returns:
432 |             Tensor: Output tensor (B, 1, T)
433 | 
434 |         """
435 |         x = self.first_conv(x)
436 | 
437 |         skips = 0
438 |         for f in self.conv_layers:
439 |             x, h = f(x, None)
440 |             skips += h
441 |         skips *= math.sqrt(1.0 / len(self.conv_layers))
442 | 
443 |         # apply final layers
444 |         x = skips
445 |         for f in self.last_conv_layers:
446 |             x = f(x)
447 |         return x
448 | 
449 |     def apply_weight_norm(self):
450 |         """Apply weight normalization module from all of the layers."""
451 |         def _apply_weight_norm(m):
452 |             if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
453 |                 torch.nn.utils.weight_norm(m)
454 |                 logging.debug(f"Weight norm is applied to {m}.")
455 | 
456 |         self.apply(_apply_weight_norm)
457 | 
458 |     def remove_weight_norm(self):
459 |         """Remove weight normalization module from all of the layers."""
460 |         def _remove_weight_norm(m):
461 |             try:
462 |                 logging.debug(f"Weight norm is removed from {m}.")
463 |                 torch.nn.utils.remove_weight_norm(m)
464 |             except ValueError:  # this module didn't have weight norm
465 |                 return
466 | 
467 |         self.apply(_remove_weight_norm)
468 | 


--------------------------------------------------------------------------------