├── exp └── gitkeep ├── ddsp ├── __init__.py ├── mel2control.py ├── loss.py ├── model_conformer_naive.py ├── core.py └── vocoder.py ├── logger ├── __init__.py ├── saver.py └── utils.py ├── data ├── val │ └── audio │ │ └── gitkeep └── train │ └── audio │ └── gitkeep ├── requirements.txt ├── LICENSE ├── configs ├── sins.yaml └── combsub.yaml ├── DiffSinger.md ├── export.py ├── train.py ├── main.py ├── README.md ├── solver.py ├── preprocess.py └── data_loaders.py /exp/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ddsp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /logger/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/val/audio/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/train/audio/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gin 2 | gin_config 3 | librosa 4 | numpy 5 | praat-parselmouth 6 | pyworld 7 | PyYAML 8 | SoundFile 9 | tqdm 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 yxlllc 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /configs/sins.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | f0_extractor: 'parselmouth' # 'parselmouth' (singing) or 'dio' (speech) or 'harvest' (speech) 3 | f0_min: 65 # about C2 4 | f0_max: 800 # about G5 5 | sampling_rate: 44100 6 | n_fft: 2048 7 | win_length: 2048 8 | block_size: 512 # Equal to hop_length 9 | n_mels: 128 10 | mel_fmin: 40 11 | mel_fmax: 16000 # <= sampling_rate / 2 12 | duration: 2 # Audio duration during training, must be less than the duration of the shortest audio clip 13 | train_path: data/train # Create a folder named "audio" under this path and put the audio clip in it 14 | valid_path: data/val # Create a folder named "audio" under this path and put the audio clip in it 15 | model: 16 | type: 'Sins' 17 | win_length: 2048 18 | use_mean_filter: true 19 | n_harmonics: 128 20 | n_mag_noise: 256 21 | loss: 22 | fft_min: 256 23 | fft_max: 2048 24 | n_scale: 4 # rss kernel numbers 25 | lambda_uv: 1.0 # uv regularization 26 | uv_tolerance: 0.05 # set it to a large value or try other f0 extractors if val_loss_uv is much higher than train_loss_uv 27 | detach_uv_step: 2000 28 | device: cuda 29 | env: 30 | expdir: exp/sins-test 31 | gpu_id: 0 32 | train: 33 | num_workers: 2 # if your cpu and gpu are both very strong, set to 0 may be faster! 34 | batch_size: 24 35 | cache_all_data: true # Save Internal-Memory if it is false, but may be slow 36 | epochs: 100000 37 | interval_log: 10 38 | interval_val: 2000 39 | lr: 0.0005 40 | weight_decay: 0 41 | save_opt: false 42 | -------------------------------------------------------------------------------- /configs/combsub.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | f0_extractor: 'parselmouth' # 'parselmouth' (singing) or 'dio' (speech) or 'harvest' (speech) 3 | f0_min: 65 # about C2 4 | f0_max: 800 # about G5 5 | sampling_rate: 44100 6 | n_fft: 2048 7 | win_length: 2048 8 | block_size: 512 # Equal to hop_length 9 | n_mels: 128 10 | mel_fmin: 40 11 | mel_fmax: 16000 # <= sampling_rate / 2 12 | duration: 2 # Audio duration during training, must be less than the duration of the shortest audio clip 13 | train_path: data/train # Create a folder named "audio" under this path and put the audio clip in it 14 | valid_path: data/val # Create a folder named "audio" under this path and put the audio clip in it 15 | model: 16 | type: 'CombSub' 17 | win_length: 2048 18 | use_mean_filter: true 19 | n_mag_harmonic: 512 20 | n_mag_noise: 256 21 | loss: 22 | fft_min: 256 23 | fft_max: 2048 24 | n_scale: 4 # rss kernel numbers 25 | lambda_uv: 1.0 # uv regularization 26 | uv_tolerance: 0.05 # set it to a large value or try other f0 extractors if val_loss_uv is much higher than train_loss_uv 27 | detach_uv_step: 2000 28 | device: cuda 29 | env: 30 | expdir: exp/combsub-test 31 | gpu_id: 0 32 | train: 33 | num_workers: 2 # if your cpu and gpu are both very strong, set to 0 may be faster! 34 | batch_size: 24 35 | cache_all_data: true # Save Internal-Memory if it is false, but may be slow 36 | epochs: 100000 37 | interval_log: 10 38 | interval_val: 2000 39 | lr: 0.0005 40 | weight_decay: 0 41 | save_opt: false -------------------------------------------------------------------------------- /DiffSinger.md: -------------------------------------------------------------------------------- 1 | # Use DDSP Vocoders in DiffSinger (OpenVPI version) 2 | Suppose you have already trained a model called `exp/combsub-test/model_100000.pt` using the code in this repository, run 3 | ```bash 4 | python export.py -m exp/combsub-test/model_100000.pt --traced 5 | ``` 6 | This will create a `.jit` format model file in the same directory. 7 | 8 | Then, move this `.jit` model file and the `config.yaml` together to the `checkpoints/ddsp` directory of the [**DiffSinger**](https://github.com/openvpi/DiffSinger) repository. 9 | 10 | Finally, edit the [**`configs/acoustic.yaml`**](https://github.com/openvpi/DiffSinger/blob/main/configs/acoustic.yaml) file in the [**DiffSinger**](https://github.com/openvpi/DiffSinger) repository to enable the DDSP vocoder. the details are: 11 | 1. Set the `vocoder` option to `DDSP`. 12 | 2. Set the `vocoder_ckpt` option to the path of the `.jit` model. An example may be `checkpoints/ddsp/model_100000-traced-torch1.9.1.jit` 13 | 3. Check whether other mel related parameters match the parameters in the `checkpoints/ddsp/config.yaml` file. For the details, the `audio_sample_rate`,`audio_num_mel_bins`,`hop_size`,`fft_size`,`win_size`,`fmin` and `fmax` in the [**`configs/acoustic.yaml`**](https://github.com/openvpi/DiffSinger/blob/main/configs/acoustic.yaml) need to match `sampling_rate`, `n_mels`, `block_size`, `n_fft`, `win_length`,`mel_fmin` and `mel_fmax` in the `checkpoints/ddsp/config.yaml`, respectively. 14 | 15 | After doing all this, [**DiffSinger**](https://github.com/openvpi/DiffSinger)'s default NSF-HiFiGAN vocoder has been replaced by your own trained DDSP vocoder, and you can perform preprocessing, training or inference normally. 16 | -------------------------------------------------------------------------------- /ddsp/mel2control.py: -------------------------------------------------------------------------------- 1 | import gin 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | from torch.nn.utils import weight_norm 7 | 8 | from .model_conformer_naive import ConformerNaiveEncoder 9 | 10 | 11 | def split_to_dict(tensor, tensor_splits): 12 | """Split a tensor into a dictionary of multiple tensors.""" 13 | labels = [] 14 | sizes = [] 15 | 16 | for k, v in tensor_splits.items(): 17 | labels.append(k) 18 | sizes.append(v) 19 | 20 | tensors = torch.split(tensor, sizes, dim=-1) 21 | return dict(zip(labels, tensors)) 22 | 23 | 24 | class Mel2Control(nn.Module): 25 | def __init__( 26 | self, 27 | n_mels, 28 | block_size, 29 | output_splits): 30 | super().__init__() 31 | self.output_splits = output_splits 32 | self.mel_emb = nn.Linear(n_mels, 256) 33 | self.stack = nn.Sequential( 34 | weight_norm(nn.Conv1d(2 * block_size, 512, 3, 1, 1)), 35 | nn.PReLU(num_parameters=512), 36 | weight_norm(nn.Conv1d(512, 256, 3, 1, 1))) 37 | self.decoder = ConformerNaiveEncoder( 38 | num_layers=3, 39 | num_heads=8, 40 | dim_model=256, 41 | use_norm=False, 42 | conv_only=True, 43 | conv_dropout=0, 44 | atten_dropout=0.1) 45 | self.norm = nn.LayerNorm(256) 46 | self.n_out = sum([v for k, v in output_splits.items()]) 47 | self.dense_out = weight_norm(nn.Linear(256, self.n_out)) 48 | 49 | def forward(self, mel, source, noise): 50 | 51 | ''' 52 | input: 53 | B x n_frames x n_mels 54 | return: 55 | dict of B x n_frames x feat 56 | ''' 57 | exciter = torch.cat((source, noise), dim=-1).transpose(1,2) 58 | x = self.mel_emb(mel) + self.stack(exciter).transpose(1,2) 59 | x = self.decoder(x) 60 | x = self.norm(x) 61 | e = self.dense_out(x) 62 | controls = split_to_dict(e, self.output_splits) 63 | 64 | return controls 65 | 66 | -------------------------------------------------------------------------------- /export.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os.path 3 | 4 | import torch 5 | 6 | from ddsp.vocoder import load_model 7 | 8 | 9 | class DDSPWrapper(torch.nn.Module): 10 | def __init__(self, module, device): 11 | super().__init__() 12 | self.model = module 13 | self.to(device) 14 | 15 | def forward(self, mel, f0): 16 | f0 = f0[..., None] 17 | signal, _, (s_h, s_n) = self.model(mel, f0) 18 | return signal, s_h, s_n 19 | 20 | 21 | def parse_args(args=None, namespace=None): 22 | parser = argparse.ArgumentParser( 23 | description='Export model to standalone PyTorch traced module or ONNX format' 24 | ) 25 | parser.add_argument( 26 | '-m', 27 | '--model_path', 28 | type=str, 29 | required=True, 30 | help='path to model file' 31 | ) 32 | parser.add_argument( 33 | '--traced', 34 | required=False, 35 | action='store_true', 36 | help='export to traced module format' 37 | ) 38 | parser.add_argument( 39 | '--onnx', 40 | required=False, 41 | action='store_true', 42 | help='export to ONNX format' 43 | ) 44 | cmd = parser.parse_args(args=args, namespace=namespace) 45 | if not cmd.traced and not cmd.onnx: 46 | parser.error('either --traced or --onnx should be specified.') 47 | return cmd 48 | 49 | 50 | def main(): 51 | device = 'cpu' 52 | # parse commands 53 | cmd = parse_args() 54 | 55 | # load model 56 | model, args = load_model(cmd.model_path, device=device) 57 | #model = DDSPWrapper(model, device) 58 | 59 | # extract model dirname and filename 60 | directory = os.path.dirname(os.path.abspath(cmd.model_path)) 61 | name = os.path.basename(cmd.model_path).rsplit('.', maxsplit=1)[0] 62 | 63 | # load input 64 | n_mel_channels = args.data.n_mels 65 | n_frames = 10 66 | mel = torch.randn((1, n_frames, n_mel_channels), dtype=torch.float32, device=device) 67 | f0 = torch.FloatTensor([[440.] * n_frames]).to(device) 68 | f0 = f0[..., None] 69 | 70 | # export model 71 | with torch.no_grad(): 72 | if cmd.traced: 73 | torch_version = torch.version.__version__.rsplit('+', maxsplit=1)[0] 74 | export_path = os.path.join(directory, f'{name}-traced-torch{torch_version}.jit') 75 | print(f' [Tracing] {cmd.model_path} => {export_path}') 76 | model = torch.jit.trace( 77 | model, 78 | ( 79 | mel, 80 | f0 81 | ), 82 | check_trace=False 83 | ) 84 | torch.jit.save(model, export_path) 85 | 86 | if cmd.onnx: 87 | raise NotImplementedError('Exporting to ONNX format is not supported yet.') 88 | 89 | 90 | if __name__ == '__main__': 91 | main() 92 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import torch 4 | 5 | from logger import utils 6 | from data_loaders import get_data_loaders 7 | from solver import train 8 | from ddsp.vocoder import Sins, CombSub 9 | from ddsp.loss import HybridLoss 10 | 11 | 12 | def parse_args(args=None, namespace=None): 13 | """Parse command-line arguments.""" 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument( 16 | "-c", 17 | "--config", 18 | type=str, 19 | required=True, 20 | help="path to the config file") 21 | return parser.parse_args(args=args, namespace=namespace) 22 | 23 | 24 | if __name__ == '__main__': 25 | # parse commands 26 | cmd = parse_args() 27 | 28 | # load config 29 | args = utils.load_config(cmd.config) 30 | print(' > config:', cmd.config) 31 | print(' > exp:', args.env.expdir) 32 | 33 | # load model 34 | model = None 35 | 36 | if args.model.type == 'Sins': 37 | model = Sins( 38 | sampling_rate=args.data.sampling_rate, 39 | block_size=args.data.block_size, 40 | win_length=args.model.win_length, 41 | use_mean_filter=args.model.use_mean_filter, 42 | n_harmonics=args.model.n_harmonics, 43 | n_mag_noise=args.model.n_mag_noise, 44 | n_mels=args.data.n_mels) 45 | 46 | elif args.model.type == 'CombSub': 47 | model = CombSub( 48 | sampling_rate=args.data.sampling_rate, 49 | block_size=args.data.block_size, 50 | win_length=args.model.win_length, 51 | use_mean_filter=args.model.use_mean_filter, 52 | n_mag_harmonic=args.model.n_mag_harmonic, 53 | n_mag_noise=args.model.n_mag_noise, 54 | n_mels=args.data.n_mels) 55 | 56 | else: 57 | raise ValueError(f" [x] Unknown Model: {args.model.type}") 58 | 59 | # load parameters 60 | optimizer = torch.optim.AdamW(model.parameters()) 61 | initial_global_step, model, optimizer = utils.load_model(args.env.expdir, model, optimizer, device=args.device) 62 | for param_group in optimizer.param_groups: 63 | param_group['lr'] = args.train.lr 64 | param_group['weight_decay'] = args.train.weight_decay 65 | 66 | # loss 67 | loss_func = HybridLoss(args.data.block_size, args.loss.fft_min, args.loss.fft_max, args.loss.n_scale, args.loss.lambda_uv, args.device) 68 | 69 | # device 70 | if args.device == 'cuda': 71 | torch.cuda.set_device(args.env.gpu_id) 72 | model.to(args.device) 73 | 74 | for state in optimizer.state.values(): 75 | for k, v in state.items(): 76 | if torch.is_tensor(v): 77 | state[k] = v.to(args.device) 78 | 79 | loss_func.to(args.device) 80 | 81 | # datas 82 | loader_train, loader_valid = get_data_loaders(args, whole_audio=False) 83 | 84 | # run 85 | train(args, initial_global_step, model, optimizer, loss_func, loader_train, loader_valid) 86 | 87 | -------------------------------------------------------------------------------- /logger/saver.py: -------------------------------------------------------------------------------- 1 | ''' 2 | author: wayn391@mastertones 3 | ''' 4 | 5 | import os 6 | import json 7 | import time 8 | import yaml 9 | import datetime 10 | import torch 11 | 12 | from . import utils 13 | from torch.utils.tensorboard import SummaryWriter 14 | 15 | class Saver(object): 16 | def __init__( 17 | self, 18 | args, 19 | initial_global_step=-1): 20 | 21 | self.expdir = args.env.expdir 22 | self.sample_rate = args.data.sampling_rate 23 | 24 | # cold start 25 | self.global_step = initial_global_step 26 | self.init_time = time.time() 27 | self.last_time = time.time() 28 | 29 | # makedirs 30 | os.makedirs(self.expdir, exist_ok=True) 31 | 32 | # path 33 | self.path_log_info = os.path.join(self.expdir, 'log_info.txt') 34 | 35 | # ckpt 36 | os.makedirs(self.expdir, exist_ok=True) 37 | 38 | # writer 39 | self.writer = SummaryWriter(os.path.join(self.expdir, 'logs')) 40 | 41 | # save config 42 | path_config = os.path.join(self.expdir, 'config.yaml') 43 | with open(path_config, "w") as out_config: 44 | yaml.dump(dict(args), out_config) 45 | 46 | 47 | def log_info(self, msg): 48 | '''log method''' 49 | if isinstance(msg, dict): 50 | msg_list = [] 51 | for k, v in msg.items(): 52 | tmp_str = '' 53 | if isinstance(v, int): 54 | tmp_str = '{}: {:,}'.format(k, v) 55 | else: 56 | tmp_str = '{}: {}'.format(k, v) 57 | 58 | msg_list.append(tmp_str) 59 | msg_str = '\n'.join(msg_list) 60 | else: 61 | msg_str = msg 62 | 63 | # dsplay 64 | print(msg_str) 65 | 66 | # save 67 | with open(self.path_log_info, 'a') as fp: 68 | fp.write(msg_str+'\n') 69 | 70 | def log_value(self, dict): 71 | for k, v in dict.items(): 72 | self.writer.add_scalar(k, v, self.global_step) 73 | 74 | def log_audio(self, dict): 75 | for k, v in dict.items(): 76 | self.writer.add_audio(k, v, global_step=self.global_step, sample_rate=self.sample_rate) 77 | 78 | def get_interval_time(self, update=True): 79 | cur_time = time.time() 80 | time_interval = cur_time - self.last_time 81 | if update: 82 | self.last_time = cur_time 83 | return time_interval 84 | 85 | def get_total_time(self, to_str=True): 86 | total_time = time.time() - self.init_time 87 | if to_str: 88 | total_time = str(datetime.timedelta( 89 | seconds=total_time))[:-5] 90 | return total_time 91 | 92 | def save_model( 93 | self, 94 | model, 95 | optimizer, 96 | name='model', 97 | postfix='', 98 | to_json=False): 99 | # path 100 | if postfix: 101 | postfix = '_' + postfix 102 | path_pt = os.path.join( 103 | self.expdir , name+postfix+'.pt') 104 | 105 | # check 106 | print(' [*] model checkpoint saved: {}'.format(path_pt)) 107 | 108 | # save 109 | if optimizer is not None: 110 | torch.save({ 111 | 'global_step': self.global_step, 112 | 'model': model.state_dict(), 113 | 'optimizer': optimizer.state_dict()}, path_pt) 114 | else: 115 | torch.save({ 116 | 'global_step': self.global_step, 117 | 'model': model.state_dict()}, path_pt) 118 | 119 | # to json 120 | if to_json: 121 | path_json = os.path.join( 122 | self.expdir , name+'.json') 123 | utils.to_json(path_params, path_json) 124 | 125 | def global_step_increment(self): 126 | self.global_step += 1 127 | 128 | 129 | -------------------------------------------------------------------------------- /logger/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | import json 4 | import pickle 5 | import torch 6 | 7 | def traverse_dir( 8 | root_dir, 9 | extension, 10 | amount=None, 11 | str_include=None, 12 | str_exclude=None, 13 | is_pure=False, 14 | is_sort=False, 15 | is_ext=True): 16 | 17 | file_list = [] 18 | cnt = 0 19 | for root, _, files in os.walk(root_dir): 20 | for file in files: 21 | if file.endswith(extension): 22 | # path 23 | mix_path = os.path.join(root, file) 24 | pure_path = mix_path[len(root_dir)+1:] if is_pure else mix_path 25 | 26 | # amount 27 | if (amount is not None) and (cnt == amount): 28 | if is_sort: 29 | file_list.sort() 30 | return file_list 31 | 32 | # check string 33 | if (str_include is not None) and (str_include not in pure_path): 34 | continue 35 | if (str_exclude is not None) and (str_exclude in pure_path): 36 | continue 37 | 38 | if not is_ext: 39 | ext = pure_path.split('.')[-1] 40 | pure_path = pure_path[:-(len(ext)+1)] 41 | file_list.append(pure_path) 42 | cnt += 1 43 | if is_sort: 44 | file_list.sort() 45 | return file_list 46 | 47 | 48 | 49 | class DotDict(dict): 50 | def __getattr__(*args): 51 | val = dict.get(*args) 52 | return DotDict(val) if type(val) is dict else val 53 | 54 | __setattr__ = dict.__setitem__ 55 | __delattr__ = dict.__delitem__ 56 | 57 | 58 | def get_network_paras_amount(model_dict): 59 | info = dict() 60 | for model_name, model in model_dict.items(): 61 | # all_params = sum(p.numel() for p in model.parameters()) 62 | trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) 63 | 64 | info[model_name] = trainable_params 65 | return info 66 | 67 | 68 | def load_config(path_config): 69 | with open(path_config, "r") as config: 70 | args = yaml.safe_load(config) 71 | args = DotDict(args) 72 | # print(args) 73 | return args 74 | 75 | 76 | def to_json(path_params, path_json): 77 | params = torch.load(path_params, map_location=torch.device('cpu')) 78 | raw_state_dict = {} 79 | for k, v in params.items(): 80 | val = v.flatten().numpy().tolist() 81 | raw_state_dict[k] = val 82 | 83 | with open(path_json, 'w') as outfile: 84 | json.dump(raw_state_dict, outfile,indent= "\t") 85 | 86 | 87 | def convert_tensor_to_numpy(tensor, is_squeeze=True): 88 | if is_squeeze: 89 | tensor = tensor.squeeze() 90 | if tensor.requires_grad: 91 | tensor = tensor.detach() 92 | if tensor.is_cuda: 93 | tensor = tensor.cpu() 94 | return tensor.numpy() 95 | 96 | 97 | def load_model( 98 | expdir, 99 | model, 100 | optimizer, 101 | name='model', 102 | postfix='', 103 | device='cpu'): 104 | if postfix == '': 105 | postfix = '_' + postfix 106 | path = os.path.join(expdir, name+postfix) 107 | path_pt = traverse_dir(expdir, '.pt', is_ext=False) 108 | global_step = 0 109 | if len(path_pt) > 0: 110 | steps = [s[len(path):] for s in path_pt] 111 | maxstep = max([int(s) if s.isdigit() else 0 for s in steps]) 112 | if maxstep > 0: 113 | path_pt = path+str(maxstep)+'.pt' 114 | else: 115 | path_pt = path+'best.pt' 116 | print(' [*] restoring model from', path_pt) 117 | ckpt = torch.load(path_pt, map_location=torch.device(device)) 118 | global_step = ckpt['global_step'] 119 | model.load_state_dict(ckpt['model']) 120 | if ckpt.get('optimizer') != None: 121 | optimizer.load_state_dict(ckpt['optimizer']) 122 | return global_step, model, optimizer 123 | -------------------------------------------------------------------------------- /ddsp/loss.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torchaudio 6 | from torch.nn import functional as F 7 | from .core import upsample 8 | 9 | class HybridLoss(nn.Module): 10 | def __init__(self, block_size, fft_min, fft_max, n_scale, lambda_uv, device): 11 | super().__init__() 12 | self.loss_rss_func = RSSLoss(fft_min, fft_max, n_scale, device = device) 13 | self.loss_uv_func = UVLoss(block_size) 14 | self.lambda_uv = lambda_uv 15 | 16 | def forward(self, signal, s_h, x_true, uv_true, detach_uv=False, uv_tolerance=0.05, prefix='train/'): 17 | loss_rss = self.loss_rss_func(signal, x_true) 18 | loss_uv = self.loss_uv_func(signal, s_h, uv_true) 19 | if detach_uv or loss_uv < uv_tolerance: 20 | loss_uv = loss_uv.detach() 21 | loss = loss_rss + self.lambda_uv * loss_uv 22 | loss_dict = {prefix+'loss': loss.item(), prefix+'loss_rss': loss_rss.item(), prefix+'loss_uv': loss_uv.item()} 23 | return loss, loss_dict 24 | 25 | class UVLoss(nn.Module): 26 | def __init__(self, block_size, eps = 1e-5): 27 | super().__init__() 28 | self.block_size = block_size 29 | self.eps = eps 30 | 31 | def forward(self, signal, s_h, uv_true): 32 | uv_mask = upsample(uv_true.unsqueeze(-1), self.block_size).squeeze(-1) 33 | loss = torch.mean(torch.linalg.norm(s_h * uv_mask, dim = 1) / (torch.linalg.norm(signal * uv_mask , dim = 1) + self.eps)) 34 | return loss 35 | 36 | class SSSLoss(nn.Module): 37 | """ 38 | Single-scale Spectral Loss. 39 | """ 40 | 41 | def __init__(self, n_fft=111, alpha=1.0, overlap=0, eps=1e-7): 42 | super().__init__() 43 | self.n_fft = n_fft 44 | self.alpha = alpha 45 | self.eps = eps 46 | self.hop_length = int(n_fft * (1 - overlap)) # 25% of the length 47 | self.spec = torchaudio.transforms.Spectrogram(n_fft=self.n_fft, hop_length=self.hop_length, power=1, normalized=True, center=False) 48 | 49 | def forward(self, x_true, x_pred): 50 | S_true = self.spec(x_true) + self.eps 51 | S_pred = self.spec(x_pred) + self.eps 52 | 53 | converge_term = torch.mean(torch.linalg.norm(S_true - S_pred, dim = (1, 2)) / torch.linalg.norm(S_true + S_pred, dim = (1, 2))) 54 | 55 | log_term = F.l1_loss(S_true.log(), S_pred.log()) 56 | 57 | loss = converge_term + self.alpha * log_term 58 | return loss 59 | 60 | 61 | class MSSLoss(nn.Module): 62 | """ 63 | Multi-scale Spectral Loss. 64 | Usage :: 65 | mssloss = MSSLoss([2048, 1024, 512, 256], alpha=1.0, overlap=0.75) 66 | mssloss(y_pred, y_gt) 67 | input(y_pred, y_gt) : two of torch.tensor w/ shape(batch, 1d-wave) 68 | output(loss) : torch.tensor(scalar) 69 | 70 | 48k: n_ffts=[2048, 1024, 512, 256] 71 | 24k: n_ffts=[1024, 512, 256, 128] 72 | """ 73 | 74 | def __init__(self, n_ffts, alpha=1.0, overlap=0.75, eps=1e-7): 75 | super().__init__() 76 | self.losses = nn.ModuleList([SSSLoss(n_fft, alpha, overlap, eps) for n_fft in n_ffts]) 77 | 78 | def forward(self, x_pred, x_true): 79 | x_pred = x_pred[..., :x_true.shape[-1]] 80 | value = 0. 81 | for loss in self.losses: 82 | value += loss(x_true, x_pred) 83 | return value 84 | 85 | class RSSLoss(nn.Module): 86 | ''' 87 | Random-scale Spectral Loss. 88 | ''' 89 | 90 | def __init__(self, fft_min, fft_max, n_scale, alpha=1.0, overlap=0, eps=1e-7, device='cuda'): 91 | super().__init__() 92 | self.fft_min = fft_min 93 | self.fft_max = fft_max 94 | self.n_scale = n_scale 95 | self.lossdict = {} 96 | for n_fft in range(fft_min, fft_max): 97 | self.lossdict[n_fft] = SSSLoss(n_fft, alpha, overlap, eps).to(device) 98 | 99 | def forward(self, x_pred, x_true): 100 | value = 0. 101 | n_ffts = torch.randint(self.fft_min, self.fft_max, (self.n_scale,)) 102 | for n_fft in n_ffts: 103 | loss_func = self.lossdict[int(n_fft)] 104 | value += loss_func(x_true, x_pred) 105 | return value / self.n_scale 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import librosa 4 | import argparse 5 | import numpy as np 6 | import soundfile as sf 7 | import pyworld as pw 8 | import parselmouth 9 | from ddsp.vocoder import load_model, Audio2Mel 10 | 11 | def parse_args(args=None, namespace=None): 12 | """Parse command-line arguments.""" 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument( 15 | "-m", 16 | "--model_path", 17 | type=str, 18 | required=True, 19 | help="path to the model file", 20 | ) 21 | parser.add_argument( 22 | "-i", 23 | "--input", 24 | type=str, 25 | required=True, 26 | help="path to the input audio file", 27 | ) 28 | parser.add_argument( 29 | "-o", 30 | "--output", 31 | type=str, 32 | required=True, 33 | help="path to the output audio file", 34 | ) 35 | parser.add_argument( 36 | "-k", 37 | "--key", 38 | type=str, 39 | required=False, 40 | default=0, 41 | help="key changed (number of semitones)", 42 | ) 43 | return parser.parse_args(args=args, namespace=namespace) 44 | 45 | if __name__ == '__main__': 46 | 47 | # cpu inference is fast enough! 48 | device = 'cpu' 49 | #device = 'cuda' if torch.cuda.is_available() else 'cpu' 50 | 51 | # parse commands 52 | cmd = parse_args() 53 | 54 | # load model 55 | model, args = load_model(cmd.model_path, device=device) 56 | 57 | sampling_rate = args.data.sampling_rate 58 | hop_length = args.data.block_size 59 | win_length = args.data.win_length 60 | n_fft = args.data.n_fft 61 | n_mel_channels = args.data.n_mels 62 | mel_fmin = args.data.mel_fmin 63 | mel_fmax = args.data.mel_fmax 64 | 65 | # load input 66 | x, _ = librosa.load(cmd.input, sr=sampling_rate) 67 | x_t = torch.from_numpy(x).float().to(device) 68 | x_t = x_t.unsqueeze(0).unsqueeze(0) # (T,) --> (1, 1, T) 69 | 70 | # mel analysis 71 | mel_extractor = Audio2Mel( 72 | hop_length=hop_length, 73 | sampling_rate=sampling_rate, 74 | n_mel_channels=n_mel_channels, 75 | win_length=win_length, 76 | n_fft=n_fft, 77 | mel_fmin=mel_fmin, 78 | mel_fmax=mel_fmax).to(device) 79 | 80 | mel = mel_extractor(x_t) 81 | 82 | # f0 analysis using dio 83 | ''' 84 | _f0, t = pw.dio( 85 | x.astype('double'), 86 | sampling_rate, 87 | f0_floor=65.0, 88 | f0_ceil=1047.0, 89 | channels_in_octave=2, 90 | frame_period=(1000*hop_length / sampling_rate)) 91 | f0 = pw.stonemask(x.astype('double'), _f0, t, sampling_rate) 92 | f0 = f0.astype('float') 93 | ''' 94 | 95 | # f0 analysis using parselmouth (faster) 96 | pitch_floor = 65 97 | l_pad = int(np.ceil(1.5 / pitch_floor * sampling_rate)) 98 | r_pad = hop_length * ((len(x) - 1) // hop_length + 1) - len(x) + l_pad + 1 99 | s = parselmouth.Sound(np.pad(x, (l_pad, r_pad)), sampling_rate).to_pitch_ac( 100 | time_step=hop_length / sampling_rate, voicing_threshold=0.6, 101 | pitch_floor=pitch_floor, pitch_ceiling=1100) 102 | assert np.abs(s.t1 - 1.5 / pitch_floor) < 0.001 103 | f0 = s.selected_array['frequency'] 104 | if len(f0) < mel.size(1): 105 | f0 = np.pad(f0, (0, mel.size(1) - len(f0))) 106 | f0 = f0[: mel.size(1)] 107 | 108 | # interpolate the unvoiced f0 109 | uv = f0 == 0 110 | f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv]) 111 | f0 = torch.from_numpy(f0).float().to(device).unsqueeze(-1).unsqueeze(0) 112 | 113 | # key change 114 | key_change = float(cmd.key) 115 | if key_change != 0: 116 | output_f0 = f0 * 2 ** (key_change / 12) 117 | else: 118 | output_f0 = None 119 | 120 | # forward and save the output 121 | with torch.no_grad(): 122 | if output_f0 is None: 123 | signal, _, (s_h, s_n) = model(mel, f0) 124 | else: 125 | signal, _, (s_h, s_n) = model(mel, f0, output_f0) 126 | signal = signal.squeeze().cpu().numpy() 127 | sf.write(cmd.output, signal, args.data.sampling_rate) 128 | 129 | 130 | 131 | 132 | 133 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pitch Controllable DDSP Vocoders 2 | 3 | 4 | 5 | 6 | 7 | In order to achieve high-quality and stable singing voice synthesis, compared with the above repositories, this repository has applied many algorithm improvements, including but not limited to volume augmentation, random-scaled STFT loss, UV regularization and phase prediction. 8 | 9 | There are currently two models in the repository , "Sins" is a classic additive synthesis model based on sine wave excitation, and "CombSub" is a new subtractive synthesis model proposed by me, which is based on combtooth wave excitation. The "Sins" model changes the formant when a pitch shift is applied, while the "CombSub" model does not. In other words, the "CombSub" model does not change the timbre of the vocal. 10 | 11 | To Use the DDSP vocoders in [DiffSinger (OpenVPI version)](https://github.com/openvpi/DiffSinger), see [DiffSinger.md](https://github.com/yxlllc/pc-ddsp/blob/master/DiffSinger.md). 12 | 13 | UPDATE (2023.6.7): Now both the 'CombSub' model and the 'Sins' model have been upgraded, and they better sound quality when doing copy-synthesising (including application in SVS system) and pitch-shifting, so the old version is not compatible. 14 | 15 | UPDATE (2023.10.15): Improve the phase filter, so the old version is not compatible. 16 | 17 | UPDATE (2024.5.4): Improve the model and refactor the code, so the old version is not compatible. 18 | 19 | ## 1. Installing the dependencies 20 | 21 | We recommend first installing PyTorch from the [official website](https://pytorch.org/), then run: 22 | 23 | ```bash 24 | pip install -r requirements.txt 25 | ``` 26 | 27 | UPDATE: python 3.8 (windows) + cuda 11.8 + torch 2.0.0 + torchaudio 2.0.1 works, and training is faster. 28 | 29 | ## 2. Preprocessing 30 | 31 | Put all the training dataset (.wav format audio clips) in the below directory: `data/train/audio`. Put all the validation dataset (.wav format audio clips) in the below directory: `data/val/audio`. Then run 32 | 33 | ```bash 34 | python preprocess.py -c configs/combsub.yaml 35 | ``` 36 | 37 | for a model of combtooth substractive synthesiser, or run 38 | 39 | ```bash 40 | python preprocess.py -c configs/sins.yaml 41 | ``` 42 | 43 | for a model of sinusoids additive synthesiser. 44 | 45 | You can modify the configuration file `config/.yaml` before preprocessing. The default configuration is suitable for training 44.1khz high sampling rate vocoder with GTX-1660 graphics card. 46 | 47 | NOTE 1: Please keep the sampling rate of all audio clips consistent with the sampling rate in the yaml configuration file ! If it is not consistent, the program can be executed safely, but the resampling during the training process will be very slow. 48 | 49 | NOTE 2: The total number of the audio clips for training dataset is recommended to be about 1000, especially long audio clip can be cut into short segments, which will speed up the training, but the duration of all audio clips should not be less than 2 seconds. If there are too many audio clips, you need a large internal-memory or set the 'cache_all_data' option to false in the configuration file. 50 | 51 | NOTE 3: The total number of the audio clips for validation dataset is recommended to be about 10, please don't put too many or it will be very slow to do the validation. 52 | 53 | ## 3. Training 54 | 55 | ```bash 56 | # train a combsub model as an example 57 | python train.py -c configs/combsub.yaml 58 | ``` 59 | 60 | The command line for training other models is similar. 61 | 62 | You can safely interrupt training, then running the same command line will resume training. 63 | 64 | You can also finetune the model if you interrupt training first, then re-preprocess the new dataset or change the training parameters (batchsize, lr etc.) and then run the same command line. 65 | 66 | ## 4. Visualization 67 | 68 | ```bash 69 | # check the training status using tensorboard 70 | tensorboard --logdir=exp 71 | ``` 72 | 73 | ## 5. Copy-synthesising or pitch-shifting test 74 | 75 | ```bash 76 | # Copy-synthesising test 77 | # wav -> mel, f0 -> wav 78 | python main.py -i -m -o 79 | ``` 80 | 81 | ```bash 82 | # Pitch-shifting test 83 | # wav -> mel, f0 -> mel (unchaned), f0 (shifted) -> wav 84 | python main.py -i -m -o -k 85 | ``` 86 | 87 | ## 6. Some suggestions for the model choice 88 | 89 | It is recommended to try the "CombSub" model first, which generally has a low random-scaled STFT loss and relatively good quality when applying a pitch shift. 90 | 91 | However, this loss sometimes cannot reflect the subjective sense of hearing. 92 | 93 | If the "CombSub" model does not work well, it is recommended to switch to the "Sins" model. 94 | 95 | The "Sins" model works also well when applying copy synthesis, but it changes the formant when applying a pitch shift, which changes the timbre. 96 | 97 | ## 7. Comments on the sound quality 98 | 99 | The sound quality of a well-trained DDSP vocoder (seen speaker) will be better than that of the world vocoder or griffin-lim vocoder, and it can also compete with the generative model-based vocoders (such as HifiGAN) when the total amount of training data is relatively small. But for a large amount of training data, the upper limit of sound quality will be lower than that of generative model based vocoders. 100 | 101 | Compared with high quality live recordings, the main defect of the current DDSP vocoder is the metallic noise, which may be due to the distortion of phase prediction based on a non-generative model, and the STFT loss overemphasizes the periodic components in the signal, resulting in too many high frequency band harmonics. 102 | -------------------------------------------------------------------------------- /solver.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import numpy as np 4 | import torch 5 | 6 | from logger.saver import Saver 7 | from logger import utils 8 | 9 | def test(args, model, loss_func, loader_test, saver): 10 | print(' [*] testing...') 11 | model.eval() 12 | 13 | # intialization 14 | num_batches = len(loader_test) 15 | rtf_all = [] 16 | test_loss_dict = {} 17 | 18 | # run 19 | with torch.no_grad(): 20 | for bidx, data in enumerate(loader_test): 21 | fn = data['name'][0] 22 | print('--------') 23 | print('{}/{} - {}'.format(bidx, num_batches, fn)) 24 | 25 | # unpack data 26 | for k in data.keys(): 27 | if k != 'name': 28 | data[k] = data[k].to(args.device).float() 29 | print('>>', data['name'][0]) 30 | 31 | # forward 32 | st_time = time.time() 33 | signal, _, (s_h, s_n) = model(data['mel'], data['f0']) 34 | ed_time = time.time() 35 | 36 | # crop 37 | min_len = np.min([signal.shape[1], data['audio'].shape[1]]) 38 | signal = signal[:,:min_len] 39 | data['audio'] = data['audio'][:,:min_len] 40 | 41 | # RTF 42 | run_time = ed_time - st_time 43 | song_time = data['audio'].shape[-1] / args.data.sampling_rate 44 | rtf = run_time / song_time 45 | print('RTF: {} | {} / {}'.format(rtf, run_time, song_time)) 46 | rtf_all.append(rtf) 47 | 48 | # loss 49 | loss, loss_dict = loss_func(signal, s_h, data['audio'], data['uv'], prefix='validation/') 50 | 51 | if test_loss_dict == {}: 52 | for key, value in loss_dict.items(): 53 | test_loss_dict[key] = value / num_batches 54 | else: 55 | for key, value in loss_dict.items(): 56 | test_loss_dict[key] += value / num_batches 57 | 58 | # log 59 | saver.log_audio({fn+'/gt.wav': data['audio'], fn+'/pred.wav': signal}) 60 | 61 | # report 62 | print(' [test_loss] test_loss:', test_loss_dict['validation/loss']) 63 | print(' [test_loss] test_loss_rss:', test_loss_dict['validation/loss_rss']) 64 | print(' Real Time Factor', np.mean(rtf_all)) 65 | return test_loss_dict 66 | 67 | 68 | def train(args, initial_global_step, model, optimizer, loss_func, loader_train, loader_test): 69 | # saver 70 | saver = Saver(args, initial_global_step=initial_global_step) 71 | 72 | # model size 73 | params_count = utils.get_network_paras_amount({'model': model}) 74 | saver.log_info('--- model size ---') 75 | saver.log_info(params_count) 76 | 77 | # run 78 | best_loss = np.inf 79 | num_batches = len(loader_train) 80 | model.train() 81 | saver.log_info('======= start training =======') 82 | for epoch in range(args.train.epochs): 83 | for batch_idx, data in enumerate(loader_train): 84 | saver.global_step_increment() 85 | optimizer.zero_grad() 86 | 87 | # unpack data 88 | for k in data.keys(): 89 | if k != 'name': 90 | data[k] = data[k].to(args.device) 91 | 92 | # forward 93 | signal, _, (s_h, s_n) = model(data['mel'], data['f0'], infer=False) 94 | 95 | # loss 96 | detach_uv = False 97 | if saver.global_step < args.loss.detach_uv_step: 98 | detach_uv = True 99 | loss, loss_dict = loss_func( 100 | signal, 101 | s_h, 102 | data['audio'], 103 | data['uv'], 104 | detach_uv = detach_uv, 105 | uv_tolerance = args.loss.uv_tolerance, 106 | prefix = 'train/') 107 | 108 | # handle nan loss 109 | if torch.isnan(loss): 110 | raise ValueError(' [x] nan loss ') 111 | else: 112 | # backpropagate 113 | loss.backward() 114 | optimizer.step() 115 | 116 | # log loss 117 | if saver.global_step % args.train.interval_log == 0: 118 | saver.log_info( 119 | 'epoch: {} | {:3d}/{:3d} | {} | batch/s: {:.2f} | loss: {:.3f} | rss: {:.3f} | time: {} | step: {}'.format( 120 | epoch, 121 | batch_idx, 122 | num_batches, 123 | args.env.expdir, 124 | args.train.interval_log/saver.get_interval_time(), 125 | loss_dict['train/loss'], 126 | loss_dict['train/loss_rss'], 127 | saver.get_total_time(), 128 | saver.global_step 129 | ) 130 | ) 131 | saver.log_value(loss_dict) 132 | 133 | # validation 134 | if saver.global_step % args.train.interval_val == 0: 135 | optimizer_save = optimizer if args.train.save_opt else None 136 | 137 | # save latest 138 | saver.save_model(model, optimizer_save, postfix=f'{saver.global_step}') 139 | 140 | # run testing set 141 | test_loss_dict = test(args, model, loss_func, loader_test, saver) 142 | 143 | saver.log_info( 144 | ' --- --- \nloss: {:.3f} | rss: {:.3f}. '.format( 145 | test_loss_dict['validation/loss'], 146 | test_loss_dict['validation/loss_rss'] 147 | ) 148 | ) 149 | saver.log_value(test_loss_dict) 150 | model.train() 151 | 152 | 153 | -------------------------------------------------------------------------------- /ddsp/model_conformer_naive.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | # From https://github.com/CNChTu/Diffusion-SVC/ by CNChTu 5 | # License: MIT 6 | 7 | 8 | class ConformerNaiveEncoder(nn.Module): 9 | """ 10 | Conformer Naive Encoder 11 | 12 | Args: 13 | dim_model (int): Dimension of model 14 | num_layers (int): Number of layers 15 | num_heads (int): Number of heads 16 | use_norm (bool): Whether to use norm for FastAttention, only True can use bf16/fp16, default False 17 | conv_only (bool): Whether to use only conv module without attention, default False 18 | conv_dropout (float): Dropout rate of conv module, default 0. 19 | atten_dropout (float): Dropout rate of attention module, default 0. 20 | """ 21 | 22 | def __init__(self, 23 | num_layers: int, 24 | num_heads: int, 25 | dim_model: int, 26 | use_norm: bool = False, 27 | conv_only: bool = False, 28 | conv_dropout: float = 0., 29 | atten_dropout: float = 0. 30 | ): 31 | super().__init__() 32 | self.num_layers = num_layers 33 | self.num_heads = num_heads 34 | self.dim_model = dim_model 35 | self.use_norm = use_norm 36 | self.residual_dropout = 0.1 # 废弃代码,仅做兼容性保留 37 | self.attention_dropout = 0.1 # 废弃代码,仅做兼容性保留 38 | 39 | self.encoder_layers = nn.ModuleList( 40 | [ 41 | CFNEncoderLayer(dim_model, num_heads, use_norm, conv_only, conv_dropout, atten_dropout) 42 | for _ in range(num_layers) 43 | ] 44 | ) 45 | 46 | def forward(self, x, mask=None) -> torch.Tensor: 47 | """ 48 | Args: 49 | x (torch.Tensor): Input tensor (#batch, length, dim_model) 50 | mask (torch.Tensor): Mask tensor, default None 51 | return: 52 | torch.Tensor: Output tensor (#batch, length, dim_model) 53 | """ 54 | 55 | for (i, layer) in enumerate(self.encoder_layers): 56 | x = layer(x, mask) 57 | return x # (#batch, length, dim_model) 58 | 59 | 60 | class CFNEncoderLayer(nn.Module): 61 | """ 62 | Conformer Naive Encoder Layer 63 | 64 | Args: 65 | dim_model (int): Dimension of model 66 | num_heads (int): Number of heads 67 | use_norm (bool): Whether to use norm for FastAttention, only True can use bf16/fp16, default False 68 | conv_only (bool): Whether to use only conv module without attention, default False 69 | conv_dropout (float): Dropout rate of conv module, default 0.1 70 | atten_dropout (float): Dropout rate of attention module, default 0.1 71 | """ 72 | 73 | def __init__(self, 74 | dim_model: int, 75 | num_heads: int = 8, 76 | use_norm: bool = False, 77 | conv_only: bool = False, 78 | conv_dropout: float = 0., 79 | atten_dropout: float = 0.1 80 | ): 81 | super().__init__() 82 | 83 | self.conformer = ConformerConvModule(dim_model, use_norm=use_norm, dropout=conv_dropout) 84 | 85 | self.norm = nn.LayerNorm(dim_model) 86 | 87 | self.dropout = nn.Dropout(0.1) # 废弃代码,仅做兼容性保留 88 | 89 | # selfatt -> fastatt: performer! 90 | if not conv_only: 91 | self.attn = nn.TransformerEncoderLayer( 92 | d_model=dim_model, 93 | nhead=num_heads, 94 | dim_feedforward=dim_model * 4, 95 | dropout=atten_dropout, 96 | activation='gelu' 97 | ) 98 | else: 99 | self.attn = None 100 | 101 | def forward(self, x, mask=None) -> torch.Tensor: 102 | """ 103 | Args: 104 | x (torch.Tensor): Input tensor (#batch, length, dim_model) 105 | mask (torch.Tensor): Mask tensor, default None 106 | return: 107 | torch.Tensor: Output tensor (#batch, length, dim_model) 108 | """ 109 | if self.attn is not None: 110 | x = x + (self.attn(self.norm(x), mask=mask)) 111 | 112 | x = x + (self.conformer(x)) 113 | 114 | return x # (#batch, length, dim_model) 115 | 116 | 117 | class ConformerConvModule(nn.Module): 118 | def __init__( 119 | self, 120 | dim, 121 | expansion_factor=2, 122 | kernel_size=31, 123 | dropout=0., 124 | use_norm=False, 125 | conv_model_type='mode1' 126 | ): 127 | super().__init__() 128 | 129 | inner_dim = dim * expansion_factor 130 | padding = calc_same_padding(kernel_size) 131 | 132 | if conv_model_type == 'mode1': 133 | self.net = nn.Sequential( 134 | nn.LayerNorm(dim) if use_norm else nn.Identity(), 135 | Transpose((1, 2)), 136 | nn.Conv1d(dim, inner_dim * 2, 1), 137 | nn.GLU(dim=1), 138 | nn.Conv1d(inner_dim, inner_dim, kernel_size=kernel_size, padding=padding[0], groups=inner_dim), 139 | nn.PReLU(num_parameters=inner_dim), 140 | nn.Conv1d(inner_dim, dim, 1), 141 | Transpose((1, 2)), 142 | nn.Dropout(dropout) 143 | ) 144 | elif conv_model_type == 'mode2': 145 | raise NotImplementedError('mode2 not implemented yet') 146 | else: 147 | raise ValueError(f'{conv_model_type} is not a valid conv_model_type') 148 | 149 | def forward(self, x): 150 | return self.net(x) 151 | 152 | 153 | def calc_same_padding(kernel_size): 154 | pad = kernel_size // 2 155 | return (pad, pad - (kernel_size + 1) % 2) 156 | 157 | 158 | class Transpose(nn.Module): 159 | def __init__(self, dims): 160 | super().__init__() 161 | assert len(dims) == 2, 'dims must be a tuple of two dimensions' 162 | self.dims = dims 163 | 164 | def forward(self, x): 165 | return x.transpose(*self.dims) 166 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import librosa 4 | import torch 5 | import pyworld as pw 6 | import parselmouth 7 | import argparse 8 | import shutil 9 | from logger import utils 10 | from tqdm import tqdm 11 | from ddsp.vocoder import Audio2Mel 12 | from librosa.filters import mel as librosa_mel_fn 13 | from logger.utils import traverse_dir 14 | import concurrent.futures 15 | 16 | def parse_args(args=None, namespace=None): 17 | """Parse command-line arguments.""" 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument( 20 | "-c", 21 | "--config", 22 | type=str, 23 | required=True, 24 | help="path to the config file") 25 | return parser.parse_args(args=args, namespace=namespace) 26 | 27 | def preprocess( 28 | path_srcdir, 29 | path_meldir, 30 | path_f0dir, 31 | path_uvdir, 32 | path_skipdir, 33 | device, 34 | f0_extractor, 35 | f0_min, 36 | f0_max, 37 | sampling_rate, 38 | hop_length, 39 | win_length, 40 | n_fft, 41 | n_mel_channels, 42 | mel_fmin, 43 | mel_fmax): 44 | 45 | # list files 46 | filelist = traverse_dir( 47 | path_srcdir, 48 | extension='wav', 49 | is_pure=True, 50 | is_sort=True, 51 | is_ext=True) 52 | 53 | # initilize extractor 54 | mel_extractor = Audio2Mel( 55 | hop_length=hop_length, 56 | sampling_rate=sampling_rate, 57 | n_mel_channels=n_mel_channels, 58 | win_length=win_length, 59 | n_fft=n_fft, 60 | mel_fmin=mel_fmin, 61 | mel_fmax=mel_fmax, 62 | clamp=1e-6).to(device) 63 | 64 | # run 65 | 66 | def process(file): 67 | ext = file.split('.')[-1] 68 | binfile = file[:-(len(ext)+1)]+'.npy' 69 | path_srcfile = os.path.join(path_srcdir, file) 70 | path_melfile = os.path.join(path_meldir, binfile) 71 | path_f0file = os.path.join(path_f0dir, binfile) 72 | path_uvfile = os.path.join(path_uvdir, binfile) 73 | 74 | # load audio 75 | x, _ = librosa.load(path_srcfile, sr=sampling_rate) 76 | x_t = torch.from_numpy(x).float().to(device) 77 | x_t = x_t.unsqueeze(0).unsqueeze(0) # (T,) --> (1, 1, T) 78 | 79 | # extract mel 80 | m_t = mel_extractor(x_t) 81 | mel = m_t.squeeze().to('cpu').numpy() 82 | 83 | # extract f0 using parselmouth 84 | if f0_extractor == 'parselmouth': 85 | l_pad = int(np.ceil(1.5 / f0_min * sampling_rate)) 86 | r_pad = hop_length * ((len(x) - 1) // hop_length + 1) - len(x) + l_pad + 1 87 | s = parselmouth.Sound(np.pad(x, (l_pad, r_pad)), sampling_rate).to_pitch_ac( 88 | time_step=hop_length / sampling_rate, voicing_threshold=0.6, 89 | pitch_floor=f0_min, pitch_ceiling=f0_max) 90 | assert np.abs(s.t1 - 1.5 / f0_min) < 0.001 91 | f0 = s.selected_array['frequency'] 92 | if len(f0) < len(mel): 93 | f0 = np.pad(f0, (0, len(mel) - len(f0))) 94 | f0 = f0[: len(mel)] 95 | 96 | # extract f0 using dio 97 | elif f0_extractor == 'dio': 98 | _f0, t = pw.dio( 99 | x.astype('double'), 100 | sampling_rate, 101 | f0_floor=f0_min, 102 | f0_ceil=f0_max, 103 | channels_in_octave=2, 104 | frame_period=(1000*hop_length / sampling_rate)) 105 | f0 = pw.stonemask(x.astype('double'), _f0, t, sampling_rate) 106 | f0 = f0.astype('float')[:len(mel)] 107 | 108 | # extract f0 using harvest 109 | elif f0_extractor == 'harvest': 110 | f0, _ = pw.harvest( 111 | x.astype('double'), 112 | sampling_rate, 113 | f0_floor=f0_min, 114 | f0_ceil=f0_max, 115 | frame_period=(1000*hop_length / sampling_rate)) 116 | f0 = f0.astype('float')[:len(mel)] 117 | 118 | else: 119 | raise ValueError(f" [x] Unknown f0 extractor: {f0_extractor}") 120 | 121 | uv = f0 == 0 122 | if len(f0[~uv]) > 0: 123 | # interpolate the unvoiced f0 124 | f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv]) 125 | uv = uv.astype('float') 126 | uv = np.min(np.array([uv[:-2],uv[1:-1],uv[2:]]),axis=0) 127 | uv = np.pad(uv, (1, 1), constant_values=(uv[0], uv[-1])) 128 | # save npy 129 | os.makedirs(os.path.dirname(path_melfile), exist_ok=True) 130 | np.save(path_melfile, mel) 131 | os.makedirs(os.path.dirname(path_f0file), exist_ok=True) 132 | np.save(path_f0file, f0) 133 | os.makedirs(os.path.dirname(path_uvfile), exist_ok=True) 134 | np.save(path_uvfile, uv) 135 | else: 136 | print('\n[Error] F0 extraction failed: ' + path_srcfile) 137 | os.makedirs(path_skipdir, exist_ok=True) 138 | shutil.move(path_srcfile, path_skipdir) 139 | print('This file has been moved to ' + os.path.join(path_skipdir, file)) 140 | print('Preprocess the audio clips in :', path_srcdir) 141 | 142 | # single process 143 | for file in tqdm(filelist, total=len(filelist)): 144 | process(file) 145 | 146 | # multi-process (have bugs) 147 | ''' 148 | with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor: 149 | list(tqdm(executor.map(process, filelist), total=len(filelist))) 150 | ''' 151 | 152 | if __name__ == '__main__': 153 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 154 | 155 | # parse commands 156 | cmd = parse_args() 157 | 158 | # load config 159 | args = utils.load_config(cmd.config) 160 | f0_extractor = args.data.f0_extractor 161 | f0_min = args.data.f0_min 162 | f0_max = args.data.f0_max 163 | sampling_rate = args.data.sampling_rate 164 | hop_length = args.data.block_size 165 | win_length = args.data.win_length 166 | n_fft = args.data.n_fft 167 | n_mel_channels = args.data.n_mels 168 | mel_fmin = args.data.mel_fmin 169 | mel_fmax = args.data.mel_fmax 170 | train_path = args.data.train_path 171 | valid_path = args.data.valid_path 172 | 173 | # run 174 | for path in [train_path, valid_path]: 175 | path_srcdir = os.path.join(path, 'audio') 176 | path_meldir = os.path.join(path, 'mel') 177 | path_f0dir = os.path.join(path, 'f0') 178 | path_uvdir = os.path.join(path, 'uv') 179 | path_skipdir = os.path.join(path, 'skip') 180 | preprocess( 181 | path_srcdir, 182 | path_meldir, 183 | path_f0dir, 184 | path_uvdir, 185 | path_skipdir, 186 | device, 187 | f0_extractor, 188 | f0_min, 189 | f0_max, 190 | sampling_rate, 191 | hop_length, 192 | win_length, 193 | n_fft, 194 | n_mel_channels, 195 | mel_fmin, 196 | mel_fmax) 197 | 198 | -------------------------------------------------------------------------------- /ddsp/core.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import functional as F 4 | 5 | import math 6 | import numpy as np 7 | 8 | def get_fft_size(frame_size: int, ir_size: int, power_of_2: bool = True): 9 | """Calculate final size for efficient FFT. 10 | Args: 11 | frame_size: Size of the audio frame. 12 | ir_size: Size of the convolving impulse response. 13 | power_of_2: Constrain to be a power of 2. If False, allow other 5-smooth 14 | numbers. TPU requires power of 2, while GPU is more flexible. 15 | Returns: 16 | fft_size: Size for efficient FFT. 17 | """ 18 | convolved_frame_size = ir_size + frame_size - 1 19 | if power_of_2: 20 | # Next power of 2. 21 | fft_size = int(2**np.ceil(np.log2(convolved_frame_size))) 22 | else: 23 | fft_size = convolved_frame_size 24 | return fft_size 25 | 26 | 27 | def mean_filter(signal, kernel_size): 28 | signal = signal.permute(0, 2, 1) 29 | signal = F.pad(signal, ((kernel_size - 1) // 2, kernel_size // 2), mode="reflect") 30 | ones_kernel = torch.ones(signal.size(1), 1, kernel_size, device=signal.device) 31 | signal = F.conv1d(signal, ones_kernel, stride=1, padding=0, groups=signal.size(1)) 32 | signal = signal / kernel_size 33 | return signal.permute(0, 2, 1) 34 | 35 | 36 | def upsample(signal, factor): 37 | signal = signal.permute(0, 2, 1) 38 | signal = nn.functional.interpolate(torch.cat((signal,signal[:,:,-1:]),2), size=signal.shape[-1] * factor + 1, mode='linear', align_corners=True) 39 | signal = signal[:,:,:-1] 40 | return signal.permute(0, 2, 1) 41 | 42 | 43 | def crop_and_compensate_delay(audio, audio_size, ir_size, 44 | padding = 'same', 45 | delay_compensation = -1): 46 | """Crop audio output from convolution to compensate for group delay. 47 | Args: 48 | audio: Audio after convolution. Tensor of shape [batch, time_steps]. 49 | audio_size: Initial size of the audio before convolution. 50 | ir_size: Size of the convolving impulse response. 51 | padding: Either 'valid' or 'same'. For 'same' the final output to be the 52 | same size as the input audio (audio_timesteps). For 'valid' the audio is 53 | extended to include the tail of the impulse response (audio_timesteps + 54 | ir_timesteps - 1). 55 | delay_compensation: Samples to crop from start of output audio to compensate 56 | for group delay of the impulse response. If delay_compensation < 0 it 57 | defaults to automatically calculating a constant group delay of the 58 | windowed linear phase filter from frequency_impulse_response(). 59 | Returns: 60 | Tensor of cropped and shifted audio. 61 | Raises: 62 | ValueError: If padding is not either 'valid' or 'same'. 63 | """ 64 | # Crop the output. 65 | if padding == 'valid': 66 | crop_size = ir_size + audio_size - 1 67 | elif padding == 'same': 68 | crop_size = audio_size 69 | else: 70 | raise ValueError('Padding must be \'valid\' or \'same\', instead ' 71 | 'of {}.'.format(padding)) 72 | 73 | # Compensate for the group delay of the filter by trimming the front. 74 | # For an impulse response produced by frequency_impulse_response(), 75 | # the group delay is constant because the filter is linear phase. 76 | total_size = audio.size(-1) 77 | crop = total_size - crop_size 78 | start = (ir_size // 2 if delay_compensation < 0 else delay_compensation) 79 | end = crop - start 80 | return audio[:, start:-end] 81 | 82 | 83 | def fft_convolve(audio, 84 | impulse_response): # B, n_frames, 2*(n_mags-1) 85 | """Filter audio with frames of time-varying impulse responses. 86 | Time-varying filter. Given audio [batch, n_samples], and a series of impulse 87 | responses [batch, n_frames, n_impulse_response], splits the audio into frames, 88 | applies filters, and then overlap-and-adds audio back together. 89 | Applies non-windowed non-overlapping STFT/ISTFT to efficiently compute 90 | convolution for large impulse response sizes. 91 | Args: 92 | audio: Input audio. Tensor of shape [batch, audio_timesteps]. 93 | impulse_response: Finite impulse response to convolve. Can either be a 2-D 94 | Tensor of shape [batch, ir_size], or a 3-D Tensor of shape [batch, 95 | ir_frames, ir_size]. A 2-D tensor will apply a single linear 96 | time-invariant filter to the audio. A 3-D Tensor will apply a linear 97 | time-varying filter. Automatically chops the audio into equally shaped 98 | blocks to match ir_frames. 99 | Returns: 100 | audio_out: Convolved audio. Tensor of shape 101 | [batch, audio_timesteps]. 102 | """ 103 | # Add a frame dimension to impulse response if it doesn't have one. 104 | ir_shape = impulse_response.size() 105 | if len(ir_shape) == 2: 106 | impulse_response = impulse_response.unsqueeze(1) 107 | ir_shape = impulse_response.size() 108 | 109 | # Get shapes of audio and impulse response. 110 | batch_size_ir, n_ir_frames, ir_size = ir_shape 111 | batch_size, audio_size = audio.size() # B, T 112 | 113 | # Cut audio into 50% overlapped frames (center padding). 114 | hop_size = audio_size // n_ir_frames 115 | frame_size = 2 * hop_size 116 | audio_frames = F.pad(audio, (hop_size, hop_size)).unfold(1, frame_size, hop_size) # B, n_frames+1, 2*hop_size 117 | 118 | # Apply Bartlett (triangular) window 119 | window = torch.bartlett_window(frame_size, device=audio_frames.device) 120 | audio_frames = audio_frames * window 121 | 122 | # Pad and FFT the audio and impulse responses. 123 | fft_size = get_fft_size(frame_size, ir_size, power_of_2=False) 124 | audio_fft = torch.fft.rfft(audio_frames, fft_size) 125 | ir_fft = torch.fft.rfft(torch.cat((impulse_response,impulse_response[:,-1:,:]),1), fft_size) 126 | 127 | # Multiply the FFTs (same as convolution in time). 128 | audio_ir_fft = torch.multiply(audio_fft, ir_fft) 129 | 130 | # Take the IFFT to resynthesize audio. 131 | audio_frames_out = torch.fft.irfft(audio_ir_fft, fft_size) 132 | 133 | # Overlap Add 134 | batch_size, n_audio_frames, frame_size = audio_frames_out.size() # # B, n_frames+1, 2*(hop_size+n_mags-1)-1 135 | fold = torch.nn.Fold(output_size=(1, (n_audio_frames - 1) * hop_size + frame_size),kernel_size=(1, frame_size),stride=(1, hop_size)) 136 | output_signal = fold(audio_frames_out.transpose(1, 2)).squeeze(1).squeeze(1) 137 | 138 | # Crop and shift the output audio. 139 | output_signal = crop_and_compensate_delay(output_signal[:,hop_size:], audio_size, ir_size) 140 | return output_signal 141 | 142 | 143 | def frequency_impulse_response(magnitudes, 144 | hann_window = True, 145 | half_width_frames = None): 146 | 147 | # Get the IR 148 | impulse_response = torch.fft.irfft(magnitudes) # B, n_frames, 2*(n_mags-1) 149 | ir_size = impulse_response.size(-1) 150 | impulse_response = impulse_response.roll(int(ir_size // 2), -1) 151 | 152 | # Window and put in causal form. 153 | if hann_window: 154 | if half_width_frames is None: 155 | window = torch.hann_window(ir_size, device=impulse_response.device) 156 | else: 157 | window = torch.arange(-(ir_size // 2), (ir_size + 1) // 2, device=impulse_response.device) / half_width_frames 158 | window = torch.clamp(window, min=-1, max=1) 159 | window = (1 + torch.cos(np.pi * window)) / 2 # B, n_frames, 2*(n_mag -1) or 2*n_mag-1 160 | impulse_response *= window 161 | 162 | return impulse_response 163 | 164 | 165 | def frequency_filter(audio, 166 | magnitudes, 167 | hann_window=True, 168 | half_width_frames=None): 169 | 170 | impulse_response = frequency_impulse_response(magnitudes, hann_window, half_width_frames) 171 | 172 | return fft_convolve(audio, impulse_response) 173 | 174 | -------------------------------------------------------------------------------- /data_loaders.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import numpy as np 4 | import librosa 5 | import torch 6 | import random 7 | from tqdm import tqdm 8 | from torch.utils.data import Dataset 9 | 10 | def traverse_dir( 11 | root_dir, 12 | extension, 13 | amount=None, 14 | str_include=None, 15 | str_exclude=None, 16 | is_pure=False, 17 | is_sort=False, 18 | is_ext=True): 19 | 20 | file_list = [] 21 | cnt = 0 22 | for root, _, files in os.walk(root_dir): 23 | for file in files: 24 | if file.endswith(extension): 25 | # path 26 | mix_path = os.path.join(root, file) 27 | pure_path = mix_path[len(root_dir)+1:] if is_pure else mix_path 28 | 29 | # amount 30 | if (amount is not None) and (cnt == amount): 31 | if is_sort: 32 | file_list.sort() 33 | return file_list 34 | 35 | # check string 36 | if (str_include is not None) and (str_include not in pure_path): 37 | continue 38 | if (str_exclude is not None) and (str_exclude in pure_path): 39 | continue 40 | 41 | if not is_ext: 42 | ext = pure_path.split('.')[-1] 43 | pure_path = pure_path[:-(len(ext)+1)] 44 | file_list.append(pure_path) 45 | cnt += 1 46 | if is_sort: 47 | file_list.sort() 48 | return file_list 49 | 50 | 51 | def get_data_loaders(args, whole_audio=False): 52 | data_train = AudioDataset( 53 | args.data.train_path, 54 | waveform_sec=args.data.duration, 55 | hop_size=args.data.block_size, 56 | sample_rate=args.data.sampling_rate, 57 | load_all_data=args.train.cache_all_data, 58 | whole_audio=whole_audio, 59 | volume_aug=True) 60 | loader_train = torch.utils.data.DataLoader( 61 | data_train , 62 | batch_size=args.train.batch_size if not whole_audio else 1, 63 | shuffle=True, 64 | num_workers=args.train.num_workers, 65 | persistent_workers=(args.train.num_workers > 0), 66 | pin_memory=True 67 | ) 68 | data_valid = AudioDataset( 69 | args.data.valid_path, 70 | waveform_sec=args.data.duration, 71 | hop_size=args.data.block_size, 72 | sample_rate=args.data.sampling_rate, 73 | load_all_data=args.train.cache_all_data, 74 | whole_audio=True, 75 | volume_aug=False) 76 | loader_valid = torch.utils.data.DataLoader( 77 | data_valid, 78 | batch_size=1, 79 | shuffle=False, 80 | num_workers=0, 81 | pin_memory=True 82 | ) 83 | return loader_train, loader_valid 84 | 85 | 86 | class AudioDataset(Dataset): 87 | def __init__( 88 | self, 89 | path_root, 90 | waveform_sec, 91 | hop_size, 92 | sample_rate, 93 | load_all_data=True, 94 | whole_audio=False, 95 | volume_aug=False 96 | ): 97 | super().__init__() 98 | 99 | self.waveform_sec = waveform_sec 100 | self.sample_rate = sample_rate 101 | self.hop_size = hop_size 102 | self.path_root = path_root 103 | self.paths = traverse_dir( 104 | os.path.join(path_root, 'audio'), 105 | extension='wav', 106 | is_pure=True, 107 | is_sort=True, 108 | is_ext=False 109 | ) 110 | self.whole_audio = whole_audio 111 | self.volume_aug = volume_aug 112 | self.data_buffer={} 113 | if load_all_data: 114 | print('Load all the data from :', path_root) 115 | else: 116 | print('Load the f0, uv data from :', path_root) 117 | for name in tqdm(self.paths, total=len(self.paths)): 118 | path_audio = os.path.join(self.path_root, 'audio', name) + '.wav' 119 | duration = librosa.get_duration(filename = path_audio, sr = self.sample_rate) 120 | 121 | path_f0 = os.path.join(self.path_root, 'f0', name) + '.npy' 122 | f0 = np.load(path_f0) 123 | f0 = torch.from_numpy(f0).float().unsqueeze(-1) 124 | 125 | path_uv = os.path.join(self.path_root, 'uv', name) + '.npy' 126 | uv = np.load(path_uv) 127 | uv = torch.from_numpy(uv).float() 128 | 129 | if load_all_data: 130 | audio, sr = librosa.load(path_audio, sr=self.sample_rate) 131 | audio = torch.from_numpy(audio).float() 132 | 133 | path_mel = os.path.join(self.path_root, 'mel', name) + '.npy' 134 | audio_mel = np.load(path_mel) 135 | audio_mel = torch.from_numpy(audio_mel).float() 136 | 137 | self.data_buffer[name] = { 138 | 'duration': duration, 139 | 'audio': audio, 140 | 'audio_mel': audio_mel, 141 | 'f0': f0, 142 | 'uv': uv 143 | } 144 | else: 145 | self.data_buffer[name] = { 146 | 'duration': duration, 147 | 'f0': f0, 148 | 'uv': uv 149 | } 150 | 151 | 152 | def __getitem__(self, file_idx): 153 | name = self.paths[file_idx] 154 | data_buffer = self.data_buffer[name] 155 | # check duration. if too short, then skip 156 | if data_buffer['duration'] < (self.waveform_sec + 0.1): 157 | return self.__getitem__( (file_idx + 1) % len(self.paths)) 158 | 159 | # get item 160 | return self.get_data(name, data_buffer) 161 | 162 | def get_data(self, name, data_buffer): 163 | frame_resolution = self.hop_size / self.sample_rate 164 | duration = data_buffer['duration'] 165 | waveform_sec = duration if self.whole_audio else self.waveform_sec 166 | 167 | # load audio 168 | idx_from = 0 if self.whole_audio else random.uniform(0, duration - waveform_sec - 0.1) 169 | start_frame = int(idx_from / frame_resolution) 170 | mel_frame_len = int(waveform_sec / frame_resolution) 171 | audio = data_buffer.get('audio') 172 | if audio is None: 173 | path_audio = os.path.join(self.path_root, 'audio', name) + '.wav' 174 | audio, sr = librosa.load( 175 | path_audio, 176 | sr = self.sample_rate, 177 | offset = start_frame * frame_resolution, 178 | duration = waveform_sec) 179 | # clip audio into N seconds 180 | audio = audio[..., : audio.shape[-1] // self.hop_size * self.hop_size] 181 | audio = torch.from_numpy(audio).float() 182 | else: 183 | audio = audio[..., start_frame * self.hop_size : (start_frame + mel_frame_len) * self.hop_size].clone() 184 | 185 | # load mel 186 | audio_mel = data_buffer.get('audio_mel') 187 | if audio_mel is None: 188 | path_mel = os.path.join(self.path_root, 'mel', name) + '.npy' 189 | audio_mel = np.load(path_mel) 190 | audio_mel = audio_mel[start_frame : start_frame + mel_frame_len] 191 | audio_mel = torch.from_numpy(audio_mel).float() 192 | else: 193 | audio_mel = audio_mel[start_frame : start_frame + mel_frame_len].clone() 194 | 195 | # load f0 196 | f0 = data_buffer.get('f0') 197 | f0_frames = f0[start_frame : start_frame + mel_frame_len] 198 | 199 | # load uv 200 | uv = data_buffer.get('uv') 201 | uv_frames = uv[start_frame : start_frame + mel_frame_len] 202 | 203 | # volume augmentation 204 | if self.volume_aug: 205 | max_amp = float(torch.max(torch.abs(audio))) + 1e-5 206 | max_shift = min(1, np.log10(1/max_amp)) 207 | log10_mel_shift = random.uniform(-1, max_shift) 208 | audio *= (10 ** log10_mel_shift) 209 | audio_mel += log10_mel_shift 210 | audio_mel = torch.clamp(audio_mel, min=-5) 211 | 212 | return dict(audio=audio, f0=f0_frames, uv=uv_frames, mel=audio_mel, name=name) 213 | 214 | def __len__(self): 215 | return len(self.paths) 216 | -------------------------------------------------------------------------------- /ddsp/vocoder.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import yaml 4 | import torch 5 | import torch.nn.functional as F 6 | from librosa.filters import mel as librosa_mel_fn 7 | from .mel2control import Mel2Control 8 | from .core import frequency_filter, mean_filter, upsample 9 | 10 | class DotDict(dict): 11 | def __getattr__(*args): 12 | val = dict.get(*args) 13 | return DotDict(val) if type(val) is dict else val 14 | 15 | __setattr__ = dict.__setitem__ 16 | __delattr__ = dict.__delitem__ 17 | 18 | 19 | def load_model( 20 | model_path, 21 | device='cpu'): 22 | config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml') 23 | with open(config_file, "r") as config: 24 | args = yaml.safe_load(config) 25 | args = DotDict(args) 26 | 27 | # load model 28 | print(' [Loading] ' + model_path) 29 | if model_path.split('.')[-1] == 'jit': 30 | model = torch.jit.load(model_path, map_location=torch.device(device)) 31 | else: 32 | if args.model.type == 'Sins': 33 | model = Sins( 34 | sampling_rate=args.data.sampling_rate, 35 | block_size=args.data.block_size, 36 | win_length=args.model.win_length, 37 | use_mean_filter=args.model.use_mean_filter, 38 | n_harmonics=args.model.n_harmonics, 39 | n_mag_noise=args.model.n_mag_noise, 40 | n_mels=args.data.n_mels) 41 | 42 | elif args.model.type == 'CombSub': 43 | model = CombSub( 44 | sampling_rate=args.data.sampling_rate, 45 | block_size=args.data.block_size, 46 | win_length=args.model.win_length, 47 | use_mean_filter=args.model.use_mean_filter, 48 | n_mag_harmonic=args.model.n_mag_harmonic, 49 | n_mag_noise=args.model.n_mag_noise, 50 | n_mels=args.data.n_mels) 51 | 52 | else: 53 | raise ValueError(f" [x] Unknown Model: {args.model.type}") 54 | model.to(device) 55 | ckpt = torch.load(model_path, map_location=torch.device(device)) 56 | model.load_state_dict(ckpt['model']) 57 | model.eval() 58 | return model, args 59 | 60 | 61 | class Audio2Mel(torch.nn.Module): 62 | def __init__( 63 | self, 64 | hop_length, 65 | sampling_rate, 66 | n_mel_channels, 67 | win_length, 68 | n_fft=None, 69 | mel_fmin=0, 70 | mel_fmax=None, 71 | clamp = 1e-5 72 | ): 73 | super().__init__() 74 | n_fft = win_length if n_fft is None else n_fft 75 | self.hann_window = {} 76 | mel_basis = librosa_mel_fn( 77 | sr=sampling_rate, 78 | n_fft=n_fft, 79 | n_mels=n_mel_channels, 80 | fmin=mel_fmin, 81 | fmax=mel_fmax) 82 | mel_basis = torch.from_numpy(mel_basis).float() 83 | self.register_buffer("mel_basis", mel_basis) 84 | self.n_fft = n_fft 85 | self.hop_length = hop_length 86 | self.win_length = win_length 87 | self.sampling_rate = sampling_rate 88 | self.n_mel_channels = n_mel_channels 89 | self.clamp = clamp 90 | 91 | def forward(self, audio, keyshift=0, speed=1): 92 | ''' 93 | audio: B x C x T 94 | log_mel_spec: B x T_ x C x n_mel 95 | ''' 96 | factor = 2 ** (keyshift / 12) 97 | n_fft_new = int(np.round(self.n_fft * factor)) 98 | win_length_new = int(np.round(self.win_length * factor)) 99 | hop_length_new = int(np.round(self.hop_length * speed)) 100 | 101 | keyshift_key = str(keyshift)+'_'+str(audio.device) 102 | if keyshift_key not in self.hann_window: 103 | self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(audio.device) 104 | 105 | B, C, T = audio.shape 106 | audio = audio.reshape(B * C, T) 107 | fft = torch.stft( 108 | audio, 109 | n_fft=n_fft_new, 110 | hop_length=hop_length_new, 111 | win_length=win_length_new, 112 | window=self.hann_window[keyshift_key], 113 | center=True, 114 | return_complex=True) 115 | magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) 116 | 117 | if keyshift != 0: 118 | size = self.n_fft // 2 + 1 119 | resize = magnitude.size(1) 120 | if resize < size: 121 | magnitude = F.pad(magnitude, (0, 0, 0, size-resize)) 122 | magnitude = magnitude[:, :size, :] * self.win_length / win_length_new 123 | 124 | mel_output = torch.matmul(self.mel_basis, magnitude) 125 | log_mel_spec = torch.log10(torch.clamp(mel_output, min=self.clamp)) 126 | 127 | # log_mel_spec: B x C, M, T 128 | T_ = log_mel_spec.shape[-1] 129 | log_mel_spec = log_mel_spec.reshape(B, C, self.n_mel_channels ,T_) 130 | log_mel_spec = log_mel_spec.permute(0, 3, 1, 2) 131 | 132 | # print('og_mel_spec:', log_mel_spec.shape) 133 | log_mel_spec = log_mel_spec.squeeze(2) # mono 134 | return log_mel_spec 135 | 136 | 137 | class Sins(torch.nn.Module): 138 | def __init__(self, 139 | sampling_rate, 140 | block_size, 141 | win_length, 142 | use_mean_filter, 143 | n_harmonics, 144 | n_mag_noise, 145 | n_mels=80): 146 | super().__init__() 147 | 148 | print(' [DDSP Model] Sinusoids Additive Synthesiser') 149 | 150 | # params 151 | self.register_buffer("sampling_rate", torch.tensor(sampling_rate)) 152 | self.register_buffer("block_size", torch.tensor(block_size)) 153 | self.register_buffer("win_length", torch.tensor(win_length)) 154 | self.register_buffer("window", torch.hann_window(win_length)) 155 | # Mel2Control 156 | split_map = { 157 | 'amplitudes': n_harmonics, 158 | 'harmonic_phase': win_length // 2 + 1, 159 | 'noise_magnitude': n_mag_noise, 160 | 'noise_phase': n_mag_noise, 161 | } 162 | self.mel2ctrl = Mel2Control(n_mels, block_size, split_map) 163 | # mean filter kernel size 164 | if use_mean_filter: 165 | self.mean_kernel_size = win_length // block_size 166 | else: 167 | self.mean_kernel_size = 1 168 | 169 | def fast_phase_gen(self, f0_frames): 170 | n = torch.arange(self.block_size, device=f0_frames.device) 171 | s0 = f0_frames / self.sampling_rate 172 | ds0 = F.pad(s0[:, 1:, :] - s0[:, :-1, :], (0, 0, 0, 1)) 173 | rad = s0 * (n + 1) + 0.5 * ds0 * n * (n + 1) / self.block_size 174 | rad2 = torch.fmod(rad[..., -1:].float() + 0.5, 1.0) - 0.5 175 | rad_acc = rad2.cumsum(dim=1).fmod(1.0).to(f0_frames) 176 | rad += F.pad(rad_acc[:, :-1, :], (0, 0, 1, 0)) 177 | phase = 2 * np.pi * rad.reshape(f0_frames.shape[0], -1, 1) 178 | return phase 179 | 180 | def forward(self, 181 | mel_frames, 182 | f0_frames, 183 | output_f0_frames=None, 184 | infer=True, 185 | max_upsample_dim=32): 186 | ''' 187 | mel_frames: B x n_frames x n_mels 188 | f0_frames: B x n_frames x 1 189 | ''' 190 | # exciter phase 191 | phase = self.fast_phase_gen(f0_frames) 192 | 193 | # sinusoid exciter signal 194 | sinusoid = torch.sin(phase).squeeze(-1) 195 | sinusoid_frames = sinusoid.unfold(1, self.block_size, self.block_size) 196 | 197 | # noise exciter signal 198 | noise = torch.randn_like(sinusoid) 199 | noise_frames = noise.unfold(1, self.block_size, self.block_size) 200 | 201 | # parameter prediction 202 | ctrls = self.mel2ctrl(mel_frames, sinusoid_frames, noise_frames) 203 | if self.mean_kernel_size > 1: 204 | ctrls['amplitudes'] = mean_filter(ctrls['amplitudes'], self.mean_kernel_size) 205 | ctrls['harmonic_phase'] = mean_filter(ctrls['harmonic_phase'], self.mean_kernel_size) 206 | 207 | src_allpass = torch.exp(1.j * np.pi * ctrls['harmonic_phase']) 208 | src_allpass = torch.cat((src_allpass, src_allpass[:,-1:,:]), 1) 209 | amplitudes_frames = torch.exp(ctrls['amplitudes'])/ 128 210 | noise_param = torch.exp(ctrls['noise_magnitude'] + 1.j * np.pi * ctrls['noise_phase']) / 128 211 | 212 | # harmonic additive synthesis 213 | if infer and output_f0_frames is not None: 214 | f0_frames = output_f0_frames 215 | phase = self.fast_phase_gen(output_f0_frames) 216 | n_harmonic = amplitudes_frames.shape[-1] 217 | level_harmonic = torch.arange(1, n_harmonic + 1, device=phase.device) 218 | mask = (f0_frames * level_harmonic < self.sampling_rate / 2).float() + 1e-7 219 | amplitudes_frames *= mask 220 | sinusoids = 0. 221 | for n in range(( n_harmonic - 1) // max_upsample_dim + 1): 222 | start = n * max_upsample_dim 223 | end = (n + 1) * max_upsample_dim 224 | phases = phase * level_harmonic[start:end] 225 | amplitudes = upsample(amplitudes_frames[:,:,start:end], self.block_size) 226 | sinusoids += (torch.sin(phases) * amplitudes).sum(-1) 227 | 228 | # harmonic part filter (all pass) 229 | harmonic_spec = torch.stft( 230 | sinusoids, 231 | n_fft = self.win_length, 232 | win_length = self.win_length, 233 | hop_length = self.block_size, 234 | window = self.window, 235 | center = True, 236 | return_complex = True) 237 | harmonic_spec = harmonic_spec * src_allpass.permute(0, 2, 1) 238 | harmonic = torch.istft( 239 | harmonic_spec, 240 | n_fft = self.win_length, 241 | win_length = self.win_length, 242 | hop_length = self.block_size, 243 | window = self.window, 244 | center = True) 245 | 246 | # noise part filter (using constant-windowed LTV-FIR) 247 | noise = frequency_filter( 248 | noise, 249 | noise_param) 250 | 251 | signal = harmonic + noise 252 | 253 | return signal, sinusoids, (harmonic, noise) 254 | 255 | 256 | class CombSub(torch.nn.Module): 257 | def __init__(self, 258 | sampling_rate, 259 | block_size, 260 | win_length, 261 | use_mean_filter, 262 | n_mag_harmonic, 263 | n_mag_noise, 264 | n_mels=80): 265 | super().__init__() 266 | 267 | print(' [DDSP Model] Combtooth Subtractive Synthesiser') 268 | # params 269 | self.register_buffer("sampling_rate", torch.tensor(sampling_rate)) 270 | self.register_buffer("block_size", torch.tensor(block_size)) 271 | self.register_buffer("win_length", torch.tensor(win_length)) 272 | self.register_buffer("window", torch.hann_window(win_length)) 273 | # Mel2Control 274 | split_map = { 275 | 'harmonic_magnitude': n_mag_harmonic, 276 | 'harmonic_phase': win_length // 2 + 1, 277 | 'noise_magnitude': n_mag_noise, 278 | 'noise_phase': n_mag_noise, 279 | } 280 | self.mel2ctrl = Mel2Control(n_mels, block_size, split_map) 281 | # mean filter kernel size 282 | if use_mean_filter: 283 | self.mean_kernel_size = win_length // block_size 284 | else: 285 | self.mean_kernel_size = 1 286 | 287 | def fast_source_gen(self, f0_frames): 288 | n = torch.arange(self.block_size, device=f0_frames.device) 289 | s0 = f0_frames / self.sampling_rate 290 | ds0 = F.pad(s0[:, 1:, :] - s0[:, :-1, :], (0, 0, 0, 1)) 291 | rad = s0 * (n + 1) + 0.5 * ds0 * n * (n + 1) / self.block_size 292 | s0 = s0 + ds0 * n / self.block_size 293 | rad2 = torch.fmod(rad[..., -1:].float() + 0.5, 1.0) - 0.5 294 | rad_acc = rad2.cumsum(dim=1).fmod(1.0).to(f0_frames) 295 | rad += F.pad(rad_acc[:, :-1, :], (0, 0, 1, 0)) 296 | rad -= torch.round(rad) 297 | combtooth = torch.sinc(rad / (s0 + 1e-5)).reshape(f0_frames.shape[0], -1) 298 | return combtooth 299 | 300 | def forward(self, 301 | mel_frames, 302 | f0_frames, 303 | output_f0_frames=None, 304 | infer=True, 305 | **kwargs): 306 | ''' 307 | mel_frames: B x n_frames x n_mels 308 | f0_frames: B x n_frames x 1 309 | ''' 310 | 311 | # combtooth exciter signal 312 | combtooth = self.fast_source_gen(f0_frames) 313 | combtooth_frames = combtooth.unfold(1, self.block_size, self.block_size) 314 | 315 | # noise exciter signal 316 | noise = torch.randn_like(combtooth) 317 | noise_frames = noise.unfold(1, self.block_size, self.block_size) 318 | 319 | # parameter prediction 320 | ctrls = self.mel2ctrl(mel_frames, combtooth_frames, noise_frames) 321 | if self.mean_kernel_size > 1: 322 | ctrls['harmonic_magnitude'] = mean_filter(ctrls['harmonic_magnitude'], self.mean_kernel_size) 323 | ctrls['harmonic_phase'] = mean_filter(ctrls['harmonic_phase'], self.mean_kernel_size) 324 | 325 | src_allpass = torch.exp(1.j * np.pi * ctrls['harmonic_phase']) 326 | src_allpass = torch.cat((src_allpass, src_allpass[:,-1:,:]), 1) 327 | src_param = torch.exp(ctrls['harmonic_magnitude']) 328 | noise_param = torch.exp(ctrls['noise_magnitude'] + 1.j * np.pi * ctrls['noise_phase']) / 128 329 | 330 | # harmonic part filter (using dynamic-windowed LTV-FIR) 331 | if infer and output_f0_frames is not None: 332 | f0_frames = output_f0_frames 333 | combtooth = self.fast_source_gen(output_f0_frames) 334 | harmonic = frequency_filter( 335 | combtooth, 336 | torch.complex(src_param, torch.zeros_like(src_param)), 337 | hann_window = True, 338 | half_width_frames = 1.5 * self.sampling_rate / (f0_frames + 1e-3)) 339 | 340 | # harmonic part filter (all pass) 341 | harmonic_spec = torch.stft( 342 | harmonic, 343 | n_fft = self.win_length, 344 | win_length = self.win_length, 345 | hop_length = self.block_size, 346 | window = self.window, 347 | center = True, 348 | return_complex = True) 349 | harmonic_spec = harmonic_spec * src_allpass.permute(0, 2, 1) 350 | harmonic = torch.istft( 351 | harmonic_spec, 352 | n_fft = self.win_length, 353 | win_length = self.win_length, 354 | hop_length = self.block_size, 355 | window = self.window, 356 | center = True) 357 | 358 | # noise part filter (using constant-windowed LTV-FIR) 359 | noise = frequency_filter( 360 | noise, 361 | noise_param) 362 | 363 | signal = harmonic + noise 364 | 365 | return signal, combtooth, (harmonic, noise) --------------------------------------------------------------------------------