├── exp
    └── gitkeep
├── ddsp
    ├── __init__.py
    ├── mel2control.py
    ├── loss.py
    ├── model_conformer_naive.py
    ├── core.py
    └── vocoder.py
├── logger
    ├── __init__.py
    ├── saver.py
    └── utils.py
├── data
    ├── val
    │   └── audio
    │   │   └── gitkeep
    └── train
    │   └── audio
    │       └── gitkeep
├── requirements.txt
├── LICENSE
├── configs
    ├── sins.yaml
    └── combsub.yaml
├── DiffSinger.md
├── export.py
├── train.py
├── main.py
├── README.md
├── solver.py
├── preprocess.py
└── data_loaders.py


/exp/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ddsp/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/logger/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/val/audio/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/train/audio/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | gin
 2 | gin_config
 3 | librosa
 4 | numpy
 5 | praat-parselmouth
 6 | pyworld
 7 | PyYAML
 8 | SoundFile
 9 | tqdm
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 yxlllc
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/configs/sins.yaml:
--------------------------------------------------------------------------------
 1 | data: 
 2 |   f0_extractor: 'parselmouth' # 'parselmouth' (singing) or 'dio' (speech) or 'harvest' (speech)
 3 |   f0_min: 65 # about C2
 4 |   f0_max: 800 # about G5
 5 |   sampling_rate: 44100
 6 |   n_fft: 2048
 7 |   win_length: 2048
 8 |   block_size: 512 # Equal to hop_length
 9 |   n_mels: 128
10 |   mel_fmin: 40 
11 |   mel_fmax: 16000 # <= sampling_rate / 2 
12 |   duration: 2 # Audio duration during training, must be less than the duration of the shortest audio clip
13 |   train_path: data/train # Create a folder named "audio" under this path and put the audio clip in it
14 |   valid_path: data/val # Create a folder named "audio" under this path and put the audio clip in it
15 | model:
16 |   type: 'Sins'
17 |   win_length: 2048
18 |   use_mean_filter: true
19 |   n_harmonics: 128
20 |   n_mag_noise: 256
21 | loss:
22 |   fft_min: 256
23 |   fft_max: 2048
24 |   n_scale: 4 # rss kernel numbers
25 |   lambda_uv: 1.0 # uv regularization
26 |   uv_tolerance: 0.05 # set it to a large value or try other f0 extractors if val_loss_uv is much higher than train_loss_uv
27 |   detach_uv_step: 2000
28 | device: cuda
29 | env:
30 |   expdir: exp/sins-test
31 |   gpu_id: 0
32 | train:
33 |   num_workers: 2 # if your cpu and gpu are both very strong, set to 0 may be faster!  
34 |   batch_size: 24
35 |   cache_all_data: true # Save Internal-Memory if it is false, but may be slow
36 |   epochs: 100000
37 |   interval_log: 10
38 |   interval_val: 2000
39 |   lr: 0.0005
40 |   weight_decay: 0
41 |   save_opt: false
42 | 


--------------------------------------------------------------------------------
/configs/combsub.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   f0_extractor: 'parselmouth' # 'parselmouth' (singing) or 'dio' (speech) or 'harvest' (speech)
 3 |   f0_min: 65 # about C2
 4 |   f0_max: 800 # about G5
 5 |   sampling_rate: 44100
 6 |   n_fft: 2048
 7 |   win_length: 2048
 8 |   block_size: 512 # Equal to hop_length
 9 |   n_mels: 128
10 |   mel_fmin: 40 
11 |   mel_fmax: 16000 # <= sampling_rate / 2 
12 |   duration: 2 # Audio duration during training, must be less than the duration of the shortest audio clip
13 |   train_path: data/train # Create a folder named "audio" under this path and put the audio clip in it
14 |   valid_path: data/val # Create a folder named "audio" under this path and put the audio clip in it
15 | model:
16 |   type: 'CombSub'
17 |   win_length: 2048
18 |   use_mean_filter: true  
19 |   n_mag_harmonic: 512
20 |   n_mag_noise: 256
21 | loss:
22 |   fft_min: 256
23 |   fft_max: 2048
24 |   n_scale: 4 # rss kernel numbers
25 |   lambda_uv: 1.0 # uv regularization
26 |   uv_tolerance: 0.05 # set it to a large value or try other f0 extractors if val_loss_uv is much higher than train_loss_uv
27 |   detach_uv_step: 2000
28 | device: cuda
29 | env:
30 |   expdir: exp/combsub-test
31 |   gpu_id: 0
32 | train:
33 |   num_workers: 2 # if your cpu and gpu are both very strong, set to 0 may be faster!
34 |   batch_size: 24
35 |   cache_all_data: true # Save Internal-Memory if it is false, but may be slow
36 |   epochs: 100000
37 |   interval_log: 10
38 |   interval_val: 2000
39 |   lr: 0.0005
40 |   weight_decay: 0
41 |   save_opt: false


--------------------------------------------------------------------------------
/DiffSinger.md:
--------------------------------------------------------------------------------
 1 | # Use DDSP Vocoders in DiffSinger (OpenVPI version)
 2 | Suppose you have already trained a model called `exp/combsub-test/model_100000.pt` using the code in this repository, run
 3 | ```bash
 4 | python export.py -m exp/combsub-test/model_100000.pt --traced
 5 | ```
 6 | This will create a `.jit`  format model file in the same directory.
 7 | 
 8 | Then, move this `.jit` model file and the `config.yaml` together to the `checkpoints/ddsp`  directory of the [**DiffSinger**](https://github.com/openvpi/DiffSinger) repository.
 9 | 
10 | Finally, edit the  [**`configs/acoustic.yaml`**](https://github.com/openvpi/DiffSinger/blob/main/configs/acoustic.yaml) file in the [**DiffSinger**](https://github.com/openvpi/DiffSinger)  repository to enable the DDSP vocoder. the details are:
11 | 1. Set the `vocoder` option to `DDSP`.
12 | 2. Set the `vocoder_ckpt` option to the path of the `.jit` model.  An example may be `checkpoints/ddsp/model_100000-traced-torch1.9.1.jit`
13 | 3. Check whether other mel related parameters match the parameters in the `checkpoints/ddsp/config.yaml` file.  For the details, the `audio_sample_rate`,`audio_num_mel_bins`,`hop_size`,`fft_size`,`win_size`,`fmin` and  `fmax` in  the [**`configs/acoustic.yaml`**](https://github.com/openvpi/DiffSinger/blob/main/configs/acoustic.yaml) need to match `sampling_rate`, `n_mels`, `block_size`, `n_fft`, `win_length`,`mel_fmin` and `mel_fmax` in the `checkpoints/ddsp/config.yaml`, respectively.
14 | 
15 | After doing all this, [**DiffSinger**](https://github.com/openvpi/DiffSinger)'s default NSF-HiFiGAN vocoder has been replaced by your own trained DDSP vocoder, and you can perform preprocessing, training or inference normally.
16 | 


--------------------------------------------------------------------------------
/ddsp/mel2control.py:
--------------------------------------------------------------------------------
 1 | import gin
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | import torch.nn as nn
 6 | from torch.nn.utils import weight_norm
 7 | 
 8 | from .model_conformer_naive import ConformerNaiveEncoder
 9 | 
10 | 
11 | def split_to_dict(tensor, tensor_splits):
12 |     """Split a tensor into a dictionary of multiple tensors."""
13 |     labels = []
14 |     sizes = []
15 | 
16 |     for k, v in tensor_splits.items():
17 |         labels.append(k)
18 |         sizes.append(v)
19 | 
20 |     tensors = torch.split(tensor, sizes, dim=-1)
21 |     return dict(zip(labels, tensors))
22 | 
23 | 
24 | class Mel2Control(nn.Module):
25 |     def __init__(
26 |             self,
27 |             n_mels,
28 |             block_size,
29 |             output_splits):
30 |         super().__init__()
31 |         self.output_splits = output_splits        
32 |         self.mel_emb = nn.Linear(n_mels, 256)      
33 |         self.stack = nn.Sequential(
34 |                 weight_norm(nn.Conv1d(2 * block_size, 512, 3, 1, 1)),
35 |                 nn.PReLU(num_parameters=512),
36 |                 weight_norm(nn.Conv1d(512, 256, 3, 1, 1)))
37 |         self.decoder = ConformerNaiveEncoder(
38 |             num_layers=3,
39 |             num_heads=8,
40 |             dim_model=256,
41 |             use_norm=False,
42 |             conv_only=True,
43 |             conv_dropout=0,
44 |             atten_dropout=0.1)
45 |         self.norm = nn.LayerNorm(256)
46 |         self.n_out = sum([v for k, v in output_splits.items()])
47 |         self.dense_out = weight_norm(nn.Linear(256, self.n_out))
48 | 
49 |     def forward(self, mel, source, noise):
50 |         
51 |         '''
52 |         input: 
53 |             B x n_frames x n_mels
54 |         return: 
55 |             dict of B x n_frames x feat
56 |         '''
57 |         exciter = torch.cat((source, noise), dim=-1).transpose(1,2)
58 |         x = self.mel_emb(mel) + self.stack(exciter).transpose(1,2)
59 |         x = self.decoder(x)
60 |         x = self.norm(x)
61 |         e = self.dense_out(x)
62 |         controls = split_to_dict(e, self.output_splits)
63 |     
64 |         return controls 
65 | 
66 | 


--------------------------------------------------------------------------------
/export.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os.path
 3 | 
 4 | import torch
 5 | 
 6 | from ddsp.vocoder import load_model
 7 | 
 8 | 
 9 | class DDSPWrapper(torch.nn.Module):
10 |     def __init__(self, module, device):
11 |         super().__init__()
12 |         self.model = module
13 |         self.to(device)
14 | 
15 |     def forward(self, mel, f0):
16 |         f0 = f0[..., None]
17 |         signal, _, (s_h, s_n) = self.model(mel, f0)
18 |         return signal, s_h, s_n
19 | 
20 | 
21 | def parse_args(args=None, namespace=None):
22 |     parser = argparse.ArgumentParser(
23 |         description='Export model to standalone PyTorch traced module or ONNX format'
24 |     )
25 |     parser.add_argument(
26 |         '-m',
27 |         '--model_path',
28 |         type=str,
29 |         required=True,
30 |         help='path to model file'
31 |     )
32 |     parser.add_argument(
33 |         '--traced',
34 |         required=False,
35 |         action='store_true',
36 |         help='export to traced module format'
37 |     )
38 |     parser.add_argument(
39 |         '--onnx',
40 |         required=False,
41 |         action='store_true',
42 |         help='export to ONNX format'
43 |     )
44 |     cmd = parser.parse_args(args=args, namespace=namespace)
45 |     if not cmd.traced and not cmd.onnx:
46 |         parser.error('either --traced or --onnx should be specified.')
47 |     return cmd
48 | 
49 | 
50 | def main():
51 |     device = 'cpu'
52 |     # parse commands
53 |     cmd = parse_args()
54 | 
55 |     # load model
56 |     model, args = load_model(cmd.model_path, device=device)
57 |     #model = DDSPWrapper(model, device)
58 | 
59 |     # extract model dirname and filename
60 |     directory = os.path.dirname(os.path.abspath(cmd.model_path))
61 |     name = os.path.basename(cmd.model_path).rsplit('.', maxsplit=1)[0]
62 | 
63 |     # load input
64 |     n_mel_channels = args.data.n_mels
65 |     n_frames = 10
66 |     mel = torch.randn((1, n_frames, n_mel_channels), dtype=torch.float32, device=device)
67 |     f0 = torch.FloatTensor([[440.] * n_frames]).to(device)
68 |     f0 = f0[..., None]
69 |     
70 |     # export model
71 |     with torch.no_grad():
72 |         if cmd.traced:
73 |             torch_version = torch.version.__version__.rsplit('+', maxsplit=1)[0]
74 |             export_path = os.path.join(directory, f'{name}-traced-torch{torch_version}.jit')
75 |             print(f' [Tracing] {cmd.model_path} => {export_path}')
76 |             model = torch.jit.trace(
77 |                 model,
78 |                 (
79 |                     mel,
80 |                     f0
81 |                 ),
82 |                 check_trace=False
83 |             )
84 |             torch.jit.save(model, export_path)
85 | 
86 |         if cmd.onnx:
87 |             raise NotImplementedError('Exporting to ONNX format is not supported yet.')
88 | 
89 | 
90 | if __name__ == '__main__':
91 |     main()
92 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import torch
 4 | 
 5 | from logger import utils
 6 | from data_loaders import get_data_loaders
 7 | from solver import train
 8 | from ddsp.vocoder import Sins, CombSub
 9 | from ddsp.loss import HybridLoss
10 | 
11 | 
12 | def parse_args(args=None, namespace=None):
13 |     """Parse command-line arguments."""
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument(
16 |         "-c",
17 |         "--config",
18 |         type=str,
19 |         required=True,
20 |         help="path to the config file")
21 |     return parser.parse_args(args=args, namespace=namespace)
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     # parse commands
26 |     cmd = parse_args()
27 |     
28 |     # load config
29 |     args = utils.load_config(cmd.config)
30 |     print(' > config:', cmd.config)
31 |     print(' >    exp:', args.env.expdir)
32 | 
33 |     # load model
34 |     model = None
35 |     
36 |     if args.model.type == 'Sins':
37 |         model = Sins(
38 |             sampling_rate=args.data.sampling_rate,
39 |             block_size=args.data.block_size,
40 |             win_length=args.model.win_length,
41 |             use_mean_filter=args.model.use_mean_filter,
42 |             n_harmonics=args.model.n_harmonics,
43 |             n_mag_noise=args.model.n_mag_noise,
44 |             n_mels=args.data.n_mels)
45 |  
46 |     elif args.model.type == 'CombSub':
47 |         model = CombSub(
48 |             sampling_rate=args.data.sampling_rate,
49 |             block_size=args.data.block_size,
50 |             win_length=args.model.win_length,
51 |             use_mean_filter=args.model.use_mean_filter,
52 |             n_mag_harmonic=args.model.n_mag_harmonic,
53 |             n_mag_noise=args.model.n_mag_noise,
54 |             n_mels=args.data.n_mels)
55 |             
56 |     else:
57 |         raise ValueError(f" [x] Unknown Model: {args.model.type}")
58 |     
59 |     # load parameters
60 |     optimizer = torch.optim.AdamW(model.parameters())
61 |     initial_global_step, model, optimizer = utils.load_model(args.env.expdir, model, optimizer, device=args.device)
62 |     for param_group in optimizer.param_groups:
63 |         param_group['lr'] = args.train.lr
64 |         param_group['weight_decay'] = args.train.weight_decay
65 |         
66 |     # loss
67 |     loss_func = HybridLoss(args.data.block_size, args.loss.fft_min, args.loss.fft_max, args.loss.n_scale, args.loss.lambda_uv, args.device)
68 | 
69 |     # device
70 |     if args.device == 'cuda':
71 |         torch.cuda.set_device(args.env.gpu_id)
72 |     model.to(args.device)
73 |     
74 |     for state in optimizer.state.values():
75 |         for k, v in state.items():
76 |             if torch.is_tensor(v):
77 |                 state[k] = v.to(args.device)
78 |                     
79 |     loss_func.to(args.device)
80 | 
81 |     # datas
82 |     loader_train, loader_valid = get_data_loaders(args, whole_audio=False)
83 |     
84 |     # run
85 |     train(args, initial_global_step, model, optimizer, loss_func, loader_train, loader_valid)
86 |     
87 | 


--------------------------------------------------------------------------------
/logger/saver.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | author: wayn391@mastertones
  3 | '''
  4 | 
  5 | import os
  6 | import json
  7 | import time
  8 | import yaml
  9 | import datetime
 10 | import torch
 11 | 
 12 | from . import utils
 13 | from torch.utils.tensorboard import SummaryWriter
 14 | 
 15 | class Saver(object):
 16 |     def __init__(
 17 |             self, 
 18 |             args,
 19 |             initial_global_step=-1):
 20 | 
 21 |         self.expdir = args.env.expdir
 22 |         self.sample_rate = args.data.sampling_rate
 23 |         
 24 |         # cold start
 25 |         self.global_step = initial_global_step
 26 |         self.init_time = time.time()
 27 |         self.last_time = time.time()
 28 | 
 29 |         # makedirs
 30 |         os.makedirs(self.expdir, exist_ok=True)       
 31 | 
 32 |         # path
 33 |         self.path_log_info = os.path.join(self.expdir, 'log_info.txt')
 34 | 
 35 |         # ckpt
 36 |         os.makedirs(self.expdir, exist_ok=True)       
 37 | 
 38 |         # writer
 39 |         self.writer = SummaryWriter(os.path.join(self.expdir, 'logs'))
 40 |         
 41 |         # save config
 42 |         path_config = os.path.join(self.expdir, 'config.yaml')
 43 |         with open(path_config, "w") as out_config:
 44 |             yaml.dump(dict(args), out_config)
 45 | 
 46 | 
 47 |     def log_info(self, msg):
 48 |         '''log method'''
 49 |         if isinstance(msg, dict):
 50 |             msg_list = []
 51 |             for k, v in msg.items():
 52 |                 tmp_str = ''
 53 |                 if isinstance(v, int):
 54 |                     tmp_str = '{}: {:,}'.format(k, v)
 55 |                 else:
 56 |                     tmp_str = '{}: {}'.format(k, v)
 57 | 
 58 |                 msg_list.append(tmp_str)
 59 |             msg_str = '\n'.join(msg_list)
 60 |         else:
 61 |             msg_str = msg
 62 |         
 63 |         # dsplay
 64 |         print(msg_str)
 65 | 
 66 |         # save
 67 |         with open(self.path_log_info, 'a') as fp:
 68 |             fp.write(msg_str+'\n')
 69 | 
 70 |     def log_value(self, dict):
 71 |         for k, v in dict.items():
 72 |             self.writer.add_scalar(k, v, self.global_step)
 73 |     
 74 |     def log_audio(self, dict):
 75 |         for k, v in dict.items():
 76 |             self.writer.add_audio(k, v, global_step=self.global_step, sample_rate=self.sample_rate)
 77 |     
 78 |     def get_interval_time(self, update=True):
 79 |         cur_time = time.time()
 80 |         time_interval = cur_time - self.last_time
 81 |         if update:
 82 |             self.last_time = cur_time
 83 |         return time_interval
 84 | 
 85 |     def get_total_time(self, to_str=True):
 86 |         total_time = time.time() - self.init_time
 87 |         if to_str:
 88 |             total_time = str(datetime.timedelta(
 89 |                 seconds=total_time))[:-5]
 90 |         return total_time
 91 | 
 92 |     def save_model(
 93 |             self,
 94 |             model, 
 95 |             optimizer,
 96 |             name='model',
 97 |             postfix='',
 98 |             to_json=False):
 99 |         # path
100 |         if postfix:
101 |             postfix = '_' + postfix
102 |         path_pt = os.path.join(
103 |             self.expdir , name+postfix+'.pt')
104 |        
105 |         # check
106 |         print(' [*] model checkpoint saved: {}'.format(path_pt))
107 | 
108 |         # save
109 |         if optimizer is not None:
110 |             torch.save({
111 |                 'global_step': self.global_step,
112 |                 'model': model.state_dict(),
113 |                 'optimizer': optimizer.state_dict()}, path_pt)
114 |         else:
115 |             torch.save({
116 |                 'global_step': self.global_step,
117 |                 'model': model.state_dict()}, path_pt)
118 |             
119 |         # to json
120 |         if to_json:
121 |             path_json = os.path.join(
122 |                 self.expdir , name+'.json')
123 |             utils.to_json(path_params, path_json)
124 |             
125 |     def global_step_increment(self):
126 |         self.global_step += 1
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/logger/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import yaml
  3 | import json
  4 | import pickle
  5 | import torch
  6 | 
  7 | def traverse_dir(
  8 |         root_dir,
  9 |         extension,
 10 |         amount=None,
 11 |         str_include=None,
 12 |         str_exclude=None,
 13 |         is_pure=False,
 14 |         is_sort=False,
 15 |         is_ext=True):
 16 | 
 17 |     file_list = []
 18 |     cnt = 0
 19 |     for root, _, files in os.walk(root_dir):
 20 |         for file in files:
 21 |             if file.endswith(extension):
 22 |                 # path
 23 |                 mix_path = os.path.join(root, file)
 24 |                 pure_path = mix_path[len(root_dir)+1:] if is_pure else mix_path
 25 | 
 26 |                 # amount
 27 |                 if (amount is not None) and (cnt == amount):
 28 |                     if is_sort:
 29 |                         file_list.sort()
 30 |                     return file_list
 31 |                 
 32 |                 # check string
 33 |                 if (str_include is not None) and (str_include not in pure_path):
 34 |                     continue
 35 |                 if (str_exclude is not None) and (str_exclude in pure_path):
 36 |                     continue
 37 |                 
 38 |                 if not is_ext:
 39 |                     ext = pure_path.split('.')[-1]
 40 |                     pure_path = pure_path[:-(len(ext)+1)]
 41 |                 file_list.append(pure_path)
 42 |                 cnt += 1
 43 |     if is_sort:
 44 |         file_list.sort()
 45 |     return file_list
 46 | 
 47 | 
 48 |     
 49 | class DotDict(dict):
 50 |     def __getattr__(*args):         
 51 |         val = dict.get(*args)         
 52 |         return DotDict(val) if type(val) is dict else val   
 53 | 
 54 |     __setattr__ = dict.__setitem__    
 55 |     __delattr__ = dict.__delitem__
 56 | 
 57 | 
 58 | def get_network_paras_amount(model_dict):
 59 |     info = dict()
 60 |     for model_name, model in model_dict.items():
 61 |         # all_params = sum(p.numel() for p in model.parameters())
 62 |         trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
 63 | 
 64 |         info[model_name] = trainable_params
 65 |     return info
 66 | 
 67 | 
 68 | def load_config(path_config):
 69 |     with open(path_config, "r") as config:
 70 |         args = yaml.safe_load(config)
 71 |     args = DotDict(args)
 72 |     # print(args)
 73 |     return args
 74 | 
 75 | 
 76 | def to_json(path_params, path_json):
 77 |     params = torch.load(path_params, map_location=torch.device('cpu'))
 78 |     raw_state_dict = {}
 79 |     for k, v in params.items():
 80 |         val = v.flatten().numpy().tolist()
 81 |         raw_state_dict[k] = val
 82 | 
 83 |     with open(path_json, 'w') as outfile:
 84 |         json.dump(raw_state_dict, outfile,indent= "\t")
 85 | 
 86 | 
 87 | def convert_tensor_to_numpy(tensor, is_squeeze=True):
 88 |     if is_squeeze:
 89 |         tensor = tensor.squeeze()
 90 |     if tensor.requires_grad:
 91 |         tensor = tensor.detach()
 92 |     if tensor.is_cuda:
 93 |         tensor = tensor.cpu()
 94 |     return tensor.numpy()
 95 | 
 96 |            
 97 | def load_model(
 98 |         expdir, 
 99 |         model,
100 |         optimizer,
101 |         name='model',
102 |         postfix='',
103 |         device='cpu'):
104 |     if postfix == '':
105 |         postfix = '_' + postfix
106 |     path = os.path.join(expdir, name+postfix)
107 |     path_pt = traverse_dir(expdir, '.pt', is_ext=False)
108 |     global_step = 0
109 |     if len(path_pt) > 0:
110 |         steps = [s[len(path):] for s in path_pt]
111 |         maxstep = max([int(s) if s.isdigit() else 0 for s in steps])
112 |         if maxstep > 0:
113 |             path_pt = path+str(maxstep)+'.pt'
114 |         else:
115 |             path_pt = path+'best.pt'
116 |         print(' [*] restoring model from', path_pt)
117 |         ckpt = torch.load(path_pt, map_location=torch.device(device))
118 |         global_step = ckpt['global_step']
119 |         model.load_state_dict(ckpt['model'])
120 |         if ckpt.get('optimizer') != None:
121 |             optimizer.load_state_dict(ckpt['optimizer'])
122 |     return global_step, model, optimizer
123 | 


--------------------------------------------------------------------------------
/ddsp/loss.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torchaudio
  6 | from torch.nn import functional as F
  7 | from .core import upsample
  8 | 
  9 | class HybridLoss(nn.Module):
 10 |     def __init__(self, block_size, fft_min, fft_max, n_scale, lambda_uv, device):
 11 |         super().__init__()
 12 |         self.loss_rss_func = RSSLoss(fft_min, fft_max, n_scale, device = device)
 13 |         self.loss_uv_func = UVLoss(block_size)
 14 |         self.lambda_uv = lambda_uv
 15 |         
 16 |     def forward(self, signal, s_h, x_true, uv_true, detach_uv=False, uv_tolerance=0.05, prefix='train/'):
 17 |         loss_rss = self.loss_rss_func(signal, x_true)
 18 |         loss_uv = self.loss_uv_func(signal, s_h, uv_true)
 19 |         if detach_uv or loss_uv < uv_tolerance:
 20 |             loss_uv = loss_uv.detach()
 21 |         loss = loss_rss + self.lambda_uv * loss_uv
 22 |         loss_dict = {prefix+'loss': loss.item(), prefix+'loss_rss': loss_rss.item(), prefix+'loss_uv': loss_uv.item()}
 23 |         return loss, loss_dict
 24 | 
 25 | class UVLoss(nn.Module):
 26 |     def __init__(self, block_size, eps = 1e-5):
 27 |         super().__init__()
 28 |         self.block_size = block_size
 29 |         self.eps = eps
 30 |         
 31 |     def forward(self, signal, s_h, uv_true):
 32 |         uv_mask = upsample(uv_true.unsqueeze(-1), self.block_size).squeeze(-1)
 33 |         loss = torch.mean(torch.linalg.norm(s_h * uv_mask, dim = 1) / (torch.linalg.norm(signal * uv_mask , dim = 1) + self.eps))
 34 |         return loss
 35 |         
 36 | class SSSLoss(nn.Module):
 37 |     """
 38 |     Single-scale Spectral Loss. 
 39 |     """
 40 | 
 41 |     def __init__(self, n_fft=111, alpha=1.0, overlap=0, eps=1e-7):
 42 |         super().__init__()
 43 |         self.n_fft = n_fft
 44 |         self.alpha = alpha
 45 |         self.eps = eps
 46 |         self.hop_length = int(n_fft * (1 - overlap))  # 25% of the length
 47 |         self.spec = torchaudio.transforms.Spectrogram(n_fft=self.n_fft, hop_length=self.hop_length, power=1, normalized=True, center=False)
 48 |         
 49 |     def forward(self, x_true, x_pred):
 50 |         S_true = self.spec(x_true) + self.eps
 51 |         S_pred = self.spec(x_pred) + self.eps
 52 |         
 53 |         converge_term = torch.mean(torch.linalg.norm(S_true - S_pred, dim = (1, 2)) / torch.linalg.norm(S_true + S_pred, dim = (1, 2)))
 54 |         
 55 |         log_term = F.l1_loss(S_true.log(), S_pred.log())
 56 | 
 57 |         loss = converge_term + self.alpha * log_term
 58 |         return loss
 59 |         
 60 | 
 61 | class MSSLoss(nn.Module):
 62 |     """
 63 |     Multi-scale Spectral Loss.
 64 |     Usage ::
 65 |     mssloss = MSSLoss([2048, 1024, 512, 256], alpha=1.0, overlap=0.75)
 66 |     mssloss(y_pred, y_gt)
 67 |     input(y_pred, y_gt) : two of torch.tensor w/ shape(batch, 1d-wave)
 68 |     output(loss) : torch.tensor(scalar)
 69 | 
 70 |     48k: n_ffts=[2048, 1024, 512, 256]
 71 |     24k: n_ffts=[1024, 512, 256, 128]
 72 |     """
 73 | 
 74 |     def __init__(self, n_ffts, alpha=1.0, overlap=0.75, eps=1e-7):
 75 |         super().__init__()
 76 |         self.losses = nn.ModuleList([SSSLoss(n_fft, alpha, overlap, eps) for n_fft in n_ffts])
 77 |         
 78 |     def forward(self, x_pred, x_true):
 79 |         x_pred = x_pred[..., :x_true.shape[-1]]
 80 |         value = 0.
 81 |         for loss in self.losses:
 82 |             value += loss(x_true, x_pred)
 83 |         return value
 84 | 
 85 | class RSSLoss(nn.Module):
 86 |     '''
 87 |     Random-scale Spectral Loss.
 88 |     '''
 89 |     
 90 |     def __init__(self, fft_min, fft_max, n_scale, alpha=1.0, overlap=0, eps=1e-7, device='cuda'):
 91 |         super().__init__()
 92 |         self.fft_min = fft_min
 93 |         self.fft_max = fft_max
 94 |         self.n_scale = n_scale
 95 |         self.lossdict = {}
 96 |         for n_fft in range(fft_min, fft_max):
 97 |             self.lossdict[n_fft] = SSSLoss(n_fft, alpha, overlap, eps).to(device)
 98 |         
 99 |     def forward(self, x_pred, x_true):
100 |         value = 0.
101 |         n_ffts = torch.randint(self.fft_min, self.fft_max, (self.n_scale,))
102 |         for n_fft in n_ffts:
103 |             loss_func = self.lossdict[int(n_fft)]
104 |             value += loss_func(x_true, x_pred)
105 |         return value / self.n_scale
106 |             
107 |         
108 |     
109 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import librosa
  4 | import argparse
  5 | import numpy as np
  6 | import soundfile as sf
  7 | import pyworld as pw
  8 | import parselmouth
  9 | from ddsp.vocoder import load_model, Audio2Mel
 10 | 
 11 | def parse_args(args=None, namespace=None):
 12 |     """Parse command-line arguments."""
 13 |     parser = argparse.ArgumentParser()
 14 |     parser.add_argument(
 15 |         "-m",
 16 |         "--model_path",
 17 |         type=str,
 18 |         required=True,
 19 |         help="path to the model file",
 20 |     )
 21 |     parser.add_argument(
 22 |         "-i",
 23 |         "--input",
 24 |         type=str,
 25 |         required=True,
 26 |         help="path to the input audio file",
 27 |     )
 28 |     parser.add_argument(
 29 |         "-o",
 30 |         "--output",
 31 |         type=str,
 32 |         required=True,
 33 |         help="path to the output audio file",
 34 |     )
 35 |     parser.add_argument(
 36 |         "-k",
 37 |         "--key",
 38 |         type=str,
 39 |         required=False,
 40 |         default=0,
 41 |         help="key changed (number of semitones)",
 42 |     )
 43 |     return parser.parse_args(args=args, namespace=namespace)
 44 |     
 45 | if __name__ == '__main__':
 46 |     
 47 |     # cpu inference is fast enough!
 48 |     device = 'cpu' 
 49 |     #device = 'cuda' if torch.cuda.is_available() else 'cpu'
 50 |     
 51 |     # parse commands
 52 |     cmd = parse_args()
 53 |     
 54 |     # load model
 55 |     model, args = load_model(cmd.model_path, device=device)
 56 |     
 57 |     sampling_rate = args.data.sampling_rate
 58 |     hop_length = args.data.block_size
 59 |     win_length = args.data.win_length
 60 |     n_fft = args.data.n_fft
 61 |     n_mel_channels = args.data.n_mels
 62 |     mel_fmin = args.data.mel_fmin
 63 |     mel_fmax = args.data.mel_fmax
 64 |     
 65 |     # load input
 66 |     x, _ = librosa.load(cmd.input, sr=sampling_rate)
 67 |     x_t = torch.from_numpy(x).float().to(device)
 68 |     x_t = x_t.unsqueeze(0).unsqueeze(0) # (T,) --> (1, 1, T)
 69 |     
 70 |     # mel analysis
 71 |     mel_extractor = Audio2Mel(
 72 |         hop_length=hop_length,
 73 |         sampling_rate=sampling_rate,
 74 |         n_mel_channels=n_mel_channels,
 75 |         win_length=win_length,
 76 |         n_fft=n_fft,
 77 |         mel_fmin=mel_fmin,
 78 |         mel_fmax=mel_fmax).to(device)
 79 |     
 80 |     mel = mel_extractor(x_t)
 81 |      
 82 |     # f0 analysis using dio
 83 |     '''
 84 |     _f0, t = pw.dio(
 85 |            x.astype('double'), 
 86 |            sampling_rate, 
 87 |            f0_floor=65.0, 
 88 |            f0_ceil=1047.0, 
 89 |            channels_in_octave=2, 
 90 |            frame_period=(1000*hop_length / sampling_rate))
 91 |     f0 = pw.stonemask(x.astype('double'), _f0, t, sampling_rate)
 92 |     f0 = f0.astype('float')
 93 |     '''
 94 |     
 95 |     # f0 analysis using parselmouth (faster)
 96 |     pitch_floor = 65
 97 |     l_pad = int(np.ceil(1.5 / pitch_floor * sampling_rate))
 98 |     r_pad = hop_length * ((len(x) - 1) // hop_length + 1) - len(x) + l_pad + 1
 99 |     s = parselmouth.Sound(np.pad(x, (l_pad, r_pad)), sampling_rate).to_pitch_ac(
100 |             time_step=hop_length / sampling_rate, voicing_threshold=0.6,
101 |             pitch_floor=pitch_floor, pitch_ceiling=1100)
102 |     assert np.abs(s.t1 - 1.5 / pitch_floor) < 0.001
103 |     f0 = s.selected_array['frequency']
104 |     if len(f0) < mel.size(1):
105 |         f0 = np.pad(f0, (0, mel.size(1) - len(f0)))
106 |     f0 = f0[: mel.size(1)]
107 |     
108 |     # interpolate the unvoiced f0 
109 |     uv = f0 == 0
110 |     f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
111 |     f0 = torch.from_numpy(f0).float().to(device).unsqueeze(-1).unsqueeze(0)
112 |    
113 |     # key change
114 |     key_change = float(cmd.key)
115 |     if key_change != 0:
116 |         output_f0 = f0 * 2 ** (key_change / 12)
117 |     else:
118 |         output_f0 = None
119 |      
120 |     # forward and save the output
121 |     with torch.no_grad():
122 |         if output_f0 is None:
123 |             signal, _, (s_h, s_n) = model(mel, f0)
124 |         else:
125 |             signal, _, (s_h, s_n) = model(mel, f0, output_f0)
126 |         signal = signal.squeeze().cpu().numpy()
127 |         sf.write(cmd.output, signal, args.data.sampling_rate)
128 |      
129 |      
130 |    
131 | 
132 |     
133 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Pitch Controllable DDSP Vocoders
  2 | 
  3 | <https://github.com/magenta/ddsp>
  4 | 
  5 | <https://github.com/YatingMusic/ddsp-singing-vocoders>
  6 | 
  7 | In order to achieve high-quality and stable singing voice synthesis, compared with the above repositories, this repository has applied many algorithm improvements, including but not limited to volume augmentation, random-scaled STFT loss, UV regularization and phase prediction.
  8 | 
  9 | There are currently two models in the repository , "Sins" is a classic additive synthesis model based on sine wave excitation, and "CombSub" is a new subtractive synthesis model proposed by me, which is based on combtooth wave excitation. The "Sins" model changes the formant when a pitch shift is applied, while the "CombSub" model does not. In other words, the "CombSub" model does not change the timbre of the vocal.
 10 | 
 11 | To Use the DDSP vocoders in [DiffSinger (OpenVPI version)](https://github.com/openvpi/DiffSinger), see [DiffSinger.md](https://github.com/yxlllc/pc-ddsp/blob/master/DiffSinger.md).
 12 | 
 13 | UPDATE (2023.6.7): Now both the 'CombSub' model and the 'Sins' model have been upgraded, and they better sound quality when doing copy-synthesising (including application in SVS system) and pitch-shifting, so the old version is not compatible.
 14 | 
 15 | UPDATE (2023.10.15): Improve the phase filter, so the old version is not compatible.
 16 | 
 17 | UPDATE (2024.5.4): Improve the model and refactor the code, so the old version is not compatible.
 18 | 
 19 | ## 1. Installing the dependencies
 20 | 
 21 | We recommend first installing PyTorch from the [official website](https://pytorch.org/), then run:
 22 | 
 23 | ```bash
 24 | pip install -r requirements.txt
 25 | ```
 26 | 
 27 | UPDATE: python 3.8 (windows) + cuda 11.8 + torch 2.0.0 + torchaudio 2.0.1 works, and training is faster.
 28 | 
 29 | ## 2. Preprocessing
 30 | 
 31 | Put all the training dataset (.wav format audio clips) in the below directory: `data/train/audio`. Put all the validation dataset (.wav format audio clips) in the below directory: `data/val/audio`. Then run
 32 | 
 33 | ```bash
 34 | python preprocess.py -c configs/combsub.yaml
 35 | ```
 36 | 
 37 | for a model of combtooth substractive synthesiser, or run
 38 | 
 39 | ```bash
 40 | python preprocess.py -c configs/sins.yaml
 41 | ```
 42 | 
 43 | for a model of sinusoids additive synthesiser.
 44 | 
 45 | You can modify the configuration file `config/<model_name>.yaml` before preprocessing. The default configuration is suitable for training 44.1khz high sampling rate vocoder with GTX-1660 graphics card.
 46 | 
 47 | NOTE 1: Please keep the sampling rate of all audio clips consistent with the sampling rate in the yaml configuration file ! If it is not consistent, the program can be executed safely, but the resampling during the training process will be very slow.
 48 | 
 49 | NOTE 2: The total number of the audio clips for training dataset is recommended to be about 1000, especially long audio clip can be cut into short segments, which will speed up the training, but the duration of all audio clips should not be less than 2 seconds. If there are too many audio clips, you need a large internal-memory or set the 'cache_all_data' option to false in the configuration file.
 50 | 
 51 | NOTE 3: The total number of the audio clips for validation dataset is recommended to be about 10, please don't put too many or it will be very slow to do the validation.
 52 | 
 53 | ## 3. Training
 54 | 
 55 | ```bash
 56 | # train a combsub model as an example
 57 | python train.py -c configs/combsub.yaml
 58 | ```
 59 | 
 60 | The command line for training other models is similar.
 61 | 
 62 | You can safely interrupt training, then running the same command line will resume training.
 63 | 
 64 | You can also finetune the model if you interrupt training first, then re-preprocess the new dataset or change the training parameters (batchsize, lr etc.) and then run the same command line.
 65 | 
 66 | ## 4. Visualization
 67 | 
 68 | ```bash
 69 | # check the training status using tensorboard
 70 | tensorboard --logdir=exp
 71 | ```
 72 | 
 73 | ## 5. Copy-synthesising or pitch-shifting test
 74 | 
 75 | ```bash
 76 | # Copy-synthesising test
 77 | # wav -> mel, f0 -> wav
 78 | python main.py -i <input.wav> -m <model_file.pt> -o <output.wav>
 79 | ```
 80 | 
 81 | ```bash
 82 | # Pitch-shifting test
 83 | # wav -> mel, f0 -> mel (unchaned), f0 (shifted) -> wav
 84 | python main.py -i <input.wav> -m <model_file.pt> -o <output.wav> -k <keychange (semitones)>
 85 | ```
 86 | 
 87 | ## 6. Some suggestions for the model choice
 88 | 
 89 | It is recommended to try the "CombSub" model first, which generally has a low random-scaled STFT loss and relatively good quality when applying a pitch shift.
 90 | 
 91 | However, this loss sometimes cannot reflect the subjective sense of hearing.
 92 | 
 93 | If the "CombSub" model does not work well, it is recommended to switch to the "Sins" model.
 94 | 
 95 | The "Sins" model works also well when applying copy synthesis, but it changes the formant when applying a pitch shift, which changes the timbre.
 96 | 
 97 | ## 7. Comments on the sound quality
 98 | 
 99 | The sound quality of a well-trained DDSP vocoder (seen speaker) will be better than that of the world vocoder or griffin-lim vocoder, and it can also compete with the generative model-based vocoders (such as HifiGAN) when the total amount of training data is relatively small. But for a large amount of training data, the upper limit of sound quality will be lower than that of generative model based vocoders.
100 | 
101 | Compared with high quality live recordings, the main defect of the current DDSP vocoder is the metallic noise, which may be due to the distortion of phase prediction based on a non-generative model, and the STFT loss overemphasizes the periodic components in the signal, resulting in too many high frequency band harmonics.
102 | 


--------------------------------------------------------------------------------
/solver.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import numpy as np
  4 | import torch
  5 | 
  6 | from logger.saver import Saver
  7 | from logger import utils
  8 | 
  9 | def test(args, model, loss_func, loader_test, saver):
 10 |     print(' [*] testing...')
 11 |     model.eval()
 12 | 
 13 |     # intialization
 14 |     num_batches = len(loader_test)
 15 |     rtf_all = []
 16 |     test_loss_dict = {}
 17 |     
 18 |     # run
 19 |     with torch.no_grad():
 20 |         for bidx, data in enumerate(loader_test):
 21 |             fn = data['name'][0]
 22 |             print('--------')
 23 |             print('{}/{} - {}'.format(bidx, num_batches, fn))
 24 | 
 25 |             # unpack data
 26 |             for k in data.keys():
 27 |                 if k != 'name':
 28 |                     data[k] = data[k].to(args.device).float()
 29 |             print('>>', data['name'][0])
 30 | 
 31 |             # forward
 32 |             st_time = time.time()
 33 |             signal, _, (s_h, s_n) = model(data['mel'], data['f0'])
 34 |             ed_time = time.time()
 35 | 
 36 |             # crop
 37 |             min_len = np.min([signal.shape[1], data['audio'].shape[1]])
 38 |             signal        = signal[:,:min_len]
 39 |             data['audio'] = data['audio'][:,:min_len]
 40 | 
 41 |             # RTF
 42 |             run_time = ed_time - st_time
 43 |             song_time = data['audio'].shape[-1] / args.data.sampling_rate
 44 |             rtf = run_time / song_time
 45 |             print('RTF: {}  | {} / {}'.format(rtf, run_time, song_time))
 46 |             rtf_all.append(rtf)
 47 |            
 48 |             # loss
 49 |             loss, loss_dict = loss_func(signal, s_h, data['audio'], data['uv'], prefix='validation/')
 50 | 
 51 |             if test_loss_dict == {}:
 52 |                 for key, value in loss_dict.items():
 53 |                     test_loss_dict[key] = value / num_batches
 54 |             else:
 55 |                 for key, value in loss_dict.items():
 56 |                     test_loss_dict[key] += value / num_batches
 57 | 
 58 |             # log
 59 |             saver.log_audio({fn+'/gt.wav': data['audio'], fn+'/pred.wav': signal})
 60 |             
 61 |     # report
 62 |     print(' [test_loss] test_loss:', test_loss_dict['validation/loss'])
 63 |     print(' [test_loss] test_loss_rss:', test_loss_dict['validation/loss_rss'])
 64 |     print(' Real Time Factor', np.mean(rtf_all))
 65 |     return test_loss_dict
 66 | 
 67 | 
 68 | def train(args, initial_global_step, model, optimizer, loss_func, loader_train, loader_test):
 69 |     # saver
 70 |     saver = Saver(args, initial_global_step=initial_global_step)
 71 | 
 72 |     # model size
 73 |     params_count = utils.get_network_paras_amount({'model': model})
 74 |     saver.log_info('--- model size ---')
 75 |     saver.log_info(params_count)
 76 |     
 77 |     # run
 78 |     best_loss = np.inf
 79 |     num_batches = len(loader_train)
 80 |     model.train()
 81 |     saver.log_info('======= start training =======')
 82 |     for epoch in range(args.train.epochs):
 83 |         for batch_idx, data in enumerate(loader_train):
 84 |             saver.global_step_increment()
 85 |             optimizer.zero_grad()
 86 | 
 87 |             # unpack data
 88 |             for k in data.keys():
 89 |                 if k != 'name':
 90 |                     data[k] = data[k].to(args.device)
 91 |             
 92 |             # forward
 93 |             signal, _, (s_h, s_n) = model(data['mel'], data['f0'], infer=False)
 94 | 
 95 |             # loss
 96 |             detach_uv = False
 97 |             if saver.global_step < args.loss.detach_uv_step:
 98 |                 detach_uv = True
 99 |             loss, loss_dict = loss_func(
100 |                                 signal, 
101 |                                 s_h, 
102 |                                 data['audio'], 
103 |                                 data['uv'], 
104 |                                 detach_uv = detach_uv, 
105 |                                 uv_tolerance = args.loss.uv_tolerance, 
106 |                                 prefix = 'train/')
107 |             
108 |             # handle nan loss
109 |             if torch.isnan(loss):
110 |                 raise ValueError(' [x] nan loss ')
111 |             else:
112 |                 # backpropagate
113 |                 loss.backward()
114 |                 optimizer.step()
115 | 
116 |             # log loss
117 |             if saver.global_step % args.train.interval_log == 0:
118 |                 saver.log_info(
119 |                     'epoch: {} | {:3d}/{:3d} | {} | batch/s: {:.2f} | loss: {:.3f} | rss: {:.3f} | time: {} | step: {}'.format(
120 |                         epoch,
121 |                         batch_idx,
122 |                         num_batches,
123 |                         args.env.expdir,
124 |                         args.train.interval_log/saver.get_interval_time(),
125 |                         loss_dict['train/loss'],
126 |                         loss_dict['train/loss_rss'],
127 |                         saver.get_total_time(),
128 |                         saver.global_step
129 |                     )
130 |                 )
131 |                 saver.log_value(loss_dict)
132 |             
133 |             # validation
134 |             if saver.global_step % args.train.interval_val == 0:
135 |                 optimizer_save = optimizer if args.train.save_opt else None
136 |                 
137 |                 # save latest
138 |                 saver.save_model(model, optimizer_save, postfix=f'{saver.global_step}')
139 | 
140 |                 # run testing set
141 |                 test_loss_dict = test(args, model, loss_func, loader_test, saver)
142 |              
143 |                 saver.log_info(
144 |                     ' --- <validation> --- \nloss: {:.3f} | rss: {:.3f}. '.format(
145 |                         test_loss_dict['validation/loss'],
146 |                         test_loss_dict['validation/loss_rss']
147 |                     )
148 |                 ) 
149 |                 saver.log_value(test_loss_dict)
150 |                 model.train()
151 | 
152 |                           
153 | 


--------------------------------------------------------------------------------
/ddsp/model_conformer_naive.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | 
  4 | # From https://github.com/CNChTu/Diffusion-SVC/ by CNChTu
  5 | # License: MIT
  6 | 
  7 | 
  8 | class ConformerNaiveEncoder(nn.Module):
  9 |     """
 10 |     Conformer Naive Encoder
 11 | 
 12 |     Args:
 13 |         dim_model (int): Dimension of model
 14 |         num_layers (int): Number of layers
 15 |         num_heads (int): Number of heads
 16 |         use_norm (bool): Whether to use norm for FastAttention, only True can use bf16/fp16, default False
 17 |         conv_only (bool): Whether to use only conv module without attention, default False
 18 |         conv_dropout (float): Dropout rate of conv module, default 0.
 19 |         atten_dropout (float): Dropout rate of attention module, default 0.
 20 |     """
 21 | 
 22 |     def __init__(self,
 23 |                  num_layers: int,
 24 |                  num_heads: int,
 25 |                  dim_model: int,
 26 |                  use_norm: bool = False,
 27 |                  conv_only: bool = False,
 28 |                  conv_dropout: float = 0.,
 29 |                  atten_dropout: float = 0.
 30 |                  ):
 31 |         super().__init__()
 32 |         self.num_layers = num_layers
 33 |         self.num_heads = num_heads
 34 |         self.dim_model = dim_model
 35 |         self.use_norm = use_norm
 36 |         self.residual_dropout = 0.1  # 废弃代码,仅做兼容性保留
 37 |         self.attention_dropout = 0.1  # 废弃代码,仅做兼容性保留
 38 | 
 39 |         self.encoder_layers = nn.ModuleList(
 40 |             [
 41 |                 CFNEncoderLayer(dim_model, num_heads, use_norm, conv_only, conv_dropout, atten_dropout)
 42 |                 for _ in range(num_layers)
 43 |             ]
 44 |         )
 45 | 
 46 |     def forward(self, x, mask=None) -> torch.Tensor:
 47 |         """
 48 |         Args:
 49 |             x (torch.Tensor): Input tensor (#batch, length, dim_model)
 50 |             mask (torch.Tensor): Mask tensor, default None
 51 |         return:
 52 |             torch.Tensor: Output tensor (#batch, length, dim_model)
 53 |         """
 54 | 
 55 |         for (i, layer) in enumerate(self.encoder_layers):
 56 |             x = layer(x, mask)
 57 |         return x  # (#batch, length, dim_model)
 58 | 
 59 | 
 60 | class CFNEncoderLayer(nn.Module):
 61 |     """
 62 |     Conformer Naive Encoder Layer
 63 | 
 64 |     Args:
 65 |         dim_model (int): Dimension of model
 66 |         num_heads (int): Number of heads
 67 |         use_norm (bool): Whether to use norm for FastAttention, only True can use bf16/fp16, default False
 68 |         conv_only (bool): Whether to use only conv module without attention, default False
 69 |         conv_dropout (float): Dropout rate of conv module, default 0.1
 70 |         atten_dropout (float): Dropout rate of attention module, default 0.1
 71 |     """
 72 | 
 73 |     def __init__(self,
 74 |                  dim_model: int,
 75 |                  num_heads: int = 8,
 76 |                  use_norm: bool = False,
 77 |                  conv_only: bool = False,
 78 |                  conv_dropout: float = 0.,
 79 |                  atten_dropout: float = 0.1
 80 |                  ):
 81 |         super().__init__()
 82 | 
 83 |         self.conformer = ConformerConvModule(dim_model, use_norm=use_norm, dropout=conv_dropout)
 84 | 
 85 |         self.norm = nn.LayerNorm(dim_model)
 86 | 
 87 |         self.dropout = nn.Dropout(0.1)  # 废弃代码,仅做兼容性保留
 88 | 
 89 |         # selfatt -> fastatt: performer!
 90 |         if not conv_only:
 91 |             self.attn = nn.TransformerEncoderLayer(
 92 |                 d_model=dim_model,
 93 |                 nhead=num_heads,
 94 |                 dim_feedforward=dim_model * 4,
 95 |                 dropout=atten_dropout,
 96 |                 activation='gelu'
 97 |             )
 98 |         else:
 99 |             self.attn = None
100 | 
101 |     def forward(self, x, mask=None) -> torch.Tensor:
102 |         """
103 |         Args:
104 |             x (torch.Tensor): Input tensor (#batch, length, dim_model)
105 |             mask (torch.Tensor): Mask tensor, default None
106 |         return:
107 |             torch.Tensor: Output tensor (#batch, length, dim_model)
108 |         """
109 |         if self.attn is not None:
110 |             x = x + (self.attn(self.norm(x), mask=mask))
111 | 
112 |         x = x + (self.conformer(x))
113 | 
114 |         return x  # (#batch, length, dim_model)
115 | 
116 | 
117 | class ConformerConvModule(nn.Module):
118 |     def __init__(
119 |             self,
120 |             dim,
121 |             expansion_factor=2,
122 |             kernel_size=31,
123 |             dropout=0.,
124 |             use_norm=False,
125 |             conv_model_type='mode1'
126 |     ):
127 |         super().__init__()
128 | 
129 |         inner_dim = dim * expansion_factor
130 |         padding = calc_same_padding(kernel_size)
131 | 
132 |         if conv_model_type == 'mode1':
133 |             self.net = nn.Sequential(
134 |                 nn.LayerNorm(dim) if use_norm else nn.Identity(),
135 |                 Transpose((1, 2)),
136 |                 nn.Conv1d(dim, inner_dim * 2, 1),
137 |                 nn.GLU(dim=1),
138 |                 nn.Conv1d(inner_dim, inner_dim, kernel_size=kernel_size, padding=padding[0], groups=inner_dim),
139 |                 nn.PReLU(num_parameters=inner_dim),
140 |                 nn.Conv1d(inner_dim, dim, 1),
141 |                 Transpose((1, 2)),
142 |                 nn.Dropout(dropout)
143 |             )
144 |         elif conv_model_type == 'mode2':
145 |             raise NotImplementedError('mode2 not implemented yet')
146 |         else:
147 |             raise ValueError(f'{conv_model_type} is not a valid conv_model_type')
148 | 
149 |     def forward(self, x):
150 |         return self.net(x)
151 | 
152 | 
153 | def calc_same_padding(kernel_size):
154 |     pad = kernel_size // 2
155 |     return (pad, pad - (kernel_size + 1) % 2)
156 | 
157 | 
158 | class Transpose(nn.Module):
159 |     def __init__(self, dims):
160 |         super().__init__()
161 |         assert len(dims) == 2, 'dims must be a tuple of two dimensions'
162 |         self.dims = dims
163 | 
164 |     def forward(self, x):
165 |         return x.transpose(*self.dims)
166 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import librosa
  4 | import torch
  5 | import pyworld as pw
  6 | import parselmouth
  7 | import argparse
  8 | import shutil
  9 | from logger import utils
 10 | from tqdm import tqdm
 11 | from ddsp.vocoder import Audio2Mel
 12 | from librosa.filters import mel as librosa_mel_fn
 13 | from logger.utils import traverse_dir
 14 | import concurrent.futures
 15 | 
 16 | def parse_args(args=None, namespace=None):
 17 |     """Parse command-line arguments."""
 18 |     parser = argparse.ArgumentParser()
 19 |     parser.add_argument(
 20 |         "-c",
 21 |         "--config",
 22 |         type=str,
 23 |         required=True,
 24 |         help="path to the config file")
 25 |     return parser.parse_args(args=args, namespace=namespace)
 26 |     
 27 | def preprocess(
 28 |         path_srcdir, 
 29 |         path_meldir,
 30 |         path_f0dir,
 31 |         path_uvdir,
 32 |         path_skipdir,
 33 |         device,
 34 |         f0_extractor,
 35 |         f0_min,
 36 |         f0_max,
 37 |         sampling_rate,
 38 |         hop_length,
 39 |         win_length,
 40 |         n_fft,
 41 |         n_mel_channels,
 42 |         mel_fmin,
 43 |         mel_fmax):
 44 |         
 45 |     # list files
 46 |     filelist =  traverse_dir(
 47 |         path_srcdir,
 48 |         extension='wav',
 49 |         is_pure=True,
 50 |         is_sort=True,
 51 |         is_ext=True)
 52 | 
 53 |     # initilize extractor
 54 |     mel_extractor = Audio2Mel(
 55 |         hop_length=hop_length,
 56 |         sampling_rate=sampling_rate,
 57 |         n_mel_channels=n_mel_channels,
 58 |         win_length=win_length,
 59 |         n_fft=n_fft,
 60 |         mel_fmin=mel_fmin,
 61 |         mel_fmax=mel_fmax,
 62 |         clamp=1e-6).to(device)
 63 | 
 64 |     # run
 65 |     
 66 |     def process(file):
 67 |         ext = file.split('.')[-1]
 68 |         binfile = file[:-(len(ext)+1)]+'.npy'
 69 |         path_srcfile = os.path.join(path_srcdir, file)
 70 |         path_melfile = os.path.join(path_meldir, binfile)
 71 |         path_f0file = os.path.join(path_f0dir, binfile)
 72 |         path_uvfile = os.path.join(path_uvdir, binfile)
 73 |         
 74 |         # load audio
 75 |         x, _ = librosa.load(path_srcfile, sr=sampling_rate)
 76 |         x_t = torch.from_numpy(x).float().to(device)
 77 |         x_t = x_t.unsqueeze(0).unsqueeze(0) # (T,) --> (1, 1, T)
 78 | 
 79 |         # extract mel
 80 |         m_t = mel_extractor(x_t)
 81 |         mel = m_t.squeeze().to('cpu').numpy()
 82 | 
 83 |         # extract f0 using parselmouth
 84 |         if f0_extractor == 'parselmouth':            
 85 |             l_pad = int(np.ceil(1.5 / f0_min * sampling_rate))
 86 |             r_pad = hop_length * ((len(x) - 1) // hop_length + 1) - len(x) + l_pad + 1
 87 |             s = parselmouth.Sound(np.pad(x, (l_pad, r_pad)), sampling_rate).to_pitch_ac(
 88 |                     time_step=hop_length / sampling_rate, voicing_threshold=0.6,
 89 |                     pitch_floor=f0_min, pitch_ceiling=f0_max)
 90 |             assert np.abs(s.t1 - 1.5 / f0_min) < 0.001
 91 |             f0 = s.selected_array['frequency']
 92 |             if len(f0) < len(mel):
 93 |                 f0 = np.pad(f0, (0, len(mel) - len(f0)))
 94 |             f0 = f0[: len(mel)]
 95 |             
 96 |         # extract f0 using dio
 97 |         elif f0_extractor == 'dio':
 98 |             _f0, t = pw.dio(
 99 |                 x.astype('double'), 
100 |                 sampling_rate, 
101 |                 f0_floor=f0_min, 
102 |                 f0_ceil=f0_max, 
103 |                 channels_in_octave=2, 
104 |                 frame_period=(1000*hop_length / sampling_rate))
105 |             f0 = pw.stonemask(x.astype('double'), _f0, t, sampling_rate)
106 |             f0 = f0.astype('float')[:len(mel)]
107 |         
108 |         # extract f0 using harvest
109 |         elif f0_extractor == 'harvest':
110 |             f0, _ = pw.harvest(
111 |                 x.astype('double'), 
112 |                 sampling_rate, 
113 |                 f0_floor=f0_min, 
114 |                 f0_ceil=f0_max, 
115 |                 frame_period=(1000*hop_length / sampling_rate))
116 |             f0 = f0.astype('float')[:len(mel)]
117 |             
118 |         else:
119 |             raise ValueError(f" [x] Unknown f0 extractor: {f0_extractor}")
120 |                
121 |         uv = f0 == 0
122 |         if len(f0[~uv]) > 0:
123 |             # interpolate the unvoiced f0
124 |             f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
125 |             uv = uv.astype('float')
126 |             uv = np.min(np.array([uv[:-2],uv[1:-1],uv[2:]]),axis=0)
127 |             uv = np.pad(uv, (1, 1), constant_values=(uv[0], uv[-1]))
128 |             # save npy
129 |             os.makedirs(os.path.dirname(path_melfile), exist_ok=True)
130 |             np.save(path_melfile, mel)
131 |             os.makedirs(os.path.dirname(path_f0file), exist_ok=True)
132 |             np.save(path_f0file, f0)
133 |             os.makedirs(os.path.dirname(path_uvfile), exist_ok=True)
134 |             np.save(path_uvfile, uv)
135 |         else:
136 |             print('\n[Error] F0 extraction failed: ' + path_srcfile)
137 |             os.makedirs(path_skipdir, exist_ok=True)
138 |             shutil.move(path_srcfile, path_skipdir)
139 |             print('This file has been moved to ' + os.path.join(path_skipdir, file))
140 |     print('Preprocess the audio clips in :', path_srcdir)
141 |     
142 |     # single process
143 |     for file in tqdm(filelist, total=len(filelist)):
144 |         process(file)
145 |     
146 |     # multi-process (have bugs)
147 |     '''
148 |     with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
149 |         list(tqdm(executor.map(process, filelist), total=len(filelist)))
150 |     '''
151 |                 
152 | if __name__ == '__main__':
153 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
154 |     
155 |     # parse commands
156 |     cmd = parse_args()
157 |     
158 |     # load config
159 |     args = utils.load_config(cmd.config)
160 |     f0_extractor = args.data.f0_extractor
161 |     f0_min = args.data.f0_min
162 |     f0_max = args.data.f0_max
163 |     sampling_rate  = args.data.sampling_rate
164 |     hop_length = args.data.block_size
165 |     win_length = args.data.win_length
166 |     n_fft = args.data.n_fft
167 |     n_mel_channels = args.data.n_mels
168 |     mel_fmin = args.data.mel_fmin
169 |     mel_fmax = args.data.mel_fmax
170 |     train_path = args.data.train_path
171 |     valid_path = args.data.valid_path
172 |     
173 |     # run
174 |     for path in [train_path, valid_path]:
175 |         path_srcdir  = os.path.join(path, 'audio')
176 |         path_meldir  = os.path.join(path, 'mel')
177 |         path_f0dir  = os.path.join(path, 'f0')
178 |         path_uvdir  = os.path.join(path, 'uv')
179 |         path_skipdir = os.path.join(path, 'skip')
180 |         preprocess(
181 |             path_srcdir, 
182 |             path_meldir, 
183 |             path_f0dir,
184 |             path_uvdir,
185 |             path_skipdir,
186 |             device,
187 |             f0_extractor,
188 |             f0_min,
189 |             f0_max,
190 |             sampling_rate,
191 |             hop_length,
192 |             win_length,
193 |             n_fft,
194 |             n_mel_channels,
195 |             mel_fmin,
196 |             mel_fmax)
197 |     
198 | 


--------------------------------------------------------------------------------
/ddsp/core.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.nn import functional as F
  4 | 
  5 | import math
  6 | import numpy as np
  7 | 
  8 | def get_fft_size(frame_size: int, ir_size: int, power_of_2: bool = True):
  9 |   """Calculate final size for efficient FFT.
 10 |   Args:
 11 |     frame_size: Size of the audio frame.
 12 |     ir_size: Size of the convolving impulse response.
 13 |     power_of_2: Constrain to be a power of 2. If False, allow other 5-smooth
 14 |       numbers. TPU requires power of 2, while GPU is more flexible.
 15 |   Returns:
 16 |     fft_size: Size for efficient FFT.
 17 |   """
 18 |   convolved_frame_size = ir_size + frame_size - 1
 19 |   if power_of_2:
 20 |     # Next power of 2.
 21 |     fft_size = int(2**np.ceil(np.log2(convolved_frame_size)))
 22 |   else:
 23 |     fft_size = convolved_frame_size
 24 |   return fft_size
 25 | 
 26 | 
 27 | def mean_filter(signal, kernel_size):
 28 |     signal = signal.permute(0, 2, 1)
 29 |     signal = F.pad(signal, ((kernel_size - 1) // 2, kernel_size // 2), mode="reflect")
 30 |     ones_kernel = torch.ones(signal.size(1), 1, kernel_size, device=signal.device)
 31 |     signal = F.conv1d(signal, ones_kernel, stride=1, padding=0, groups=signal.size(1))
 32 |     signal = signal / kernel_size
 33 |     return signal.permute(0, 2, 1)
 34 |     
 35 |     
 36 | def upsample(signal, factor):
 37 |     signal = signal.permute(0, 2, 1)
 38 |     signal = nn.functional.interpolate(torch.cat((signal,signal[:,:,-1:]),2), size=signal.shape[-1] * factor + 1, mode='linear', align_corners=True)
 39 |     signal = signal[:,:,:-1]
 40 |     return signal.permute(0, 2, 1)
 41 | 
 42 | 
 43 | def crop_and_compensate_delay(audio, audio_size, ir_size,
 44 |                               padding = 'same',
 45 |                               delay_compensation = -1):
 46 |   """Crop audio output from convolution to compensate for group delay.
 47 |   Args:
 48 |     audio: Audio after convolution. Tensor of shape [batch, time_steps].
 49 |     audio_size: Initial size of the audio before convolution.
 50 |     ir_size: Size of the convolving impulse response.
 51 |     padding: Either 'valid' or 'same'. For 'same' the final output to be the
 52 |       same size as the input audio (audio_timesteps). For 'valid' the audio is
 53 |       extended to include the tail of the impulse response (audio_timesteps +
 54 |       ir_timesteps - 1).
 55 |     delay_compensation: Samples to crop from start of output audio to compensate
 56 |       for group delay of the impulse response. If delay_compensation < 0 it
 57 |       defaults to automatically calculating a constant group delay of the
 58 |       windowed linear phase filter from frequency_impulse_response().
 59 |   Returns:
 60 |     Tensor of cropped and shifted audio.
 61 |   Raises:
 62 |     ValueError: If padding is not either 'valid' or 'same'.
 63 |   """
 64 |   # Crop the output.
 65 |   if padding == 'valid':
 66 |     crop_size = ir_size + audio_size - 1
 67 |   elif padding == 'same':
 68 |     crop_size = audio_size
 69 |   else:
 70 |     raise ValueError('Padding must be \'valid\' or \'same\', instead '
 71 |                      'of {}.'.format(padding))
 72 | 
 73 |   # Compensate for the group delay of the filter by trimming the front.
 74 |   # For an impulse response produced by frequency_impulse_response(),
 75 |   # the group delay is constant because the filter is linear phase.
 76 |   total_size = audio.size(-1)
 77 |   crop = total_size - crop_size
 78 |   start = (ir_size // 2 if delay_compensation < 0 else delay_compensation)
 79 |   end = crop - start
 80 |   return audio[:, start:-end]
 81 | 
 82 | 
 83 | def fft_convolve(audio,
 84 |                  impulse_response): # B, n_frames, 2*(n_mags-1)
 85 |     """Filter audio with frames of time-varying impulse responses.
 86 |     Time-varying filter. Given audio [batch, n_samples], and a series of impulse
 87 |     responses [batch, n_frames, n_impulse_response], splits the audio into frames,
 88 |     applies filters, and then overlap-and-adds audio back together.
 89 |     Applies non-windowed non-overlapping STFT/ISTFT to efficiently compute
 90 |     convolution for large impulse response sizes.
 91 |     Args:
 92 |         audio: Input audio. Tensor of shape [batch, audio_timesteps].
 93 |         impulse_response: Finite impulse response to convolve. Can either be a 2-D
 94 |         Tensor of shape [batch, ir_size], or a 3-D Tensor of shape [batch,
 95 |         ir_frames, ir_size]. A 2-D tensor will apply a single linear
 96 |         time-invariant filter to the audio. A 3-D Tensor will apply a linear
 97 |         time-varying filter. Automatically chops the audio into equally shaped
 98 |         blocks to match ir_frames.
 99 |     Returns:
100 |         audio_out: Convolved audio. Tensor of shape
101 |             [batch, audio_timesteps].
102 |     """
103 |     # Add a frame dimension to impulse response if it doesn't have one.
104 |     ir_shape = impulse_response.size() 
105 |     if len(ir_shape) == 2:
106 |         impulse_response = impulse_response.unsqueeze(1)
107 |         ir_shape = impulse_response.size()
108 | 
109 |     # Get shapes of audio and impulse response.
110 |     batch_size_ir, n_ir_frames, ir_size = ir_shape
111 |     batch_size, audio_size = audio.size() # B, T
112 | 
113 |     # Cut audio into 50% overlapped frames (center padding).
114 |     hop_size = audio_size // n_ir_frames
115 |     frame_size = 2 * hop_size    
116 |     audio_frames = F.pad(audio, (hop_size, hop_size)).unfold(1, frame_size, hop_size) # B, n_frames+1, 2*hop_size
117 |     
118 |     # Apply Bartlett (triangular) window
119 |     window = torch.bartlett_window(frame_size, device=audio_frames.device)
120 |     audio_frames = audio_frames * window
121 |     
122 |     # Pad and FFT the audio and impulse responses.
123 |     fft_size = get_fft_size(frame_size, ir_size, power_of_2=False)
124 |     audio_fft = torch.fft.rfft(audio_frames, fft_size)
125 |     ir_fft = torch.fft.rfft(torch.cat((impulse_response,impulse_response[:,-1:,:]),1), fft_size)
126 |     
127 |     # Multiply the FFTs (same as convolution in time).
128 |     audio_ir_fft = torch.multiply(audio_fft, ir_fft)
129 | 
130 |     # Take the IFFT to resynthesize audio.
131 |     audio_frames_out = torch.fft.irfft(audio_ir_fft, fft_size)
132 |     
133 |     # Overlap Add
134 |     batch_size, n_audio_frames, frame_size = audio_frames_out.size() # # B, n_frames+1, 2*(hop_size+n_mags-1)-1
135 |     fold = torch.nn.Fold(output_size=(1, (n_audio_frames - 1) * hop_size + frame_size),kernel_size=(1, frame_size),stride=(1, hop_size))
136 |     output_signal = fold(audio_frames_out.transpose(1, 2)).squeeze(1).squeeze(1)
137 |     
138 |     # Crop and shift the output audio.
139 |     output_signal = crop_and_compensate_delay(output_signal[:,hop_size:], audio_size, ir_size)
140 |     return output_signal
141 | 
142 |           
143 | def frequency_impulse_response(magnitudes,
144 |                                hann_window = True,
145 |                                half_width_frames = None):
146 |                                
147 |     # Get the IR
148 |     impulse_response = torch.fft.irfft(magnitudes) # B, n_frames, 2*(n_mags-1)
149 |     ir_size = impulse_response.size(-1)
150 |     impulse_response = impulse_response.roll(int(ir_size // 2), -1)
151 |     
152 |     # Window and put in causal form.
153 |     if hann_window:
154 |         if half_width_frames is None:
155 |             window = torch.hann_window(ir_size, device=impulse_response.device)
156 |         else:
157 |             window = torch.arange(-(ir_size // 2), (ir_size + 1) // 2, device=impulse_response.device) / half_width_frames 
158 |             window = torch.clamp(window, min=-1, max=1)
159 |             window = (1 + torch.cos(np.pi * window)) / 2 # B, n_frames, 2*(n_mag -1) or 2*n_mag-1
160 |         impulse_response *= window
161 |        
162 |     return impulse_response
163 | 
164 | 
165 | def frequency_filter(audio,
166 |                      magnitudes,
167 |                      hann_window=True,
168 |                      half_width_frames=None):
169 | 
170 |     impulse_response = frequency_impulse_response(magnitudes, hann_window, half_width_frames)
171 |     
172 |     return fft_convolve(audio, impulse_response)
173 |     
174 | 


--------------------------------------------------------------------------------
/data_loaders.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import numpy as np
  4 | import librosa
  5 | import torch
  6 | import random
  7 | from tqdm import tqdm
  8 | from torch.utils.data import Dataset
  9 | 
 10 | def traverse_dir(
 11 |         root_dir,
 12 |         extension,
 13 |         amount=None,
 14 |         str_include=None,
 15 |         str_exclude=None,
 16 |         is_pure=False,
 17 |         is_sort=False,
 18 |         is_ext=True):
 19 | 
 20 |     file_list = []
 21 |     cnt = 0
 22 |     for root, _, files in os.walk(root_dir):
 23 |         for file in files:
 24 |             if file.endswith(extension):
 25 |                 # path
 26 |                 mix_path = os.path.join(root, file)
 27 |                 pure_path = mix_path[len(root_dir)+1:] if is_pure else mix_path
 28 | 
 29 |                 # amount
 30 |                 if (amount is not None) and (cnt == amount):
 31 |                     if is_sort:
 32 |                         file_list.sort()
 33 |                     return file_list
 34 |                 
 35 |                 # check string
 36 |                 if (str_include is not None) and (str_include not in pure_path):
 37 |                     continue
 38 |                 if (str_exclude is not None) and (str_exclude in pure_path):
 39 |                     continue
 40 |                 
 41 |                 if not is_ext:
 42 |                     ext = pure_path.split('.')[-1]
 43 |                     pure_path = pure_path[:-(len(ext)+1)]
 44 |                 file_list.append(pure_path)
 45 |                 cnt += 1
 46 |     if is_sort:
 47 |         file_list.sort()
 48 |     return file_list
 49 | 
 50 | 
 51 | def get_data_loaders(args, whole_audio=False):
 52 |     data_train = AudioDataset(
 53 |         args.data.train_path,
 54 |         waveform_sec=args.data.duration,
 55 |         hop_size=args.data.block_size,
 56 |         sample_rate=args.data.sampling_rate,
 57 |         load_all_data=args.train.cache_all_data,
 58 |         whole_audio=whole_audio,
 59 |         volume_aug=True)
 60 |     loader_train = torch.utils.data.DataLoader(
 61 |         data_train ,
 62 |         batch_size=args.train.batch_size if not whole_audio else 1,
 63 |         shuffle=True,
 64 |         num_workers=args.train.num_workers,
 65 |         persistent_workers=(args.train.num_workers > 0),
 66 |         pin_memory=True
 67 |     )
 68 |     data_valid = AudioDataset(
 69 |         args.data.valid_path,
 70 |         waveform_sec=args.data.duration,
 71 |         hop_size=args.data.block_size,
 72 |         sample_rate=args.data.sampling_rate,
 73 |         load_all_data=args.train.cache_all_data,
 74 |         whole_audio=True,
 75 |         volume_aug=False)
 76 |     loader_valid = torch.utils.data.DataLoader(
 77 |         data_valid,
 78 |         batch_size=1,
 79 |         shuffle=False,
 80 |         num_workers=0,
 81 |         pin_memory=True
 82 |     )
 83 |     return loader_train, loader_valid 
 84 | 
 85 | 
 86 | class AudioDataset(Dataset):
 87 |     def __init__(
 88 |         self,
 89 |         path_root,
 90 |         waveform_sec,
 91 |         hop_size,
 92 |         sample_rate,
 93 |         load_all_data=True,
 94 |         whole_audio=False,
 95 |         volume_aug=False
 96 |     ):
 97 |         super().__init__()
 98 |         
 99 |         self.waveform_sec = waveform_sec
100 |         self.sample_rate = sample_rate
101 |         self.hop_size = hop_size
102 |         self.path_root = path_root
103 |         self.paths = traverse_dir(
104 |             os.path.join(path_root, 'audio'),
105 |             extension='wav',
106 |             is_pure=True,
107 |             is_sort=True,
108 |             is_ext=False
109 |         )
110 |         self.whole_audio = whole_audio
111 |         self.volume_aug = volume_aug
112 |         self.data_buffer={}
113 |         if load_all_data:
114 |             print('Load all the data from :', path_root)
115 |         else:
116 |             print('Load the f0, uv data from :', path_root)
117 |         for name in tqdm(self.paths, total=len(self.paths)):
118 |             path_audio = os.path.join(self.path_root, 'audio', name) + '.wav'
119 |             duration = librosa.get_duration(filename = path_audio, sr = self.sample_rate)
120 |             
121 |             path_f0 = os.path.join(self.path_root, 'f0', name) + '.npy'
122 |             f0 = np.load(path_f0)
123 |             f0 = torch.from_numpy(f0).float().unsqueeze(-1)
124 |                 
125 |             path_uv = os.path.join(self.path_root, 'uv', name) + '.npy'
126 |             uv = np.load(path_uv)
127 |             uv = torch.from_numpy(uv).float()
128 |             
129 |             if load_all_data:
130 |                 audio, sr = librosa.load(path_audio, sr=self.sample_rate)
131 |                 audio = torch.from_numpy(audio).float()
132 |                 
133 |                 path_mel = os.path.join(self.path_root, 'mel', name) + '.npy'
134 |                 audio_mel = np.load(path_mel)
135 |                 audio_mel = torch.from_numpy(audio_mel).float()
136 |                 
137 |                 self.data_buffer[name] = {
138 |                         'duration': duration,
139 |                         'audio': audio,
140 |                         'audio_mel': audio_mel,
141 |                         'f0': f0,
142 |                         'uv': uv
143 |                         }
144 |             else:
145 |                 self.data_buffer[name] = {
146 |                         'duration': duration,
147 |                         'f0': f0,
148 |                         'uv': uv
149 |                         }
150 |            
151 | 
152 |     def __getitem__(self, file_idx):
153 |         name = self.paths[file_idx]
154 |         data_buffer = self.data_buffer[name]
155 |         # check duration. if too short, then skip
156 |         if data_buffer['duration'] < (self.waveform_sec + 0.1):
157 |             return self.__getitem__( (file_idx + 1) % len(self.paths))
158 |         
159 |         # get item
160 |         return self.get_data(name, data_buffer)
161 | 
162 |     def get_data(self, name, data_buffer):
163 |         frame_resolution = self.hop_size / self.sample_rate
164 |         duration = data_buffer['duration']
165 |         waveform_sec = duration if self.whole_audio else self.waveform_sec
166 |         
167 |         # load audio
168 |         idx_from = 0 if self.whole_audio else random.uniform(0, duration - waveform_sec - 0.1)
169 |         start_frame = int(idx_from / frame_resolution)
170 |         mel_frame_len = int(waveform_sec / frame_resolution)
171 |         audio = data_buffer.get('audio')
172 |         if audio is None:
173 |             path_audio = os.path.join(self.path_root, 'audio', name) + '.wav'
174 |             audio, sr = librosa.load(
175 |                     path_audio, 
176 |                     sr = self.sample_rate, 
177 |                     offset = start_frame * frame_resolution,
178 |                     duration = waveform_sec)
179 |             # clip audio into N seconds
180 |             audio = audio[..., : audio.shape[-1] // self.hop_size * self.hop_size]       
181 |             audio = torch.from_numpy(audio).float()
182 |         else:
183 |             audio = audio[..., start_frame * self.hop_size : (start_frame + mel_frame_len) * self.hop_size].clone()
184 |         
185 |         # load mel
186 |         audio_mel = data_buffer.get('audio_mel')
187 |         if audio_mel is None:
188 |             path_mel  = os.path.join(self.path_root, 'mel', name) + '.npy'
189 |             audio_mel = np.load(path_mel)
190 |             audio_mel = audio_mel[start_frame : start_frame + mel_frame_len]
191 |             audio_mel = torch.from_numpy(audio_mel).float() 
192 |         else:
193 |             audio_mel = audio_mel[start_frame : start_frame + mel_frame_len].clone()
194 | 
195 |         # load f0
196 |         f0 = data_buffer.get('f0')
197 |         f0_frames = f0[start_frame : start_frame + mel_frame_len]
198 |         
199 |         # load uv
200 |         uv = data_buffer.get('uv')
201 |         uv_frames = uv[start_frame : start_frame + mel_frame_len]
202 |         
203 |         # volume augmentation
204 |         if self.volume_aug:
205 |             max_amp = float(torch.max(torch.abs(audio))) + 1e-5
206 |             max_shift = min(1, np.log10(1/max_amp))
207 |             log10_mel_shift = random.uniform(-1, max_shift)
208 |             audio *= (10 ** log10_mel_shift)
209 |             audio_mel += log10_mel_shift
210 |         audio_mel = torch.clamp(audio_mel, min=-5)
211 |         
212 |         return dict(audio=audio, f0=f0_frames, uv=uv_frames, mel=audio_mel, name=name)
213 | 
214 |     def __len__(self):
215 |         return len(self.paths)
216 | 


--------------------------------------------------------------------------------
/ddsp/vocoder.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import yaml
  4 | import torch
  5 | import torch.nn.functional as F
  6 | from librosa.filters import mel as librosa_mel_fn
  7 | from .mel2control import Mel2Control
  8 | from .core import frequency_filter, mean_filter, upsample
  9 | 
 10 | class DotDict(dict):
 11 |     def __getattr__(*args):         
 12 |         val = dict.get(*args)         
 13 |         return DotDict(val) if type(val) is dict else val   
 14 | 
 15 |     __setattr__ = dict.__setitem__    
 16 |     __delattr__ = dict.__delitem__
 17 | 
 18 | 
 19 | def load_model(
 20 |         model_path,
 21 |         device='cpu'):
 22 |     config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml')
 23 |     with open(config_file, "r") as config:
 24 |         args = yaml.safe_load(config)
 25 |     args = DotDict(args)
 26 |     
 27 |     # load model
 28 |     print(' [Loading] ' + model_path)
 29 |     if model_path.split('.')[-1] == 'jit':
 30 |         model = torch.jit.load(model_path, map_location=torch.device(device))
 31 |     else:
 32 |         if args.model.type == 'Sins':
 33 |             model = Sins(
 34 |                 sampling_rate=args.data.sampling_rate,
 35 |                 block_size=args.data.block_size,
 36 |                 win_length=args.model.win_length,
 37 |                 use_mean_filter=args.model.use_mean_filter,              
 38 |                 n_harmonics=args.model.n_harmonics,
 39 |                 n_mag_noise=args.model.n_mag_noise,
 40 |                 n_mels=args.data.n_mels)
 41 |     
 42 |         elif args.model.type == 'CombSub':
 43 |             model = CombSub(
 44 |                 sampling_rate=args.data.sampling_rate,
 45 |                 block_size=args.data.block_size,
 46 |                 win_length=args.model.win_length,
 47 |                 use_mean_filter=args.model.use_mean_filter,               
 48 |                 n_mag_harmonic=args.model.n_mag_harmonic,
 49 |                 n_mag_noise=args.model.n_mag_noise,
 50 |                 n_mels=args.data.n_mels)
 51 |                         
 52 |         else:
 53 |             raise ValueError(f" [x] Unknown Model: {args.model.type}")
 54 |         model.to(device)
 55 |         ckpt = torch.load(model_path, map_location=torch.device(device))
 56 |         model.load_state_dict(ckpt['model'])
 57 |         model.eval()
 58 |     return model, args
 59 | 
 60 | 
 61 | class Audio2Mel(torch.nn.Module):
 62 |     def __init__(
 63 |         self,
 64 |         hop_length,
 65 |         sampling_rate,
 66 |         n_mel_channels,
 67 |         win_length,
 68 |         n_fft=None,
 69 |         mel_fmin=0,
 70 |         mel_fmax=None,
 71 |         clamp = 1e-5
 72 |     ):
 73 |         super().__init__()
 74 |         n_fft = win_length if n_fft is None else n_fft
 75 |         self.hann_window = {}
 76 |         mel_basis = librosa_mel_fn(
 77 |             sr=sampling_rate,
 78 |             n_fft=n_fft, 
 79 |             n_mels=n_mel_channels, 
 80 |             fmin=mel_fmin, 
 81 |             fmax=mel_fmax)
 82 |         mel_basis = torch.from_numpy(mel_basis).float()
 83 |         self.register_buffer("mel_basis", mel_basis)
 84 |         self.n_fft = n_fft
 85 |         self.hop_length = hop_length
 86 |         self.win_length = win_length
 87 |         self.sampling_rate = sampling_rate
 88 |         self.n_mel_channels = n_mel_channels
 89 |         self.clamp = clamp
 90 | 
 91 |     def forward(self, audio, keyshift=0, speed=1):
 92 |         '''
 93 |               audio: B x C x T
 94 |         log_mel_spec: B x T_ x C x n_mel 
 95 |         '''
 96 |         factor = 2 ** (keyshift / 12)       
 97 |         n_fft_new = int(np.round(self.n_fft * factor))
 98 |         win_length_new = int(np.round(self.win_length * factor))
 99 |         hop_length_new = int(np.round(self.hop_length * speed))
100 |         
101 |         keyshift_key = str(keyshift)+'_'+str(audio.device)
102 |         if keyshift_key not in self.hann_window:
103 |             self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(audio.device)
104 |             
105 |         B, C, T = audio.shape
106 |         audio = audio.reshape(B * C, T)
107 |         fft = torch.stft(
108 |             audio,
109 |             n_fft=n_fft_new,
110 |             hop_length=hop_length_new,
111 |             win_length=win_length_new,
112 |             window=self.hann_window[keyshift_key],
113 |             center=True,
114 |             return_complex=True)
115 |         magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
116 |         
117 |         if keyshift != 0:
118 |             size = self.n_fft // 2 + 1
119 |             resize = magnitude.size(1)
120 |             if resize < size:
121 |                 magnitude = F.pad(magnitude, (0, 0, 0, size-resize))
122 |             magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
123 |             
124 |         mel_output = torch.matmul(self.mel_basis, magnitude)
125 |         log_mel_spec = torch.log10(torch.clamp(mel_output, min=self.clamp))
126 | 
127 |         # log_mel_spec: B x C, M, T
128 |         T_ = log_mel_spec.shape[-1]
129 |         log_mel_spec = log_mel_spec.reshape(B, C, self.n_mel_channels ,T_)
130 |         log_mel_spec = log_mel_spec.permute(0, 3, 1, 2)
131 | 
132 |         # print('og_mel_spec:', log_mel_spec.shape)
133 |         log_mel_spec = log_mel_spec.squeeze(2) # mono
134 |         return log_mel_spec
135 | 
136 |        
137 | class Sins(torch.nn.Module):
138 |     def __init__(self, 
139 |             sampling_rate,
140 |             block_size,
141 |             win_length,
142 |             use_mean_filter,
143 |             n_harmonics,
144 |             n_mag_noise,
145 |             n_mels=80):
146 |         super().__init__()
147 | 
148 |         print(' [DDSP Model] Sinusoids Additive Synthesiser')
149 | 
150 |         # params
151 |         self.register_buffer("sampling_rate", torch.tensor(sampling_rate))
152 |         self.register_buffer("block_size", torch.tensor(block_size))
153 |         self.register_buffer("win_length", torch.tensor(win_length))
154 |         self.register_buffer("window", torch.hann_window(win_length))
155 |         # Mel2Control
156 |         split_map = {
157 |             'amplitudes': n_harmonics,
158 |             'harmonic_phase': win_length // 2 + 1,
159 |             'noise_magnitude': n_mag_noise,
160 |             'noise_phase': n_mag_noise,
161 |         }
162 |         self.mel2ctrl = Mel2Control(n_mels, block_size, split_map)
163 |         # mean filter kernel size
164 |         if use_mean_filter:
165 |             self.mean_kernel_size = win_length // block_size
166 |         else:
167 |             self.mean_kernel_size = 1
168 |     
169 |     def fast_phase_gen(self, f0_frames):
170 |         n = torch.arange(self.block_size, device=f0_frames.device)
171 |         s0 = f0_frames / self.sampling_rate
172 |         ds0 = F.pad(s0[:, 1:, :] - s0[:, :-1, :], (0, 0, 0, 1))
173 |         rad = s0 * (n + 1) + 0.5 * ds0 * n * (n + 1) / self.block_size
174 |         rad2 = torch.fmod(rad[..., -1:].float() + 0.5, 1.0) - 0.5
175 |         rad_acc = rad2.cumsum(dim=1).fmod(1.0).to(f0_frames)
176 |         rad += F.pad(rad_acc[:, :-1, :], (0, 0, 1, 0))
177 |         phase = 2 * np.pi * rad.reshape(f0_frames.shape[0], -1, 1)
178 |         return phase
179 |     
180 |     def forward(self, 
181 |             mel_frames, 
182 |             f0_frames, 
183 |             output_f0_frames=None,
184 |             infer=True, 
185 |             max_upsample_dim=32):
186 |         '''
187 |             mel_frames: B x n_frames x n_mels
188 |             f0_frames: B x n_frames x 1
189 |         '''
190 |         # exciter phase
191 |         phase = self.fast_phase_gen(f0_frames)
192 |         
193 |         # sinusoid exciter signal
194 |         sinusoid = torch.sin(phase).squeeze(-1)
195 |         sinusoid_frames = sinusoid.unfold(1, self.block_size, self.block_size)
196 |         
197 |         # noise exciter signal
198 |         noise = torch.randn_like(sinusoid)
199 |         noise_frames = noise.unfold(1, self.block_size, self.block_size)
200 |         
201 |         # parameter prediction
202 |         ctrls = self.mel2ctrl(mel_frames, sinusoid_frames, noise_frames)
203 |         if self.mean_kernel_size > 1:
204 |             ctrls['amplitudes'] = mean_filter(ctrls['amplitudes'], self.mean_kernel_size)
205 |             ctrls['harmonic_phase'] = mean_filter(ctrls['harmonic_phase'], self.mean_kernel_size)
206 |             
207 |         src_allpass = torch.exp(1.j * np.pi * ctrls['harmonic_phase'])
208 |         src_allpass = torch.cat((src_allpass, src_allpass[:,-1:,:]), 1)
209 |         amplitudes_frames = torch.exp(ctrls['amplitudes'])/ 128
210 |         noise_param = torch.exp(ctrls['noise_magnitude'] + 1.j * np.pi * ctrls['noise_phase']) / 128
211 |         
212 |         # harmonic additive synthesis
213 |         if infer and output_f0_frames is not None:
214 |             f0_frames = output_f0_frames
215 |             phase = self.fast_phase_gen(output_f0_frames)  
216 |         n_harmonic = amplitudes_frames.shape[-1]
217 |         level_harmonic = torch.arange(1, n_harmonic + 1, device=phase.device)
218 |         mask = (f0_frames * level_harmonic < self.sampling_rate / 2).float() + 1e-7
219 |         amplitudes_frames *= mask
220 |         sinusoids = 0.
221 |         for n in range(( n_harmonic - 1) // max_upsample_dim + 1):
222 |             start = n * max_upsample_dim
223 |             end = (n + 1) * max_upsample_dim
224 |             phases = phase * level_harmonic[start:end]
225 |             amplitudes = upsample(amplitudes_frames[:,:,start:end], self.block_size)
226 |             sinusoids += (torch.sin(phases) * amplitudes).sum(-1)
227 |         
228 |         # harmonic part filter (all pass)
229 |         harmonic_spec = torch.stft(
230 |                             sinusoids,
231 |                             n_fft = self.win_length,
232 |                             win_length = self.win_length,
233 |                             hop_length = self.block_size,
234 |                             window = self.window,
235 |                             center = True,
236 |                             return_complex = True)
237 |         harmonic_spec = harmonic_spec * src_allpass.permute(0, 2, 1)
238 |         harmonic = torch.istft(
239 |                         harmonic_spec,
240 |                         n_fft = self.win_length,
241 |                         win_length = self.win_length,
242 |                         hop_length = self.block_size,
243 |                         window = self.window,
244 |                         center = True)
245 |                         
246 |         # noise part filter (using constant-windowed LTV-FIR) 
247 |         noise = frequency_filter(
248 |                         noise,
249 |                         noise_param)
250 |                         
251 |         signal = harmonic + noise
252 | 
253 |         return signal, sinusoids, (harmonic, noise)
254 |         
255 |         
256 | class CombSub(torch.nn.Module):
257 |     def __init__(self, 
258 |             sampling_rate,
259 |             block_size,
260 |             win_length,
261 |             use_mean_filter,
262 |             n_mag_harmonic,
263 |             n_mag_noise,
264 |             n_mels=80):
265 |         super().__init__()
266 | 
267 |         print(' [DDSP Model] Combtooth Subtractive Synthesiser')
268 |         # params
269 |         self.register_buffer("sampling_rate", torch.tensor(sampling_rate))
270 |         self.register_buffer("block_size", torch.tensor(block_size))
271 |         self.register_buffer("win_length", torch.tensor(win_length))
272 |         self.register_buffer("window", torch.hann_window(win_length))
273 |         # Mel2Control
274 |         split_map = {            
275 |             'harmonic_magnitude': n_mag_harmonic,
276 |             'harmonic_phase': win_length // 2 + 1,
277 |             'noise_magnitude': n_mag_noise,
278 |             'noise_phase': n_mag_noise,
279 |         }
280 |         self.mel2ctrl = Mel2Control(n_mels, block_size, split_map)
281 |         # mean filter kernel size
282 |         if use_mean_filter:
283 |             self.mean_kernel_size = win_length // block_size
284 |         else:
285 |             self.mean_kernel_size = 1
286 |     
287 |     def fast_source_gen(self, f0_frames):
288 |         n = torch.arange(self.block_size, device=f0_frames.device)
289 |         s0 = f0_frames / self.sampling_rate
290 |         ds0 = F.pad(s0[:, 1:, :] - s0[:, :-1, :], (0, 0, 0, 1))
291 |         rad = s0 * (n + 1) + 0.5 * ds0 * n * (n + 1) / self.block_size
292 |         s0 = s0 + ds0 * n / self.block_size
293 |         rad2 = torch.fmod(rad[..., -1:].float() + 0.5, 1.0) - 0.5
294 |         rad_acc = rad2.cumsum(dim=1).fmod(1.0).to(f0_frames)
295 |         rad += F.pad(rad_acc[:, :-1, :], (0, 0, 1, 0))
296 |         rad -= torch.round(rad)
297 |         combtooth = torch.sinc(rad / (s0 + 1e-5)).reshape(f0_frames.shape[0], -1)
298 |         return combtooth
299 |         
300 |     def forward(self, 
301 |             mel_frames, 
302 |             f0_frames,
303 |             output_f0_frames=None,
304 |             infer=True, 
305 |             **kwargs):
306 |         '''
307 |             mel_frames: B x n_frames x n_mels
308 |             f0_frames: B x n_frames x 1
309 |         '''
310 |                 
311 |         # combtooth exciter signal
312 |         combtooth = self.fast_source_gen(f0_frames)
313 |         combtooth_frames = combtooth.unfold(1, self.block_size, self.block_size)
314 |         
315 |         # noise exciter signal
316 |         noise = torch.randn_like(combtooth)
317 |         noise_frames = noise.unfold(1, self.block_size, self.block_size)
318 |         
319 |         # parameter prediction
320 |         ctrls = self.mel2ctrl(mel_frames, combtooth_frames, noise_frames)
321 |         if self.mean_kernel_size > 1:
322 |             ctrls['harmonic_magnitude'] = mean_filter(ctrls['harmonic_magnitude'], self.mean_kernel_size)
323 |             ctrls['harmonic_phase'] = mean_filter(ctrls['harmonic_phase'], self.mean_kernel_size)
324 |         
325 |         src_allpass = torch.exp(1.j * np.pi * ctrls['harmonic_phase'])
326 |         src_allpass = torch.cat((src_allpass, src_allpass[:,-1:,:]), 1)
327 |         src_param = torch.exp(ctrls['harmonic_magnitude'])
328 |         noise_param = torch.exp(ctrls['noise_magnitude'] + 1.j * np.pi * ctrls['noise_phase']) / 128
329 |         
330 |         # harmonic part filter (using dynamic-windowed LTV-FIR)
331 |         if infer and output_f0_frames is not None:
332 |             f0_frames = output_f0_frames
333 |             combtooth = self.fast_source_gen(output_f0_frames)
334 |         harmonic = frequency_filter(
335 |                         combtooth,
336 |                         torch.complex(src_param, torch.zeros_like(src_param)),
337 |                         hann_window = True,
338 |                         half_width_frames = 1.5 * self.sampling_rate / (f0_frames + 1e-3))
339 |                
340 |         # harmonic part filter (all pass)
341 |         harmonic_spec = torch.stft(
342 |                             harmonic,
343 |                             n_fft = self.win_length,
344 |                             win_length = self.win_length,
345 |                             hop_length = self.block_size,
346 |                             window = self.window,
347 |                             center = True,
348 |                             return_complex = True)
349 |         harmonic_spec = harmonic_spec * src_allpass.permute(0, 2, 1)
350 |         harmonic = torch.istft(
351 |                         harmonic_spec,
352 |                         n_fft = self.win_length,
353 |                         win_length = self.win_length,
354 |                         hop_length = self.block_size,
355 |                         window = self.window,
356 |                         center = True)
357 |                         
358 |         # noise part filter (using constant-windowed LTV-FIR) 
359 |         noise = frequency_filter(
360 |                         noise,
361 |                         noise_param)
362 |                        
363 |         signal = harmonic + noise
364 | 
365 |         return signal, combtooth, (harmonic, noise)


--------------------------------------------------------------------------------