├── .github └── FUNDING.yml ├── .gitignore ├── LICENSE ├── README.md ├── checkpoints └── .gitkeep ├── configs ├── config_base.yaml ├── singing │ ├── base.yaml │ └── fs2.yaml └── tts │ ├── base.yaml │ ├── base_zh.yaml │ ├── fs2.yaml │ ├── hifigan.yaml │ ├── lj │ ├── base_mel2wav.yaml │ ├── base_text2mel.yaml │ ├── fs2.yaml │ ├── hifigan.yaml │ └── pwg.yaml │ └── pwg.yaml ├── data └── processed │ └── ljspeech │ ├── dict.txt │ ├── metadata_phone.csv │ ├── mfa_dict.txt │ └── phone_set.json ├── data_gen ├── singing │ └── binarize.py └── tts │ ├── base_binarizer.py │ ├── bin │ └── binarize.py │ ├── binarizer_zh.py │ ├── data_gen_utils.py │ └── txt_processors │ ├── base_text_processor.py │ ├── en.py │ ├── zh.py │ └── zh_g2pM.py ├── docs ├── README-SVS-opencpop-cascade.md ├── README-SVS-opencpop-e2e.md ├── README-SVS-opencpop-pndm.md ├── README-SVS-popcs.md ├── README-SVS.md ├── README-TTS-pndm.md └── README-TTS.md ├── inference └── svs │ ├── base_svs_infer.py │ ├── ds_cascade.py │ ├── ds_e2e.py │ ├── gradio │ ├── gradio_settings.yaml │ └── infer.py │ └── opencpop │ ├── cpop_pinyin2ph.txt │ └── map.py ├── modules ├── __init__.py ├── commons │ ├── common_layers.py │ ├── espnet_positional_embedding.py │ └── ssim.py ├── diffsinger_midi │ └── fs2.py ├── fastspeech │ ├── fs2.py │ ├── pe.py │ └── tts_modules.py ├── hifigan │ ├── hifigan.py │ └── mel_utils.py └── parallel_wavegan │ ├── __init__.py │ ├── layers │ ├── __init__.py │ ├── causal_conv.py │ ├── pqmf.py │ ├── residual_block.py │ ├── residual_stack.py │ ├── tf_layers.py │ └── upsample.py │ ├── losses │ ├── __init__.py │ └── stft_loss.py │ ├── models │ ├── __init__.py │ ├── melgan.py │ ├── parallel_wavegan.py │ └── source.py │ ├── optimizers │ ├── __init__.py │ └── radam.py │ ├── stft_loss.py │ └── utils │ ├── __init__.py │ └── utils.py ├── requirements.txt ├── requirements_2080.txt ├── requirements_3090.txt ├── resources ├── apply_form.md ├── diffspeech-fs2-1.png ├── diffspeech-fs2-2.png ├── diffspeech-fs2.png ├── model_a.png ├── model_b.png └── tfb.png ├── tasks ├── base_task.py ├── run.py └── tts │ ├── fs2.py │ ├── fs2_utils.py │ ├── pe.py │ └── tts.py ├── usr ├── .gitkeep ├── __init__.py ├── configs │ ├── base.yaml │ ├── lj_ds_beta6.yaml │ ├── lj_ds_pndm.yaml │ ├── midi │ │ ├── cascade │ │ │ └── opencs │ │ │ │ ├── aux_rel.yaml │ │ │ │ ├── ds60_rel.yaml │ │ │ │ └── opencpop_statis.yaml │ │ ├── e2e │ │ │ ├── opencpop │ │ │ │ ├── ds1000.yaml │ │ │ │ └── ds100_adj_rel.yaml │ │ │ └── popcs │ │ │ │ └── ds100_adj_rel.yaml │ │ └── pe.yaml │ ├── popcs_ds_beta6.yaml │ ├── popcs_ds_beta6_offline.yaml │ └── popcs_fs2.yaml ├── diff │ ├── candidate_decoder.py │ ├── diffusion.py │ ├── net.py │ └── shallow_diffusion_tts.py ├── diffsinger_task.py ├── diffspeech_task.py └── task.py ├── utils ├── __init__.py ├── audio.py ├── cwt.py ├── hparams.py ├── indexed_datasets.py ├── multiprocess_utils.py ├── pitch_utils.py ├── pl_utils.py ├── plot.py ├── text_encoder.py ├── text_norm.py ├── training_utils.py └── tts_utils.py └── vocoders ├── __init__.py ├── base_vocoder.py ├── hifigan.py ├── pwg.py └── vocoder_utils.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: RayeRen # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.pyc 3 | __pycache__/ 4 | *.sh 5 | local_tools/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Jinglin Liu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /checkpoints/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/checkpoints/.gitkeep -------------------------------------------------------------------------------- /configs/config_base.yaml: -------------------------------------------------------------------------------- 1 | # task 2 | binary_data_dir: '' 3 | work_dir: '' # experiment directory. 4 | infer: false # infer 5 | seed: 1234 6 | debug: false 7 | save_codes: 8 | - configs 9 | - modules 10 | - tasks 11 | - utils 12 | - usr 13 | 14 | ############# 15 | # dataset 16 | ############# 17 | ds_workers: 1 18 | test_num: 100 19 | valid_num: 100 20 | endless_ds: false 21 | sort_by_len: true 22 | 23 | ######### 24 | # train and eval 25 | ######### 26 | load_ckpt: '' 27 | save_ckpt: true 28 | save_best: false 29 | num_ckpt_keep: 3 30 | clip_grad_norm: 0 31 | accumulate_grad_batches: 1 32 | log_interval: 100 33 | num_sanity_val_steps: 5 # steps of validation at the beginning 34 | check_val_every_n_epoch: 10 35 | val_check_interval: 2000 36 | max_epochs: 1000 37 | max_updates: 160000 38 | max_tokens: 31250 39 | max_sentences: 100000 40 | max_eval_tokens: -1 41 | max_eval_sentences: -1 42 | test_input_dir: '' 43 | -------------------------------------------------------------------------------- /configs/singing/base.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/base.yaml 3 | - configs/tts/base_zh.yaml 4 | 5 | 6 | datasets: [] 7 | test_prefixes: [] 8 | test_num: 0 9 | valid_num: 0 10 | 11 | pre_align_cls: data_gen.singing.pre_align.SingingPreAlign 12 | binarizer_cls: data_gen.singing.binarize.SingingBinarizer 13 | pre_align_args: 14 | use_tone: false # for ZH 15 | forced_align: mfa 16 | use_sox: true 17 | hop_size: 128 # Hop size. 18 | fft_size: 512 # FFT size. 19 | win_size: 512 # FFT size. 20 | max_frames: 8000 21 | fmin: 50 # Minimum freq in mel basis calculation. 22 | fmax: 11025 # Maximum frequency in mel basis calculation. 23 | pitch_type: frame 24 | 25 | hidden_size: 256 26 | mel_loss: "ssim:0.5|l1:0.5" 27 | lambda_f0: 0.0 28 | lambda_uv: 0.0 29 | lambda_energy: 0.0 30 | lambda_ph_dur: 0.0 31 | lambda_sent_dur: 0.0 32 | lambda_word_dur: 0.0 33 | predictor_grad: 0.0 34 | use_spk_embed: true 35 | use_spk_id: false 36 | 37 | max_tokens: 20000 38 | max_updates: 400000 39 | num_spk: 100 40 | save_f0: true 41 | use_gt_dur: true 42 | use_gt_f0: true 43 | -------------------------------------------------------------------------------- /configs/singing/fs2.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/fs2.yaml 3 | - configs/singing/base.yaml 4 | -------------------------------------------------------------------------------- /configs/tts/base.yaml: -------------------------------------------------------------------------------- 1 | # task 2 | base_config: configs/config_base.yaml 3 | task_cls: '' 4 | ############# 5 | # dataset 6 | ############# 7 | raw_data_dir: '' 8 | processed_data_dir: '' 9 | binary_data_dir: '' 10 | dict_dir: '' 11 | pre_align_cls: '' 12 | binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer 13 | pre_align_args: 14 | use_tone: true # for ZH 15 | forced_align: mfa 16 | use_sox: false 17 | txt_processor: en 18 | allow_no_txt: false 19 | denoise: false 20 | binarization_args: 21 | shuffle: false 22 | with_txt: true 23 | with_wav: false 24 | with_align: true 25 | with_spk_embed: true 26 | with_f0: true 27 | with_f0cwt: true 28 | 29 | loud_norm: false 30 | endless_ds: true 31 | reset_phone_dict: true 32 | 33 | test_num: 100 34 | valid_num: 100 35 | max_frames: 1550 36 | max_input_tokens: 1550 37 | audio_num_mel_bins: 80 38 | audio_sample_rate: 22050 39 | hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate) 40 | win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate) 41 | fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525]) 42 | fmax: 7600 # To be increased/reduced depending on data. 43 | fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter 44 | min_level_db: -100 45 | num_spk: 1 46 | mel_vmin: -6 47 | mel_vmax: 1.5 48 | ds_workers: 4 49 | 50 | ######### 51 | # model 52 | ######### 53 | dropout: 0.1 54 | enc_layers: 4 55 | dec_layers: 4 56 | hidden_size: 384 57 | num_heads: 2 58 | prenet_dropout: 0.5 59 | prenet_hidden_size: 256 60 | stop_token_weight: 5.0 61 | enc_ffn_kernel_size: 9 62 | dec_ffn_kernel_size: 9 63 | ffn_act: gelu 64 | ffn_padding: 'SAME' 65 | 66 | 67 | ########### 68 | # optimization 69 | ########### 70 | lr: 2.0 71 | warmup_updates: 8000 72 | optimizer_adam_beta1: 0.9 73 | optimizer_adam_beta2: 0.98 74 | weight_decay: 0 75 | clip_grad_norm: 1 76 | 77 | 78 | ########### 79 | # train and eval 80 | ########### 81 | max_tokens: 30000 82 | max_sentences: 100000 83 | max_eval_sentences: 1 84 | max_eval_tokens: 60000 85 | train_set_name: 'train' 86 | valid_set_name: 'valid' 87 | test_set_name: 'test' 88 | vocoder: pwg 89 | vocoder_ckpt: '' 90 | profile_infer: false 91 | out_wav_norm: false 92 | save_gt: false 93 | save_f0: false 94 | gen_dir_name: '' 95 | use_denoise: false 96 | -------------------------------------------------------------------------------- /configs/tts/base_zh.yaml: -------------------------------------------------------------------------------- 1 | pre_align_args: 2 | txt_processor: zh_g2pM 3 | binarizer_cls: data_gen.tts.binarizer_zh.ZhBinarizer -------------------------------------------------------------------------------- /configs/tts/fs2.yaml: -------------------------------------------------------------------------------- 1 | base_config: configs/tts/base.yaml 2 | task_cls: tasks.tts.fs2.FastSpeech2Task 3 | 4 | # model 5 | hidden_size: 256 6 | dropout: 0.1 7 | encoder_type: fft # fft|tacotron|tacotron2|conformer 8 | encoder_K: 8 # for tacotron encoder 9 | decoder_type: fft # fft|rnn|conv|conformer 10 | use_pos_embed: true 11 | 12 | # duration 13 | predictor_hidden: -1 14 | predictor_kernel: 5 15 | predictor_layers: 2 16 | dur_predictor_kernel: 3 17 | dur_predictor_layers: 2 18 | predictor_dropout: 0.5 19 | 20 | # pitch and energy 21 | use_pitch_embed: true 22 | pitch_type: ph # frame|ph|cwt 23 | use_uv: true 24 | cwt_hidden_size: 128 25 | cwt_layers: 2 26 | cwt_loss: l1 27 | cwt_add_f0_loss: false 28 | cwt_std_scale: 0.8 29 | 30 | pitch_ar: false 31 | #pitch_embed_type: 0q 32 | pitch_loss: 'l1' # l1|l2|ssim 33 | pitch_norm: log 34 | use_energy_embed: false 35 | 36 | # reference encoder and speaker embedding 37 | use_spk_id: false 38 | use_split_spk_id: false 39 | use_spk_embed: false 40 | use_var_enc: false 41 | lambda_commit: 0.25 42 | ref_norm_layer: bn 43 | pitch_enc_hidden_stride_kernel: 44 | - 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size 45 | - 0,2,5 46 | - 0,2,5 47 | dur_enc_hidden_stride_kernel: 48 | - 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size 49 | - 0,2,3 50 | - 0,1,3 51 | 52 | 53 | # mel 54 | mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5 55 | 56 | # loss lambda 57 | lambda_f0: 1.0 58 | lambda_uv: 1.0 59 | lambda_energy: 0.1 60 | lambda_ph_dur: 1.0 61 | lambda_sent_dur: 1.0 62 | lambda_word_dur: 1.0 63 | predictor_grad: 0.1 64 | 65 | # train and eval 66 | pretrain_fs_ckpt: '' 67 | warmup_updates: 2000 68 | max_tokens: 32000 69 | max_sentences: 100000 70 | max_eval_sentences: 1 71 | max_updates: 120000 72 | num_valid_plots: 5 73 | num_test_samples: 0 74 | test_ids: [] 75 | use_gt_dur: false 76 | use_gt_f0: false 77 | 78 | # exp 79 | dur_loss: mse # huber|mol 80 | norm_type: gn -------------------------------------------------------------------------------- /configs/tts/hifigan.yaml: -------------------------------------------------------------------------------- 1 | base_config: configs/tts/pwg.yaml 2 | task_cls: tasks.vocoder.hifigan.HifiGanTask 3 | resblock: "1" 4 | adam_b1: 0.8 5 | adam_b2: 0.99 6 | upsample_rates: [ 8,8,2,2 ] 7 | upsample_kernel_sizes: [ 16,16,4,4 ] 8 | upsample_initial_channel: 128 9 | resblock_kernel_sizes: [ 3,7,11 ] 10 | resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ] 11 | 12 | lambda_mel: 45.0 13 | 14 | max_samples: 8192 15 | max_sentences: 16 16 | 17 | generator_params: 18 | lr: 0.0002 # Generator's learning rate. 19 | aux_context_window: 0 # Context window size for auxiliary feature. 20 | discriminator_optimizer_params: 21 | lr: 0.0002 # Discriminator's learning rate. -------------------------------------------------------------------------------- /configs/tts/lj/base_mel2wav.yaml: -------------------------------------------------------------------------------- 1 | raw_data_dir: 'data/raw/LJSpeech-1.1' 2 | processed_data_dir: 'data/processed/ljspeech' 3 | binary_data_dir: 'data/binary/ljspeech_wav' 4 | -------------------------------------------------------------------------------- /configs/tts/lj/base_text2mel.yaml: -------------------------------------------------------------------------------- 1 | raw_data_dir: 'data/raw/LJSpeech-1.1' 2 | processed_data_dir: 'data/processed/ljspeech' 3 | binary_data_dir: 'data/binary/ljspeech' 4 | pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign 5 | 6 | pitch_type: cwt 7 | mel_loss: l1 8 | num_test_samples: 20 9 | test_ids: [ 68, 70, 74, 87, 110, 172, 190, 215, 231, 294, 10 | 316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ] 11 | use_energy_embed: false 12 | test_num: 523 13 | valid_num: 348 -------------------------------------------------------------------------------- /configs/tts/lj/fs2.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/fs2.yaml 3 | - configs/tts/lj/base_text2mel.yaml -------------------------------------------------------------------------------- /configs/tts/lj/hifigan.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/hifigan.yaml 3 | - configs/tts/lj/base_mel2wav.yaml -------------------------------------------------------------------------------- /configs/tts/lj/pwg.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/pwg.yaml 3 | - configs/tts/lj/base_mel2wav.yaml -------------------------------------------------------------------------------- /configs/tts/pwg.yaml: -------------------------------------------------------------------------------- 1 | base_config: configs/tts/base.yaml 2 | task_cls: tasks.vocoder.pwg.PwgTask 3 | 4 | binarization_args: 5 | with_wav: true 6 | with_spk_embed: false 7 | with_align: false 8 | test_input_dir: '' 9 | 10 | ########### 11 | # train and eval 12 | ########### 13 | max_samples: 25600 14 | max_sentences: 5 15 | max_eval_sentences: 1 16 | max_updates: 1000000 17 | val_check_interval: 2000 18 | 19 | 20 | ########################################################### 21 | # FEATURE EXTRACTION SETTING # 22 | ########################################################### 23 | sampling_rate: 22050 # Sampling rate. 24 | fft_size: 1024 # FFT size. 25 | hop_size: 256 # Hop size. 26 | win_length: null # Window length. 27 | # If set to null, it will be the same as fft_size. 28 | window: "hann" # Window function. 29 | num_mels: 80 # Number of mel basis. 30 | fmin: 80 # Minimum freq in mel basis calculation. 31 | fmax: 7600 # Maximum frequency in mel basis calculation. 32 | format: "hdf5" # Feature file format. "npy" or "hdf5" is supported. 33 | 34 | ########################################################### 35 | # GENERATOR NETWORK ARCHITECTURE SETTING # 36 | ########################################################### 37 | generator_params: 38 | in_channels: 1 # Number of input channels. 39 | out_channels: 1 # Number of output channels. 40 | kernel_size: 3 # Kernel size of dilated convolution. 41 | layers: 30 # Number of residual block layers. 42 | stacks: 3 # Number of stacks i.e., dilation cycles. 43 | residual_channels: 64 # Number of channels in residual conv. 44 | gate_channels: 128 # Number of channels in gated conv. 45 | skip_channels: 64 # Number of channels in skip conv. 46 | aux_channels: 80 # Number of channels for auxiliary feature conv. 47 | # Must be the same as num_mels. 48 | aux_context_window: 2 # Context window size for auxiliary feature. 49 | # If set to 2, previous 2 and future 2 frames will be considered. 50 | dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. 51 | use_weight_norm: true # Whether to use weight norm. 52 | # If set to true, it will be applied to all of the conv layers. 53 | upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture. 54 | upsample_params: # Upsampling network parameters. 55 | upsample_scales: [4, 4, 4, 4] # Upsampling scales. Prodcut of these must be the same as hop size. 56 | use_pitch_embed: false 57 | 58 | ########################################################### 59 | # DISCRIMINATOR NETWORK ARCHITECTURE SETTING # 60 | ########################################################### 61 | discriminator_params: 62 | in_channels: 1 # Number of input channels. 63 | out_channels: 1 # Number of output channels. 64 | kernel_size: 3 # Number of output channels. 65 | layers: 10 # Number of conv layers. 66 | conv_channels: 64 # Number of chnn layers. 67 | bias: true # Whether to use bias parameter in conv. 68 | use_weight_norm: true # Whether to use weight norm. 69 | # If set to true, it will be applied to all of the conv layers. 70 | nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv. 71 | nonlinear_activation_params: # Nonlinear function parameters 72 | negative_slope: 0.2 # Alpha in LeakyReLU. 73 | 74 | ########################################################### 75 | # STFT LOSS SETTING # 76 | ########################################################### 77 | stft_loss_params: 78 | fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. 79 | hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss 80 | win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. 81 | window: "hann_window" # Window function for STFT-based loss 82 | use_mel_loss: false 83 | 84 | ########################################################### 85 | # ADVERSARIAL LOSS SETTING # 86 | ########################################################### 87 | lambda_adv: 4.0 # Loss balancing coefficient. 88 | 89 | ########################################################### 90 | # OPTIMIZER & SCHEDULER SETTING # 91 | ########################################################### 92 | generator_optimizer_params: 93 | lr: 0.0001 # Generator's learning rate. 94 | eps: 1.0e-6 # Generator's epsilon. 95 | weight_decay: 0.0 # Generator's weight decay coefficient. 96 | generator_scheduler_params: 97 | step_size: 200000 # Generator's scheduler step size. 98 | gamma: 0.5 # Generator's scheduler gamma. 99 | # At each step size, lr will be multiplied by this parameter. 100 | generator_grad_norm: 10 # Generator's gradient norm. 101 | discriminator_optimizer_params: 102 | lr: 0.00005 # Discriminator's learning rate. 103 | eps: 1.0e-6 # Discriminator's epsilon. 104 | weight_decay: 0.0 # Discriminator's weight decay coefficient. 105 | discriminator_scheduler_params: 106 | step_size: 200000 # Discriminator's scheduler step size. 107 | gamma: 0.5 # Discriminator's scheduler gamma. 108 | # At each step size, lr will be multiplied by this parameter. 109 | discriminator_grad_norm: 1 # Discriminator's gradient norm. 110 | disc_start_steps: 40000 # Number of steps to start to train discriminator. 111 | -------------------------------------------------------------------------------- /data/processed/ljspeech/dict.txt: -------------------------------------------------------------------------------- 1 | ! ! 2 | , , 3 | . . 4 | ; ; 5 | 6 | 7 | ? ? 8 | AA0 AA0 9 | AA1 AA1 10 | AA2 AA2 11 | AE0 AE0 12 | AE1 AE1 13 | AE2 AE2 14 | AH0 AH0 15 | AH1 AH1 16 | AH2 AH2 17 | AO0 AO0 18 | AO1 AO1 19 | AO2 AO2 20 | AW0 AW0 21 | AW1 AW1 22 | AW2 AW2 23 | AY0 AY0 24 | AY1 AY1 25 | AY2 AY2 26 | B B 27 | CH CH 28 | D D 29 | DH DH 30 | EH0 EH0 31 | EH1 EH1 32 | EH2 EH2 33 | ER0 ER0 34 | ER1 ER1 35 | ER2 ER2 36 | EY0 EY0 37 | EY1 EY1 38 | EY2 EY2 39 | F F 40 | G G 41 | HH HH 42 | IH0 IH0 43 | IH1 IH1 44 | IH2 IH2 45 | IY0 IY0 46 | IY1 IY1 47 | IY2 IY2 48 | JH JH 49 | K K 50 | L L 51 | M M 52 | N N 53 | NG NG 54 | OW0 OW0 55 | OW1 OW1 56 | OW2 OW2 57 | OY0 OY0 58 | OY1 OY1 59 | OY2 OY2 60 | P P 61 | R R 62 | S S 63 | SH SH 64 | T T 65 | TH TH 66 | UH0 UH0 67 | UH1 UH1 68 | UH2 UH2 69 | UW0 UW0 70 | UW1 UW1 71 | UW2 UW2 72 | V V 73 | W W 74 | Y Y 75 | Z Z 76 | ZH ZH 77 | | | 78 | -------------------------------------------------------------------------------- /data/processed/ljspeech/phone_set.json: -------------------------------------------------------------------------------- 1 | ["!", ",", ".", ";", "", "", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"] -------------------------------------------------------------------------------- /data_gen/tts/bin/binarize.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["OMP_NUM_THREADS"] = "1" 4 | 5 | import importlib 6 | from utils.hparams import set_hparams, hparams 7 | 8 | 9 | def binarize(): 10 | binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer') 11 | pkg = ".".join(binarizer_cls.split(".")[:-1]) 12 | cls_name = binarizer_cls.split(".")[-1] 13 | binarizer_cls = getattr(importlib.import_module(pkg), cls_name) 14 | print("| Binarizer: ", binarizer_cls) 15 | binarizer_cls().process() 16 | 17 | 18 | if __name__ == '__main__': 19 | set_hparams() 20 | binarize() 21 | -------------------------------------------------------------------------------- /data_gen/tts/binarizer_zh.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["OMP_NUM_THREADS"] = "1" 4 | 5 | from data_gen.tts.txt_processors.zh_g2pM import ALL_SHENMU 6 | from data_gen.tts.base_binarizer import BaseBinarizer, BinarizationError 7 | from data_gen.tts.data_gen_utils import get_mel2ph 8 | from utils.hparams import set_hparams, hparams 9 | import numpy as np 10 | 11 | 12 | class ZhBinarizer(BaseBinarizer): 13 | @staticmethod 14 | def get_align(tg_fn, ph, mel, phone_encoded, res): 15 | if tg_fn is not None and os.path.exists(tg_fn): 16 | _, dur = get_mel2ph(tg_fn, ph, mel, hparams) 17 | else: 18 | raise BinarizationError(f"Align not found") 19 | ph_list = ph.split(" ") 20 | assert len(dur) == len(ph_list) 21 | mel2ph = [] 22 | # 分隔符的时长分配给韵母 23 | dur_cumsum = np.pad(np.cumsum(dur), [1, 0], mode='constant', constant_values=0) 24 | for i in range(len(dur)): 25 | p = ph_list[i] 26 | if p[0] != '<' and not p[0].isalpha(): 27 | uv_ = res['f0'][dur_cumsum[i]:dur_cumsum[i + 1]] == 0 28 | j = 0 29 | while j < len(uv_) and not uv_[j]: 30 | j += 1 31 | dur[i - 1] += j 32 | dur[i] -= j 33 | if dur[i] < 100: 34 | dur[i - 1] += dur[i] 35 | dur[i] = 0 36 | # 声母和韵母等长 37 | for i in range(len(dur)): 38 | p = ph_list[i] 39 | if p in ALL_SHENMU: 40 | p_next = ph_list[i + 1] 41 | if not (dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU): 42 | print(f"assert dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU, " 43 | f"dur[i]: {dur[i]}, p: {p}, p_next: {p_next}.") 44 | continue 45 | total = dur[i + 1] + dur[i] 46 | dur[i] = total // 2 47 | dur[i + 1] = total - dur[i] 48 | for i in range(len(dur)): 49 | mel2ph += [i + 1] * dur[i] 50 | mel2ph = np.array(mel2ph) 51 | if mel2ph.max() - 1 >= len(phone_encoded): 52 | raise BinarizationError(f"| Align does not match: {(mel2ph.max() - 1, len(phone_encoded))}") 53 | res['mel2ph'] = mel2ph 54 | res['dur'] = dur 55 | 56 | 57 | if __name__ == "__main__": 58 | set_hparams() 59 | ZhBinarizer().process() 60 | -------------------------------------------------------------------------------- /data_gen/tts/txt_processors/base_text_processor.py: -------------------------------------------------------------------------------- 1 | class BaseTxtProcessor: 2 | @staticmethod 3 | def sp_phonemes(): 4 | return ['|'] 5 | 6 | @classmethod 7 | def process(cls, txt, pre_align_args): 8 | raise NotImplementedError 9 | -------------------------------------------------------------------------------- /data_gen/tts/txt_processors/en.py: -------------------------------------------------------------------------------- 1 | import re 2 | from data_gen.tts.data_gen_utils import PUNCS 3 | from g2p_en import G2p 4 | import unicodedata 5 | from g2p_en.expand import normalize_numbers 6 | from nltk import pos_tag 7 | from nltk.tokenize import TweetTokenizer 8 | 9 | from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor 10 | 11 | 12 | class EnG2p(G2p): 13 | word_tokenize = TweetTokenizer().tokenize 14 | 15 | def __call__(self, text): 16 | # preprocessing 17 | words = EnG2p.word_tokenize(text) 18 | tokens = pos_tag(words) # tuples of (word, tag) 19 | 20 | # steps 21 | prons = [] 22 | for word, pos in tokens: 23 | if re.search("[a-z]", word) is None: 24 | pron = [word] 25 | 26 | elif word in self.homograph2features: # Check homograph 27 | pron1, pron2, pos1 = self.homograph2features[word] 28 | if pos.startswith(pos1): 29 | pron = pron1 30 | else: 31 | pron = pron2 32 | elif word in self.cmu: # lookup CMU dict 33 | pron = self.cmu[word][0] 34 | else: # predict for oov 35 | pron = self.predict(word) 36 | 37 | prons.extend(pron) 38 | prons.extend([" "]) 39 | 40 | return prons[:-1] 41 | 42 | 43 | class TxtProcessor(BaseTxtProcessor): 44 | g2p = EnG2p() 45 | 46 | @staticmethod 47 | def preprocess_text(text): 48 | text = normalize_numbers(text) 49 | text = ''.join(char for char in unicodedata.normalize('NFD', text) 50 | if unicodedata.category(char) != 'Mn') # Strip accents 51 | text = text.lower() 52 | text = re.sub("[\'\"()]+", "", text) 53 | text = re.sub("[-]+", " ", text) 54 | text = re.sub(f"[^ a-z{PUNCS}]", "", text) 55 | text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text) # !! -> ! 56 | text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> ! 57 | text = text.replace("i.e.", "that is") 58 | text = text.replace("i.e.", "that is") 59 | text = text.replace("etc.", "etc") 60 | text = re.sub(f"([{PUNCS}])", r" \1 ", text) 61 | text = re.sub(rf"\s+", r" ", text) 62 | return text 63 | 64 | @classmethod 65 | def process(cls, txt, pre_align_args): 66 | txt = cls.preprocess_text(txt).strip() 67 | phs = cls.g2p(txt) 68 | phs_ = [] 69 | n_word_sep = 0 70 | for p in phs: 71 | if p.strip() == '': 72 | phs_ += ['|'] 73 | n_word_sep += 1 74 | else: 75 | phs_ += p.split(" ") 76 | phs = phs_ 77 | assert n_word_sep + 1 == len(txt.split(" ")), (phs, f"\"{txt}\"") 78 | return phs, txt 79 | -------------------------------------------------------------------------------- /data_gen/tts/txt_processors/zh.py: -------------------------------------------------------------------------------- 1 | import re 2 | from pypinyin import pinyin, Style 3 | from data_gen.tts.data_gen_utils import PUNCS 4 | from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor 5 | from utils.text_norm import NSWNormalizer 6 | 7 | 8 | class TxtProcessor(BaseTxtProcessor): 9 | table = {ord(f): ord(t) for f, t in zip( 10 | u':,。!?【】()%#@&1234567890', 11 | u':,.!?[]()%#@&1234567890')} 12 | 13 | @staticmethod 14 | def preprocess_text(text): 15 | text = text.translate(TxtProcessor.table) 16 | text = NSWNormalizer(text).normalize(remove_punc=False) 17 | text = re.sub("[\'\"()]+", "", text) 18 | text = re.sub("[-]+", " ", text) 19 | text = re.sub(f"[^ A-Za-z\u4e00-\u9fff{PUNCS}]", "", text) 20 | text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> ! 21 | text = re.sub(f"([{PUNCS}])", r" \1 ", text) 22 | text = re.sub(rf"\s+", r"", text) 23 | return text 24 | 25 | @classmethod 26 | def process(cls, txt, pre_align_args): 27 | txt = cls.preprocess_text(txt) 28 | shengmu = pinyin(txt, style=Style.INITIALS) # https://blog.csdn.net/zhoulei124/article/details/89055403 29 | yunmu_finals = pinyin(txt, style=Style.FINALS) 30 | yunmu_tone3 = pinyin(txt, style=Style.FINALS_TONE3) 31 | yunmu = [[t[0] + '5'] if t[0] == f[0] else t for f, t in zip(yunmu_finals, yunmu_tone3)] \ 32 | if pre_align_args['use_tone'] else yunmu_finals 33 | 34 | assert len(shengmu) == len(yunmu) 35 | phs = ["|"] 36 | for a, b, c in zip(shengmu, yunmu, yunmu_finals): 37 | if a[0] == c[0]: 38 | phs += [a[0], "|"] 39 | else: 40 | phs += [a[0], b[0], "|"] 41 | return phs, txt 42 | -------------------------------------------------------------------------------- /data_gen/tts/txt_processors/zh_g2pM.py: -------------------------------------------------------------------------------- 1 | import re 2 | import jieba 3 | from pypinyin import pinyin, Style 4 | from data_gen.tts.data_gen_utils import PUNCS 5 | from data_gen.tts.txt_processors import zh 6 | from g2pM import G2pM 7 | 8 | ALL_SHENMU = ['zh', 'ch', 'sh', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j', 9 | 'q', 'x', 'r', 'z', 'c', 's', 'y', 'w'] 10 | ALL_YUNMU = ['a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian', 11 | 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iu', 'ng', 'o', 'ong', 'ou', 12 | 'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've', 'vn'] 13 | 14 | 15 | class TxtProcessor(zh.TxtProcessor): 16 | model = G2pM() 17 | 18 | @staticmethod 19 | def sp_phonemes(): 20 | return ['|', '#'] 21 | 22 | @classmethod 23 | def process(cls, txt, pre_align_args): 24 | txt = cls.preprocess_text(txt) 25 | ph_list = cls.model(txt, tone=pre_align_args['use_tone'], char_split=True) 26 | seg_list = '#'.join(jieba.cut(txt)) 27 | assert len(ph_list) == len([s for s in seg_list if s != '#']), (ph_list, seg_list) 28 | 29 | # 加入词边界'#' 30 | ph_list_ = [] 31 | seg_idx = 0 32 | for p in ph_list: 33 | p = p.replace("u:", "v") 34 | if seg_list[seg_idx] == '#': 35 | ph_list_.append('#') 36 | seg_idx += 1 37 | else: 38 | ph_list_.append("|") 39 | seg_idx += 1 40 | if re.findall('[\u4e00-\u9fff]', p): 41 | if pre_align_args['use_tone']: 42 | p = pinyin(p, style=Style.TONE3, strict=True)[0][0] 43 | if p[-1] not in ['1', '2', '3', '4', '5']: 44 | p = p + '5' 45 | else: 46 | p = pinyin(p, style=Style.NORMAL, strict=True)[0][0] 47 | 48 | finished = False 49 | if len([c.isalpha() for c in p]) > 1: 50 | for shenmu in ALL_SHENMU: 51 | if p.startswith(shenmu) and not p.lstrip(shenmu).isnumeric(): 52 | ph_list_ += [shenmu, p.lstrip(shenmu)] 53 | finished = True 54 | break 55 | if not finished: 56 | ph_list_.append(p) 57 | 58 | ph_list = ph_list_ 59 | 60 | # 去除静音符号周围的词边界标记 [..., '#', ',', '#', ...] 61 | sil_phonemes = list(PUNCS) + TxtProcessor.sp_phonemes() 62 | ph_list_ = [] 63 | for i in range(0, len(ph_list), 1): 64 | if ph_list[i] != '#' or (ph_list[i - 1] not in sil_phonemes and ph_list[i + 1] not in sil_phonemes): 65 | ph_list_.append(ph_list[i]) 66 | ph_list = ph_list_ 67 | return ph_list, txt 68 | 69 | 70 | if __name__ == '__main__': 71 | phs, txt = TxtProcessor.process('他来到了,网易杭研大厦', {'use_tone': True}) 72 | print(phs) 73 | -------------------------------------------------------------------------------- /docs/README-SVS-opencpop-cascade.md: -------------------------------------------------------------------------------- 1 | # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism 2 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2105.02446) 3 | [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger) 4 | [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases) 5 | 6 | ## DiffSinger (MIDI SVS | A version) 7 | ### 0. Data Acquirement 8 | For Opencpop dataset: Please strictly follow the instructions of [Opencpop](https://wenet.org.cn/opencpop/). We have no right to give you the access to Opencpop. 9 | 10 | The pipeline below is designed for Opencpop dataset: 11 | 12 | ### 1. Preparation 13 | 14 | #### Data Preparation 15 | a) Download and extract Opencpop, then create a link to the dataset folder: `ln -s /xxx/opencpop data/raw/` 16 | 17 | b) Run the following scripts to pack the dataset for training/inference. 18 | 19 | ```sh 20 | export PYTHONPATH=. 21 | CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml 22 | 23 | # `data/binary/opencpop-midi-dp` will be generated. 24 | ``` 25 | 26 | #### Vocoder Preparation 27 | We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism. 28 | Please unzip this file into `checkpoints` before training your acoustic model. 29 | 30 | (Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory) 31 | 32 | This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder. 33 | 34 | #### Exp Name Preparation 35 | ```bash 36 | export MY_FS_EXP_NAME=0302_opencpop_fs_midi 37 | export MY_DS_EXP_NAME=0303_opencpop_ds58_midi 38 | ``` 39 | 40 | ``` 41 | . 42 | |--data 43 | |--raw 44 | |--opencpop 45 | |--segments 46 | |--transcriptions.txt 47 | |--wavs 48 | |--checkpoints 49 | |--MY_FS_EXP_NAME (optional) 50 | |--MY_DS_EXP_NAME (optional) 51 | |--0109_hifigan_bigpopcs_hop128 52 | |--model_ckpt_steps_1512000.ckpt 53 | |--config.yaml 54 | ``` 55 | 56 | ### 2. Training Example 57 | First, you need a pre-trained FFT-Singer checkpoint. You can use the pre-trained model, or train FFT-Singer from scratch, run: 58 | ```sh 59 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml --exp_name $MY_FS_EXP_NAME --reset 60 | ``` 61 | 62 | Then, to train DiffSinger, run: 63 | 64 | ```sh 65 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME --reset 66 | ``` 67 | 68 | Remember to adjust the "fs2_ckpt" parameter in `usr/configs/midi/cascade/opencs/ds60_rel.yaml` to fit your path. 69 | 70 | ### 3. Inference from packed test set 71 | ```sh 72 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME --reset --infer 73 | ``` 74 | Inference results will be saved in `./checkpoints/MY_DS_EXP_NAME/generated_` by default. 75 | 76 | We also provide: 77 | - the pre-trained model of DiffSinger; 78 | - the pre-trained model of FFT-Singer; 79 | 80 | They can be found in [here](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/adjust-receptive-field.zip). 81 | 82 | Remember to put the pre-trained models in `checkpoints` directory. 83 | 84 | ### 4. Inference from raw inputs 85 | ```sh 86 | python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME 87 | ``` 88 | Raw inputs: 89 | ``` 90 | inp = { 91 | 'text': '小酒窝长睫毛AP是你最美的记号', 92 | 'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4', 93 | 'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340', 94 | 'input_type': 'word' 95 | } # user input: Chinese characters 96 | or, 97 | inp = { 98 | 'text': '小酒窝长睫毛AP是你最美的记号', 99 | 'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao', 100 | 'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4', 101 | 'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340', 102 | 'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0', 103 | 'input_type': 'phoneme' 104 | } # input like Opencpop dataset. 105 | ``` 106 | Here the inference results will be saved in `./infer_out` by default. 107 | 108 | ### 5. Some issues. 109 | a) the HifiGAN-Singing is trained on our [vocoder dataset](https://dl.acm.org/doi/abs/10.1145/3474085.3475437) and the training set of [PopCS](https://arxiv.org/abs/2105.02446). Opencpop is the out-of-domain dataset (unseen speaker). This may cause the deterioration of audio quality, and we are considering fine-tuning this vocoder on the training set of Opencpop. 110 | 111 | b) in this version of codes, we used the melody frontend ([lyric + MIDI]->[F0+ph_dur]) to predict F0 contour and phoneme duration. 112 | 113 | c) generated audio demos can be found in [MY_DS_EXP_NAME](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/adjust-receptive-field.zip). 114 | -------------------------------------------------------------------------------- /docs/README-SVS-opencpop-e2e.md: -------------------------------------------------------------------------------- 1 | # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism 2 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2105.02446) 3 | [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger) 4 | [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases) 5 | [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-blue?label=SVSDemo)](https://huggingface.co/spaces/Silentlin/DiffSinger) 6 | 7 | Substantial update: We 1) **abandon** the explicit prediction of the F0 curve; 2) increase the receptive field of the denoiser; 3) make the linguistic encoder more robust. 8 | **By doing so, 1) the synthesized recordings are more natural in terms of pitch; 2) the pipeline is simpler.** 9 | 10 | 简而言之,把F0曲线的动态性交给生成式模型去捕捉,而不再是以前那样用MSE约束对数域F0。 11 | 12 | ## DiffSinger (MIDI SVS | B version) 13 | ### 0. Data Acquirement 14 | For Opencpop dataset: Please strictly follow the instructions of [Opencpop](https://wenet.org.cn/opencpop/). We have no right to give you the access to Opencpop. 15 | 16 | The pipeline below is designed for Opencpop dataset: 17 | 18 | ### 1. Preparation 19 | 20 | #### Data Preparation 21 | a) Download and extract Opencpop, then create a link to the dataset folder: `ln -s /xxx/opencpop data/raw/` 22 | 23 | b) Run the following scripts to pack the dataset for training/inference. 24 | 25 | ```sh 26 | export PYTHONPATH=. 27 | CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml 28 | 29 | # `data/binary/opencpop-midi-dp` will be generated. 30 | ``` 31 | 32 | #### Vocoder Preparation 33 | We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism. 34 | 35 | Also, please unzip pre-trained vocoder and [this pendant for vocoder](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0102_xiaoma_pe.zip) into `checkpoints` before training your acoustic model. 36 | 37 | (Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory) 38 | 39 | This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder. 40 | 41 | #### Exp Name Preparation 42 | ```bash 43 | export MY_DS_EXP_NAME=0228_opencpop_ds100_rel 44 | ``` 45 | 46 | ``` 47 | . 48 | |--data 49 | |--raw 50 | |--opencpop 51 | |--segments 52 | |--transcriptions.txt 53 | |--wavs 54 | |--checkpoints 55 | |--MY_DS_EXP_NAME (optional) 56 | |--0109_hifigan_bigpopcs_hop128 (vocoder) 57 | |--model_ckpt_steps_1512000.ckpt 58 | |--config.yaml 59 | ``` 60 | 61 | ### 2. Training Example 62 | ```sh 63 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME --reset 64 | ``` 65 | 66 | ### 3. Inference from packed test set 67 | ```sh 68 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME --reset --infer 69 | ``` 70 | Inference results will be saved in `./checkpoints/MY_DS_EXP_NAME/generated_` by default. 71 | 72 | We also provide: 73 | - the pre-trained model of DiffSinger; 74 | 75 | They can be found in [here](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0228_opencpop_ds100_rel.zip). 76 | 77 | Remember to put the pre-trained models in `checkpoints` directory. 78 | 79 | ### 4. Inference from raw inputs 80 | ```sh 81 | python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME 82 | ``` 83 | Raw inputs: 84 | ``` 85 | inp = { 86 | 'text': '小酒窝长睫毛AP是你最美的记号', 87 | 'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4', 88 | 'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340', 89 | 'input_type': 'word' 90 | } # user input: Chinese characters 91 | or, 92 | inp = { 93 | 'text': '小酒窝长睫毛AP是你最美的记号', 94 | 'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao', 95 | 'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4', 96 | 'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340', 97 | 'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0', 98 | 'input_type': 'phoneme' 99 | } # input like Opencpop dataset. 100 | ``` 101 | Here the inference results will be saved in `./infer_out` by default. 102 | ### 5. Some issues. 103 | a) the HifiGAN-Singing is trained on our [vocoder dataset](https://dl.acm.org/doi/abs/10.1145/3474085.3475437) and the training set of [PopCS](https://arxiv.org/abs/2105.02446). Opencpop is the out-of-domain dataset (unseen speaker). This may cause the deterioration of audio quality, and we are considering fine-tuning this vocoder on the training set of Opencpop. 104 | 105 | b) in this version of codes, we used the melody frontend ([lyric + MIDI]->[ph_dur]) to predict phoneme duration. F0 curve is implicitly predicted together with mel-spectrogram. 106 | 107 | -------------------------------------------------------------------------------- /docs/README-SVS-opencpop-pndm.md: -------------------------------------------------------------------------------- 1 | # DiffSinger-PNDM 2 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2105.02446) 3 | [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger) 4 | [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases) 5 | 6 | Highlights: 7 | 8 | Training diffusion model: 1000 steps 9 | 10 | Default pndm_speedup: 40 11 | 12 | Inference diffusion model: (1000 / pndm_speedup) steps = 25 steps 13 | 14 | You can freely control the inference steps, by adding these arguments in your experiment scripts : 15 | --hparams="pndm_speedup=40" or --hparams="pndm_speedup=20" or --hparams="pndm_speedup=10". 16 | 17 | Contributed by @luping-liu . 18 | 19 | ## DiffSinger (MIDI SVS | B version | +PNDM) 20 | ### 0. Data Acquirement 21 | For Opencpop dataset: Please strictly follow the instructions of [Opencpop](https://wenet.org.cn/opencpop/). We have no right to give you the access to Opencpop. 22 | 23 | The pipeline below is designed for Opencpop dataset: 24 | 25 | ### 1. Preparation 26 | 27 | #### Data Preparation 28 | a) Download and extract Opencpop, then create a link to the dataset folder: `ln -s /xxx/opencpop data/raw/` 29 | 30 | b) Run the following scripts to pack the dataset for training/inference. 31 | 32 | ```sh 33 | export PYTHONPATH=. 34 | CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml 35 | 36 | # `data/binary/opencpop-midi-dp` will be generated. 37 | ``` 38 | 39 | #### Vocoder Preparation 40 | We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism. 41 | 42 | Also, please unzip pre-trained vocoder and [this pendant for vocoder](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0102_xiaoma_pe.zip) into `checkpoints` before training your acoustic model. 43 | 44 | (Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory) 45 | 46 | This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder. 47 | 48 | #### Exp Name Preparation 49 | ```bash 50 | export MY_DS_EXP_NAME=0831_opencpop_ds1000 51 | ``` 52 | 53 | ``` 54 | . 55 | |--data 56 | |--raw 57 | |--opencpop 58 | |--segments 59 | |--transcriptions.txt 60 | |--wavs 61 | |--checkpoints 62 | |--MY_DS_EXP_NAME (optional) 63 | |--0109_hifigan_bigpopcs_hop128 (vocoder) 64 | |--model_ckpt_steps_1512000.ckpt 65 | |--config.yaml 66 | ``` 67 | 68 | ### 2. Training Example 69 | ```sh 70 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds1000.yaml --exp_name $MY_DS_EXP_NAME --reset 71 | ``` 72 | 73 | ### 3. Inference from packed test set 74 | ```sh 75 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds1000.yaml --exp_name $MY_DS_EXP_NAME --reset --infer 76 | ``` 77 | Inference results will be saved in `./checkpoints/MY_DS_EXP_NAME/generated_` by default. 78 | 79 | We also provide: 80 | - the pre-trained model of DiffSinger; 81 | 82 | They can be found in [here](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0831_opencpop_ds1000.zip). 83 | 84 | Remember to put the pre-trained models in `checkpoints` directory. 85 | 86 | ### 4. Inference from raw inputs 87 | ```sh 88 | python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds1000.yaml --exp_name $MY_DS_EXP_NAME 89 | ``` 90 | Raw inputs: 91 | ``` 92 | inp = { 93 | 'text': '小酒窝长睫毛AP是你最美的记号', 94 | 'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4', 95 | 'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340', 96 | 'input_type': 'word' 97 | } # user input: Chinese characters 98 | or, 99 | inp = { 100 | 'text': '小酒窝长睫毛AP是你最美的记号', 101 | 'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao', 102 | 'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4', 103 | 'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340', 104 | 'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0', 105 | 'input_type': 'phoneme' 106 | } # input like Opencpop dataset. 107 | ``` 108 | Here the inference results will be saved in `./infer_out` by default. 109 | ### 5. Some issues. 110 | a) the HifiGAN-Singing is trained on our [vocoder dataset](https://dl.acm.org/doi/abs/10.1145/3474085.3475437) and the training set of [PopCS](https://arxiv.org/abs/2105.02446). Opencpop is the out-of-domain dataset (unseen speaker). This may cause the deterioration of audio quality, and we are considering fine-tuning this vocoder on the training set of Opencpop. 111 | 112 | b) in this version of codes, we used the melody frontend ([lyric + MIDI]->[ph_dur]) to predict phoneme duration. F0 curve is implicitly predicted together with mel-spectrogram. 113 | -------------------------------------------------------------------------------- /docs/README-SVS-popcs.md: -------------------------------------------------------------------------------- 1 | ## DiffSinger (SVS version) 2 | 3 | ### 0. Data Acquirement 4 | - [Download link](https://drive.google.com/file/d/1uFJmPEUWbzguGBdiuupYvYbBEjopN-Xq/view?usp=sharing). 5 | - Please note that, if you are using PopCS, it means that you have accepted the terms in [apply_form](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md). 6 | 7 | ### 1. Preparation 8 | #### Data Preparation 9 | a) Download and extract PopCS, then create a link to the dataset folder: `ln -s /xxx/popcs/ data/processed/popcs` 10 | 11 | b) Run the following scripts to pack the dataset for training/inference. 12 | ```sh 13 | export PYTHONPATH=. 14 | CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/popcs_ds_beta6.yaml 15 | # `data/binary/popcs-pmf0` will be generated. 16 | ``` 17 | 18 | #### Vocoder Preparation 19 | We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism. 20 | Please unzip this file into `checkpoints` before training your acoustic model. 21 | 22 | (Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory) 23 | 24 | This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder. 25 | 26 | ### 2. Training Example 27 | First, you need a pre-trained FFT-Singer checkpoint. You can use the [pre-trained model](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip), or train FFT-Singer from scratch, run: 28 | 29 | ```sh 30 | # First, train fft-singer; 31 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset 32 | # Then, infer fft-singer; 33 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset --infer 34 | ``` 35 | 36 | Then, to train DiffSinger, run: 37 | ```sh 38 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset 39 | ``` 40 | 41 | Remember to adjust the "fs2_ckpt" parameter in `usr/configs/popcs_ds_beta6_offline.yaml` to fit your path. 42 | 43 | ### 3. Inference Example 44 | ```sh 45 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset --infer 46 | ``` 47 | 48 | We also provide: 49 | - the pre-trained model of [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_ds_beta6_offline_pmf0_1230.zip); 50 | - the pre-trained model of [FFT-Singer](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip) for the shallow diffusion mechanism in DiffSinger; 51 | 52 | Remember to put the pre-trained models in `checkpoints` directory. 53 | 54 | *Note that:* 55 | 56 | - *the original PWG version vocoder in the paper we used has been put into commercial use, so we provide this HifiGAN version vocoder as a substitute.* 57 | - *we assume the ground-truth F0 to be given as the pitch information following [1][2][3]. If you want to conduct experiments on MIDI data, you need an external F0 predictor (like [MIDI-A-version](README-SVS-opencpop-cascade.md)) or a joint prediction with spectrograms(like [MIDI-B-version](README-SVS-opencpop-e2e.md)).* 58 | 59 | [1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020. 60 | 61 | [2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020. 62 | 63 | [3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020. 64 | -------------------------------------------------------------------------------- /docs/README-SVS.md: -------------------------------------------------------------------------------- 1 | # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism 2 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2105.02446) 3 | [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger) 4 | [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases) 5 | [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-blue?label=SVSDemo)](https://huggingface.co/spaces/Silentlin/DiffSinger) 6 | 7 | ## DiffSinger (SVS) 8 | 9 | ### PART1. [Run DiffSinger on PopCS](README-SVS-popcs.md) 10 | In PART1, we only focus on spectrum modeling (acoustic model) and assume the ground-truth (GT) F0 to be given as the pitch information following these papers [1][2][3]. If you want to conduct experiments with F0 prediction, please move to PART2. 11 | 12 | Thus, the pipeline of this part can be summarized as: 13 | 14 | ``` 15 | [lyrics] -> [linguistic representation] (Frontend) 16 | [linguistic representation] + [GT F0] + [GT phoneme duration] -> [mel-spectrogram] (Acoustic model) 17 | [mel-spectrogram] + [GT F0] -> [waveform] (Vocoder) 18 | ``` 19 | 20 | 21 | [1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020. 22 | 23 | [2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020. 24 | 25 | [3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020. 26 | 27 | Click here for detailed instructions: [link](README-SVS-popcs.md). 28 | 29 | 30 | ### PART2. [Run DiffSinger on Opencpop](README-SVS-opencpop-cascade.md) 31 | Thanks [Opencpop team](https://wenet.org.cn/opencpop/) for releasing their SVS dataset with MIDI label, **Jan.20, 2022** (after we published our paper). 32 | 33 | Since there are elaborately annotated MIDI labels, we are able to supplement the pipeline in PART 1 by adding a naive melody frontend. 34 | 35 | #### 2.A 36 | Thus, the pipeline of [2.A](README-SVS-opencpop-cascade.md) can be summarized as: 37 | 38 | ``` 39 | [lyrics] + [MIDI] -> [linguistic representation (with MIDI information)] + [predicted F0] + [predicted phoneme duration] (Melody frontend) 40 | [linguistic representation] + [predicted F0] + [predicted phoneme duration] -> [mel-spectrogram] (Acoustic model) 41 | [mel-spectrogram] + [predicted F0] -> [waveform] (Vocoder) 42 | ``` 43 | 44 | Click here for detailed instructions: [link](README-SVS-opencpop-cascade.md). 45 | 46 | #### 2.B 47 | In 2.1, we find that if we predict F0 explicitly in the melody frontend, there will be many bad cases of uv/v prediction. Then, we abandon the explicit prediction of the F0 curve in the melody frontend and make a joint prediction with spectrograms. 48 | 49 | Thus, the pipeline of [2.B](README-SVS-opencpop-e2e.md) can be summarized as: 50 | ``` 51 | [lyrics] + [MIDI] -> [linguistic representation] + [predicted phoneme duration] (Melody frontend) 52 | [linguistic representation (with MIDI information)] + [predicted phoneme duration] -> [mel-spectrogram] (Acoustic model) 53 | [mel-spectrogram] -> [predicted F0] (Pitch extractor) 54 | [mel-spectrogram] + [predicted F0] -> [waveform] (Vocoder) 55 | ``` 56 | 57 | Click here for detailed instructions: [link](README-SVS-opencpop-e2e.md). 58 | 59 | ### FAQ 60 | Q1: Why do I need F0 in Vocoders? 61 | 62 | A1: See vocoder parts in HiFiSinger, DiffSinger or SingGAN. This is a common practice now. 63 | 64 | Q2: Why not run MIDI version SVS on PopCS dataset? or Why not release MIDI labels for PopCS dataset? 65 | 66 | A2: Our laboratory has no funds to label PopCS dataset. But there are funds for labeling other singing dataset, which is coming soon. 67 | 68 | Q3: Why " 'HifiGAN' object has no attribute 'model' "? 69 | 70 | A3: Please put the pretrained vocoders in your `checkpoints` dictionary. 71 | 72 | Q4: How to check whether I use GT information or predicted information during inference from packed test set? 73 | 74 | A4: Please see codes [here](https://github.com/MoonInTheRiver/DiffSinger/blob/55e2f46068af6e69940a9f8f02d306c24a940cab/tasks/tts/fs2.py#L343). 75 | 76 | ... -------------------------------------------------------------------------------- /docs/README-TTS-pndm.md: -------------------------------------------------------------------------------- 1 | # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism 2 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2105.02446) 3 | [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger) 4 | [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases) 5 | [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-blue?label=TTSDemo)](https://huggingface.co/spaces/NATSpeech/DiffSpeech) 6 | 7 | ## DiffSpeech (TTS) 8 | ### 1. Preparation 9 | 10 | #### Data Preparation 11 | a) Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/), then create a link to the dataset folder: `ln -s /xxx/LJSpeech-1.1/ data/raw/` 12 | 13 | b) Download and Unzip the [ground-truth duration](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/mfa_outputs.tar) extracted by [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz): `tar -xvf mfa_outputs.tar; mv mfa_outputs data/processed/ljspeech/` 14 | 15 | c) Run the following scripts to pack the dataset for training/inference. 16 | 17 | ```sh 18 | export PYTHONPATH=. 19 | CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config configs/tts/lj/fs2.yaml 20 | 21 | # `data/binary/ljspeech` will be generated. 22 | ``` 23 | 24 | #### Vocoder Preparation 25 | We provide the pre-trained model of [HifiGAN](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0414_hifi_lj_1.zip) vocoder. 26 | Please unzip this file into `checkpoints` before training your acoustic model. 27 | 28 | ### 2. Training Example 29 | 30 | ```sh 31 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_pndm.yaml --exp_name ds_pndm_lj_1 --reset 32 | ``` 33 | 34 | ### 3. Inference Example 35 | 36 | ```sh 37 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_pndm.yaml --exp_name ds_pndm_lj_1 --reset --infer 38 | ``` 39 | -------------------------------------------------------------------------------- /docs/README-TTS.md: -------------------------------------------------------------------------------- 1 | # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism 2 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2105.02446) 3 | [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger) 4 | [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases) 5 | [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-blue?label=TTSDemo)](https://huggingface.co/spaces/NATSpeech/DiffSpeech) 6 | 7 | ## DiffSpeech (TTS) 8 | ### 1. Preparation 9 | 10 | #### Data Preparation 11 | a) Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/), then create a link to the dataset folder: `ln -s /xxx/LJSpeech-1.1/ data/raw/` 12 | 13 | b) Download and Unzip the [ground-truth duration](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/mfa_outputs.tar) extracted by [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz): `tar -xvf mfa_outputs.tar; mv mfa_outputs data/processed/ljspeech/` 14 | 15 | c) Run the following scripts to pack the dataset for training/inference. 16 | 17 | ```sh 18 | export PYTHONPATH=. 19 | CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config configs/tts/lj/fs2.yaml 20 | 21 | # `data/binary/ljspeech` will be generated. 22 | ``` 23 | 24 | #### Vocoder Preparation 25 | We provide the pre-trained model of [HifiGAN](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0414_hifi_lj_1.zip) vocoder. 26 | Please unzip this file into `checkpoints` before training your acoustic model. 27 | 28 | ### 2. Training Example 29 | 30 | First, you need a pre-trained FastSpeech2 checkpoint. You can use the [pre-trained model](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip), or train FastSpeech2 from scratch, run: 31 | ```sh 32 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config configs/tts/lj/fs2.yaml --exp_name fs2_lj_1 --reset 33 | ``` 34 | Then, to train DiffSpeech, run: 35 | ```sh 36 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset 37 | ``` 38 | 39 | Remember to adjust the "fs2_ckpt" parameter in `usr/configs/lj_ds_beta6.yaml` to fit your path. 40 | 41 | ### 3. Inference Example 42 | 43 | ```sh 44 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset --infer 45 | ``` 46 | 47 | We also provide: 48 | - the pre-trained model of [DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip); 49 | - the individual pre-trained model of [FastSpeech 2](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip) for the shallow diffusion mechanism in DiffSpeech; 50 | 51 | Remember to put the pre-trained models in `checkpoints` directory. 52 | 53 | ## Mel Visualization 54 | Along vertical axis, DiffSpeech: [0-80]; FastSpeech2: [80-160]. 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 |
DiffSpeech vs. FastSpeech 2
DiffSpeech-vs-FastSpeech2
DiffSpeech-vs-FastSpeech2
DiffSpeech-vs-FastSpeech2
-------------------------------------------------------------------------------- /inference/svs/ds_cascade.py: -------------------------------------------------------------------------------- 1 | import torch 2 | # from inference.tts.fs import FastSpeechInfer 3 | # from modules.tts.fs2_orig import FastSpeech2Orig 4 | from inference.svs.base_svs_infer import BaseSVSInfer 5 | from utils import load_ckpt 6 | from utils.hparams import hparams 7 | from usr.diff.shallow_diffusion_tts import GaussianDiffusion 8 | from usr.diffsinger_task import DIFF_DECODERS 9 | 10 | class DiffSingerCascadeInfer(BaseSVSInfer): 11 | def build_model(self): 12 | model = GaussianDiffusion( 13 | phone_encoder=self.ph_encoder, 14 | out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams), 15 | timesteps=hparams['timesteps'], 16 | K_step=hparams['K_step'], 17 | loss_type=hparams['diff_loss_type'], 18 | spec_min=hparams['spec_min'], spec_max=hparams['spec_max'], 19 | ) 20 | model.eval() 21 | load_ckpt(model, hparams['work_dir'], 'model') 22 | return model 23 | 24 | def forward_model(self, inp): 25 | sample = self.input_to_batch(inp) 26 | txt_tokens = sample['txt_tokens'] # [B, T_t] 27 | spk_id = sample.get('spk_ids') 28 | with torch.no_grad(): 29 | output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True, 30 | pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'], 31 | is_slur=sample['is_slur']) 32 | mel_out = output['mel_out'] # [B, T,80] 33 | f0_pred = output['f0_denorm'] 34 | wav_out = self.run_vocoder(mel_out, f0=f0_pred) 35 | wav_out = wav_out.cpu().numpy() 36 | return wav_out[0] 37 | 38 | 39 | if __name__ == '__main__': 40 | inp = { 41 | 'text': '小酒窝长睫毛AP是你最美的记号', 42 | 'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4', 43 | 'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340', 44 | 'input_type': 'word' 45 | } # user input: Chinese characters 46 | c = { 47 | 'text': '小酒窝长睫毛AP是你最美的记号', 48 | 'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao', 49 | 'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4', 50 | 'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340', 51 | 'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0', 52 | 'input_type': 'phoneme' 53 | } # input like Opencpop dataset. 54 | DiffSingerCascadeInfer.example_run(inp) 55 | -------------------------------------------------------------------------------- /inference/svs/ds_e2e.py: -------------------------------------------------------------------------------- 1 | import torch 2 | # from inference.tts.fs import FastSpeechInfer 3 | # from modules.tts.fs2_orig import FastSpeech2Orig 4 | from inference.svs.base_svs_infer import BaseSVSInfer 5 | from utils import load_ckpt 6 | from utils.hparams import hparams 7 | from usr.diff.shallow_diffusion_tts import GaussianDiffusion 8 | from usr.diffsinger_task import DIFF_DECODERS 9 | from modules.fastspeech.pe import PitchExtractor 10 | import utils 11 | 12 | 13 | class DiffSingerE2EInfer(BaseSVSInfer): 14 | def build_model(self): 15 | model = GaussianDiffusion( 16 | phone_encoder=self.ph_encoder, 17 | out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams), 18 | timesteps=hparams['timesteps'], 19 | K_step=hparams['K_step'], 20 | loss_type=hparams['diff_loss_type'], 21 | spec_min=hparams['spec_min'], spec_max=hparams['spec_max'], 22 | ) 23 | model.eval() 24 | load_ckpt(model, hparams['work_dir'], 'model') 25 | 26 | if hparams.get('pe_enable') is not None and hparams['pe_enable']: 27 | self.pe = PitchExtractor().to(self.device) 28 | utils.load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True) 29 | self.pe.eval() 30 | return model 31 | 32 | def forward_model(self, inp): 33 | sample = self.input_to_batch(inp) 34 | txt_tokens = sample['txt_tokens'] # [B, T_t] 35 | spk_id = sample.get('spk_ids') 36 | with torch.no_grad(): 37 | output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True, 38 | pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'], 39 | is_slur=sample['is_slur']) 40 | mel_out = output['mel_out'] # [B, T,80] 41 | if hparams.get('pe_enable') is not None and hparams['pe_enable']: 42 | f0_pred = self.pe(mel_out)['f0_denorm_pred'] # pe predict from Pred mel 43 | else: 44 | f0_pred = output['f0_denorm'] 45 | wav_out = self.run_vocoder(mel_out, f0=f0_pred) 46 | wav_out = wav_out.cpu().numpy() 47 | return wav_out[0] 48 | 49 | if __name__ == '__main__': 50 | inp = { 51 | 'text': '小酒窝长睫毛AP是你最美的记号', 52 | 'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4', 53 | 'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340', 54 | 'input_type': 'word' 55 | } # user input: Chinese characters 56 | c = { 57 | 'text': '小酒窝长睫毛AP是你最美的记号', 58 | 'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao', 59 | 'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4', 60 | 'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340', 61 | 'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0', 62 | 'input_type': 'phoneme' 63 | } # input like Opencpop dataset. 64 | DiffSingerE2EInfer.example_run(inp) 65 | 66 | 67 | # python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel -------------------------------------------------------------------------------- /inference/svs/gradio/gradio_settings.yaml: -------------------------------------------------------------------------------- 1 | title: 'DiffSinger' 2 | description: | 3 | This model is trained on 5 hours single female singing voice samples of Opencpop dataset. (该模型在开源数据集Opencpop的5小时单人歌声上训练。) 4 | 5 | Please assign pitch and duration values to each Chinese character. The corresponding pitch and duration value of each character should be separated by a | separator. It is necessary to ensure that the note window separated by the separator is consistent with the number of Chinese characters (AP or SP is also viewed as a Chinese character). (请给每个汉字分配音高和时值, 每个字对应的音高和时值需要用|分隔符隔开。需要保证分隔符分割出来的音符窗口与汉字个数(AP或SP也算一个汉字)一致。) 6 | 7 | You can click one of the examples to load them. (你可以点击下方示例,加载示例曲谱。) 8 | 9 | Note: This space is running on CPU. (该Demo是在Huggingface提供的CPU上运行的, 其推理速度在本地会更快一些。) 10 | 11 | article: | 12 | Link to Github REPO 13 | example_inputs: 14 | - |- 15 | 你 说 你 不 SP 懂 为 何 在 这 时 牵 手 APD#4/Eb4 | D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | rest | D#4/Eb4 | D4 | D4 | D4 | D#4/Eb4 | F4 | D#4/Eb4 | D4 | rest0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590 16 | - |- 17 | 小酒窝长睫毛AP是你最美的记号C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db40.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340 18 | - |- 19 | 我真的SP爱你SP句句不轻易D4 | A4 | F#4 | rest | A4 | D4 | rest | B4 | A4 F#4 | F#4 | A4 | A40.8 | 0.4 | 0.967 | 0.3 | 0.4 | 0.967 | 0.4 | 0.8 | 0.4 0.4 | 0.25 | 0.967 | 0.9 20 | - |- 21 | 好冷啊 AP 我在东北玩泥巴F4 | F4 | D4 | rest | D4 | D4 | C4 | C4 | B3 | C4 | D40.5 | 0.3 | 0.3 | 0.3 | 0.2 | 0.2 | 0.2 | 0.2 | 0.25 | 0.25 | 0.4 22 | 23 | #inference_cls: inference.svs.ds_cascade.DiffSingerCascadeInfer 24 | #exp_name: 0303_opencpop_ds58_midi 25 | 26 | inference_cls: inference.svs.ds_e2e.DiffSingerE2EInfer 27 | exp_name: 0228_opencpop_ds100_rel -------------------------------------------------------------------------------- /inference/svs/gradio/infer.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import re 3 | 4 | import gradio as gr 5 | import yaml 6 | from gradio.inputs import Textbox 7 | 8 | from inference.svs.base_svs_infer import BaseSVSInfer 9 | from utils.hparams import set_hparams 10 | from utils.hparams import hparams as hp 11 | import numpy as np 12 | 13 | 14 | class GradioInfer: 15 | def __init__(self, exp_name, inference_cls, title, description, article, example_inputs): 16 | self.exp_name = exp_name 17 | self.title = title 18 | self.description = description 19 | self.article = article 20 | self.example_inputs = example_inputs 21 | pkg = ".".join(inference_cls.split(".")[:-1]) 22 | cls_name = inference_cls.split(".")[-1] 23 | self.inference_cls = getattr(importlib.import_module(pkg), cls_name) 24 | 25 | def greet(self, text, notes, notes_duration): 26 | PUNCS = '。?;:' 27 | sents = re.split(rf'([{PUNCS}])', text.replace('\n', ',')) 28 | sents_notes = re.split(rf'([{PUNCS}])', notes.replace('\n', ',')) 29 | sents_notes_dur = re.split(rf'([{PUNCS}])', notes_duration.replace('\n', ',')) 30 | 31 | if sents[-1] not in list(PUNCS): 32 | sents = sents + [''] 33 | sents_notes = sents_notes + [''] 34 | sents_notes_dur = sents_notes_dur + [''] 35 | 36 | audio_outs = [] 37 | s, n, n_dur = "", "", "" 38 | for i in range(0, len(sents), 2): 39 | if len(sents[i]) > 0: 40 | s += sents[i] + sents[i + 1] 41 | n += sents_notes[i] + sents_notes[i+1] 42 | n_dur += sents_notes_dur[i] + sents_notes_dur[i+1] 43 | if len(s) >= 400 or (i >= len(sents) - 2 and len(s) > 0): 44 | audio_out = self.infer_ins.infer_once({ 45 | 'text': s, 46 | 'notes': n, 47 | 'notes_duration': n_dur, 48 | }) 49 | audio_out = audio_out * 32767 50 | audio_out = audio_out.astype(np.int16) 51 | audio_outs.append(audio_out) 52 | audio_outs.append(np.zeros(int(hp['audio_sample_rate'] * 0.3)).astype(np.int16)) 53 | s = "" 54 | n = "" 55 | audio_outs = np.concatenate(audio_outs) 56 | return hp['audio_sample_rate'], audio_outs 57 | 58 | def run(self): 59 | set_hparams(exp_name=self.exp_name, print_hparams=False) 60 | infer_cls = self.inference_cls 61 | self.infer_ins: BaseSVSInfer = infer_cls(hp) 62 | example_inputs = self.example_inputs 63 | for i in range(len(example_inputs)): 64 | text, notes, notes_dur = example_inputs[i].split('') 65 | example_inputs[i] = [text, notes, notes_dur] 66 | 67 | iface = gr.Interface(fn=self.greet, 68 | inputs=[ 69 | Textbox(lines=2, placeholder=None, default=example_inputs[0][0], label="input text"), 70 | Textbox(lines=2, placeholder=None, default=example_inputs[0][1], label="input note"), 71 | Textbox(lines=2, placeholder=None, default=example_inputs[0][2], label="input duration")] 72 | , 73 | outputs="audio", 74 | allow_flagging="never", 75 | title=self.title, 76 | description=self.description, 77 | article=self.article, 78 | examples=example_inputs, 79 | enable_queue=True) 80 | iface.launch(share=True,)# cache_examples=True) 81 | 82 | 83 | if __name__ == '__main__': 84 | gradio_config = yaml.safe_load(open('inference/svs/gradio/gradio_settings.yaml')) 85 | g = GradioInfer(**gradio_config) 86 | g.run() 87 | 88 | 89 | # python inference/svs/gradio/infer.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi 90 | # python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi 91 | # CUDA_VISIBLE_DEVICES=3 python inference/svs/gradio/infer.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel -------------------------------------------------------------------------------- /inference/svs/opencpop/map.py: -------------------------------------------------------------------------------- 1 | def cpop_pinyin2ph_func(): 2 | # In the README file of opencpop dataset, they defined a "pinyin to phoneme mapping table" 3 | pinyin2phs = {'AP': 'AP', 'SP': 'SP'} 4 | with open('inference/svs/opencpop/cpop_pinyin2ph.txt') as rf: 5 | for line in rf.readlines(): 6 | elements = [x.strip() for x in line.split('|') if x.strip() != ''] 7 | pinyin2phs[elements[0]] = elements[1] 8 | return pinyin2phs -------------------------------------------------------------------------------- /modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/modules/__init__.py -------------------------------------------------------------------------------- /modules/commons/espnet_positional_embedding.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | 4 | 5 | class PositionalEncoding(torch.nn.Module): 6 | """Positional encoding. 7 | Args: 8 | d_model (int): Embedding dimension. 9 | dropout_rate (float): Dropout rate. 10 | max_len (int): Maximum input length. 11 | reverse (bool): Whether to reverse the input position. 12 | """ 13 | 14 | def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False): 15 | """Construct an PositionalEncoding object.""" 16 | super(PositionalEncoding, self).__init__() 17 | self.d_model = d_model 18 | self.reverse = reverse 19 | self.xscale = math.sqrt(self.d_model) 20 | self.dropout = torch.nn.Dropout(p=dropout_rate) 21 | self.pe = None 22 | self.extend_pe(torch.tensor(0.0).expand(1, max_len)) 23 | 24 | def extend_pe(self, x): 25 | """Reset the positional encodings.""" 26 | if self.pe is not None: 27 | if self.pe.size(1) >= x.size(1): 28 | if self.pe.dtype != x.dtype or self.pe.device != x.device: 29 | self.pe = self.pe.to(dtype=x.dtype, device=x.device) 30 | return 31 | pe = torch.zeros(x.size(1), self.d_model) 32 | if self.reverse: 33 | position = torch.arange( 34 | x.size(1) - 1, -1, -1.0, dtype=torch.float32 35 | ).unsqueeze(1) 36 | else: 37 | position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) 38 | div_term = torch.exp( 39 | torch.arange(0, self.d_model, 2, dtype=torch.float32) 40 | * -(math.log(10000.0) / self.d_model) 41 | ) 42 | pe[:, 0::2] = torch.sin(position * div_term) 43 | pe[:, 1::2] = torch.cos(position * div_term) 44 | pe = pe.unsqueeze(0) 45 | self.pe = pe.to(device=x.device, dtype=x.dtype) 46 | 47 | def forward(self, x: torch.Tensor): 48 | """Add positional encoding. 49 | Args: 50 | x (torch.Tensor): Input tensor (batch, time, `*`). 51 | Returns: 52 | torch.Tensor: Encoded tensor (batch, time, `*`). 53 | """ 54 | self.extend_pe(x) 55 | x = x * self.xscale + self.pe[:, : x.size(1)] 56 | return self.dropout(x) 57 | 58 | 59 | class ScaledPositionalEncoding(PositionalEncoding): 60 | """Scaled positional encoding module. 61 | See Sec. 3.2 https://arxiv.org/abs/1809.08895 62 | Args: 63 | d_model (int): Embedding dimension. 64 | dropout_rate (float): Dropout rate. 65 | max_len (int): Maximum input length. 66 | """ 67 | 68 | def __init__(self, d_model, dropout_rate, max_len=5000): 69 | """Initialize class.""" 70 | super().__init__(d_model=d_model, dropout_rate=dropout_rate, max_len=max_len) 71 | self.alpha = torch.nn.Parameter(torch.tensor(1.0)) 72 | 73 | def reset_parameters(self): 74 | """Reset parameters.""" 75 | self.alpha.data = torch.tensor(1.0) 76 | 77 | def forward(self, x): 78 | """Add positional encoding. 79 | Args: 80 | x (torch.Tensor): Input tensor (batch, time, `*`). 81 | Returns: 82 | torch.Tensor: Encoded tensor (batch, time, `*`). 83 | """ 84 | self.extend_pe(x) 85 | x = x + self.alpha * self.pe[:, : x.size(1)] 86 | return self.dropout(x) 87 | 88 | 89 | class RelPositionalEncoding(PositionalEncoding): 90 | """Relative positional encoding module. 91 | See : Appendix B in https://arxiv.org/abs/1901.02860 92 | Args: 93 | d_model (int): Embedding dimension. 94 | dropout_rate (float): Dropout rate. 95 | max_len (int): Maximum input length. 96 | """ 97 | 98 | def __init__(self, d_model, dropout_rate, max_len=5000): 99 | """Initialize class.""" 100 | super().__init__(d_model, dropout_rate, max_len, reverse=True) 101 | 102 | def forward(self, x): 103 | """Compute positional encoding. 104 | Args: 105 | x (torch.Tensor): Input tensor (batch, time, `*`). 106 | Returns: 107 | torch.Tensor: Encoded tensor (batch, time, `*`). 108 | torch.Tensor: Positional embedding tensor (1, time, `*`). 109 | """ 110 | self.extend_pe(x) 111 | x = x * self.xscale 112 | pos_emb = self.pe[:, : x.size(1)] 113 | return self.dropout(x) + self.dropout(pos_emb) -------------------------------------------------------------------------------- /modules/diffsinger_midi/fs2.py: -------------------------------------------------------------------------------- 1 | from modules.commons.common_layers import * 2 | from modules.commons.common_layers import Embedding 3 | from modules.fastspeech.tts_modules import FastspeechDecoder, DurationPredictor, LengthRegulator, PitchPredictor, \ 4 | EnergyPredictor, FastspeechEncoder 5 | from utils.cwt import cwt2f0 6 | from utils.hparams import hparams 7 | from utils.pitch_utils import f0_to_coarse, denorm_f0, norm_f0 8 | from modules.fastspeech.fs2 import FastSpeech2 9 | 10 | 11 | class FastspeechMIDIEncoder(FastspeechEncoder): 12 | def forward_embedding(self, txt_tokens, midi_embedding, midi_dur_embedding, slur_embedding): 13 | # embed tokens and positions 14 | x = self.embed_scale * self.embed_tokens(txt_tokens) 15 | x = x + midi_embedding + midi_dur_embedding + slur_embedding 16 | if hparams['use_pos_embed']: 17 | if hparams.get('rel_pos') is not None and hparams['rel_pos']: 18 | x = self.embed_positions(x) 19 | else: 20 | positions = self.embed_positions(txt_tokens) 21 | x = x + positions 22 | x = F.dropout(x, p=self.dropout, training=self.training) 23 | return x 24 | 25 | def forward(self, txt_tokens, midi_embedding, midi_dur_embedding, slur_embedding): 26 | """ 27 | 28 | :param txt_tokens: [B, T] 29 | :return: { 30 | 'encoder_out': [T x B x C] 31 | } 32 | """ 33 | encoder_padding_mask = txt_tokens.eq(self.padding_idx).data 34 | x = self.forward_embedding(txt_tokens, midi_embedding, midi_dur_embedding, slur_embedding) # [B, T, H] 35 | x = super(FastspeechEncoder, self).forward(x, encoder_padding_mask) 36 | return x 37 | 38 | 39 | FS_ENCODERS = { 40 | 'fft': lambda hp, embed_tokens, d: FastspeechMIDIEncoder( 41 | embed_tokens, hp['hidden_size'], hp['enc_layers'], hp['enc_ffn_kernel_size'], 42 | num_heads=hp['num_heads']), 43 | } 44 | 45 | 46 | class FastSpeech2MIDI(FastSpeech2): 47 | def __init__(self, dictionary, out_dims=None): 48 | super().__init__(dictionary, out_dims) 49 | del self.encoder 50 | self.encoder = FS_ENCODERS[hparams['encoder_type']](hparams, self.encoder_embed_tokens, self.dictionary) 51 | self.midi_embed = Embedding(300, self.hidden_size, self.padding_idx) 52 | self.midi_dur_layer = Linear(1, self.hidden_size) 53 | self.is_slur_embed = Embedding(2, self.hidden_size) 54 | 55 | def forward(self, txt_tokens, mel2ph=None, spk_embed=None, 56 | ref_mels=None, f0=None, uv=None, energy=None, skip_decoder=False, 57 | spk_embed_dur_id=None, spk_embed_f0_id=None, infer=False, **kwargs): 58 | ret = {} 59 | 60 | midi_embedding = self.midi_embed(kwargs['pitch_midi']) 61 | midi_dur_embedding, slur_embedding = 0, 0 62 | if kwargs.get('midi_dur') is not None: 63 | midi_dur_embedding = self.midi_dur_layer(kwargs['midi_dur'][:, :, None]) # [B, T, 1] -> [B, T, H] 64 | if kwargs.get('is_slur') is not None: 65 | slur_embedding = self.is_slur_embed(kwargs['is_slur']) 66 | encoder_out = self.encoder(txt_tokens, midi_embedding, midi_dur_embedding, slur_embedding) # [B, T, C] 67 | src_nonpadding = (txt_tokens > 0).float()[:, :, None] 68 | 69 | # add ref style embed 70 | # Not implemented 71 | # variance encoder 72 | var_embed = 0 73 | 74 | # encoder_out_dur denotes encoder outputs for duration predictor 75 | # in speech adaptation, duration predictor use old speaker embedding 76 | if hparams['use_spk_embed']: 77 | spk_embed_dur = spk_embed_f0 = spk_embed = self.spk_embed_proj(spk_embed)[:, None, :] 78 | elif hparams['use_spk_id']: 79 | spk_embed_id = spk_embed 80 | if spk_embed_dur_id is None: 81 | spk_embed_dur_id = spk_embed_id 82 | if spk_embed_f0_id is None: 83 | spk_embed_f0_id = spk_embed_id 84 | spk_embed = self.spk_embed_proj(spk_embed_id)[:, None, :] 85 | spk_embed_dur = spk_embed_f0 = spk_embed 86 | if hparams['use_split_spk_id']: 87 | spk_embed_dur = self.spk_embed_dur(spk_embed_dur_id)[:, None, :] 88 | spk_embed_f0 = self.spk_embed_f0(spk_embed_f0_id)[:, None, :] 89 | else: 90 | spk_embed_dur = spk_embed_f0 = spk_embed = 0 91 | 92 | # add dur 93 | dur_inp = (encoder_out + var_embed + spk_embed_dur) * src_nonpadding 94 | 95 | mel2ph = self.add_dur(dur_inp, mel2ph, txt_tokens, ret) 96 | 97 | decoder_inp = F.pad(encoder_out, [0, 0, 1, 0]) 98 | 99 | mel2ph_ = mel2ph[..., None].repeat([1, 1, encoder_out.shape[-1]]) 100 | decoder_inp_origin = decoder_inp = torch.gather(decoder_inp, 1, mel2ph_) # [B, T, H] 101 | 102 | tgt_nonpadding = (mel2ph > 0).float()[:, :, None] 103 | 104 | # add pitch and energy embed 105 | pitch_inp = (decoder_inp_origin + var_embed + spk_embed_f0) * tgt_nonpadding 106 | if hparams['use_pitch_embed']: 107 | pitch_inp_ph = (encoder_out + var_embed + spk_embed_f0) * src_nonpadding 108 | decoder_inp = decoder_inp + self.add_pitch(pitch_inp, f0, uv, mel2ph, ret, encoder_out=pitch_inp_ph) 109 | if hparams['use_energy_embed']: 110 | decoder_inp = decoder_inp + self.add_energy(pitch_inp, energy, ret) 111 | 112 | ret['decoder_inp'] = decoder_inp = (decoder_inp + spk_embed) * tgt_nonpadding 113 | 114 | if skip_decoder: 115 | return ret 116 | ret['mel_out'] = self.run_decoder(decoder_inp, tgt_nonpadding, ret, infer=infer, **kwargs) 117 | 118 | return ret 119 | -------------------------------------------------------------------------------- /modules/fastspeech/pe.py: -------------------------------------------------------------------------------- 1 | from modules.commons.common_layers import * 2 | from utils.hparams import hparams 3 | from modules.fastspeech.tts_modules import PitchPredictor 4 | from utils.pitch_utils import denorm_f0 5 | 6 | 7 | class Prenet(nn.Module): 8 | def __init__(self, in_dim=80, out_dim=256, kernel=5, n_layers=3, strides=None): 9 | super(Prenet, self).__init__() 10 | padding = kernel // 2 11 | self.layers = [] 12 | self.strides = strides if strides is not None else [1] * n_layers 13 | for l in range(n_layers): 14 | self.layers.append(nn.Sequential( 15 | nn.Conv1d(in_dim, out_dim, kernel_size=kernel, padding=padding, stride=self.strides[l]), 16 | nn.ReLU(), 17 | nn.BatchNorm1d(out_dim) 18 | )) 19 | in_dim = out_dim 20 | self.layers = nn.ModuleList(self.layers) 21 | self.out_proj = nn.Linear(out_dim, out_dim) 22 | 23 | def forward(self, x): 24 | """ 25 | 26 | :param x: [B, T, 80] 27 | :return: [L, B, T, H], [B, T, H] 28 | """ 29 | padding_mask = x.abs().sum(-1).eq(0).data # [B, T] 30 | nonpadding_mask_TB = 1 - padding_mask.float()[:, None, :] # [B, 1, T] 31 | x = x.transpose(1, 2) 32 | hiddens = [] 33 | for i, l in enumerate(self.layers): 34 | nonpadding_mask_TB = nonpadding_mask_TB[:, :, ::self.strides[i]] 35 | x = l(x) * nonpadding_mask_TB 36 | hiddens.append(x) 37 | hiddens = torch.stack(hiddens, 0) # [L, B, H, T] 38 | hiddens = hiddens.transpose(2, 3) # [L, B, T, H] 39 | x = self.out_proj(x.transpose(1, 2)) # [B, T, H] 40 | x = x * nonpadding_mask_TB.transpose(1, 2) 41 | return hiddens, x 42 | 43 | 44 | class ConvBlock(nn.Module): 45 | def __init__(self, idim=80, n_chans=256, kernel_size=3, stride=1, norm='gn', dropout=0): 46 | super().__init__() 47 | self.conv = ConvNorm(idim, n_chans, kernel_size, stride=stride) 48 | self.norm = norm 49 | if self.norm == 'bn': 50 | self.norm = nn.BatchNorm1d(n_chans) 51 | elif self.norm == 'in': 52 | self.norm = nn.InstanceNorm1d(n_chans, affine=True) 53 | elif self.norm == 'gn': 54 | self.norm = nn.GroupNorm(n_chans // 16, n_chans) 55 | elif self.norm == 'ln': 56 | self.norm = LayerNorm(n_chans // 16, n_chans) 57 | elif self.norm == 'wn': 58 | self.conv = torch.nn.utils.weight_norm(self.conv.conv) 59 | self.dropout = nn.Dropout(dropout) 60 | self.relu = nn.ReLU() 61 | 62 | def forward(self, x): 63 | """ 64 | 65 | :param x: [B, C, T] 66 | :return: [B, C, T] 67 | """ 68 | x = self.conv(x) 69 | if not isinstance(self.norm, str): 70 | if self.norm == 'none': 71 | pass 72 | elif self.norm == 'ln': 73 | x = self.norm(x.transpose(1, 2)).transpose(1, 2) 74 | else: 75 | x = self.norm(x) 76 | x = self.relu(x) 77 | x = self.dropout(x) 78 | return x 79 | 80 | 81 | class ConvStacks(nn.Module): 82 | def __init__(self, idim=80, n_layers=5, n_chans=256, odim=32, kernel_size=5, norm='gn', 83 | dropout=0, strides=None, res=True): 84 | super().__init__() 85 | self.conv = torch.nn.ModuleList() 86 | self.kernel_size = kernel_size 87 | self.res = res 88 | self.in_proj = Linear(idim, n_chans) 89 | if strides is None: 90 | strides = [1] * n_layers 91 | else: 92 | assert len(strides) == n_layers 93 | for idx in range(n_layers): 94 | self.conv.append(ConvBlock( 95 | n_chans, n_chans, kernel_size, stride=strides[idx], norm=norm, dropout=dropout)) 96 | self.out_proj = Linear(n_chans, odim) 97 | 98 | def forward(self, x, return_hiddens=False): 99 | """ 100 | 101 | :param x: [B, T, H] 102 | :return: [B, T, H] 103 | """ 104 | x = self.in_proj(x) 105 | x = x.transpose(1, -1) # (B, idim, Tmax) 106 | hiddens = [] 107 | for f in self.conv: 108 | x_ = f(x) 109 | x = x + x_ if self.res else x_ # (B, C, Tmax) 110 | hiddens.append(x) 111 | x = x.transpose(1, -1) 112 | x = self.out_proj(x) # (B, Tmax, H) 113 | if return_hiddens: 114 | hiddens = torch.stack(hiddens, 1) # [B, L, C, T] 115 | return x, hiddens 116 | return x 117 | 118 | 119 | class PitchExtractor(nn.Module): 120 | def __init__(self, n_mel_bins=80, conv_layers=2): 121 | super().__init__() 122 | self.hidden_size = hparams['hidden_size'] 123 | self.predictor_hidden = hparams['predictor_hidden'] if hparams['predictor_hidden'] > 0 else self.hidden_size 124 | self.conv_layers = conv_layers 125 | 126 | self.mel_prenet = Prenet(n_mel_bins, self.hidden_size, strides=[1, 1, 1]) 127 | if self.conv_layers > 0: 128 | self.mel_encoder = ConvStacks( 129 | idim=self.hidden_size, n_chans=self.hidden_size, odim=self.hidden_size, n_layers=self.conv_layers) 130 | self.pitch_predictor = PitchPredictor( 131 | self.hidden_size, n_chans=self.predictor_hidden, 132 | n_layers=5, dropout_rate=0.1, odim=2, 133 | padding=hparams['ffn_padding'], kernel_size=hparams['predictor_kernel']) 134 | 135 | def forward(self, mel_input=None): 136 | ret = {} 137 | mel_hidden = self.mel_prenet(mel_input)[1] 138 | if self.conv_layers > 0: 139 | mel_hidden = self.mel_encoder(mel_hidden) 140 | 141 | ret['pitch_pred'] = pitch_pred = self.pitch_predictor(mel_hidden) 142 | 143 | pitch_padding = mel_input.abs().sum(-1) == 0 144 | use_uv = hparams['pitch_type'] == 'frame' and hparams['use_uv'] 145 | 146 | ret['f0_denorm_pred'] = denorm_f0( 147 | pitch_pred[:, :, 0], (pitch_pred[:, :, 1] > 0) if use_uv else None, 148 | hparams, pitch_padding=pitch_padding) 149 | return ret -------------------------------------------------------------------------------- /modules/hifigan/mel_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.utils.data 4 | from librosa.filters import mel as librosa_mel_fn 5 | from scipy.io.wavfile import read 6 | 7 | MAX_WAV_VALUE = 32768.0 8 | 9 | 10 | def load_wav(full_path): 11 | sampling_rate, data = read(full_path) 12 | return data, sampling_rate 13 | 14 | 15 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 16 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) 17 | 18 | 19 | def dynamic_range_decompression(x, C=1): 20 | return np.exp(x) / C 21 | 22 | 23 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 24 | return torch.log(torch.clamp(x, min=clip_val) * C) 25 | 26 | 27 | def dynamic_range_decompression_torch(x, C=1): 28 | return torch.exp(x) / C 29 | 30 | 31 | def spectral_normalize_torch(magnitudes): 32 | output = dynamic_range_compression_torch(magnitudes) 33 | return output 34 | 35 | 36 | def spectral_de_normalize_torch(magnitudes): 37 | output = dynamic_range_decompression_torch(magnitudes) 38 | return output 39 | 40 | 41 | mel_basis = {} 42 | hann_window = {} 43 | 44 | 45 | def mel_spectrogram(y, hparams, center=False, complex=False): 46 | # hop_size: 512 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate) 47 | # win_size: 2048 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate) 48 | # fmin: 55 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525]) 49 | # fmax: 10000 # To be increased/reduced depending on data. 50 | # fft_size: 2048 # Extra window size is filled with 0 paddings to match this parameter 51 | # n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, 52 | n_fft = hparams['fft_size'] 53 | num_mels = hparams['audio_num_mel_bins'] 54 | sampling_rate = hparams['audio_sample_rate'] 55 | hop_size = hparams['hop_size'] 56 | win_size = hparams['win_size'] 57 | fmin = hparams['fmin'] 58 | fmax = hparams['fmax'] 59 | y = y.clamp(min=-1., max=1.) 60 | global mel_basis, hann_window 61 | if fmax not in mel_basis: 62 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 63 | mel_basis[str(fmax) + '_' + str(y.device)] = torch.from_numpy(mel).float().to(y.device) 64 | hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) 65 | 66 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 67 | mode='reflect') 68 | y = y.squeeze(1) 69 | 70 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)], 71 | center=center, pad_mode='reflect', normalized=False, onesided=True) 72 | 73 | if not complex: 74 | spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) 75 | spec = torch.matmul(mel_basis[str(fmax) + '_' + str(y.device)], spec) 76 | spec = spectral_normalize_torch(spec) 77 | else: 78 | B, C, T, _ = spec.shape 79 | spec = spec.transpose(1, 2) # [B, T, n_fft, 2] 80 | return spec 81 | -------------------------------------------------------------------------------- /modules/parallel_wavegan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/modules/parallel_wavegan/__init__.py -------------------------------------------------------------------------------- /modules/parallel_wavegan/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .causal_conv import * # NOQA 2 | from .pqmf import * # NOQA 3 | from .residual_block import * # NOQA 4 | from modules.parallel_wavegan.layers.residual_stack import * # NOQA 5 | from .upsample import * # NOQA 6 | -------------------------------------------------------------------------------- /modules/parallel_wavegan/layers/causal_conv.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Causal convolusion layer modules.""" 7 | 8 | 9 | import torch 10 | 11 | 12 | class CausalConv1d(torch.nn.Module): 13 | """CausalConv1d module with customized initialization.""" 14 | 15 | def __init__(self, in_channels, out_channels, kernel_size, 16 | dilation=1, bias=True, pad="ConstantPad1d", pad_params={"value": 0.0}): 17 | """Initialize CausalConv1d module.""" 18 | super(CausalConv1d, self).__init__() 19 | self.pad = getattr(torch.nn, pad)((kernel_size - 1) * dilation, **pad_params) 20 | self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size, 21 | dilation=dilation, bias=bias) 22 | 23 | def forward(self, x): 24 | """Calculate forward propagation. 25 | 26 | Args: 27 | x (Tensor): Input tensor (B, in_channels, T). 28 | 29 | Returns: 30 | Tensor: Output tensor (B, out_channels, T). 31 | 32 | """ 33 | return self.conv(self.pad(x))[:, :, :x.size(2)] 34 | 35 | 36 | class CausalConvTranspose1d(torch.nn.Module): 37 | """CausalConvTranspose1d module with customized initialization.""" 38 | 39 | def __init__(self, in_channels, out_channels, kernel_size, stride, bias=True): 40 | """Initialize CausalConvTranspose1d module.""" 41 | super(CausalConvTranspose1d, self).__init__() 42 | self.deconv = torch.nn.ConvTranspose1d( 43 | in_channels, out_channels, kernel_size, stride, bias=bias) 44 | self.stride = stride 45 | 46 | def forward(self, x): 47 | """Calculate forward propagation. 48 | 49 | Args: 50 | x (Tensor): Input tensor (B, in_channels, T_in). 51 | 52 | Returns: 53 | Tensor: Output tensor (B, out_channels, T_out). 54 | 55 | """ 56 | return self.deconv(x)[:, :, :-self.stride] 57 | -------------------------------------------------------------------------------- /modules/parallel_wavegan/layers/pqmf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Pseudo QMF modules.""" 7 | 8 | import numpy as np 9 | import torch 10 | import torch.nn.functional as F 11 | 12 | from scipy.signal import kaiser 13 | 14 | 15 | def design_prototype_filter(taps=62, cutoff_ratio=0.15, beta=9.0): 16 | """Design prototype filter for PQMF. 17 | 18 | This method is based on `A Kaiser window approach for the design of prototype 19 | filters of cosine modulated filterbanks`_. 20 | 21 | Args: 22 | taps (int): The number of filter taps. 23 | cutoff_ratio (float): Cut-off frequency ratio. 24 | beta (float): Beta coefficient for kaiser window. 25 | 26 | Returns: 27 | ndarray: Impluse response of prototype filter (taps + 1,). 28 | 29 | .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`: 30 | https://ieeexplore.ieee.org/abstract/document/681427 31 | 32 | """ 33 | # check the arguments are valid 34 | assert taps % 2 == 0, "The number of taps mush be even number." 35 | assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0." 36 | 37 | # make initial filter 38 | omega_c = np.pi * cutoff_ratio 39 | with np.errstate(invalid='ignore'): 40 | h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) \ 41 | / (np.pi * (np.arange(taps + 1) - 0.5 * taps)) 42 | h_i[taps // 2] = np.cos(0) * cutoff_ratio # fix nan due to indeterminate form 43 | 44 | # apply kaiser window 45 | w = kaiser(taps + 1, beta) 46 | h = h_i * w 47 | 48 | return h 49 | 50 | 51 | class PQMF(torch.nn.Module): 52 | """PQMF module. 53 | 54 | This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_. 55 | 56 | .. _`Near-perfect-reconstruction pseudo-QMF banks`: 57 | https://ieeexplore.ieee.org/document/258122 58 | 59 | """ 60 | 61 | def __init__(self, subbands=4, taps=62, cutoff_ratio=0.15, beta=9.0): 62 | """Initilize PQMF module. 63 | 64 | Args: 65 | subbands (int): The number of subbands. 66 | taps (int): The number of filter taps. 67 | cutoff_ratio (float): Cut-off frequency ratio. 68 | beta (float): Beta coefficient for kaiser window. 69 | 70 | """ 71 | super(PQMF, self).__init__() 72 | 73 | # define filter coefficient 74 | h_proto = design_prototype_filter(taps, cutoff_ratio, beta) 75 | h_analysis = np.zeros((subbands, len(h_proto))) 76 | h_synthesis = np.zeros((subbands, len(h_proto))) 77 | for k in range(subbands): 78 | h_analysis[k] = 2 * h_proto * np.cos( 79 | (2 * k + 1) * (np.pi / (2 * subbands)) * 80 | (np.arange(taps + 1) - ((taps - 1) / 2)) + 81 | (-1) ** k * np.pi / 4) 82 | h_synthesis[k] = 2 * h_proto * np.cos( 83 | (2 * k + 1) * (np.pi / (2 * subbands)) * 84 | (np.arange(taps + 1) - ((taps - 1) / 2)) - 85 | (-1) ** k * np.pi / 4) 86 | 87 | # convert to tensor 88 | analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1) 89 | synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0) 90 | 91 | # register coefficients as beffer 92 | self.register_buffer("analysis_filter", analysis_filter) 93 | self.register_buffer("synthesis_filter", synthesis_filter) 94 | 95 | # filter for downsampling & upsampling 96 | updown_filter = torch.zeros((subbands, subbands, subbands)).float() 97 | for k in range(subbands): 98 | updown_filter[k, k, 0] = 1.0 99 | self.register_buffer("updown_filter", updown_filter) 100 | self.subbands = subbands 101 | 102 | # keep padding info 103 | self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0) 104 | 105 | def analysis(self, x): 106 | """Analysis with PQMF. 107 | 108 | Args: 109 | x (Tensor): Input tensor (B, 1, T). 110 | 111 | Returns: 112 | Tensor: Output tensor (B, subbands, T // subbands). 113 | 114 | """ 115 | x = F.conv1d(self.pad_fn(x), self.analysis_filter) 116 | return F.conv1d(x, self.updown_filter, stride=self.subbands) 117 | 118 | def synthesis(self, x): 119 | """Synthesis with PQMF. 120 | 121 | Args: 122 | x (Tensor): Input tensor (B, subbands, T // subbands). 123 | 124 | Returns: 125 | Tensor: Output tensor (B, 1, T). 126 | 127 | """ 128 | x = F.conv_transpose1d(x, self.updown_filter * self.subbands, stride=self.subbands) 129 | return F.conv1d(self.pad_fn(x), self.synthesis_filter) 130 | -------------------------------------------------------------------------------- /modules/parallel_wavegan/layers/residual_block.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Residual block module in WaveNet. 4 | 5 | This code is modified from https://github.com/r9y9/wavenet_vocoder. 6 | 7 | """ 8 | 9 | import math 10 | 11 | import torch 12 | import torch.nn.functional as F 13 | 14 | 15 | class Conv1d(torch.nn.Conv1d): 16 | """Conv1d module with customized initialization.""" 17 | 18 | def __init__(self, *args, **kwargs): 19 | """Initialize Conv1d module.""" 20 | super(Conv1d, self).__init__(*args, **kwargs) 21 | 22 | def reset_parameters(self): 23 | """Reset parameters.""" 24 | torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu") 25 | if self.bias is not None: 26 | torch.nn.init.constant_(self.bias, 0.0) 27 | 28 | 29 | class Conv1d1x1(Conv1d): 30 | """1x1 Conv1d with customized initialization.""" 31 | 32 | def __init__(self, in_channels, out_channels, bias): 33 | """Initialize 1x1 Conv1d module.""" 34 | super(Conv1d1x1, self).__init__(in_channels, out_channels, 35 | kernel_size=1, padding=0, 36 | dilation=1, bias=bias) 37 | 38 | 39 | class ResidualBlock(torch.nn.Module): 40 | """Residual block module in WaveNet.""" 41 | 42 | def __init__(self, 43 | kernel_size=3, 44 | residual_channels=64, 45 | gate_channels=128, 46 | skip_channels=64, 47 | aux_channels=80, 48 | dropout=0.0, 49 | dilation=1, 50 | bias=True, 51 | use_causal_conv=False 52 | ): 53 | """Initialize ResidualBlock module. 54 | 55 | Args: 56 | kernel_size (int): Kernel size of dilation convolution layer. 57 | residual_channels (int): Number of channels for residual connection. 58 | skip_channels (int): Number of channels for skip connection. 59 | aux_channels (int): Local conditioning channels i.e. auxiliary input dimension. 60 | dropout (float): Dropout probability. 61 | dilation (int): Dilation factor. 62 | bias (bool): Whether to add bias parameter in convolution layers. 63 | use_causal_conv (bool): Whether to use use_causal_conv or non-use_causal_conv convolution. 64 | 65 | """ 66 | super(ResidualBlock, self).__init__() 67 | self.dropout = dropout 68 | # no future time stamps available 69 | if use_causal_conv: 70 | padding = (kernel_size - 1) * dilation 71 | else: 72 | assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." 73 | padding = (kernel_size - 1) // 2 * dilation 74 | self.use_causal_conv = use_causal_conv 75 | 76 | # dilation conv 77 | self.conv = Conv1d(residual_channels, gate_channels, kernel_size, 78 | padding=padding, dilation=dilation, bias=bias) 79 | 80 | # local conditioning 81 | if aux_channels > 0: 82 | self.conv1x1_aux = Conv1d1x1(aux_channels, gate_channels, bias=False) 83 | else: 84 | self.conv1x1_aux = None 85 | 86 | # conv output is split into two groups 87 | gate_out_channels = gate_channels // 2 88 | self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, bias=bias) 89 | self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_channels, bias=bias) 90 | 91 | def forward(self, x, c): 92 | """Calculate forward propagation. 93 | 94 | Args: 95 | x (Tensor): Input tensor (B, residual_channels, T). 96 | c (Tensor): Local conditioning auxiliary tensor (B, aux_channels, T). 97 | 98 | Returns: 99 | Tensor: Output tensor for residual connection (B, residual_channels, T). 100 | Tensor: Output tensor for skip connection (B, skip_channels, T). 101 | 102 | """ 103 | residual = x 104 | x = F.dropout(x, p=self.dropout, training=self.training) 105 | x = self.conv(x) 106 | 107 | # remove future time steps if use_causal_conv conv 108 | x = x[:, :, :residual.size(-1)] if self.use_causal_conv else x 109 | 110 | # split into two part for gated activation 111 | splitdim = 1 112 | xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim) 113 | 114 | # local conditioning 115 | if c is not None: 116 | assert self.conv1x1_aux is not None 117 | c = self.conv1x1_aux(c) 118 | ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim) 119 | xa, xb = xa + ca, xb + cb 120 | 121 | x = torch.tanh(xa) * torch.sigmoid(xb) 122 | 123 | # for skip connection 124 | s = self.conv1x1_skip(x) 125 | 126 | # for residual connection 127 | x = (self.conv1x1_out(x) + residual) * math.sqrt(0.5) 128 | 129 | return x, s 130 | -------------------------------------------------------------------------------- /modules/parallel_wavegan/layers/residual_stack.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Residual stack module in MelGAN.""" 7 | 8 | import torch 9 | 10 | from . import CausalConv1d 11 | 12 | 13 | class ResidualStack(torch.nn.Module): 14 | """Residual stack module introduced in MelGAN.""" 15 | 16 | def __init__(self, 17 | kernel_size=3, 18 | channels=32, 19 | dilation=1, 20 | bias=True, 21 | nonlinear_activation="LeakyReLU", 22 | nonlinear_activation_params={"negative_slope": 0.2}, 23 | pad="ReflectionPad1d", 24 | pad_params={}, 25 | use_causal_conv=False, 26 | ): 27 | """Initialize ResidualStack module. 28 | 29 | Args: 30 | kernel_size (int): Kernel size of dilation convolution layer. 31 | channels (int): Number of channels of convolution layers. 32 | dilation (int): Dilation factor. 33 | bias (bool): Whether to add bias parameter in convolution layers. 34 | nonlinear_activation (str): Activation function module name. 35 | nonlinear_activation_params (dict): Hyperparameters for activation function. 36 | pad (str): Padding function module name before dilated convolution layer. 37 | pad_params (dict): Hyperparameters for padding function. 38 | use_causal_conv (bool): Whether to use causal convolution. 39 | 40 | """ 41 | super(ResidualStack, self).__init__() 42 | 43 | # defile residual stack part 44 | if not use_causal_conv: 45 | assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." 46 | self.stack = torch.nn.Sequential( 47 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 48 | getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params), 49 | torch.nn.Conv1d(channels, channels, kernel_size, dilation=dilation, bias=bias), 50 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 51 | torch.nn.Conv1d(channels, channels, 1, bias=bias), 52 | ) 53 | else: 54 | self.stack = torch.nn.Sequential( 55 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 56 | CausalConv1d(channels, channels, kernel_size, dilation=dilation, 57 | bias=bias, pad=pad, pad_params=pad_params), 58 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 59 | torch.nn.Conv1d(channels, channels, 1, bias=bias), 60 | ) 61 | 62 | # defile extra layer for skip connection 63 | self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias) 64 | 65 | def forward(self, c): 66 | """Calculate forward propagation. 67 | 68 | Args: 69 | c (Tensor): Input tensor (B, channels, T). 70 | 71 | Returns: 72 | Tensor: Output tensor (B, chennels, T). 73 | 74 | """ 75 | return self.stack(c) + self.skip_layer(c) 76 | -------------------------------------------------------------------------------- /modules/parallel_wavegan/layers/tf_layers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 MINH ANH (@dathudeptrai) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Tensorflow Layer modules complatible with pytorch.""" 7 | 8 | import tensorflow as tf 9 | 10 | 11 | class TFReflectionPad1d(tf.keras.layers.Layer): 12 | """Tensorflow ReflectionPad1d module.""" 13 | 14 | def __init__(self, padding_size): 15 | """Initialize TFReflectionPad1d module. 16 | 17 | Args: 18 | padding_size (int): Padding size. 19 | 20 | """ 21 | super(TFReflectionPad1d, self).__init__() 22 | self.padding_size = padding_size 23 | 24 | @tf.function 25 | def call(self, x): 26 | """Calculate forward propagation. 27 | 28 | Args: 29 | x (Tensor): Input tensor (B, T, 1, C). 30 | 31 | Returns: 32 | Tensor: Padded tensor (B, T + 2 * padding_size, 1, C). 33 | 34 | """ 35 | return tf.pad(x, [[0, 0], [self.padding_size, self.padding_size], [0, 0], [0, 0]], "REFLECT") 36 | 37 | 38 | class TFConvTranspose1d(tf.keras.layers.Layer): 39 | """Tensorflow ConvTranspose1d module.""" 40 | 41 | def __init__(self, channels, kernel_size, stride, padding): 42 | """Initialize TFConvTranspose1d( module. 43 | 44 | Args: 45 | channels (int): Number of channels. 46 | kernel_size (int): kernel size. 47 | strides (int): Stride width. 48 | padding (str): Padding type ("same" or "valid"). 49 | 50 | """ 51 | super(TFConvTranspose1d, self).__init__() 52 | self.conv1d_transpose = tf.keras.layers.Conv2DTranspose( 53 | filters=channels, 54 | kernel_size=(kernel_size, 1), 55 | strides=(stride, 1), 56 | padding=padding, 57 | ) 58 | 59 | @tf.function 60 | def call(self, x): 61 | """Calculate forward propagation. 62 | 63 | Args: 64 | x (Tensor): Input tensor (B, T, 1, C). 65 | 66 | Returns: 67 | Tensors: Output tensor (B, T', 1, C'). 68 | 69 | """ 70 | x = self.conv1d_transpose(x) 71 | return x 72 | 73 | 74 | class TFResidualStack(tf.keras.layers.Layer): 75 | """Tensorflow ResidualStack module.""" 76 | 77 | def __init__(self, 78 | kernel_size, 79 | channels, 80 | dilation, 81 | bias, 82 | nonlinear_activation, 83 | nonlinear_activation_params, 84 | padding, 85 | ): 86 | """Initialize TFResidualStack module. 87 | 88 | Args: 89 | kernel_size (int): Kernel size. 90 | channles (int): Number of channels. 91 | dilation (int): Dilation ine. 92 | bias (bool): Whether to add bias parameter in convolution layers. 93 | nonlinear_activation (str): Activation function module name. 94 | nonlinear_activation_params (dict): Hyperparameters for activation function. 95 | padding (str): Padding type ("same" or "valid"). 96 | 97 | """ 98 | super(TFResidualStack, self).__init__() 99 | self.block = [ 100 | getattr(tf.keras.layers, nonlinear_activation)(**nonlinear_activation_params), 101 | TFReflectionPad1d(dilation), 102 | tf.keras.layers.Conv2D( 103 | filters=channels, 104 | kernel_size=(kernel_size, 1), 105 | dilation_rate=(dilation, 1), 106 | use_bias=bias, 107 | padding="valid", 108 | ), 109 | getattr(tf.keras.layers, nonlinear_activation)(**nonlinear_activation_params), 110 | tf.keras.layers.Conv2D(filters=channels, kernel_size=1, use_bias=bias) 111 | ] 112 | self.shortcut = tf.keras.layers.Conv2D(filters=channels, kernel_size=1, use_bias=bias) 113 | 114 | @tf.function 115 | def call(self, x): 116 | """Calculate forward propagation. 117 | 118 | Args: 119 | x (Tensor): Input tensor (B, T, 1, C). 120 | 121 | Returns: 122 | Tensor: Output tensor (B, T, 1, C). 123 | 124 | """ 125 | _x = tf.identity(x) 126 | for i, layer in enumerate(self.block): 127 | _x = layer(_x) 128 | shortcut = self.shortcut(x) 129 | return shortcut + _x 130 | -------------------------------------------------------------------------------- /modules/parallel_wavegan/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .stft_loss import * # NOQA 2 | -------------------------------------------------------------------------------- /modules/parallel_wavegan/losses/stft_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2019 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """STFT-based Loss modules.""" 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | 11 | 12 | def stft(x, fft_size, hop_size, win_length, window): 13 | """Perform STFT and convert to magnitude spectrogram. 14 | 15 | Args: 16 | x (Tensor): Input signal tensor (B, T). 17 | fft_size (int): FFT size. 18 | hop_size (int): Hop size. 19 | win_length (int): Window length. 20 | window (str): Window function type. 21 | 22 | Returns: 23 | Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). 24 | 25 | """ 26 | x_stft = torch.stft(x, fft_size, hop_size, win_length, window) 27 | real = x_stft[..., 0] 28 | imag = x_stft[..., 1] 29 | 30 | # NOTE(kan-bayashi): clamp is needed to avoid nan or inf 31 | return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1) 32 | 33 | 34 | class SpectralConvergengeLoss(torch.nn.Module): 35 | """Spectral convergence loss module.""" 36 | 37 | def __init__(self): 38 | """Initilize spectral convergence loss module.""" 39 | super(SpectralConvergengeLoss, self).__init__() 40 | 41 | def forward(self, x_mag, y_mag): 42 | """Calculate forward propagation. 43 | 44 | Args: 45 | x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). 46 | y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). 47 | 48 | Returns: 49 | Tensor: Spectral convergence loss value. 50 | 51 | """ 52 | return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro") 53 | 54 | 55 | class LogSTFTMagnitudeLoss(torch.nn.Module): 56 | """Log STFT magnitude loss module.""" 57 | 58 | def __init__(self): 59 | """Initilize los STFT magnitude loss module.""" 60 | super(LogSTFTMagnitudeLoss, self).__init__() 61 | 62 | def forward(self, x_mag, y_mag): 63 | """Calculate forward propagation. 64 | 65 | Args: 66 | x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). 67 | y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). 68 | 69 | Returns: 70 | Tensor: Log STFT magnitude loss value. 71 | 72 | """ 73 | return F.l1_loss(torch.log(y_mag), torch.log(x_mag)) 74 | 75 | 76 | class STFTLoss(torch.nn.Module): 77 | """STFT loss module.""" 78 | 79 | def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"): 80 | """Initialize STFT loss module.""" 81 | super(STFTLoss, self).__init__() 82 | self.fft_size = fft_size 83 | self.shift_size = shift_size 84 | self.win_length = win_length 85 | self.window = getattr(torch, window)(win_length) 86 | self.spectral_convergenge_loss = SpectralConvergengeLoss() 87 | self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss() 88 | 89 | def forward(self, x, y): 90 | """Calculate forward propagation. 91 | 92 | Args: 93 | x (Tensor): Predicted signal (B, T). 94 | y (Tensor): Groundtruth signal (B, T). 95 | 96 | Returns: 97 | Tensor: Spectral convergence loss value. 98 | Tensor: Log STFT magnitude loss value. 99 | 100 | """ 101 | x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window) 102 | y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window) 103 | sc_loss = self.spectral_convergenge_loss(x_mag, y_mag) 104 | mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag) 105 | 106 | return sc_loss, mag_loss 107 | 108 | 109 | class MultiResolutionSTFTLoss(torch.nn.Module): 110 | """Multi resolution STFT loss module.""" 111 | 112 | def __init__(self, 113 | fft_sizes=[1024, 2048, 512], 114 | hop_sizes=[120, 240, 50], 115 | win_lengths=[600, 1200, 240], 116 | window="hann_window"): 117 | """Initialize Multi resolution STFT loss module. 118 | 119 | Args: 120 | fft_sizes (list): List of FFT sizes. 121 | hop_sizes (list): List of hop sizes. 122 | win_lengths (list): List of window lengths. 123 | window (str): Window function type. 124 | 125 | """ 126 | super(MultiResolutionSTFTLoss, self).__init__() 127 | assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) 128 | self.stft_losses = torch.nn.ModuleList() 129 | for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): 130 | self.stft_losses += [STFTLoss(fs, ss, wl, window)] 131 | 132 | def forward(self, x, y): 133 | """Calculate forward propagation. 134 | 135 | Args: 136 | x (Tensor): Predicted signal (B, T). 137 | y (Tensor): Groundtruth signal (B, T). 138 | 139 | Returns: 140 | Tensor: Multi resolution spectral convergence loss value. 141 | Tensor: Multi resolution log STFT magnitude loss value. 142 | 143 | """ 144 | sc_loss = 0.0 145 | mag_loss = 0.0 146 | for f in self.stft_losses: 147 | sc_l, mag_l = f(x, y) 148 | sc_loss += sc_l 149 | mag_loss += mag_l 150 | sc_loss /= len(self.stft_losses) 151 | mag_loss /= len(self.stft_losses) 152 | 153 | return sc_loss, mag_loss 154 | -------------------------------------------------------------------------------- /modules/parallel_wavegan/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .melgan import * # NOQA 2 | from .parallel_wavegan import * # NOQA 3 | -------------------------------------------------------------------------------- /modules/parallel_wavegan/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from torch.optim import * # NOQA 2 | from .radam import * # NOQA 3 | -------------------------------------------------------------------------------- /modules/parallel_wavegan/optimizers/radam.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """RAdam optimizer. 4 | 5 | This code is drived from https://github.com/LiyuanLucasLiu/RAdam. 6 | """ 7 | 8 | import math 9 | import torch 10 | 11 | from torch.optim.optimizer import Optimizer 12 | 13 | 14 | class RAdam(Optimizer): 15 | """Rectified Adam optimizer.""" 16 | 17 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): 18 | """Initilize RAdam optimizer.""" 19 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 20 | self.buffer = [[None, None, None] for ind in range(10)] 21 | super(RAdam, self).__init__(params, defaults) 22 | 23 | def __setstate__(self, state): 24 | """Set state.""" 25 | super(RAdam, self).__setstate__(state) 26 | 27 | def step(self, closure=None): 28 | """Run one step.""" 29 | loss = None 30 | if closure is not None: 31 | loss = closure() 32 | 33 | for group in self.param_groups: 34 | 35 | for p in group['params']: 36 | if p.grad is None: 37 | continue 38 | grad = p.grad.data.float() 39 | if grad.is_sparse: 40 | raise RuntimeError('RAdam does not support sparse gradients') 41 | 42 | p_data_fp32 = p.data.float() 43 | 44 | state = self.state[p] 45 | 46 | if len(state) == 0: 47 | state['step'] = 0 48 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 49 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 50 | else: 51 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 52 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 53 | 54 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 55 | beta1, beta2 = group['betas'] 56 | 57 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 58 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 59 | 60 | state['step'] += 1 61 | buffered = self.buffer[int(state['step'] % 10)] 62 | if state['step'] == buffered[0]: 63 | N_sma, step_size = buffered[1], buffered[2] 64 | else: 65 | buffered[0] = state['step'] 66 | beta2_t = beta2 ** state['step'] 67 | N_sma_max = 2 / (1 - beta2) - 1 68 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 69 | buffered[1] = N_sma 70 | 71 | # more conservative since it's an approximated value 72 | if N_sma >= 5: 73 | step_size = math.sqrt( 74 | (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) # NOQA 75 | else: 76 | step_size = 1.0 / (1 - beta1 ** state['step']) 77 | buffered[2] = step_size 78 | 79 | if group['weight_decay'] != 0: 80 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 81 | 82 | # more conservative since it's an approximated value 83 | if N_sma >= 5: 84 | denom = exp_avg_sq.sqrt().add_(group['eps']) 85 | p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) 86 | else: 87 | p_data_fp32.add_(-step_size * group['lr'], exp_avg) 88 | 89 | p.data.copy_(p_data_fp32) 90 | 91 | return loss 92 | -------------------------------------------------------------------------------- /modules/parallel_wavegan/stft_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2019 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """STFT-based Loss modules.""" 7 | import librosa 8 | import torch 9 | 10 | from modules.parallel_wavegan.losses import LogSTFTMagnitudeLoss, SpectralConvergengeLoss, stft 11 | 12 | 13 | class STFTLoss(torch.nn.Module): 14 | """STFT loss module.""" 15 | 16 | def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window", 17 | use_mel_loss=False): 18 | """Initialize STFT loss module.""" 19 | super(STFTLoss, self).__init__() 20 | self.fft_size = fft_size 21 | self.shift_size = shift_size 22 | self.win_length = win_length 23 | self.window = getattr(torch, window)(win_length) 24 | self.spectral_convergenge_loss = SpectralConvergengeLoss() 25 | self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss() 26 | self.use_mel_loss = use_mel_loss 27 | self.mel_basis = None 28 | 29 | def forward(self, x, y): 30 | """Calculate forward propagation. 31 | 32 | Args: 33 | x (Tensor): Predicted signal (B, T). 34 | y (Tensor): Groundtruth signal (B, T). 35 | 36 | Returns: 37 | Tensor: Spectral convergence loss value. 38 | Tensor: Log STFT magnitude loss value. 39 | 40 | """ 41 | x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window) 42 | y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window) 43 | if self.use_mel_loss: 44 | if self.mel_basis is None: 45 | self.mel_basis = torch.from_numpy(librosa.filters.mel(22050, self.fft_size, 80)).cuda().T 46 | x_mag = x_mag @ self.mel_basis 47 | y_mag = y_mag @ self.mel_basis 48 | 49 | sc_loss = self.spectral_convergenge_loss(x_mag, y_mag) 50 | mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag) 51 | 52 | return sc_loss, mag_loss 53 | 54 | 55 | class MultiResolutionSTFTLoss(torch.nn.Module): 56 | """Multi resolution STFT loss module.""" 57 | 58 | def __init__(self, 59 | fft_sizes=[1024, 2048, 512], 60 | hop_sizes=[120, 240, 50], 61 | win_lengths=[600, 1200, 240], 62 | window="hann_window", 63 | use_mel_loss=False): 64 | """Initialize Multi resolution STFT loss module. 65 | 66 | Args: 67 | fft_sizes (list): List of FFT sizes. 68 | hop_sizes (list): List of hop sizes. 69 | win_lengths (list): List of window lengths. 70 | window (str): Window function type. 71 | 72 | """ 73 | super(MultiResolutionSTFTLoss, self).__init__() 74 | assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) 75 | self.stft_losses = torch.nn.ModuleList() 76 | for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): 77 | self.stft_losses += [STFTLoss(fs, ss, wl, window, use_mel_loss)] 78 | 79 | def forward(self, x, y): 80 | """Calculate forward propagation. 81 | 82 | Args: 83 | x (Tensor): Predicted signal (B, T). 84 | y (Tensor): Groundtruth signal (B, T). 85 | 86 | Returns: 87 | Tensor: Multi resolution spectral convergence loss value. 88 | Tensor: Multi resolution log STFT magnitude loss value. 89 | 90 | """ 91 | sc_loss = 0.0 92 | mag_loss = 0.0 93 | for f in self.stft_losses: 94 | sc_l, mag_l = f(x, y) 95 | sc_loss += sc_l 96 | mag_loss += mag_l 97 | sc_loss /= len(self.stft_losses) 98 | mag_loss /= len(self.stft_losses) 99 | 100 | return sc_loss, mag_loss 101 | -------------------------------------------------------------------------------- /modules/parallel_wavegan/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * # NOQA 2 | -------------------------------------------------------------------------------- /modules/parallel_wavegan/utils/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2019 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Utility functions.""" 7 | 8 | import fnmatch 9 | import logging 10 | import os 11 | import sys 12 | 13 | import h5py 14 | import numpy as np 15 | 16 | 17 | def find_files(root_dir, query="*.wav", include_root_dir=True): 18 | """Find files recursively. 19 | 20 | Args: 21 | root_dir (str): Root root_dir to find. 22 | query (str): Query to find. 23 | include_root_dir (bool): If False, root_dir name is not included. 24 | 25 | Returns: 26 | list: List of found filenames. 27 | 28 | """ 29 | files = [] 30 | for root, dirnames, filenames in os.walk(root_dir, followlinks=True): 31 | for filename in fnmatch.filter(filenames, query): 32 | files.append(os.path.join(root, filename)) 33 | if not include_root_dir: 34 | files = [file_.replace(root_dir + "/", "") for file_ in files] 35 | 36 | return files 37 | 38 | 39 | def read_hdf5(hdf5_name, hdf5_path): 40 | """Read hdf5 dataset. 41 | 42 | Args: 43 | hdf5_name (str): Filename of hdf5 file. 44 | hdf5_path (str): Dataset name in hdf5 file. 45 | 46 | Return: 47 | any: Dataset values. 48 | 49 | """ 50 | if not os.path.exists(hdf5_name): 51 | logging.error(f"There is no such a hdf5 file ({hdf5_name}).") 52 | sys.exit(1) 53 | 54 | hdf5_file = h5py.File(hdf5_name, "r") 55 | 56 | if hdf5_path not in hdf5_file: 57 | logging.error(f"There is no such a data in hdf5 file. ({hdf5_path})") 58 | sys.exit(1) 59 | 60 | hdf5_data = hdf5_file[hdf5_path][()] 61 | hdf5_file.close() 62 | 63 | return hdf5_data 64 | 65 | 66 | def write_hdf5(hdf5_name, hdf5_path, write_data, is_overwrite=True): 67 | """Write dataset to hdf5. 68 | 69 | Args: 70 | hdf5_name (str): Hdf5 dataset filename. 71 | hdf5_path (str): Dataset path in hdf5. 72 | write_data (ndarray): Data to write. 73 | is_overwrite (bool): Whether to overwrite dataset. 74 | 75 | """ 76 | # convert to numpy array 77 | write_data = np.array(write_data) 78 | 79 | # check folder existence 80 | folder_name, _ = os.path.split(hdf5_name) 81 | if not os.path.exists(folder_name) and len(folder_name) != 0: 82 | os.makedirs(folder_name) 83 | 84 | # check hdf5 existence 85 | if os.path.exists(hdf5_name): 86 | # if already exists, open with r+ mode 87 | hdf5_file = h5py.File(hdf5_name, "r+") 88 | # check dataset existence 89 | if hdf5_path in hdf5_file: 90 | if is_overwrite: 91 | logging.warning("Dataset in hdf5 file already exists. " 92 | "recreate dataset in hdf5.") 93 | hdf5_file.__delitem__(hdf5_path) 94 | else: 95 | logging.error("Dataset in hdf5 file already exists. " 96 | "if you want to overwrite, please set is_overwrite = True.") 97 | hdf5_file.close() 98 | sys.exit(1) 99 | else: 100 | # if not exists, open with w mode 101 | hdf5_file = h5py.File(hdf5_name, "w") 102 | 103 | # write data to hdf5 104 | hdf5_file.create_dataset(hdf5_path, data=write_data) 105 | hdf5_file.flush() 106 | hdf5_file.close() 107 | 108 | 109 | class HDF5ScpLoader(object): 110 | """Loader class for a fests.scp file of hdf5 file. 111 | 112 | Examples: 113 | key1 /some/path/a.h5:feats 114 | key2 /some/path/b.h5:feats 115 | key3 /some/path/c.h5:feats 116 | key4 /some/path/d.h5:feats 117 | ... 118 | >>> loader = HDF5ScpLoader("hdf5.scp") 119 | >>> array = loader["key1"] 120 | 121 | key1 /some/path/a.h5 122 | key2 /some/path/b.h5 123 | key3 /some/path/c.h5 124 | key4 /some/path/d.h5 125 | ... 126 | >>> loader = HDF5ScpLoader("hdf5.scp", "feats") 127 | >>> array = loader["key1"] 128 | 129 | """ 130 | 131 | def __init__(self, feats_scp, default_hdf5_path="feats"): 132 | """Initialize HDF5 scp loader. 133 | 134 | Args: 135 | feats_scp (str): Kaldi-style feats.scp file with hdf5 format. 136 | default_hdf5_path (str): Path in hdf5 file. If the scp contain the info, not used. 137 | 138 | """ 139 | self.default_hdf5_path = default_hdf5_path 140 | with open(feats_scp) as f: 141 | lines = [line.replace("\n", "") for line in f.readlines()] 142 | self.data = {} 143 | for line in lines: 144 | key, value = line.split() 145 | self.data[key] = value 146 | 147 | def get_path(self, key): 148 | """Get hdf5 file path for a given key.""" 149 | return self.data[key] 150 | 151 | def __getitem__(self, key): 152 | """Get ndarray for a given key.""" 153 | p = self.data[key] 154 | if ":" in p: 155 | return read_hdf5(*p.split(":")) 156 | else: 157 | return read_hdf5(p, self.default_hdf5_path) 158 | 159 | def __len__(self): 160 | """Return the length of the scp file.""" 161 | return len(self.data) 162 | 163 | def __iter__(self): 164 | """Return the iterator of the scp file.""" 165 | return iter(self.data) 166 | 167 | def keys(self): 168 | """Return the keys of the scp file.""" 169 | return self.data.keys() 170 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | librosa==0.8.0 3 | tqdm 4 | pandas 5 | numba==0.53.1 6 | numpy==1.19.2 7 | scipy==1.5.4 8 | PyYAML==5.3.1 9 | tensorboardX 10 | pyloudnorm 11 | setuptools>=41.0.0 12 | g2p_en 13 | resemblyzer 14 | webrtcvad 15 | tensorboard==2.6.0 16 | scikit-learn==0.24.1 17 | scikit-image==0.16.2 18 | textgrid 19 | jiwer 20 | pycwt 21 | PyWavelets 22 | praat-parselmouth==0.3.3 23 | jieba 24 | einops 25 | chardet 26 | pretty-midi==0.2.9 27 | pytorch-lightning==0.7.1 28 | h5py==3.1.0 29 | pypinyin==0.39.0 30 | g2pM==0.1.2.5 31 | -------------------------------------------------------------------------------- /requirements_2080.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.11.0 2 | alignment==1.0.10 3 | altgraph==0.17 4 | appdirs==1.4.4 5 | async-timeout==3.0.1 6 | audioread==2.1.9 7 | backcall==0.2.0 8 | blinker==1.4 9 | brotlipy==0.7.0 10 | cachetools==4.2.0 11 | certifi==2020.12.5 12 | cffi==1.14.4 13 | chardet==4.0.0 14 | click==7.1.2 15 | cycler==0.10.0 16 | Cython==0.29.21 17 | cytoolz==0.11.0 18 | decorator==4.4.2 19 | Distance==0.1.3 20 | einops==0.3.0 21 | et-xmlfile==1.0.1 22 | fsspec==0.8.4 23 | future==0.18.2 24 | g2p-en==2.1.0 25 | g2pM==0.1.2.5 26 | google-auth==1.24.0 27 | google-auth-oauthlib==0.4.2 28 | grpcio==1.34.0 29 | h5py==3.1.0 30 | horology==1.1.0 31 | httplib2==0.18.1 32 | idna==2.10 33 | imageio==2.9.0 34 | inflect==5.0.2 35 | ipdb==0.13.4 36 | ipython==7.19.0 37 | ipython-genutils==0.2.0 38 | jdcal==1.4.1 39 | jedi==0.17.2 40 | jieba==0.42.1 41 | jiwer==2.2.0 42 | joblib==1.0.0 43 | kiwisolver==1.3.1 44 | librosa==0.8.0 45 | llvmlite==0.31.0 46 | Markdown==3.3.3 47 | matplotlib==3.3.3 48 | miditoolkit==0.1.7 49 | mido==1.2.9 50 | music21==5.7.2 51 | networkx==2.5 52 | nltk==3.5 53 | numba==0.48.0 54 | numpy==1.19.4 55 | oauth2client==4.1.3 56 | oauthlib==3.1.0 57 | olefile==0.46 58 | packaging==20.7 59 | pandas==1.2.0 60 | parso==0.7.1 61 | patsy==0.5.1 62 | pexpect==4.8.0 63 | pickleshare==0.7.5 64 | Pillow==8.0.1 65 | pooch==1.3.0 66 | praat-parselmouth==0.3.3 67 | prompt-toolkit==3.0.8 68 | protobuf==3.13.0 69 | ptyprocess==0.6.0 70 | pyasn1==0.4.8 71 | pyasn1-modules==0.2.8 72 | pycparser==2.20 73 | pycwt==0.3.0a22 74 | Pygments==2.7.3 75 | PyInstaller==3.6 76 | PyJWT==1.7.1 77 | pyloudnorm==0.1.0 78 | pyparsing==2.4.7 79 | pypinyin==0.39.0 80 | PySocks==1.7.1 81 | python-dateutil==2.8.1 82 | python-Levenshtein==0.12.0 83 | pytorch-lightning==0.7.1 84 | pytz==2020.5 85 | PyWavelets==1.1.1 86 | pyworld==0.2.12 87 | PyYAML==5.3.1 88 | regex==2020.11.13 89 | requests==2.25.1 90 | requests-oauthlib==1.3.0 91 | resampy==0.2.2 92 | Resemblyzer==0.1.1.dev0 93 | rsa==4.6 94 | scikit-image==0.16.2 95 | scikit-learn==0.22.2.post1 96 | scipy==1.5.4 97 | six==1.15.0 98 | SoundFile==0.10.3.post1 99 | stopit==1.1.1 100 | tensorboard==2.4.0 101 | tensorboard-plugin-wit==1.7.0 102 | tensorboardX==2.1 103 | TextGrid==1.5 104 | threadpoolctl==2.1.0 105 | toolz==0.11.1 106 | torch==1.6.0 107 | torchaudio==0.6.0 108 | torchvision==0.7.0 109 | tqdm==4.54.1 110 | traitlets==5.0.5 111 | typing==3.7.4.3 112 | urllib3==1.26.2 113 | uuid==1.30 114 | wcwidth==0.2.5 115 | webencodings==0.5.1 116 | webrtcvad==2.0.10 117 | Werkzeug==1.0.1 118 | pretty-midi==0.2.9 119 | -------------------------------------------------------------------------------- /requirements_3090.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.15.0 2 | appdirs==1.4.4 3 | audioread==2.1.9 4 | beautifulsoup4==4.10.0 5 | certifi==2021.10.8 6 | cffi==1.15.0 7 | charset-normalizer==2.0.7 8 | cycler==0.11.0 9 | Cython==0.29.24 10 | decorator==4.4.2 11 | dlib==19.22.1 12 | einops==0.3.2 13 | future==0.18.2 14 | g2p-en==2.1.0 15 | google==3.0.0 16 | grpcio==1.42.0 17 | h5py==2.8.0 18 | horology==1.2.0 19 | idna==3.3 20 | imageio==2.10.1 21 | imageio-ffmpeg==0.4.5 22 | importlib-metadata==4.8.1 23 | joblib==1.1.0 24 | kiwisolver==1.3.2 25 | librosa==0.8.0 26 | llvmlite==0.31.0 27 | Markdown==3.3.4 28 | matplotlib==3.4.3 29 | miditoolkit==0.1.7 30 | moviepy==1.0.3 31 | numba==0.48.0 32 | numpy==1.20.0 33 | opencv-python==4.5.4.58 34 | packaging==21.2 35 | pandas==1.3.4 36 | Pillow==8.4.0 37 | pooch==1.5.2 38 | praat-parselmouth==0.3.3 39 | proglog==0.1.9 40 | protobuf==3.19.1 41 | pycparser==2.20 42 | pycwt==0.3.0a22 43 | pydub==0.25.1 44 | pyloudnorm==0.1.0 45 | pyparsing==2.4.7 46 | pypinyin==0.43.0 47 | python-dateutil==2.8.2 48 | pytorch-lightning==0.7.1 49 | pytorch-ssim==0.1 50 | pytz==2021.3 51 | pyworld==0.3.0 52 | PyYAML==6.0 53 | requests==2.26.0 54 | resampy==0.2.2 55 | Resemblyzer==0.1.1.dev0 56 | scikit-image==0.16.2 57 | scikit-learn==0.22 58 | scipy==1.3.0 59 | six==1.16.0 60 | sklearn==0.0 61 | SoundFile==0.10.3.post1 62 | soupsieve==2.3 63 | sympy==1.9 64 | tensorboard==1.15.0 65 | tensorboardX==2.4 66 | test-tube==0.7.5 67 | TextGrid==1.5 68 | torch @ https://download.pytorch.org/whl/nightly/cu113/torch-1.10.0.dev20210907%2Bcu113-cp37-cp37m-linux_x86_64.whl 69 | torchvision==0.9.1 70 | tqdm==4.62.3 71 | typing-extensions==3.10.0.2 72 | urllib3==1.26.7 73 | uuid==1.30 74 | webrtcvad==2.0.10 75 | Werkzeug==2.0.2 76 | zipp==3.6.0 77 | -------------------------------------------------------------------------------- /resources/apply_form.md: -------------------------------------------------------------------------------- 1 | # The way to apply for PopCS 2 | Thanks for your attention to our works. Please write the email to jinglinliu@zju.edu.cn with: 3 | 4 | " 5 | 6 | name: *** 7 | 8 | affiliations: *** (school or institution) 9 | 10 | research fields: *** 11 | 12 | We want to apply for PopCS and agree to the dataset license: CC by-nc-sa 4.0 (NonCommercial!). 13 | 14 | We accept full responsibility for our use of the dataset and shall defend and indemnify the authors of DiffSinger, against any and all claims arising from our use of the dataset, including but not limited to our use of any copies of copyrighted audio files that we may create from the dataset. 15 | 16 | We hereby represent that we are fully authorized to enter into this agreement on behalf of my employer. 17 | 18 | We will cite your paper if these codes or data have been used. We will not distribute the download link to others without informing the authors of DiffSinger. 19 | 20 | " 21 | 22 | Then we will provide the download link to you. 23 | 24 | **Please note that, if you are using PopCS, it means that you have accepted the terms above.** 25 | 26 | **Please use your Official Email Address (like xxx@zju.edu.cn)! Thank you!** -------------------------------------------------------------------------------- /resources/diffspeech-fs2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/resources/diffspeech-fs2-1.png -------------------------------------------------------------------------------- /resources/diffspeech-fs2-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/resources/diffspeech-fs2-2.png -------------------------------------------------------------------------------- /resources/diffspeech-fs2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/resources/diffspeech-fs2.png -------------------------------------------------------------------------------- /resources/model_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/resources/model_a.png -------------------------------------------------------------------------------- /resources/model_b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/resources/model_b.png -------------------------------------------------------------------------------- /resources/tfb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/resources/tfb.png -------------------------------------------------------------------------------- /tasks/run.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | from utils.hparams import set_hparams, hparams 3 | 4 | 5 | def run_task(): 6 | assert hparams['task_cls'] != '' 7 | pkg = ".".join(hparams["task_cls"].split(".")[:-1]) 8 | cls_name = hparams["task_cls"].split(".")[-1] 9 | task_cls = getattr(importlib.import_module(pkg), cls_name) 10 | task_cls.start() 11 | 12 | 13 | if __name__ == '__main__': 14 | set_hparams() 15 | run_task() 16 | -------------------------------------------------------------------------------- /tasks/tts/pe.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | 4 | import torch 5 | import numpy as np 6 | import os 7 | 8 | from tasks.base_task import BaseDataset 9 | from tasks.tts.fs2 import FastSpeech2Task 10 | from modules.fastspeech.pe import PitchExtractor 11 | import utils 12 | from utils.indexed_datasets import IndexedDataset 13 | from utils.hparams import hparams 14 | from utils.plot import f0_to_figure 15 | from utils.pitch_utils import norm_interp_f0, denorm_f0 16 | 17 | 18 | class PeDataset(BaseDataset): 19 | def __init__(self, prefix, shuffle=False): 20 | super().__init__(shuffle) 21 | self.data_dir = hparams['binary_data_dir'] 22 | self.prefix = prefix 23 | self.hparams = hparams 24 | self.sizes = np.load(f'{self.data_dir}/{self.prefix}_lengths.npy') 25 | self.indexed_ds = None 26 | 27 | # pitch stats 28 | f0_stats_fn = f'{self.data_dir}/train_f0s_mean_std.npy' 29 | if os.path.exists(f0_stats_fn): 30 | hparams['f0_mean'], hparams['f0_std'] = self.f0_mean, self.f0_std = np.load(f0_stats_fn) 31 | hparams['f0_mean'] = float(hparams['f0_mean']) 32 | hparams['f0_std'] = float(hparams['f0_std']) 33 | else: 34 | hparams['f0_mean'], hparams['f0_std'] = self.f0_mean, self.f0_std = None, None 35 | 36 | if prefix == 'test': 37 | if hparams['num_test_samples'] > 0: 38 | self.avail_idxs = list(range(hparams['num_test_samples'])) + hparams['test_ids'] 39 | self.sizes = [self.sizes[i] for i in self.avail_idxs] 40 | 41 | def _get_item(self, index): 42 | if hasattr(self, 'avail_idxs') and self.avail_idxs is not None: 43 | index = self.avail_idxs[index] 44 | if self.indexed_ds is None: 45 | self.indexed_ds = IndexedDataset(f'{self.data_dir}/{self.prefix}') 46 | return self.indexed_ds[index] 47 | 48 | def __getitem__(self, index): 49 | hparams = self.hparams 50 | item = self._get_item(index) 51 | max_frames = hparams['max_frames'] 52 | spec = torch.Tensor(item['mel'])[:max_frames] 53 | # mel2ph = torch.LongTensor(item['mel2ph'])[:max_frames] if 'mel2ph' in item else None 54 | f0, uv = norm_interp_f0(item["f0"][:max_frames], hparams) 55 | pitch = torch.LongTensor(item.get("pitch"))[:max_frames] 56 | # print(item.keys(), item['mel'].shape, spec.shape) 57 | sample = { 58 | "id": index, 59 | "item_name": item['item_name'], 60 | "text": item['txt'], 61 | "mel": spec, 62 | "pitch": pitch, 63 | "f0": f0, 64 | "uv": uv, 65 | # "mel2ph": mel2ph, 66 | # "mel_nonpadding": spec.abs().sum(-1) > 0, 67 | } 68 | return sample 69 | 70 | def collater(self, samples): 71 | if len(samples) == 0: 72 | return {} 73 | id = torch.LongTensor([s['id'] for s in samples]) 74 | item_names = [s['item_name'] for s in samples] 75 | text = [s['text'] for s in samples] 76 | f0 = utils.collate_1d([s['f0'] for s in samples], 0.0) 77 | pitch = utils.collate_1d([s['pitch'] for s in samples]) 78 | uv = utils.collate_1d([s['uv'] for s in samples]) 79 | mels = utils.collate_2d([s['mel'] for s in samples], 0.0) 80 | mel_lengths = torch.LongTensor([s['mel'].shape[0] for s in samples]) 81 | # mel2ph = utils.collate_1d([s['mel2ph'] for s in samples], 0.0) \ 82 | # if samples[0]['mel2ph'] is not None else None 83 | # mel_nonpaddings = utils.collate_1d([s['mel_nonpadding'].float() for s in samples], 0.0) 84 | 85 | batch = { 86 | 'id': id, 87 | 'item_name': item_names, 88 | 'nsamples': len(samples), 89 | 'text': text, 90 | 'mels': mels, 91 | 'mel_lengths': mel_lengths, 92 | 'pitch': pitch, 93 | # 'mel2ph': mel2ph, 94 | # 'mel_nonpaddings': mel_nonpaddings, 95 | 'f0': f0, 96 | 'uv': uv, 97 | } 98 | return batch 99 | 100 | 101 | class PitchExtractionTask(FastSpeech2Task): 102 | def __init__(self): 103 | super().__init__() 104 | self.dataset_cls = PeDataset 105 | 106 | def build_tts_model(self): 107 | self.model = PitchExtractor(conv_layers=hparams['pitch_extractor_conv_layers']) 108 | 109 | # def build_scheduler(self, optimizer): 110 | # return torch.optim.lr_scheduler.StepLR(optimizer, hparams['decay_steps'], gamma=0.5) 111 | def _training_step(self, sample, batch_idx, _): 112 | loss_output = self.run_model(self.model, sample) 113 | total_loss = sum([v for v in loss_output.values() if isinstance(v, torch.Tensor) and v.requires_grad]) 114 | loss_output['batch_size'] = sample['mels'].size()[0] 115 | return total_loss, loss_output 116 | 117 | def validation_step(self, sample, batch_idx): 118 | outputs = {} 119 | outputs['losses'] = {} 120 | outputs['losses'], model_out = self.run_model(self.model, sample, return_output=True, infer=True) 121 | outputs['total_loss'] = sum(outputs['losses'].values()) 122 | outputs['nsamples'] = sample['nsamples'] 123 | outputs = utils.tensors_to_scalars(outputs) 124 | if batch_idx < hparams['num_valid_plots']: 125 | self.plot_pitch(batch_idx, model_out, sample) 126 | return outputs 127 | 128 | def run_model(self, model, sample, return_output=False, infer=False): 129 | f0 = sample['f0'] 130 | uv = sample['uv'] 131 | output = model(sample['mels']) 132 | losses = {} 133 | self.add_pitch_loss(output, sample, losses) 134 | if not return_output: 135 | return losses 136 | else: 137 | return losses, output 138 | 139 | def plot_pitch(self, batch_idx, model_out, sample): 140 | gt_f0 = denorm_f0(sample['f0'], sample['uv'], hparams) 141 | self.logger.experiment.add_figure( 142 | f'f0_{batch_idx}', 143 | f0_to_figure(gt_f0[0], None, model_out['f0_denorm_pred'][0]), 144 | self.global_step) 145 | 146 | def add_pitch_loss(self, output, sample, losses): 147 | # mel2ph = sample['mel2ph'] # [B, T_s] 148 | mel = sample['mels'] 149 | f0 = sample['f0'] 150 | uv = sample['uv'] 151 | # nonpadding = (mel2ph != 0).float() if hparams['pitch_type'] == 'frame' \ 152 | # else (sample['txt_tokens'] != 0).float() 153 | nonpadding = (mel.abs().sum(-1) > 0).float() # sample['mel_nonpaddings'] 154 | # print(nonpadding[0][-8:], nonpadding.shape) 155 | self.add_f0_loss(output['pitch_pred'], f0, uv, losses, nonpadding=nonpadding) -------------------------------------------------------------------------------- /tasks/tts/tts.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.pool import Pool 2 | 3 | import matplotlib 4 | 5 | from utils.pl_utils import data_loader 6 | from utils.training_utils import RSQRTSchedule 7 | from vocoders.base_vocoder import get_vocoder_cls, BaseVocoder 8 | from modules.fastspeech.pe import PitchExtractor 9 | 10 | matplotlib.use('Agg') 11 | import os 12 | import numpy as np 13 | from tqdm import tqdm 14 | import torch.distributed as dist 15 | 16 | from tasks.base_task import BaseTask 17 | from utils.hparams import hparams 18 | from utils.text_encoder import TokenTextEncoder 19 | import json 20 | 21 | import torch 22 | import torch.optim 23 | import torch.utils.data 24 | import utils 25 | 26 | 27 | 28 | class TtsTask(BaseTask): 29 | def __init__(self, *args, **kwargs): 30 | self.vocoder = None 31 | self.phone_encoder = self.build_phone_encoder(hparams['binary_data_dir']) 32 | self.padding_idx = self.phone_encoder.pad() 33 | self.eos_idx = self.phone_encoder.eos() 34 | self.seg_idx = self.phone_encoder.seg() 35 | self.saving_result_pool = None 36 | self.saving_results_futures = None 37 | self.stats = {} 38 | super().__init__(*args, **kwargs) 39 | 40 | def build_scheduler(self, optimizer): 41 | return RSQRTSchedule(optimizer) 42 | 43 | def build_optimizer(self, model): 44 | self.optimizer = optimizer = torch.optim.AdamW( 45 | model.parameters(), 46 | lr=hparams['lr']) 47 | return optimizer 48 | 49 | def build_dataloader(self, dataset, shuffle, max_tokens=None, max_sentences=None, 50 | required_batch_size_multiple=-1, endless=False, batch_by_size=True): 51 | devices_cnt = torch.cuda.device_count() 52 | if devices_cnt == 0: 53 | devices_cnt = 1 54 | if required_batch_size_multiple == -1: 55 | required_batch_size_multiple = devices_cnt 56 | 57 | def shuffle_batches(batches): 58 | np.random.shuffle(batches) 59 | return batches 60 | 61 | if max_tokens is not None: 62 | max_tokens *= devices_cnt 63 | if max_sentences is not None: 64 | max_sentences *= devices_cnt 65 | indices = dataset.ordered_indices() 66 | if batch_by_size: 67 | batch_sampler = utils.batch_by_size( 68 | indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, 69 | required_batch_size_multiple=required_batch_size_multiple, 70 | ) 71 | else: 72 | batch_sampler = [] 73 | for i in range(0, len(indices), max_sentences): 74 | batch_sampler.append(indices[i:i + max_sentences]) 75 | 76 | if shuffle: 77 | batches = shuffle_batches(list(batch_sampler)) 78 | if endless: 79 | batches = [b for _ in range(1000) for b in shuffle_batches(list(batch_sampler))] 80 | else: 81 | batches = batch_sampler 82 | if endless: 83 | batches = [b for _ in range(1000) for b in batches] 84 | num_workers = dataset.num_workers 85 | if self.trainer.use_ddp: 86 | num_replicas = dist.get_world_size() 87 | rank = dist.get_rank() 88 | batches = [x[rank::num_replicas] for x in batches if len(x) % num_replicas == 0] 89 | return torch.utils.data.DataLoader(dataset, 90 | collate_fn=dataset.collater, 91 | batch_sampler=batches, 92 | num_workers=num_workers, 93 | pin_memory=False) 94 | 95 | def build_phone_encoder(self, data_dir): 96 | phone_list_file = os.path.join(data_dir, 'phone_set.json') 97 | 98 | phone_list = json.load(open(phone_list_file)) 99 | return TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',') 100 | 101 | def build_optimizer(self, model): 102 | self.optimizer = optimizer = torch.optim.AdamW( 103 | model.parameters(), 104 | lr=hparams['lr']) 105 | return optimizer 106 | 107 | def test_start(self): 108 | self.saving_result_pool = Pool(8) 109 | self.saving_results_futures = [] 110 | self.vocoder: BaseVocoder = get_vocoder_cls(hparams)() 111 | if hparams.get('pe_enable') is not None and hparams['pe_enable']: 112 | self.pe = PitchExtractor().cuda() 113 | utils.load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True) 114 | self.pe.eval() 115 | def test_end(self, outputs): 116 | self.saving_result_pool.close() 117 | [f.get() for f in tqdm(self.saving_results_futures)] 118 | self.saving_result_pool.join() 119 | return {} 120 | 121 | ########## 122 | # utils 123 | ########## 124 | def weights_nonzero_speech(self, target): 125 | # target : B x T x mel 126 | # Assign weight 1.0 to all labels except for padding (id=0). 127 | dim = target.size(-1) 128 | return target.abs().sum(-1, keepdim=True).ne(0).float().repeat(1, 1, dim) 129 | 130 | if __name__ == '__main__': 131 | TtsTask.start() 132 | -------------------------------------------------------------------------------- /usr/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/usr/.gitkeep -------------------------------------------------------------------------------- /usr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/usr/__init__.py -------------------------------------------------------------------------------- /usr/configs/base.yaml: -------------------------------------------------------------------------------- 1 | task_cls: usr.task.DiffFsTask 2 | pitch_type: frame 3 | timesteps: 100 4 | dilation_cycle_length: 1 5 | residual_layers: 20 6 | residual_channels: 256 7 | lr: 0.001 8 | decay_steps: 50000 9 | keep_bins: 80 10 | spec_min: [ ] 11 | spec_max: [ ] 12 | 13 | content_cond_steps: [ ] # [ 0, 10000 ] 14 | spk_cond_steps: [ ] # [ 0, 10000 ] 15 | # train and eval 16 | fs2_ckpt: '' 17 | max_updates: 400000 18 | # max_updates: 200000 19 | use_gt_dur: true 20 | use_gt_f0: true 21 | gen_tgt_spk_id: -1 22 | max_sentences: 48 23 | num_sanity_val_steps: 1 24 | num_valid_plots: 1 25 | -------------------------------------------------------------------------------- /usr/configs/lj_ds_beta6.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/lj/fs2.yaml 3 | - ./base.yaml 4 | # spec_min and spec_max are calculated on the training set. 5 | spec_min: [ -4.7574, -4.6783, -4.6431, -4.5832, -4.5390, -4.6771, -4.8089, -4.7672, 6 | -4.5784, -4.7755, -4.7150, -4.8919, -4.8271, -4.7389, -4.6047, -4.7759, 7 | -4.6799, -4.8201, -4.7823, -4.8262, -4.7857, -4.7545, -4.9358, -4.9733, 8 | -5.1134, -5.1395, -4.9016, -4.8434, -5.0189, -4.8460, -5.0529, -4.9510, 9 | -5.0217, -5.0049, -5.1831, -5.1445, -5.1015, -5.0281, -4.9887, -4.9916, 10 | -4.9785, -4.9071, -4.9488, -5.0342, -4.9332, -5.0650, -4.8924, -5.0875, 11 | -5.0483, -5.0848, -5.1809, -5.0677, -5.0015, -5.0792, -5.0636, -5.2413, 12 | -5.1421, -5.1710, -5.3256, -5.0511, -5.1186, -5.0057, -5.0446, -5.1173, 13 | -5.0325, -5.1085, -5.0053, -5.0755, -5.1176, -5.1004, -5.2153, -5.2757, 14 | -5.3025, -5.2867, -5.2918, -5.3328, -5.2731, -5.2985, -5.2400, -5.2211 ] 15 | spec_max: [ -0.5982, -0.0778, 0.1205, 0.2747, 0.4657, 0.5123, 0.5684, 0.7093, 16 | 0.6461, 0.6420, 0.7316, 0.7715, 0.7681, 0.8349, 0.7815, 0.7591, 17 | 0.7910, 0.7433, 0.7352, 0.6869, 0.6854, 0.6623, 0.5353, 0.6492, 18 | 0.6909, 0.6106, 0.5761, 0.5936, 0.5638, 0.4054, 0.4545, 0.3589, 19 | 0.3037, 0.3380, 0.1599, 0.2433, 0.2741, 0.2130, 0.1569, 0.1911, 20 | 0.2324, 0.1586, 0.1221, 0.0341, -0.0558, 0.0553, -0.1153, -0.0933, 21 | -0.1171, -0.0050, -0.1519, -0.1629, -0.0522, -0.0739, -0.2069, -0.2405, 22 | -0.1244, -0.2116, -0.1361, -0.1575, -0.1442, 0.0513, -0.1567, -0.2000, 23 | 0.0086, -0.0698, 0.1385, 0.0941, 0.1864, 0.1225, 0.2176, 0.2566, 24 | 0.1670, 0.1007, 0.1444, 0.0888, 0.1998, 0.2414, 0.2932, 0.3047 ] 25 | 26 | task_cls: usr.diffspeech_task.DiffSpeechTask 27 | vocoder: vocoders.hifigan.HifiGAN 28 | vocoder_ckpt: checkpoints/0414_hifi_lj_1 29 | num_valid_plots: 10 30 | use_gt_dur: false 31 | use_gt_f0: false 32 | pitch_type: cwt 33 | pitch_extractor: 'parselmouth' 34 | max_updates: 160000 35 | lr: 0.001 36 | timesteps: 100 37 | K_step: 71 38 | diff_loss_type: l1 39 | diff_decoder_type: 'wavenet' 40 | schedule_type: 'linear' 41 | max_beta: 0.06 42 | fs2_ckpt: checkpoints/fs2_lj_1/model_ckpt_steps_150000.ckpt 43 | save_gt: true -------------------------------------------------------------------------------- /usr/configs/lj_ds_pndm.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - ./lj_ds_beta6.yaml 3 | 4 | fs2_ckpt: '' 5 | gaussian_start: True 6 | max_beta: 0.02 7 | timesteps: 1000 8 | K_step: 1000 9 | pndm_speedup: 10 10 | 11 | pitch_type: frame 12 | use_pitch_embed: false # using diffusion to model pitch curve 13 | lambda_f0: 0. 14 | lambda_uv: 0. 15 | #rel_pos: true 16 | 17 | max_updates: 320000 18 | -------------------------------------------------------------------------------- /usr/configs/midi/cascade/opencs/aux_rel.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/singing/fs2.yaml 3 | - usr/configs/midi/cascade/opencs/opencpop_statis.yaml 4 | 5 | audio_sample_rate: 24000 6 | hop_size: 128 # Hop size. 7 | fft_size: 512 # FFT size. 8 | win_size: 512 # FFT size. 9 | fmin: 30 10 | fmax: 12000 11 | min_level_db: -120 12 | 13 | binarization_args: 14 | with_wav: true 15 | with_spk_embed: false 16 | with_align: true 17 | raw_data_dir: 'data/raw/opencpop/segments' 18 | processed_data_dir: 'xxx' 19 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer 20 | 21 | 22 | binary_data_dir: 'data/binary/opencpop-midi-dp' 23 | use_midi: true # for midi exp 24 | use_gt_f0: false # for midi exp 25 | use_gt_dur: false # for further midi exp 26 | lambda_f0: 1.0 27 | lambda_uv: 1.0 28 | #lambda_energy: 0.1 29 | lambda_ph_dur: 1.0 30 | lambda_sent_dur: 1.0 31 | lambda_word_dur: 1.0 32 | predictor_grad: 0.1 33 | pe_enable: false 34 | pe_ckpt: '' 35 | 36 | num_spk: 1 37 | test_prefixes: [ 38 | '2044', 39 | '2086', 40 | '2092', 41 | '2093', 42 | '2100', 43 | ] 44 | 45 | task_cls: usr.diffsinger_task.AuxDecoderMIDITask 46 | #vocoder: usr.singingvocoder.highgan.HighGAN 47 | #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl 48 | vocoder: vocoders.hifigan.HifiGAN 49 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128 50 | 51 | use_nsf: true 52 | 53 | # config for experiments 54 | max_frames: 5000 55 | max_tokens: 40000 56 | predictor_layers: 5 57 | rel_pos: true 58 | dur_predictor_layers: 5 # * 59 | 60 | use_spk_embed: false 61 | num_valid_plots: 10 62 | max_updates: 160000 63 | save_gt: true -------------------------------------------------------------------------------- /usr/configs/midi/cascade/opencs/ds60_rel.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - usr/configs/popcs_ds_beta6.yaml 3 | - usr/configs/midi/cascade/opencs/opencpop_statis.yaml 4 | 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer 6 | binary_data_dir: 'data/binary/opencpop-midi-dp' 7 | 8 | #switch_midi2f0_step: 174000 9 | use_midi: true # for midi exp 10 | use_gt_f0: false # for midi exp 11 | use_gt_dur: false # for further midi exp 12 | lambda_f0: 1.0 13 | lambda_uv: 1.0 14 | #lambda_energy: 0.1 15 | lambda_ph_dur: 1.0 16 | lambda_sent_dur: 1.0 17 | lambda_word_dur: 1.0 18 | predictor_grad: 0.1 19 | pe_enable: false 20 | pe_ckpt: '' 21 | 22 | fs2_ckpt: 'checkpoints/0302_opencpop_fs_midi/model_ckpt_steps_160000.ckpt' # 23 | #num_valid_plots: 0 24 | task_cls: usr.diffsinger_task.DiffSingerMIDITask 25 | 26 | K_step: 60 27 | max_tokens: 40000 28 | predictor_layers: 5 29 | dilation_cycle_length: 4 # * 30 | rel_pos: true 31 | dur_predictor_layers: 5 # * 32 | max_updates: 160000 33 | gaussian_start: false 34 | -------------------------------------------------------------------------------- /usr/configs/midi/cascade/opencs/opencpop_statis.yaml: -------------------------------------------------------------------------------- 1 | spec_min: [-6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., 2 | -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., 3 | -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., 4 | -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., 5 | -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., 6 | -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., 7 | -6., -6., -6., -6., -6., -6., -6., -6.] 8 | spec_max: [-7.9453e-01, -8.1116e-01, -6.1631e-01, -3.0679e-01, -1.3863e-01, 9 | -5.0652e-02, -1.1563e-01, -1.0679e-01, -9.1068e-02, -6.2174e-02, 10 | -7.5302e-02, -7.2217e-02, -6.3815e-02, -7.3299e-02, 7.3610e-03, 11 | -7.2508e-02, -5.0234e-02, -1.6534e-01, -2.6928e-01, -2.0782e-01, 12 | -2.0823e-01, -1.1702e-01, -7.0128e-02, -6.5868e-02, -1.2675e-02, 13 | 1.5121e-03, -8.9902e-02, -2.1392e-01, -2.3789e-01, -2.8922e-01, 14 | -3.0405e-01, -2.3029e-01, -2.2088e-01, -2.1542e-01, -2.9367e-01, 15 | -3.0137e-01, -3.8281e-01, -4.3590e-01, -2.8681e-01, -4.6855e-01, 16 | -5.7485e-01, -4.7022e-01, -5.4266e-01, -4.4848e-01, -6.4120e-01, 17 | -6.8700e-01, -6.4860e-01, -7.6436e-01, -4.9971e-01, -7.1068e-01, 18 | -6.9724e-01, -6.1487e-01, -5.5843e-01, -6.9773e-01, -5.7502e-01, 19 | -7.0919e-01, -8.2431e-01, -8.4213e-01, -9.0431e-01, -8.2840e-01, 20 | -7.7945e-01, -8.2758e-01, -8.7699e-01, -1.0532e+00, -1.0766e+00, 21 | -1.1198e+00, -1.0185e+00, -9.8983e-01, -1.0001e+00, -1.0756e+00, 22 | -1.0024e+00, -1.0304e+00, -1.0579e+00, -1.0188e+00, -1.0500e+00, 23 | -1.0842e+00, -1.0923e+00, -1.1223e+00, -1.2381e+00, -1.6467e+00] 24 | 25 | mel_vmin: -6. #-6. 26 | mel_vmax: 1.5 27 | wav2spec_eps: 1e-6 28 | 29 | raw_data_dir: 'data/raw/opencpop/segments' 30 | processed_data_dir: 'xxx' 31 | binary_data_dir: 'data/binary/opencpop-midi-dp' 32 | datasets: [ 33 | 'opencpop', 34 | ] 35 | test_prefixes: [ 36 | '2044', 37 | '2086', 38 | '2092', 39 | '2093', 40 | '2100', 41 | ] 42 | -------------------------------------------------------------------------------- /usr/configs/midi/e2e/opencpop/ds1000.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - usr/configs/popcs_ds_beta6.yaml 3 | - usr/configs/midi/cascade/opencs/opencpop_statis.yaml 4 | 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer 6 | binary_data_dir: 'data/binary/opencpop-midi-dp' 7 | 8 | #switch_midi2f0_step: 174000 9 | use_midi: true # for midi exp 10 | use_gt_dur: false # for further midi exp 11 | lambda_ph_dur: 1.0 12 | lambda_sent_dur: 1.0 13 | lambda_word_dur: 1.0 14 | predictor_grad: 0.1 15 | dur_predictor_layers: 5 # * 16 | 17 | 18 | fs2_ckpt: '' # 19 | #num_valid_plots: 0 20 | task_cls: usr.diffsinger_task.DiffSingerMIDITask 21 | 22 | # for diffusion schedule 23 | timesteps: 1000 24 | K_step: 1000 25 | max_beta: 0.02 26 | max_tokens: 36000 27 | max_updates: 320000 28 | gaussian_start: True 29 | pndm_speedup: 40 30 | 31 | use_pitch_embed: false 32 | use_gt_f0: false # for midi exp 33 | 34 | lambda_f0: 0. 35 | lambda_uv: 0. 36 | dilation_cycle_length: 4 # * 37 | rel_pos: true 38 | predictor_layers: 5 39 | pe_enable: true 40 | pe_ckpt: 'checkpoints/0102_xiaoma_pe' 41 | 42 | 43 | -------------------------------------------------------------------------------- /usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - usr/configs/popcs_ds_beta6.yaml 3 | - usr/configs/midi/cascade/opencs/opencpop_statis.yaml 4 | 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer 6 | binary_data_dir: 'data/binary/opencpop-midi-dp' 7 | 8 | #switch_midi2f0_step: 174000 9 | use_midi: true # for midi exp 10 | use_gt_dur: false # for further midi exp 11 | lambda_ph_dur: 1.0 12 | lambda_sent_dur: 1.0 13 | lambda_word_dur: 1.0 14 | predictor_grad: 0.1 15 | dur_predictor_layers: 5 # * 16 | 17 | 18 | fs2_ckpt: '' # 19 | #num_valid_plots: 0 20 | task_cls: usr.diffsinger_task.DiffSingerMIDITask 21 | 22 | K_step: 100 23 | max_tokens: 40000 24 | max_updates: 160000 25 | gaussian_start: True 26 | 27 | use_pitch_embed: false 28 | use_gt_f0: false # for midi exp 29 | 30 | lambda_f0: 0. 31 | lambda_uv: 0. 32 | dilation_cycle_length: 4 # * 33 | rel_pos: true 34 | predictor_layers: 5 35 | pe_enable: true 36 | pe_ckpt: 'checkpoints/0102_xiaoma_pe' 37 | -------------------------------------------------------------------------------- /usr/configs/midi/e2e/popcs/ds100_adj_rel.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - usr/configs/popcs_ds_beta6.yaml 3 | - usr/configs/midi/cascade/popcs/popcs_statis.yaml 4 | 5 | binarizer_cls: data_gen.singing.binarize.MidiSingingBinarizer 6 | binary_data_dir: 'data/binary/popcs-midi-dp' 7 | 8 | #switch_midi2f0_step: 174000 9 | use_midi: true # for midi exp 10 | use_gt_dur: false # for further midi exp 11 | lambda_ph_dur: 1.0 12 | lambda_sent_dur: 1.0 13 | lambda_word_dur: 1.0 14 | predictor_grad: 0.1 15 | dur_predictor_layers: 5 # * 16 | 17 | 18 | fs2_ckpt: '' # 19 | #num_valid_plots: 0 20 | task_cls: usr.diffsinger_task.DiffSingerMIDITask 21 | 22 | K_step: 100 23 | max_tokens: 40000 24 | max_updates: 160000 25 | gaussian_start: True 26 | 27 | use_pitch_embed: false 28 | use_gt_f0: false # for midi exp 29 | 30 | lambda_f0: 0. 31 | lambda_uv: 0. 32 | dilation_cycle_length: 4 # * 33 | rel_pos: true 34 | predictor_layers: 5 35 | pe_enable: true 36 | pe_ckpt: 'checkpoints/0102_xiaoma_pe' 37 | -------------------------------------------------------------------------------- /usr/configs/midi/pe.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/lj/fs2.yaml 3 | 4 | max_frames: 8000 5 | audio_sample_rate: 24000 6 | hop_size: 128 # Hop size. 7 | fft_size: 512 # FFT size. 8 | win_size: 512 # FFT size. 9 | fmin: 30 10 | fmax: 12000 11 | min_level_db: -120 12 | 13 | binary_data_dir: 'xxx' 14 | 15 | pitch_type: frame 16 | task_cls: tasks.tts.pe.PitchExtractionTask 17 | pitch_extractor_conv_layers: 2 18 | 19 | 20 | # config for experiments 21 | max_tokens: 20000 22 | use_spk_embed: false 23 | num_valid_plots: 10 24 | max_updates: 60000 -------------------------------------------------------------------------------- /usr/configs/popcs_ds_beta6.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/fs2.yaml 3 | - configs/singing/base.yaml 4 | - ./base.yaml 5 | 6 | audio_sample_rate: 24000 7 | hop_size: 128 # Hop size. 8 | fft_size: 512 # FFT size. 9 | win_size: 512 # FFT size. 10 | fmin: 30 11 | fmax: 12000 12 | min_level_db: -120 13 | 14 | binarization_args: 15 | with_wav: true 16 | with_spk_embed: false 17 | with_align: true 18 | raw_data_dir: 'data/raw/popcs' 19 | processed_data_dir: 'data/processed/popcs' 20 | binary_data_dir: 'data/binary/popcs-pmf0' 21 | num_spk: 1 22 | datasets: [ 23 | 'popcs', 24 | ] 25 | test_prefixes: [ 26 | 'popcs-说散就散', 27 | 'popcs-隐形的翅膀', 28 | ] 29 | 30 | spec_min: [-6.8276, -7.0270, -6.8142, -7.1429, -7.6669, -7.6000, -7.1148, -6.9640, 31 | -6.8414, -6.6596, -6.6880, -6.7439, -6.7986, -7.4940, -7.7845, -7.6586, 32 | -6.9288, -6.7639, -6.9118, -6.8246, -6.7183, -7.1769, -6.9794, -7.4513, 33 | -7.3422, -7.5623, -6.9610, -6.8158, -6.9595, -6.8403, -6.5688, -6.6356, 34 | -7.0209, -6.5002, -6.7819, -6.5232, -6.6927, -6.5701, -6.5531, -6.7069, 35 | -6.6462, -6.4523, -6.5954, -6.4264, -6.4487, -6.7070, -6.4025, -6.3042, 36 | -6.4008, -6.3857, -6.3903, -6.3094, -6.2491, -6.3518, -6.3566, -6.4168, 37 | -6.2481, -6.3624, -6.2858, -6.2575, -6.3638, -6.4520, -6.1835, -6.2754, 38 | -6.1253, -6.1645, -6.0638, -6.1262, -6.0710, -6.1039, -6.4428, -6.1363, 39 | -6.1054, -6.1252, -6.1797, -6.0235, -6.0758, -5.9453, -6.0213, -6.0446] 40 | spec_max: [ 0.2645, 0.0583, -0.2344, -0.0184, 0.1227, 0.1533, 0.1103, 0.1212, 41 | 0.2421, 0.1809, 0.2134, 0.3161, 0.3301, 0.3289, 0.2667, 0.2421, 42 | 0.2581, 0.2600, 0.1394, 0.1907, 0.1082, 0.1474, 0.1680, 0.2550, 43 | 0.1057, 0.0826, 0.0423, 0.1203, -0.0701, -0.0056, 0.0477, -0.0639, 44 | -0.0272, -0.0728, -0.1648, -0.0855, -0.2652, -0.1998, -0.1547, -0.2167, 45 | -0.4181, -0.5463, -0.4161, -0.4733, -0.6518, -0.5387, -0.4290, -0.4191, 46 | -0.4151, -0.3042, -0.3810, -0.4160, -0.4496, -0.2847, -0.4676, -0.4658, 47 | -0.4931, -0.4885, -0.5547, -0.5481, -0.6948, -0.7968, -0.8455, -0.8392, 48 | -0.8770, -0.9520, -0.8749, -0.7297, -0.8374, -0.8667, -0.7157, -0.9035, 49 | -0.9219, -0.8801, -0.9298, -0.9009, -0.9604, -1.0537, -1.0781, -1.3766] 50 | 51 | task_cls: usr.diffsinger_task.DiffSingerTask 52 | #vocoder: usr.singingvocoder.highgan.HighGAN 53 | #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl 54 | vocoder: vocoders.hifigan.HifiGAN 55 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128 56 | 57 | pitch_extractor: 'parselmouth' 58 | # config for experiments 59 | use_spk_embed: false 60 | num_valid_plots: 10 61 | max_updates: 160000 62 | lr: 0.001 63 | timesteps: 100 64 | K_step: 51 65 | diff_loss_type: l1 66 | diff_decoder_type: 'wavenet' 67 | schedule_type: 'linear' 68 | max_beta: 0.06 69 | fs2_ckpt: '' 70 | use_nsf: true -------------------------------------------------------------------------------- /usr/configs/popcs_ds_beta6_offline.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - ./popcs_ds_beta6.yaml 3 | 4 | fs2_ckpt: checkpoints/popcs_fs2_pmf0_1230/model_ckpt_steps_160000.ckpt # to be infer 5 | num_valid_plots: 0 6 | task_cls: usr.diffsinger_task.DiffSingerOfflineTask 7 | 8 | # tmp: 9 | #pe_enable: true 10 | #pe_ckpt: '' 11 | vocoder: vocoders.hifigan.HifiGAN 12 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128 -------------------------------------------------------------------------------- /usr/configs/popcs_fs2.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/singing/fs2.yaml 3 | 4 | audio_sample_rate: 24000 5 | hop_size: 128 # Hop size. 6 | fft_size: 512 # FFT size. 7 | win_size: 512 # FFT size. 8 | fmin: 30 9 | fmax: 12000 10 | min_level_db: -120 11 | 12 | binarization_args: 13 | with_wav: true 14 | with_spk_embed: false 15 | with_align: true 16 | raw_data_dir: 'data/raw/popcs' 17 | processed_data_dir: 'data/processed/popcs' 18 | binary_data_dir: 'data/binary/popcs-pmf0' 19 | num_spk: 1 20 | datasets: [ 21 | 'popcs', 22 | ] 23 | test_prefixes: [ 24 | 'popcs-说散就散', 25 | 'popcs-隐形的翅膀', 26 | ] 27 | 28 | task_cls: tasks.tts.fs2.FastSpeech2Task 29 | #vocoder: usr.singingvocoder.highgan.HighGAN 30 | #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl 31 | vocoder: vocoders.hifigan.HifiGAN 32 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128 33 | use_nsf: true 34 | 35 | # config for experiments 36 | max_tokens: 18000 37 | use_spk_embed: false 38 | num_valid_plots: 10 39 | max_updates: 160000 40 | save_gt: true 41 | 42 | # tmp: 43 | #pe_enable: true 44 | #pe_ckpt: '' -------------------------------------------------------------------------------- /usr/diff/candidate_decoder.py: -------------------------------------------------------------------------------- 1 | from modules.fastspeech.tts_modules import FastspeechDecoder 2 | # from modules.fastspeech.fast_tacotron import DecoderRNN 3 | # from modules.fastspeech.speedy_speech.speedy_speech import ConvBlocks 4 | # from modules.fastspeech.conformer.conformer import ConformerDecoder 5 | import torch 6 | from torch.nn import functional as F 7 | import torch.nn as nn 8 | import math 9 | from utils.hparams import hparams 10 | from .diffusion import Mish 11 | Linear = nn.Linear 12 | 13 | 14 | class SinusoidalPosEmb(nn.Module): 15 | def __init__(self, dim): 16 | super().__init__() 17 | self.dim = dim 18 | 19 | def forward(self, x): 20 | device = x.device 21 | half_dim = self.dim // 2 22 | emb = math.log(10000) / (half_dim - 1) 23 | emb = torch.exp(torch.arange(half_dim, device=device) * -emb) 24 | emb = x[:, None] * emb[None, :] 25 | emb = torch.cat((emb.sin(), emb.cos()), dim=-1) 26 | return emb 27 | 28 | 29 | def Conv1d(*args, **kwargs): 30 | layer = nn.Conv1d(*args, **kwargs) 31 | nn.init.kaiming_normal_(layer.weight) 32 | return layer 33 | 34 | 35 | class FFT(FastspeechDecoder): 36 | def __init__(self, hidden_size=None, num_layers=None, kernel_size=None, num_heads=None): 37 | super().__init__(hidden_size, num_layers, kernel_size, num_heads=num_heads) 38 | dim = hparams['residual_channels'] 39 | self.input_projection = Conv1d(hparams['audio_num_mel_bins'], dim, 1) 40 | self.diffusion_embedding = SinusoidalPosEmb(dim) 41 | self.mlp = nn.Sequential( 42 | nn.Linear(dim, dim * 4), 43 | Mish(), 44 | nn.Linear(dim * 4, dim) 45 | ) 46 | self.get_mel_out = Linear(hparams['hidden_size'], 80, bias=True) 47 | self.get_decode_inp = Linear(hparams['hidden_size'] + dim + dim, 48 | hparams['hidden_size']) # hs + dim + 80 -> hs 49 | 50 | def forward(self, spec, diffusion_step, cond, padding_mask=None, attn_mask=None, return_hiddens=False): 51 | """ 52 | :param spec: [B, 1, 80, T] 53 | :param diffusion_step: [B, 1] 54 | :param cond: [B, M, T] 55 | :return: 56 | """ 57 | x = spec[:, 0] 58 | x = self.input_projection(x).permute([0, 2, 1]) # [B, T, residual_channel] 59 | diffusion_step = self.diffusion_embedding(diffusion_step) 60 | diffusion_step = self.mlp(diffusion_step) # [B, dim] 61 | cond = cond.permute([0, 2, 1]) # [B, T, M] 62 | 63 | seq_len = cond.shape[1] # [T_mel] 64 | time_embed = diffusion_step[:, None, :] # [B, 1, dim] 65 | time_embed = time_embed.repeat([1, seq_len, 1]) # # [B, T, dim] 66 | 67 | decoder_inp = torch.cat([x, cond, time_embed], dim=-1) # [B, T, dim + H + dim] 68 | decoder_inp = self.get_decode_inp(decoder_inp) # [B, T, H] 69 | x = decoder_inp 70 | 71 | ''' 72 | Required x: [B, T, C] 73 | :return: [B, T, C] or [L, B, T, C] 74 | ''' 75 | padding_mask = x.abs().sum(-1).eq(0).data if padding_mask is None else padding_mask 76 | nonpadding_mask_TB = 1 - padding_mask.transpose(0, 1).float()[:, :, None] # [T, B, 1] 77 | if self.use_pos_embed: 78 | positions = self.pos_embed_alpha * self.embed_positions(x[..., 0]) 79 | x = x + positions 80 | x = F.dropout(x, p=self.dropout, training=self.training) 81 | # B x T x C -> T x B x C 82 | x = x.transpose(0, 1) * nonpadding_mask_TB 83 | hiddens = [] 84 | for layer in self.layers: 85 | x = layer(x, encoder_padding_mask=padding_mask, attn_mask=attn_mask) * nonpadding_mask_TB 86 | hiddens.append(x) 87 | if self.use_last_norm: 88 | x = self.layer_norm(x) * nonpadding_mask_TB 89 | if return_hiddens: 90 | x = torch.stack(hiddens, 0) # [L, T, B, C] 91 | x = x.transpose(1, 2) # [L, B, T, C] 92 | else: 93 | x = x.transpose(0, 1) # [B, T, C] 94 | 95 | x = self.get_mel_out(x).permute([0, 2, 1]) # [B, 80, T] 96 | return x[:, None, :, :] -------------------------------------------------------------------------------- /usr/diff/net.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from math import sqrt 8 | 9 | from .diffusion import Mish 10 | from utils.hparams import hparams 11 | 12 | Linear = nn.Linear 13 | ConvTranspose2d = nn.ConvTranspose2d 14 | 15 | 16 | class AttrDict(dict): 17 | def __init__(self, *args, **kwargs): 18 | super(AttrDict, self).__init__(*args, **kwargs) 19 | self.__dict__ = self 20 | 21 | def override(self, attrs): 22 | if isinstance(attrs, dict): 23 | self.__dict__.update(**attrs) 24 | elif isinstance(attrs, (list, tuple, set)): 25 | for attr in attrs: 26 | self.override(attr) 27 | elif attrs is not None: 28 | raise NotImplementedError 29 | return self 30 | 31 | 32 | class SinusoidalPosEmb(nn.Module): 33 | def __init__(self, dim): 34 | super().__init__() 35 | self.dim = dim 36 | 37 | def forward(self, x): 38 | device = x.device 39 | half_dim = self.dim // 2 40 | emb = math.log(10000) / (half_dim - 1) 41 | emb = torch.exp(torch.arange(half_dim, device=device) * -emb) 42 | emb = x[:, None] * emb[None, :] 43 | emb = torch.cat((emb.sin(), emb.cos()), dim=-1) 44 | return emb 45 | 46 | 47 | def Conv1d(*args, **kwargs): 48 | layer = nn.Conv1d(*args, **kwargs) 49 | nn.init.kaiming_normal_(layer.weight) 50 | return layer 51 | 52 | 53 | @torch.jit.script 54 | def silu(x): 55 | return x * torch.sigmoid(x) 56 | 57 | 58 | class ResidualBlock(nn.Module): 59 | def __init__(self, encoder_hidden, residual_channels, dilation): 60 | super().__init__() 61 | self.dilated_conv = Conv1d(residual_channels, 2 * residual_channels, 3, padding=dilation, dilation=dilation) 62 | self.diffusion_projection = Linear(residual_channels, residual_channels) 63 | self.conditioner_projection = Conv1d(encoder_hidden, 2 * residual_channels, 1) 64 | self.output_projection = Conv1d(residual_channels, 2 * residual_channels, 1) 65 | 66 | def forward(self, x, conditioner, diffusion_step): 67 | diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1) 68 | conditioner = self.conditioner_projection(conditioner) 69 | y = x + diffusion_step 70 | 71 | y = self.dilated_conv(y) + conditioner 72 | 73 | gate, filter = torch.chunk(y, 2, dim=1) 74 | y = torch.sigmoid(gate) * torch.tanh(filter) 75 | 76 | y = self.output_projection(y) 77 | residual, skip = torch.chunk(y, 2, dim=1) 78 | return (x + residual) / sqrt(2.0), skip 79 | 80 | 81 | class DiffNet(nn.Module): 82 | def __init__(self, in_dims=80): 83 | super().__init__() 84 | self.params = params = AttrDict( 85 | # Model params 86 | encoder_hidden=hparams['hidden_size'], 87 | residual_layers=hparams['residual_layers'], 88 | residual_channels=hparams['residual_channels'], 89 | dilation_cycle_length=hparams['dilation_cycle_length'], 90 | ) 91 | self.input_projection = Conv1d(in_dims, params.residual_channels, 1) 92 | self.diffusion_embedding = SinusoidalPosEmb(params.residual_channels) 93 | dim = params.residual_channels 94 | self.mlp = nn.Sequential( 95 | nn.Linear(dim, dim * 4), 96 | Mish(), 97 | nn.Linear(dim * 4, dim) 98 | ) 99 | self.residual_layers = nn.ModuleList([ 100 | ResidualBlock(params.encoder_hidden, params.residual_channels, 2 ** (i % params.dilation_cycle_length)) 101 | for i in range(params.residual_layers) 102 | ]) 103 | self.skip_projection = Conv1d(params.residual_channels, params.residual_channels, 1) 104 | self.output_projection = Conv1d(params.residual_channels, in_dims, 1) 105 | nn.init.zeros_(self.output_projection.weight) 106 | 107 | def forward(self, spec, diffusion_step, cond): 108 | """ 109 | 110 | :param spec: [B, 1, M, T] 111 | :param diffusion_step: [B, 1] 112 | :param cond: [B, M, T] 113 | :return: 114 | """ 115 | x = spec[:, 0] 116 | x = self.input_projection(x) # x [B, residual_channel, T] 117 | 118 | x = F.relu(x) 119 | diffusion_step = self.diffusion_embedding(diffusion_step) 120 | diffusion_step = self.mlp(diffusion_step) 121 | skip = [] 122 | for layer_id, layer in enumerate(self.residual_layers): 123 | x, skip_connection = layer(x, cond, diffusion_step) 124 | skip.append(skip_connection) 125 | 126 | x = torch.sum(torch.stack(skip), dim=0) / sqrt(len(self.residual_layers)) 127 | x = self.skip_projection(x) 128 | x = F.relu(x) 129 | x = self.output_projection(x) # [B, 80, T] 130 | return x[:, None, :, :] 131 | -------------------------------------------------------------------------------- /usr/diffspeech_task.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import utils 4 | from utils.hparams import hparams 5 | from .diff.net import DiffNet 6 | from .diff.shallow_diffusion_tts import GaussianDiffusion 7 | from .task import DiffFsTask 8 | from vocoders.base_vocoder import get_vocoder_cls, BaseVocoder 9 | from utils.pitch_utils import denorm_f0 10 | from tasks.tts.fs2_utils import FastSpeechDataset 11 | 12 | DIFF_DECODERS = { 13 | 'wavenet': lambda hp: DiffNet(hp['audio_num_mel_bins']), 14 | } 15 | 16 | 17 | class DiffSpeechTask(DiffFsTask): 18 | def __init__(self): 19 | super(DiffSpeechTask, self).__init__() 20 | self.dataset_cls = FastSpeechDataset 21 | self.vocoder: BaseVocoder = get_vocoder_cls(hparams)() 22 | 23 | def build_tts_model(self): 24 | mel_bins = hparams['audio_num_mel_bins'] 25 | self.model = GaussianDiffusion( 26 | phone_encoder=self.phone_encoder, 27 | out_dims=mel_bins, denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams), 28 | timesteps=hparams['timesteps'], 29 | K_step=hparams['K_step'], 30 | loss_type=hparams['diff_loss_type'], 31 | spec_min=hparams['spec_min'], spec_max=hparams['spec_max'], 32 | ) 33 | if hparams['fs2_ckpt'] != '': 34 | utils.load_ckpt(self.model.fs2, hparams['fs2_ckpt'], 'model', strict=True) 35 | # self.model.fs2.decoder = None 36 | for k, v in self.model.fs2.named_parameters(): 37 | if not 'predictor' in k: 38 | v.requires_grad = False 39 | 40 | def build_optimizer(self, model): 41 | self.optimizer = optimizer = torch.optim.AdamW( 42 | filter(lambda p: p.requires_grad, model.parameters()), 43 | lr=hparams['lr'], 44 | betas=(hparams['optimizer_adam_beta1'], hparams['optimizer_adam_beta2']), 45 | weight_decay=hparams['weight_decay']) 46 | return optimizer 47 | 48 | def run_model(self, model, sample, return_output=False, infer=False): 49 | txt_tokens = sample['txt_tokens'] # [B, T_t] 50 | target = sample['mels'] # [B, T_s, 80] 51 | # mel2ph = sample['mel2ph'] if hparams['use_gt_dur'] else None # [B, T_s] 52 | mel2ph = sample['mel2ph'] 53 | f0 = sample['f0'] 54 | uv = sample['uv'] 55 | energy = sample['energy'] 56 | # fs2_mel = sample['fs2_mels'] 57 | spk_embed = sample.get('spk_embed') if not hparams['use_spk_id'] else sample.get('spk_ids') 58 | if hparams['pitch_type'] == 'cwt': 59 | cwt_spec = sample[f'cwt_spec'] 60 | f0_mean = sample['f0_mean'] 61 | f0_std = sample['f0_std'] 62 | sample['f0_cwt'] = f0 = model.cwt2f0_norm(cwt_spec, f0_mean, f0_std, mel2ph) 63 | 64 | output = model(txt_tokens, mel2ph=mel2ph, spk_embed=spk_embed, 65 | ref_mels=target, f0=f0, uv=uv, energy=energy, infer=infer) 66 | 67 | losses = {} 68 | if 'diff_loss' in output: 69 | losses['mel'] = output['diff_loss'] 70 | self.add_dur_loss(output['dur'], mel2ph, txt_tokens, losses=losses) 71 | if hparams['use_pitch_embed']: 72 | self.add_pitch_loss(output, sample, losses) 73 | if hparams['use_energy_embed']: 74 | self.add_energy_loss(output['energy_pred'], energy, losses) 75 | if not return_output: 76 | return losses 77 | else: 78 | return losses, output 79 | 80 | def validation_step(self, sample, batch_idx): 81 | outputs = {} 82 | txt_tokens = sample['txt_tokens'] # [B, T_t] 83 | 84 | energy = sample['energy'] 85 | spk_embed = sample.get('spk_embed') if not hparams['use_spk_id'] else sample.get('spk_ids') 86 | mel2ph = sample['mel2ph'] 87 | f0 = sample['f0'] 88 | uv = sample['uv'] 89 | 90 | outputs['losses'] = {} 91 | 92 | outputs['losses'], model_out = self.run_model(self.model, sample, return_output=True, infer=False) 93 | 94 | 95 | outputs['total_loss'] = sum(outputs['losses'].values()) 96 | outputs['nsamples'] = sample['nsamples'] 97 | outputs = utils.tensors_to_scalars(outputs) 98 | if batch_idx < hparams['num_valid_plots']: 99 | # model_out = self.model( 100 | # txt_tokens, spk_embed=spk_embed, mel2ph=None, f0=None, uv=None, energy=None, ref_mels=None, infer=True) 101 | # self.plot_mel(batch_idx, model_out['mel_out'], model_out['fs2_mel'], name=f'diffspeech_vs_fs2_{batch_idx}') 102 | model_out = self.model( 103 | txt_tokens, spk_embed=spk_embed, mel2ph=mel2ph, f0=f0, uv=uv, energy=energy, ref_mels=None, infer=True) 104 | gt_f0 = denorm_f0(sample['f0'], sample['uv'], hparams) 105 | self.plot_wav(batch_idx, sample['mels'], model_out['mel_out'], is_mel=True, gt_f0=gt_f0, f0=model_out.get('f0_denorm')) 106 | self.plot_mel(batch_idx, sample['mels'], model_out['mel_out']) 107 | return outputs 108 | 109 | ############ 110 | # validation plots 111 | ############ 112 | def plot_wav(self, batch_idx, gt_wav, wav_out, is_mel=False, gt_f0=None, f0=None, name=None): 113 | gt_wav = gt_wav[0].cpu().numpy() 114 | wav_out = wav_out[0].cpu().numpy() 115 | gt_f0 = gt_f0[0].cpu().numpy() 116 | f0 = f0[0].cpu().numpy() if f0 is not None else None 117 | if is_mel: 118 | gt_wav = self.vocoder.spec2wav(gt_wav, f0=gt_f0) 119 | wav_out = self.vocoder.spec2wav(wav_out, f0=f0) 120 | self.logger.experiment.add_audio(f'gt_{batch_idx}', gt_wav, sample_rate=hparams['audio_sample_rate'], global_step=self.global_step) 121 | self.logger.experiment.add_audio(f'wav_{batch_idx}', wav_out, sample_rate=hparams['audio_sample_rate'], global_step=self.global_step) 122 | 123 | -------------------------------------------------------------------------------- /usr/task.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import utils 4 | from .diff.diffusion import GaussianDiffusion 5 | from .diff.net import DiffNet 6 | from tasks.tts.fs2 import FastSpeech2Task 7 | from utils.hparams import hparams 8 | 9 | 10 | DIFF_DECODERS = { 11 | 'wavenet': lambda hp: DiffNet(hp['audio_num_mel_bins']), 12 | } 13 | 14 | 15 | class DiffFsTask(FastSpeech2Task): 16 | def build_tts_model(self): 17 | mel_bins = hparams['audio_num_mel_bins'] 18 | self.model = GaussianDiffusion( 19 | phone_encoder=self.phone_encoder, 20 | out_dims=mel_bins, denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams), 21 | timesteps=hparams['timesteps'], 22 | loss_type=hparams['diff_loss_type'], 23 | spec_min=hparams['spec_min'], spec_max=hparams['spec_max'], 24 | ) 25 | 26 | def run_model(self, model, sample, return_output=False, infer=False): 27 | txt_tokens = sample['txt_tokens'] # [B, T_t] 28 | target = sample['mels'] # [B, T_s, 80] 29 | mel2ph = sample['mel2ph'] # [B, T_s] 30 | f0 = sample['f0'] 31 | uv = sample['uv'] 32 | energy = sample['energy'] 33 | spk_embed = sample.get('spk_embed') if not hparams['use_spk_id'] else sample.get('spk_ids') 34 | if hparams['pitch_type'] == 'cwt': 35 | cwt_spec = sample[f'cwt_spec'] 36 | f0_mean = sample['f0_mean'] 37 | f0_std = sample['f0_std'] 38 | sample['f0_cwt'] = f0 = model.cwt2f0_norm(cwt_spec, f0_mean, f0_std, mel2ph) 39 | 40 | output = model(txt_tokens, mel2ph=mel2ph, spk_embed=spk_embed, 41 | ref_mels=target, f0=f0, uv=uv, energy=energy, infer=infer) 42 | 43 | losses = {} 44 | if 'diff_loss' in output: 45 | losses['mel'] = output['diff_loss'] 46 | self.add_dur_loss(output['dur'], mel2ph, txt_tokens, losses=losses) 47 | if hparams['use_pitch_embed']: 48 | self.add_pitch_loss(output, sample, losses) 49 | if hparams['use_energy_embed']: 50 | self.add_energy_loss(output['energy_pred'], energy, losses) 51 | if not return_output: 52 | return losses 53 | else: 54 | return losses, output 55 | 56 | def _training_step(self, sample, batch_idx, _): 57 | log_outputs = self.run_model(self.model, sample) 58 | total_loss = sum([v for v in log_outputs.values() if isinstance(v, torch.Tensor) and v.requires_grad]) 59 | log_outputs['batch_size'] = sample['txt_tokens'].size()[0] 60 | log_outputs['lr'] = self.scheduler.get_lr()[0] 61 | return total_loss, log_outputs 62 | 63 | def validation_step(self, sample, batch_idx): 64 | outputs = {} 65 | outputs['losses'] = {} 66 | outputs['losses'], model_out = self.run_model(self.model, sample, return_output=True, infer=False) 67 | outputs['total_loss'] = sum(outputs['losses'].values()) 68 | outputs['nsamples'] = sample['nsamples'] 69 | outputs = utils.tensors_to_scalars(outputs) 70 | if batch_idx < hparams['num_valid_plots']: 71 | _, model_out = self.run_model(self.model, sample, return_output=True, infer=True) 72 | self.plot_mel(batch_idx, sample['mels'], model_out['mel_out']) 73 | return outputs 74 | 75 | def build_scheduler(self, optimizer): 76 | return torch.optim.lr_scheduler.StepLR(optimizer, hparams['decay_steps'], gamma=0.5) 77 | 78 | def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx): 79 | if optimizer is None: 80 | return 81 | optimizer.step() 82 | optimizer.zero_grad() 83 | if self.scheduler is not None: 84 | self.scheduler.step(self.global_step // hparams['accumulate_grad_batches']) 85 | -------------------------------------------------------------------------------- /utils/audio.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import matplotlib 3 | 4 | matplotlib.use('Agg') 5 | import librosa 6 | import librosa.filters 7 | import numpy as np 8 | from scipy import signal 9 | from scipy.io import wavfile 10 | 11 | 12 | def save_wav(wav, path, sr, norm=False): 13 | if norm: 14 | wav = wav / np.abs(wav).max() 15 | wav *= 32767 16 | # proposed by @dsmiller 17 | wavfile.write(path, sr, wav.astype(np.int16)) 18 | 19 | 20 | def get_hop_size(hparams): 21 | hop_size = hparams['hop_size'] 22 | if hop_size is None: 23 | assert hparams['frame_shift_ms'] is not None 24 | hop_size = int(hparams['frame_shift_ms'] / 1000 * hparams['audio_sample_rate']) 25 | return hop_size 26 | 27 | 28 | ########################################################################################### 29 | def _stft(y, hparams): 30 | return librosa.stft(y=y, n_fft=hparams['fft_size'], hop_length=get_hop_size(hparams), 31 | win_length=hparams['win_size'], pad_mode='constant') 32 | 33 | 34 | def _istft(y, hparams): 35 | return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams['win_size']) 36 | 37 | 38 | def librosa_pad_lr(x, fsize, fshift, pad_sides=1): 39 | '''compute right padding (final frame) or both sides padding (first and final frames) 40 | ''' 41 | assert pad_sides in (1, 2) 42 | # return int(fsize // 2) 43 | pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0] 44 | if pad_sides == 1: 45 | return 0, pad 46 | else: 47 | return pad // 2, pad // 2 + pad % 2 48 | 49 | 50 | # Conversions 51 | def amp_to_db(x): 52 | return 20 * np.log10(np.maximum(1e-5, x)) 53 | 54 | 55 | def normalize(S, hparams): 56 | return (S - hparams['min_level_db']) / -hparams['min_level_db'] 57 | -------------------------------------------------------------------------------- /utils/cwt.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | from pycwt import wavelet 4 | from scipy.interpolate import interp1d 5 | 6 | 7 | def load_wav(wav_file, sr): 8 | wav, _ = librosa.load(wav_file, sr=sr, mono=True) 9 | return wav 10 | 11 | 12 | def convert_continuos_f0(f0): 13 | '''CONVERT F0 TO CONTINUOUS F0 14 | Args: 15 | f0 (ndarray): original f0 sequence with the shape (T) 16 | Return: 17 | (ndarray): continuous f0 with the shape (T) 18 | ''' 19 | # get uv information as binary 20 | f0 = np.copy(f0) 21 | uv = np.float32(f0 != 0) 22 | 23 | # get start and end of f0 24 | if (f0 == 0).all(): 25 | print("| all of the f0 values are 0.") 26 | return uv, f0 27 | start_f0 = f0[f0 != 0][0] 28 | end_f0 = f0[f0 != 0][-1] 29 | 30 | # padding start and end of f0 sequence 31 | start_idx = np.where(f0 == start_f0)[0][0] 32 | end_idx = np.where(f0 == end_f0)[0][-1] 33 | f0[:start_idx] = start_f0 34 | f0[end_idx:] = end_f0 35 | 36 | # get non-zero frame index 37 | nz_frames = np.where(f0 != 0)[0] 38 | 39 | # perform linear interpolation 40 | f = interp1d(nz_frames, f0[nz_frames]) 41 | cont_f0 = f(np.arange(0, f0.shape[0])) 42 | 43 | return uv, cont_f0 44 | 45 | 46 | def get_cont_lf0(f0, frame_period=5.0): 47 | uv, cont_f0_lpf = convert_continuos_f0(f0) 48 | # cont_f0_lpf = low_pass_filter(cont_f0_lpf, int(1.0 / (frame_period * 0.001)), cutoff=20) 49 | cont_lf0_lpf = np.log(cont_f0_lpf) 50 | return uv, cont_lf0_lpf 51 | 52 | 53 | def get_lf0_cwt(lf0): 54 | ''' 55 | input: 56 | signal of shape (N) 57 | output: 58 | Wavelet_lf0 of shape(10, N), scales of shape(10) 59 | ''' 60 | mother = wavelet.MexicanHat() 61 | dt = 0.005 62 | dj = 1 63 | s0 = dt * 2 64 | J = 9 65 | 66 | Wavelet_lf0, scales, _, _, _, _ = wavelet.cwt(np.squeeze(lf0), dt, dj, s0, J, mother) 67 | # Wavelet.shape => (J + 1, len(lf0)) 68 | Wavelet_lf0 = np.real(Wavelet_lf0).T 69 | return Wavelet_lf0, scales 70 | 71 | 72 | def norm_scale(Wavelet_lf0): 73 | Wavelet_lf0_norm = np.zeros((Wavelet_lf0.shape[0], Wavelet_lf0.shape[1])) 74 | mean = Wavelet_lf0.mean(0)[None, :] 75 | std = Wavelet_lf0.std(0)[None, :] 76 | Wavelet_lf0_norm = (Wavelet_lf0 - mean) / std 77 | return Wavelet_lf0_norm, mean, std 78 | 79 | 80 | def normalize_cwt_lf0(f0, mean, std): 81 | uv, cont_lf0_lpf = get_cont_lf0(f0) 82 | cont_lf0_norm = (cont_lf0_lpf - mean) / std 83 | Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_norm) 84 | Wavelet_lf0_norm, _, _ = norm_scale(Wavelet_lf0) 85 | 86 | return Wavelet_lf0_norm 87 | 88 | 89 | def get_lf0_cwt_norm(f0s, mean, std): 90 | uvs = list() 91 | cont_lf0_lpfs = list() 92 | cont_lf0_lpf_norms = list() 93 | Wavelet_lf0s = list() 94 | Wavelet_lf0s_norm = list() 95 | scaless = list() 96 | 97 | means = list() 98 | stds = list() 99 | for f0 in f0s: 100 | uv, cont_lf0_lpf = get_cont_lf0(f0) 101 | cont_lf0_lpf_norm = (cont_lf0_lpf - mean) / std 102 | 103 | Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm) # [560,10] 104 | Wavelet_lf0_norm, mean_scale, std_scale = norm_scale(Wavelet_lf0) # [560,10],[1,10],[1,10] 105 | 106 | Wavelet_lf0s_norm.append(Wavelet_lf0_norm) 107 | uvs.append(uv) 108 | cont_lf0_lpfs.append(cont_lf0_lpf) 109 | cont_lf0_lpf_norms.append(cont_lf0_lpf_norm) 110 | Wavelet_lf0s.append(Wavelet_lf0) 111 | scaless.append(scales) 112 | means.append(mean_scale) 113 | stds.append(std_scale) 114 | 115 | return Wavelet_lf0s_norm, scaless, means, stds 116 | 117 | 118 | def inverse_cwt_torch(Wavelet_lf0, scales): 119 | import torch 120 | b = ((torch.arange(0, len(scales)).float().to(Wavelet_lf0.device)[None, None, :] + 1 + 2.5) ** (-2.5)) 121 | lf0_rec = Wavelet_lf0 * b 122 | lf0_rec_sum = lf0_rec.sum(-1) 123 | lf0_rec_sum = (lf0_rec_sum - lf0_rec_sum.mean(-1, keepdim=True)) / lf0_rec_sum.std(-1, keepdim=True) 124 | return lf0_rec_sum 125 | 126 | 127 | def inverse_cwt(Wavelet_lf0, scales): 128 | b = ((np.arange(0, len(scales))[None, None, :] + 1 + 2.5) ** (-2.5)) 129 | lf0_rec = Wavelet_lf0 * b 130 | lf0_rec_sum = lf0_rec.sum(-1) 131 | lf0_rec_sum = (lf0_rec_sum - lf0_rec_sum.mean(-1, keepdims=True)) / lf0_rec_sum.std(-1, keepdims=True) 132 | return lf0_rec_sum 133 | 134 | 135 | def cwt2f0(cwt_spec, mean, std, cwt_scales): 136 | assert len(mean.shape) == 1 and len(std.shape) == 1 and len(cwt_spec.shape) == 3 137 | import torch 138 | if isinstance(cwt_spec, torch.Tensor): 139 | f0 = inverse_cwt_torch(cwt_spec, cwt_scales) 140 | f0 = f0 * std[:, None] + mean[:, None] 141 | f0 = f0.exp() # [B, T] 142 | else: 143 | f0 = inverse_cwt(cwt_spec, cwt_scales) 144 | f0 = f0 * std[:, None] + mean[:, None] 145 | f0 = np.exp(f0) # [B, T] 146 | return f0 147 | -------------------------------------------------------------------------------- /utils/hparams.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import yaml 4 | 5 | global_print_hparams = True 6 | hparams = {} 7 | 8 | 9 | class Args: 10 | def __init__(self, **kwargs): 11 | for k, v in kwargs.items(): 12 | self.__setattr__(k, v) 13 | 14 | 15 | def override_config(old_config: dict, new_config: dict): 16 | for k, v in new_config.items(): 17 | if isinstance(v, dict) and k in old_config: 18 | override_config(old_config[k], new_config[k]) 19 | else: 20 | old_config[k] = v 21 | 22 | 23 | def set_hparams(config='', exp_name='', hparams_str='', print_hparams=True, global_hparams=True): 24 | if config == '': 25 | parser = argparse.ArgumentParser(description='neural music') 26 | parser.add_argument('--config', type=str, default='', 27 | help='location of the data corpus') 28 | parser.add_argument('--exp_name', type=str, default='', help='exp_name') 29 | parser.add_argument('--hparams', type=str, default='', 30 | help='location of the data corpus') 31 | parser.add_argument('--infer', action='store_true', help='infer') 32 | parser.add_argument('--validate', action='store_true', help='validate') 33 | parser.add_argument('--reset', action='store_true', help='reset hparams') 34 | parser.add_argument('--debug', action='store_true', help='debug') 35 | args, unknown = parser.parse_known_args() 36 | else: 37 | args = Args(config=config, exp_name=exp_name, hparams=hparams_str, 38 | infer=False, validate=False, reset=False, debug=False) 39 | args_work_dir = '' 40 | if args.exp_name != '': 41 | args.work_dir = args.exp_name 42 | args_work_dir = f'checkpoints/{args.work_dir}' 43 | 44 | config_chains = [] 45 | loaded_config = set() 46 | 47 | def load_config(config_fn): # deep first 48 | with open(config_fn) as f: 49 | hparams_ = yaml.safe_load(f) 50 | loaded_config.add(config_fn) 51 | if 'base_config' in hparams_: 52 | ret_hparams = {} 53 | if not isinstance(hparams_['base_config'], list): 54 | hparams_['base_config'] = [hparams_['base_config']] 55 | for c in hparams_['base_config']: 56 | if c not in loaded_config: 57 | if c.startswith('.'): 58 | c = f'{os.path.dirname(config_fn)}/{c}' 59 | c = os.path.normpath(c) 60 | override_config(ret_hparams, load_config(c)) 61 | override_config(ret_hparams, hparams_) 62 | else: 63 | ret_hparams = hparams_ 64 | config_chains.append(config_fn) 65 | return ret_hparams 66 | 67 | global hparams 68 | assert args.config != '' or args_work_dir != '' 69 | saved_hparams = {} 70 | if args_work_dir != 'checkpoints/': 71 | ckpt_config_path = f'{args_work_dir}/config.yaml' 72 | if os.path.exists(ckpt_config_path): 73 | try: 74 | with open(ckpt_config_path) as f: 75 | saved_hparams.update(yaml.safe_load(f)) 76 | except: 77 | pass 78 | if args.config == '': 79 | args.config = ckpt_config_path 80 | 81 | hparams_ = {} 82 | 83 | hparams_.update(load_config(args.config)) 84 | 85 | if not args.reset: 86 | hparams_.update(saved_hparams) 87 | hparams_['work_dir'] = args_work_dir 88 | 89 | if args.hparams != "": 90 | for new_hparam in args.hparams.split(","): 91 | k, v = new_hparam.split("=") 92 | if v in ['True', 'False'] or type(hparams_[k]) == bool: 93 | hparams_[k] = eval(v) 94 | else: 95 | hparams_[k] = type(hparams_[k])(v) 96 | 97 | if args_work_dir != '' and (not os.path.exists(ckpt_config_path) or args.reset) and not args.infer: 98 | os.makedirs(hparams_['work_dir'], exist_ok=True) 99 | with open(ckpt_config_path, 'w') as f: 100 | yaml.safe_dump(hparams_, f) 101 | 102 | hparams_['infer'] = args.infer 103 | hparams_['debug'] = args.debug 104 | hparams_['validate'] = args.validate 105 | global global_print_hparams 106 | if global_hparams: 107 | hparams.clear() 108 | hparams.update(hparams_) 109 | 110 | if print_hparams and global_print_hparams and global_hparams: 111 | print('| Hparams chains: ', config_chains) 112 | print('| Hparams: ') 113 | for i, (k, v) in enumerate(sorted(hparams_.items())): 114 | print(f"\033[;33;m{k}\033[0m: {v}, ", end="\n" if i % 5 == 4 else "") 115 | print("") 116 | global_print_hparams = False 117 | # print(hparams_.keys()) 118 | if hparams.get('exp_name') is None: 119 | hparams['exp_name'] = args.exp_name 120 | if hparams_.get('exp_name') is None: 121 | hparams_['exp_name'] = args.exp_name 122 | return hparams_ 123 | -------------------------------------------------------------------------------- /utils/indexed_datasets.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from copy import deepcopy 3 | 4 | import numpy as np 5 | 6 | 7 | class IndexedDataset: 8 | def __init__(self, path, num_cache=1): 9 | super().__init__() 10 | self.path = path 11 | self.data_file = None 12 | self.data_offsets = np.load(f"{path}.idx", allow_pickle=True).item()['offsets'] 13 | self.data_file = open(f"{path}.data", 'rb', buffering=-1) 14 | self.cache = [] 15 | self.num_cache = num_cache 16 | 17 | def check_index(self, i): 18 | if i < 0 or i >= len(self.data_offsets) - 1: 19 | raise IndexError('index out of range') 20 | 21 | def __del__(self): 22 | if self.data_file: 23 | self.data_file.close() 24 | 25 | def __getitem__(self, i): 26 | self.check_index(i) 27 | if self.num_cache > 0: 28 | for c in self.cache: 29 | if c[0] == i: 30 | return c[1] 31 | self.data_file.seek(self.data_offsets[i]) 32 | b = self.data_file.read(self.data_offsets[i + 1] - self.data_offsets[i]) 33 | item = pickle.loads(b) 34 | if self.num_cache > 0: 35 | self.cache = [(i, deepcopy(item))] + self.cache[:-1] 36 | return item 37 | 38 | def __len__(self): 39 | return len(self.data_offsets) - 1 40 | 41 | class IndexedDatasetBuilder: 42 | def __init__(self, path): 43 | self.path = path 44 | self.out_file = open(f"{path}.data", 'wb') 45 | self.byte_offsets = [0] 46 | 47 | def add_item(self, item): 48 | s = pickle.dumps(item) 49 | bytes = self.out_file.write(s) 50 | self.byte_offsets.append(self.byte_offsets[-1] + bytes) 51 | 52 | def finalize(self): 53 | self.out_file.close() 54 | np.save(open(f"{self.path}.idx", 'wb'), {'offsets': self.byte_offsets}) 55 | 56 | 57 | if __name__ == "__main__": 58 | import random 59 | from tqdm import tqdm 60 | ds_path = '/tmp/indexed_ds_example' 61 | size = 100 62 | items = [{"a": np.random.normal(size=[10000, 10]), 63 | "b": np.random.normal(size=[10000, 10])} for i in range(size)] 64 | builder = IndexedDatasetBuilder(ds_path) 65 | for i in tqdm(range(size)): 66 | builder.add_item(items[i]) 67 | builder.finalize() 68 | ds = IndexedDataset(ds_path) 69 | for i in tqdm(range(10000)): 70 | idx = random.randint(0, size - 1) 71 | assert (ds[idx]['a'] == items[idx]['a']).all() 72 | -------------------------------------------------------------------------------- /utils/multiprocess_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import traceback 3 | from multiprocessing import Queue, Process 4 | 5 | 6 | def chunked_worker(worker_id, map_func, args, results_queue=None, init_ctx_func=None): 7 | ctx = init_ctx_func(worker_id) if init_ctx_func is not None else None 8 | for job_idx, arg in args: 9 | try: 10 | if ctx is not None: 11 | res = map_func(*arg, ctx=ctx) 12 | else: 13 | res = map_func(*arg) 14 | results_queue.put((job_idx, res)) 15 | except: 16 | traceback.print_exc() 17 | results_queue.put((job_idx, None)) 18 | 19 | def chunked_multiprocess_run(map_func, args, num_workers=None, ordered=True, init_ctx_func=None, q_max_size=1000): 20 | args = zip(range(len(args)), args) 21 | args = list(args) 22 | n_jobs = len(args) 23 | if num_workers is None: 24 | num_workers = int(os.getenv('N_PROC', os.cpu_count())) 25 | results_queues = [] 26 | if ordered: 27 | for i in range(num_workers): 28 | results_queues.append(Queue(maxsize=q_max_size // num_workers)) 29 | else: 30 | results_queue = Queue(maxsize=q_max_size) 31 | for i in range(num_workers): 32 | results_queues.append(results_queue) 33 | workers = [] 34 | for i in range(num_workers): 35 | args_worker = args[i::num_workers] 36 | p = Process(target=chunked_worker, args=( 37 | i, map_func, args_worker, results_queues[i], init_ctx_func), daemon=True) 38 | workers.append(p) 39 | p.start() 40 | for n_finished in range(n_jobs): 41 | results_queue = results_queues[n_finished % num_workers] 42 | job_idx, res = results_queue.get() 43 | assert job_idx == n_finished or not ordered, (job_idx, n_finished) 44 | yield res 45 | for w in workers: 46 | w.join() 47 | w.close() 48 | -------------------------------------------------------------------------------- /utils/pitch_utils.py: -------------------------------------------------------------------------------- 1 | ######### 2 | # world 3 | ########## 4 | import librosa 5 | import numpy as np 6 | import torch 7 | 8 | gamma = 0 9 | mcepInput = 3 # 0 for dB, 3 for magnitude 10 | alpha = 0.45 11 | en_floor = 10 ** (-80 / 20) 12 | FFT_SIZE = 2048 13 | 14 | 15 | f0_bin = 256 16 | f0_max = 1100.0 17 | f0_min = 50.0 18 | f0_mel_min = 1127 * np.log(1 + f0_min / 700) 19 | f0_mel_max = 1127 * np.log(1 + f0_max / 700) 20 | 21 | 22 | def f0_to_coarse(f0): 23 | is_torch = isinstance(f0, torch.Tensor) 24 | f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) 25 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 26 | 27 | f0_mel[f0_mel <= 1] = 1 28 | f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 29 | f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int) 30 | assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min()) 31 | return f0_coarse 32 | 33 | 34 | def norm_f0(f0, uv, hparams): 35 | is_torch = isinstance(f0, torch.Tensor) 36 | if hparams['pitch_norm'] == 'standard': 37 | f0 = (f0 - hparams['f0_mean']) / hparams['f0_std'] 38 | if hparams['pitch_norm'] == 'log': 39 | f0 = torch.log2(f0) if is_torch else np.log2(f0) 40 | if uv is not None and hparams['use_uv']: 41 | f0[uv > 0] = 0 42 | return f0 43 | 44 | 45 | def norm_interp_f0(f0, hparams): 46 | is_torch = isinstance(f0, torch.Tensor) 47 | if is_torch: 48 | device = f0.device 49 | f0 = f0.data.cpu().numpy() 50 | uv = f0 == 0 51 | f0 = norm_f0(f0, uv, hparams) 52 | if sum(uv) == len(f0): 53 | f0[uv] = 0 54 | elif sum(uv) > 0: 55 | f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv]) 56 | uv = torch.FloatTensor(uv) 57 | f0 = torch.FloatTensor(f0) 58 | if is_torch: 59 | f0 = f0.to(device) 60 | return f0, uv 61 | 62 | 63 | def denorm_f0(f0, uv, hparams, pitch_padding=None, min=None, max=None): 64 | if hparams['pitch_norm'] == 'standard': 65 | f0 = f0 * hparams['f0_std'] + hparams['f0_mean'] 66 | if hparams['pitch_norm'] == 'log': 67 | f0 = 2 ** f0 68 | if min is not None: 69 | f0 = f0.clamp(min=min) 70 | if max is not None: 71 | f0 = f0.clamp(max=max) 72 | if uv is not None and hparams['use_uv']: 73 | f0[uv > 0] = 0 74 | if pitch_padding is not None: 75 | f0[pitch_padding] = 0 76 | return f0 77 | -------------------------------------------------------------------------------- /utils/plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import torch 4 | 5 | LINE_COLORS = ['w', 'r', 'y', 'cyan', 'm', 'b', 'lime'] 6 | 7 | 8 | def spec_to_figure(spec, vmin=None, vmax=None): 9 | if isinstance(spec, torch.Tensor): 10 | spec = spec.cpu().numpy() 11 | fig = plt.figure(figsize=(12, 6)) 12 | plt.pcolor(spec.T, vmin=vmin, vmax=vmax) 13 | return fig 14 | 15 | 16 | def spec_f0_to_figure(spec, f0s, figsize=None): 17 | max_y = spec.shape[1] 18 | if isinstance(spec, torch.Tensor): 19 | spec = spec.detach().cpu().numpy() 20 | f0s = {k: f0.detach().cpu().numpy() for k, f0 in f0s.items()} 21 | f0s = {k: f0 / 10 for k, f0 in f0s.items()} 22 | fig = plt.figure(figsize=(12, 6) if figsize is None else figsize) 23 | plt.pcolor(spec.T) 24 | for i, (k, f0) in enumerate(f0s.items()): 25 | plt.plot(f0.clip(0, max_y), label=k, c=LINE_COLORS[i], linewidth=1, alpha=0.8) 26 | plt.legend() 27 | return fig 28 | 29 | 30 | def dur_to_figure(dur_gt, dur_pred, txt): 31 | dur_gt = dur_gt.long().cpu().numpy() 32 | dur_pred = dur_pred.long().cpu().numpy() 33 | dur_gt = np.cumsum(dur_gt) 34 | dur_pred = np.cumsum(dur_pred) 35 | fig = plt.figure(figsize=(12, 6)) 36 | for i in range(len(dur_gt)): 37 | shift = (i % 8) + 1 38 | plt.text(dur_gt[i], shift, txt[i]) 39 | plt.text(dur_pred[i], 10 + shift, txt[i]) 40 | plt.vlines(dur_gt[i], 0, 10, colors='b') # blue is gt 41 | plt.vlines(dur_pred[i], 10, 20, colors='r') # red is pred 42 | return fig 43 | 44 | 45 | def f0_to_figure(f0_gt, f0_cwt=None, f0_pred=None): 46 | fig = plt.figure() 47 | f0_gt = f0_gt.cpu().numpy() 48 | plt.plot(f0_gt, color='r', label='gt') 49 | if f0_cwt is not None: 50 | f0_cwt = f0_cwt.cpu().numpy() 51 | plt.plot(f0_cwt, color='b', label='cwt') 52 | if f0_pred is not None: 53 | f0_pred = f0_pred.cpu().numpy() 54 | plt.plot(f0_pred, color='green', label='pred') 55 | plt.legend() 56 | return fig 57 | -------------------------------------------------------------------------------- /utils/training_utils.py: -------------------------------------------------------------------------------- 1 | from utils.hparams import hparams 2 | 3 | 4 | class RSQRTSchedule(object): 5 | def __init__(self, optimizer): 6 | super().__init__() 7 | self.optimizer = optimizer 8 | self.constant_lr = hparams['lr'] 9 | self.warmup_updates = hparams['warmup_updates'] 10 | self.hidden_size = hparams['hidden_size'] 11 | self.lr = hparams['lr'] 12 | for param_group in optimizer.param_groups: 13 | param_group['lr'] = self.lr 14 | self.step(0) 15 | 16 | def step(self, num_updates): 17 | constant_lr = self.constant_lr 18 | warmup = min(num_updates / self.warmup_updates, 1.0) 19 | rsqrt_decay = max(self.warmup_updates, num_updates) ** -0.5 20 | rsqrt_hidden = self.hidden_size ** -0.5 21 | self.lr = max(constant_lr * warmup * rsqrt_decay * rsqrt_hidden, 1e-7) 22 | for param_group in self.optimizer.param_groups: 23 | param_group['lr'] = self.lr 24 | return self.lr 25 | 26 | def get_lr(self): 27 | return self.optimizer.param_groups[0]['lr'] 28 | -------------------------------------------------------------------------------- /utils/tts_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from collections import defaultdict 4 | 5 | 6 | def make_positions(tensor, padding_idx): 7 | """Replace non-padding symbols with their position numbers. 8 | Position numbers begin at padding_idx+1. Padding symbols are ignored. 9 | """ 10 | # The series of casts and type-conversions here are carefully 11 | # balanced to both work with ONNX export and XLA. In particular XLA 12 | # prefers ints, cumsum defaults to output longs, and ONNX doesn't know 13 | # how to handle the dtype kwarg in cumsum. 14 | mask = tensor.ne(padding_idx).int() 15 | return ( 16 | torch.cumsum(mask, dim=1).type_as(mask) * mask 17 | ).long() + padding_idx 18 | 19 | 20 | def softmax(x, dim): 21 | return F.softmax(x, dim=dim, dtype=torch.float32) 22 | -------------------------------------------------------------------------------- /vocoders/__init__.py: -------------------------------------------------------------------------------- 1 | from vocoders import hifigan 2 | -------------------------------------------------------------------------------- /vocoders/base_vocoder.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | VOCODERS = {} 3 | 4 | 5 | def register_vocoder(cls): 6 | VOCODERS[cls.__name__.lower()] = cls 7 | VOCODERS[cls.__name__] = cls 8 | return cls 9 | 10 | 11 | def get_vocoder_cls(hparams): 12 | if hparams['vocoder'] in VOCODERS: 13 | return VOCODERS[hparams['vocoder']] 14 | else: 15 | vocoder_cls = hparams['vocoder'] 16 | pkg = ".".join(vocoder_cls.split(".")[:-1]) 17 | cls_name = vocoder_cls.split(".")[-1] 18 | vocoder_cls = getattr(importlib.import_module(pkg), cls_name) 19 | return vocoder_cls 20 | 21 | 22 | class BaseVocoder: 23 | def spec2wav(self, mel): 24 | """ 25 | 26 | :param mel: [T, 80] 27 | :return: wav: [T'] 28 | """ 29 | 30 | raise NotImplementedError 31 | 32 | @staticmethod 33 | def wav2spec(wav_fn): 34 | """ 35 | 36 | :param wav_fn: str 37 | :return: wav, mel: [T, 80] 38 | """ 39 | raise NotImplementedError 40 | -------------------------------------------------------------------------------- /vocoders/hifigan.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import os 4 | import re 5 | 6 | import librosa 7 | import torch 8 | 9 | import utils 10 | from modules.hifigan.hifigan import HifiGanGenerator 11 | from utils.hparams import hparams, set_hparams 12 | from vocoders.base_vocoder import register_vocoder 13 | from vocoders.pwg import PWG 14 | from vocoders.vocoder_utils import denoise 15 | 16 | 17 | def load_model(config_path, checkpoint_path): 18 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 19 | ckpt_dict = torch.load(checkpoint_path, map_location="cpu") 20 | if '.yaml' in config_path: 21 | config = set_hparams(config_path, global_hparams=False) 22 | state = ckpt_dict["state_dict"]["model_gen"] 23 | elif '.json' in config_path: 24 | config = json.load(open(config_path, 'r')) 25 | state = ckpt_dict["generator"] 26 | 27 | model = HifiGanGenerator(config) 28 | model.load_state_dict(state, strict=True) 29 | model.remove_weight_norm() 30 | model = model.eval().to(device) 31 | print(f"| Loaded model parameters from {checkpoint_path}.") 32 | print(f"| HifiGAN device: {device}.") 33 | return model, config, device 34 | 35 | 36 | total_time = 0 37 | 38 | 39 | @register_vocoder 40 | class HifiGAN(PWG): 41 | def __init__(self): 42 | base_dir = hparams['vocoder_ckpt'] 43 | config_path = f'{base_dir}/config.yaml' 44 | if os.path.exists(config_path): 45 | ckpt = sorted(glob.glob(f'{base_dir}/model_ckpt_steps_*.ckpt'), key= 46 | lambda x: int(re.findall(f'{base_dir}/model_ckpt_steps_(\d+).ckpt', x)[0]))[-1] 47 | print('| load HifiGAN: ', ckpt) 48 | self.model, self.config, self.device = load_model(config_path=config_path, checkpoint_path=ckpt) 49 | else: 50 | config_path = f'{base_dir}/config.json' 51 | ckpt = f'{base_dir}/generator_v1' 52 | if os.path.exists(config_path): 53 | self.model, self.config, self.device = load_model(config_path=config_path, checkpoint_path=ckpt) 54 | 55 | def spec2wav(self, mel, **kwargs): 56 | device = self.device 57 | with torch.no_grad(): 58 | c = torch.FloatTensor(mel).unsqueeze(0).transpose(2, 1).to(device) 59 | with utils.Timer('hifigan', print_time=hparams['profile_infer']): 60 | f0 = kwargs.get('f0') 61 | if f0 is not None and hparams.get('use_nsf'): 62 | f0 = torch.FloatTensor(f0[None, :]).to(device) 63 | y = self.model(c, f0).view(-1) 64 | else: 65 | y = self.model(c).view(-1) 66 | wav_out = y.cpu().numpy() 67 | if hparams.get('vocoder_denoise_c', 0.0) > 0: 68 | wav_out = denoise(wav_out, v=hparams['vocoder_denoise_c']) 69 | return wav_out 70 | 71 | # @staticmethod 72 | # def wav2spec(wav_fn, **kwargs): 73 | # wav, _ = librosa.core.load(wav_fn, sr=hparams['audio_sample_rate']) 74 | # wav_torch = torch.FloatTensor(wav)[None, :] 75 | # mel = mel_spectrogram(wav_torch, hparams).numpy()[0] 76 | # return wav, mel.T 77 | -------------------------------------------------------------------------------- /vocoders/pwg.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import re 3 | import librosa 4 | import torch 5 | import yaml 6 | from sklearn.preprocessing import StandardScaler 7 | from torch import nn 8 | from modules.parallel_wavegan.models import ParallelWaveGANGenerator 9 | from modules.parallel_wavegan.utils import read_hdf5 10 | from utils.hparams import hparams 11 | from utils.pitch_utils import f0_to_coarse 12 | from vocoders.base_vocoder import BaseVocoder, register_vocoder 13 | import numpy as np 14 | 15 | 16 | def load_pwg_model(config_path, checkpoint_path, stats_path): 17 | # load config 18 | with open(config_path) as f: 19 | config = yaml.load(f, Loader=yaml.Loader) 20 | 21 | # setup 22 | if torch.cuda.is_available(): 23 | device = torch.device("cuda") 24 | else: 25 | device = torch.device("cpu") 26 | model = ParallelWaveGANGenerator(**config["generator_params"]) 27 | 28 | ckpt_dict = torch.load(checkpoint_path, map_location="cpu") 29 | if 'state_dict' not in ckpt_dict: # official vocoder 30 | model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["model"]["generator"]) 31 | scaler = StandardScaler() 32 | if config["format"] == "hdf5": 33 | scaler.mean_ = read_hdf5(stats_path, "mean") 34 | scaler.scale_ = read_hdf5(stats_path, "scale") 35 | elif config["format"] == "npy": 36 | scaler.mean_ = np.load(stats_path)[0] 37 | scaler.scale_ = np.load(stats_path)[1] 38 | else: 39 | raise ValueError("support only hdf5 or npy format.") 40 | else: # custom PWG vocoder 41 | fake_task = nn.Module() 42 | fake_task.model_gen = model 43 | fake_task.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["state_dict"], strict=False) 44 | scaler = None 45 | 46 | model.remove_weight_norm() 47 | model = model.eval().to(device) 48 | print(f"| Loaded model parameters from {checkpoint_path}.") 49 | print(f"| PWG device: {device}.") 50 | return model, scaler, config, device 51 | 52 | 53 | @register_vocoder 54 | class PWG(BaseVocoder): 55 | def __init__(self): 56 | if hparams['vocoder_ckpt'] == '': # load LJSpeech PWG pretrained model 57 | base_dir = 'wavegan_pretrained' 58 | ckpts = glob.glob(f'{base_dir}/checkpoint-*steps.pkl') 59 | ckpt = sorted(ckpts, key= 60 | lambda x: int(re.findall(f'{base_dir}/checkpoint-(\d+)steps.pkl', x)[0]))[-1] 61 | config_path = f'{base_dir}/config.yaml' 62 | print('| load PWG: ', ckpt) 63 | self.model, self.scaler, self.config, self.device = load_pwg_model( 64 | config_path=config_path, 65 | checkpoint_path=ckpt, 66 | stats_path=f'{base_dir}/stats.h5', 67 | ) 68 | else: 69 | base_dir = hparams['vocoder_ckpt'] 70 | print(base_dir) 71 | config_path = f'{base_dir}/config.yaml' 72 | ckpt = sorted(glob.glob(f'{base_dir}/model_ckpt_steps_*.ckpt'), key= 73 | lambda x: int(re.findall(f'{base_dir}/model_ckpt_steps_(\d+).ckpt', x)[0]))[-1] 74 | print('| load PWG: ', ckpt) 75 | self.scaler = None 76 | self.model, _, self.config, self.device = load_pwg_model( 77 | config_path=config_path, 78 | checkpoint_path=ckpt, 79 | stats_path=f'{base_dir}/stats.h5', 80 | ) 81 | 82 | def spec2wav(self, mel, **kwargs): 83 | # start generation 84 | config = self.config 85 | device = self.device 86 | pad_size = (config["generator_params"]["aux_context_window"], 87 | config["generator_params"]["aux_context_window"]) 88 | c = mel 89 | if self.scaler is not None: 90 | c = self.scaler.transform(c) 91 | 92 | with torch.no_grad(): 93 | z = torch.randn(1, 1, c.shape[0] * config["hop_size"]).to(device) 94 | c = np.pad(c, (pad_size, (0, 0)), "edge") 95 | c = torch.FloatTensor(c).unsqueeze(0).transpose(2, 1).to(device) 96 | p = kwargs.get('f0') 97 | if p is not None: 98 | p = f0_to_coarse(p) 99 | p = np.pad(p, (pad_size,), "edge") 100 | p = torch.LongTensor(p[None, :]).to(device) 101 | y = self.model(z, c, p).view(-1) 102 | wav_out = y.cpu().numpy() 103 | return wav_out 104 | 105 | @staticmethod 106 | def wav2spec(wav_fn, return_linear=False): 107 | from data_gen.tts.data_gen_utils import process_utterance 108 | res = process_utterance( 109 | wav_fn, fft_size=hparams['fft_size'], 110 | hop_size=hparams['hop_size'], 111 | win_length=hparams['win_size'], 112 | num_mels=hparams['audio_num_mel_bins'], 113 | fmin=hparams['fmin'], 114 | fmax=hparams['fmax'], 115 | sample_rate=hparams['audio_sample_rate'], 116 | loud_norm=hparams['loud_norm'], 117 | min_level_db=hparams['min_level_db'], 118 | return_linear=return_linear, vocoder='pwg', eps=float(hparams.get('wav2spec_eps', 1e-10))) 119 | if return_linear: 120 | return res[0], res[1].T, res[2].T # [T, 80], [T, n_fft] 121 | else: 122 | return res[0], res[1].T 123 | 124 | @staticmethod 125 | def wav2mfcc(wav_fn): 126 | fft_size = hparams['fft_size'] 127 | hop_size = hparams['hop_size'] 128 | win_length = hparams['win_size'] 129 | sample_rate = hparams['audio_sample_rate'] 130 | wav, _ = librosa.core.load(wav_fn, sr=sample_rate) 131 | mfcc = librosa.feature.mfcc(y=wav, sr=sample_rate, n_mfcc=13, 132 | n_fft=fft_size, hop_length=hop_size, 133 | win_length=win_length, pad_mode="constant", power=1.0) 134 | mfcc_delta = librosa.feature.delta(mfcc, order=1) 135 | mfcc_delta_delta = librosa.feature.delta(mfcc, order=2) 136 | mfcc = np.concatenate([mfcc, mfcc_delta, mfcc_delta_delta]).T 137 | return mfcc 138 | -------------------------------------------------------------------------------- /vocoders/vocoder_utils.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | 3 | from utils.hparams import hparams 4 | import numpy as np 5 | 6 | 7 | def denoise(wav, v=0.1): 8 | spec = librosa.stft(y=wav, n_fft=hparams['fft_size'], hop_length=hparams['hop_size'], 9 | win_length=hparams['win_size'], pad_mode='constant') 10 | spec_m = np.abs(spec) 11 | spec_m = np.clip(spec_m - v, a_min=0, a_max=None) 12 | spec_a = np.angle(spec) 13 | 14 | return librosa.istft(spec_m * np.exp(1j * spec_a), hop_length=hparams['hop_size'], 15 | win_length=hparams['win_size']) 16 | --------------------------------------------------------------------------------