├── .github
    └── FUNDING.yml
├── .gitignore
├── LICENSE
├── README.md
├── checkpoints
    └── .gitkeep
├── configs
    ├── config_base.yaml
    ├── singing
    │   ├── base.yaml
    │   └── fs2.yaml
    └── tts
    │   ├── base.yaml
    │   ├── base_zh.yaml
    │   ├── fs2.yaml
    │   ├── hifigan.yaml
    │   ├── lj
    │       ├── base_mel2wav.yaml
    │       ├── base_text2mel.yaml
    │       ├── fs2.yaml
    │       ├── hifigan.yaml
    │       └── pwg.yaml
    │   └── pwg.yaml
├── data
    └── processed
    │   └── ljspeech
    │       ├── dict.txt
    │       ├── metadata_phone.csv
    │       ├── mfa_dict.txt
    │       └── phone_set.json
├── data_gen
    ├── singing
    │   └── binarize.py
    └── tts
    │   ├── base_binarizer.py
    │   ├── bin
    │       └── binarize.py
    │   ├── binarizer_zh.py
    │   ├── data_gen_utils.py
    │   └── txt_processors
    │       ├── base_text_processor.py
    │       ├── en.py
    │       ├── zh.py
    │       └── zh_g2pM.py
├── docs
    ├── README-SVS-opencpop-cascade.md
    ├── README-SVS-opencpop-e2e.md
    ├── README-SVS-opencpop-pndm.md
    ├── README-SVS-popcs.md
    ├── README-SVS.md
    ├── README-TTS-pndm.md
    └── README-TTS.md
├── inference
    └── svs
    │   ├── base_svs_infer.py
    │   ├── ds_cascade.py
    │   ├── ds_e2e.py
    │   ├── gradio
    │       ├── gradio_settings.yaml
    │       └── infer.py
    │   └── opencpop
    │       ├── cpop_pinyin2ph.txt
    │       └── map.py
├── modules
    ├── __init__.py
    ├── commons
    │   ├── common_layers.py
    │   ├── espnet_positional_embedding.py
    │   └── ssim.py
    ├── diffsinger_midi
    │   └── fs2.py
    ├── fastspeech
    │   ├── fs2.py
    │   ├── pe.py
    │   └── tts_modules.py
    ├── hifigan
    │   ├── hifigan.py
    │   └── mel_utils.py
    └── parallel_wavegan
    │   ├── __init__.py
    │   ├── layers
    │       ├── __init__.py
    │       ├── causal_conv.py
    │       ├── pqmf.py
    │       ├── residual_block.py
    │       ├── residual_stack.py
    │       ├── tf_layers.py
    │       └── upsample.py
    │   ├── losses
    │       ├── __init__.py
    │       └── stft_loss.py
    │   ├── models
    │       ├── __init__.py
    │       ├── melgan.py
    │       ├── parallel_wavegan.py
    │       └── source.py
    │   ├── optimizers
    │       ├── __init__.py
    │       └── radam.py
    │   ├── stft_loss.py
    │   └── utils
    │       ├── __init__.py
    │       └── utils.py
├── requirements.txt
├── requirements_2080.txt
├── requirements_3090.txt
├── resources
    ├── apply_form.md
    ├── diffspeech-fs2-1.png
    ├── diffspeech-fs2-2.png
    ├── diffspeech-fs2.png
    ├── model_a.png
    ├── model_b.png
    └── tfb.png
├── tasks
    ├── base_task.py
    ├── run.py
    └── tts
    │   ├── fs2.py
    │   ├── fs2_utils.py
    │   ├── pe.py
    │   └── tts.py
├── usr
    ├── .gitkeep
    ├── __init__.py
    ├── configs
    │   ├── base.yaml
    │   ├── lj_ds_beta6.yaml
    │   ├── lj_ds_pndm.yaml
    │   ├── midi
    │   │   ├── cascade
    │   │   │   └── opencs
    │   │   │   │   ├── aux_rel.yaml
    │   │   │   │   ├── ds60_rel.yaml
    │   │   │   │   └── opencpop_statis.yaml
    │   │   ├── e2e
    │   │   │   ├── opencpop
    │   │   │   │   ├── ds1000.yaml
    │   │   │   │   └── ds100_adj_rel.yaml
    │   │   │   └── popcs
    │   │   │   │   └── ds100_adj_rel.yaml
    │   │   └── pe.yaml
    │   ├── popcs_ds_beta6.yaml
    │   ├── popcs_ds_beta6_offline.yaml
    │   └── popcs_fs2.yaml
    ├── diff
    │   ├── candidate_decoder.py
    │   ├── diffusion.py
    │   ├── net.py
    │   └── shallow_diffusion_tts.py
    ├── diffsinger_task.py
    ├── diffspeech_task.py
    └── task.py
├── utils
    ├── __init__.py
    ├── audio.py
    ├── cwt.py
    ├── hparams.py
    ├── indexed_datasets.py
    ├── multiprocess_utils.py
    ├── pitch_utils.py
    ├── pl_utils.py
    ├── plot.py
    ├── text_encoder.py
    ├── text_norm.py
    ├── training_utils.py
    └── tts_utils.py
└── vocoders
    ├── __init__.py
    ├── base_vocoder.py
    ├── hifigan.py
    ├── pwg.py
    └── vocoder_utils.py


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: RayeRen # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.pyc
3 | __pycache__/
4 | *.sh
5 | local_tools/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Jinglin Liu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/checkpoints/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/checkpoints/.gitkeep


--------------------------------------------------------------------------------
/configs/config_base.yaml:
--------------------------------------------------------------------------------
 1 | # task
 2 | binary_data_dir: ''
 3 | work_dir: '' # experiment directory.
 4 | infer: false # infer
 5 | seed: 1234
 6 | debug: false
 7 | save_codes:
 8 |   - configs
 9 |   - modules
10 |   - tasks
11 |   - utils
12 |   - usr
13 | 
14 | #############
15 | # dataset
16 | #############
17 | ds_workers: 1
18 | test_num: 100
19 | valid_num: 100
20 | endless_ds: false
21 | sort_by_len: true
22 | 
23 | #########
24 | # train and eval
25 | #########
26 | load_ckpt: ''
27 | save_ckpt: true
28 | save_best: false
29 | num_ckpt_keep: 3
30 | clip_grad_norm: 0
31 | accumulate_grad_batches: 1
32 | log_interval: 100
33 | num_sanity_val_steps: 5  # steps of validation at the beginning
34 | check_val_every_n_epoch: 10
35 | val_check_interval: 2000
36 | max_epochs: 1000
37 | max_updates: 160000
38 | max_tokens: 31250
39 | max_sentences: 100000
40 | max_eval_tokens: -1
41 | max_eval_sentences: -1
42 | test_input_dir: ''
43 | 


--------------------------------------------------------------------------------
/configs/singing/base.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - configs/tts/base.yaml
 3 |   - configs/tts/base_zh.yaml
 4 | 
 5 | 
 6 | datasets: []
 7 | test_prefixes: []
 8 | test_num: 0
 9 | valid_num: 0
10 | 
11 | pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
12 | binarizer_cls: data_gen.singing.binarize.SingingBinarizer
13 | pre_align_args:
14 |   use_tone: false # for ZH
15 |   forced_align: mfa
16 |   use_sox: true
17 | hop_size: 128            # Hop size.
18 | fft_size: 512           # FFT size.
19 | win_size: 512           # FFT size.
20 | max_frames: 8000
21 | fmin: 50                 # Minimum freq in mel basis calculation.
22 | fmax: 11025               # Maximum frequency in mel basis calculation.
23 | pitch_type: frame
24 | 
25 | hidden_size: 256
26 | mel_loss: "ssim:0.5|l1:0.5"
27 | lambda_f0: 0.0
28 | lambda_uv: 0.0
29 | lambda_energy: 0.0
30 | lambda_ph_dur: 0.0
31 | lambda_sent_dur: 0.0
32 | lambda_word_dur: 0.0
33 | predictor_grad: 0.0
34 | use_spk_embed: true
35 | use_spk_id: false
36 | 
37 | max_tokens: 20000
38 | max_updates: 400000
39 | num_spk: 100
40 | save_f0: true
41 | use_gt_dur: true
42 | use_gt_f0: true
43 | 


--------------------------------------------------------------------------------
/configs/singing/fs2.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - configs/tts/fs2.yaml
3 |   - configs/singing/base.yaml
4 | 


--------------------------------------------------------------------------------
/configs/tts/base.yaml:
--------------------------------------------------------------------------------
 1 | # task
 2 | base_config: configs/config_base.yaml
 3 | task_cls: ''
 4 | #############
 5 | # dataset
 6 | #############
 7 | raw_data_dir: ''
 8 | processed_data_dir: ''
 9 | binary_data_dir: ''
10 | dict_dir: ''
11 | pre_align_cls: ''
12 | binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
13 | pre_align_args:
14 |   use_tone: true # for ZH
15 |   forced_align: mfa
16 |   use_sox: false
17 |   txt_processor: en
18 |   allow_no_txt: false
19 |   denoise: false
20 | binarization_args:
21 |   shuffle: false
22 |   with_txt: true
23 |   with_wav: false
24 |   with_align: true
25 |   with_spk_embed: true
26 |   with_f0: true
27 |   with_f0cwt: true
28 | 
29 | loud_norm: false
30 | endless_ds: true
31 | reset_phone_dict: true
32 | 
33 | test_num: 100
34 | valid_num: 100
35 | max_frames: 1550
36 | max_input_tokens: 1550
37 | audio_num_mel_bins: 80
38 | audio_sample_rate: 22050
39 | hop_size: 256  # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
40 | win_size: 1024  # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
41 | fmin: 80  # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
42 | fmax: 7600  # To be increased/reduced depending on data.
43 | fft_size: 1024  # Extra window size is filled with 0 paddings to match this parameter
44 | min_level_db: -100
45 | num_spk: 1
46 | mel_vmin: -6
47 | mel_vmax: 1.5
48 | ds_workers: 4
49 | 
50 | #########
51 | # model
52 | #########
53 | dropout: 0.1
54 | enc_layers: 4
55 | dec_layers: 4
56 | hidden_size: 384
57 | num_heads: 2
58 | prenet_dropout: 0.5
59 | prenet_hidden_size: 256
60 | stop_token_weight: 5.0
61 | enc_ffn_kernel_size: 9
62 | dec_ffn_kernel_size: 9
63 | ffn_act: gelu
64 | ffn_padding: 'SAME'
65 | 
66 | 
67 | ###########
68 | # optimization
69 | ###########
70 | lr: 2.0
71 | warmup_updates: 8000
72 | optimizer_adam_beta1: 0.9
73 | optimizer_adam_beta2: 0.98
74 | weight_decay: 0
75 | clip_grad_norm: 1
76 | 
77 | 
78 | ###########
79 | # train and eval
80 | ###########
81 | max_tokens: 30000
82 | max_sentences: 100000
83 | max_eval_sentences: 1
84 | max_eval_tokens: 60000
85 | train_set_name: 'train'
86 | valid_set_name: 'valid'
87 | test_set_name: 'test'
88 | vocoder: pwg
89 | vocoder_ckpt: ''
90 | profile_infer: false
91 | out_wav_norm: false
92 | save_gt: false
93 | save_f0: false
94 | gen_dir_name: ''
95 | use_denoise: false
96 | 


--------------------------------------------------------------------------------
/configs/tts/base_zh.yaml:
--------------------------------------------------------------------------------
1 | pre_align_args:
2 |   txt_processor: zh_g2pM
3 | binarizer_cls: data_gen.tts.binarizer_zh.ZhBinarizer


--------------------------------------------------------------------------------
/configs/tts/fs2.yaml:
--------------------------------------------------------------------------------
 1 | base_config: configs/tts/base.yaml
 2 | task_cls: tasks.tts.fs2.FastSpeech2Task
 3 | 
 4 | # model
 5 | hidden_size: 256
 6 | dropout: 0.1
 7 | encoder_type: fft # fft|tacotron|tacotron2|conformer
 8 | encoder_K: 8 # for tacotron encoder
 9 | decoder_type: fft # fft|rnn|conv|conformer
10 | use_pos_embed: true
11 | 
12 | # duration
13 | predictor_hidden: -1
14 | predictor_kernel: 5
15 | predictor_layers: 2
16 | dur_predictor_kernel: 3
17 | dur_predictor_layers: 2
18 | predictor_dropout: 0.5
19 | 
20 | # pitch and energy
21 | use_pitch_embed: true
22 | pitch_type: ph # frame|ph|cwt
23 | use_uv: true
24 | cwt_hidden_size: 128
25 | cwt_layers: 2
26 | cwt_loss: l1
27 | cwt_add_f0_loss: false
28 | cwt_std_scale: 0.8
29 | 
30 | pitch_ar: false
31 | #pitch_embed_type: 0q
32 | pitch_loss: 'l1' # l1|l2|ssim
33 | pitch_norm: log
34 | use_energy_embed: false
35 | 
36 | # reference encoder and speaker embedding
37 | use_spk_id: false
38 | use_split_spk_id: false
39 | use_spk_embed: false
40 | use_var_enc: false
41 | lambda_commit: 0.25
42 | ref_norm_layer: bn
43 | pitch_enc_hidden_stride_kernel:
44 |   - 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
45 |   - 0,2,5
46 |   - 0,2,5
47 | dur_enc_hidden_stride_kernel:
48 |   - 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
49 |   - 0,2,3
50 |   - 0,1,3
51 | 
52 | 
53 | # mel
54 | mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
55 | 
56 | # loss lambda
57 | lambda_f0: 1.0
58 | lambda_uv: 1.0
59 | lambda_energy: 0.1
60 | lambda_ph_dur: 1.0
61 | lambda_sent_dur: 1.0
62 | lambda_word_dur: 1.0
63 | predictor_grad: 0.1
64 | 
65 | # train and eval
66 | pretrain_fs_ckpt: ''
67 | warmup_updates: 2000
68 | max_tokens: 32000
69 | max_sentences: 100000
70 | max_eval_sentences: 1
71 | max_updates: 120000
72 | num_valid_plots: 5
73 | num_test_samples: 0
74 | test_ids: []
75 | use_gt_dur: false
76 | use_gt_f0: false
77 | 
78 | # exp
79 | dur_loss: mse # huber|mol
80 | norm_type: gn


--------------------------------------------------------------------------------
/configs/tts/hifigan.yaml:
--------------------------------------------------------------------------------
 1 | base_config: configs/tts/pwg.yaml
 2 | task_cls: tasks.vocoder.hifigan.HifiGanTask
 3 | resblock: "1"
 4 | adam_b1: 0.8
 5 | adam_b2: 0.99
 6 | upsample_rates: [ 8,8,2,2 ]
 7 | upsample_kernel_sizes: [ 16,16,4,4 ]
 8 | upsample_initial_channel: 128
 9 | resblock_kernel_sizes: [ 3,7,11 ]
10 | resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ]
11 | 
12 | lambda_mel: 45.0
13 | 
14 | max_samples: 8192
15 | max_sentences: 16
16 | 
17 | generator_params:
18 |   lr: 0.0002            # Generator's learning rate.
19 |   aux_context_window: 0 # Context window size for auxiliary feature.
20 | discriminator_optimizer_params:
21 |   lr: 0.0002            # Discriminator's learning rate.


--------------------------------------------------------------------------------
/configs/tts/lj/base_mel2wav.yaml:
--------------------------------------------------------------------------------
1 | raw_data_dir: 'data/raw/LJSpeech-1.1'
2 | processed_data_dir: 'data/processed/ljspeech'
3 | binary_data_dir: 'data/binary/ljspeech_wav'
4 | 


--------------------------------------------------------------------------------
/configs/tts/lj/base_text2mel.yaml:
--------------------------------------------------------------------------------
 1 | raw_data_dir: 'data/raw/LJSpeech-1.1'
 2 | processed_data_dir: 'data/processed/ljspeech'
 3 | binary_data_dir: 'data/binary/ljspeech'
 4 | pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
 5 | 
 6 | pitch_type: cwt
 7 | mel_loss: l1
 8 | num_test_samples: 20
 9 | test_ids: [ 68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
10 |             316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
11 | use_energy_embed: false
12 | test_num: 523
13 | valid_num: 348


--------------------------------------------------------------------------------
/configs/tts/lj/fs2.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - configs/tts/fs2.yaml
3 |   - configs/tts/lj/base_text2mel.yaml


--------------------------------------------------------------------------------
/configs/tts/lj/hifigan.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - configs/tts/hifigan.yaml
3 |   - configs/tts/lj/base_mel2wav.yaml


--------------------------------------------------------------------------------
/configs/tts/lj/pwg.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - configs/tts/pwg.yaml
3 |   - configs/tts/lj/base_mel2wav.yaml


--------------------------------------------------------------------------------
/configs/tts/pwg.yaml:
--------------------------------------------------------------------------------
  1 | base_config: configs/tts/base.yaml
  2 | task_cls: tasks.vocoder.pwg.PwgTask
  3 | 
  4 | binarization_args:
  5 |   with_wav: true
  6 |   with_spk_embed: false
  7 |   with_align: false
  8 | test_input_dir: ''
  9 | 
 10 | ###########
 11 | # train and eval
 12 | ###########
 13 | max_samples: 25600
 14 | max_sentences: 5
 15 | max_eval_sentences: 1
 16 | max_updates: 1000000
 17 | val_check_interval: 2000
 18 | 
 19 | 
 20 | ###########################################################
 21 | #                FEATURE EXTRACTION SETTING               #
 22 | ###########################################################
 23 | sampling_rate: 22050     # Sampling rate.
 24 | fft_size: 1024           # FFT size.
 25 | hop_size: 256            # Hop size.
 26 | win_length: null         # Window length.
 27 | # If set to null, it will be the same as fft_size.
 28 | window: "hann"           # Window function.
 29 | num_mels: 80             # Number of mel basis.
 30 | fmin: 80                 # Minimum freq in mel basis calculation.
 31 | fmax: 7600               # Maximum frequency in mel basis calculation.
 32 | format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
 33 | 
 34 | ###########################################################
 35 | #         GENERATOR NETWORK ARCHITECTURE SETTING          #
 36 | ###########################################################
 37 | generator_params:
 38 |   in_channels: 1        # Number of input channels.
 39 |   out_channels: 1       # Number of output channels.
 40 |   kernel_size: 3        # Kernel size of dilated convolution.
 41 |   layers: 30            # Number of residual block layers.
 42 |   stacks: 3             # Number of stacks i.e., dilation cycles.
 43 |   residual_channels: 64 # Number of channels in residual conv.
 44 |   gate_channels: 128    # Number of channels in gated conv.
 45 |   skip_channels: 64     # Number of channels in skip conv.
 46 |   aux_channels: 80      # Number of channels for auxiliary feature conv.
 47 |   # Must be the same as num_mels.
 48 |   aux_context_window: 2 # Context window size for auxiliary feature.
 49 |   # If set to 2, previous 2 and future 2 frames will be considered.
 50 |   dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
 51 |   use_weight_norm: true # Whether to use weight norm.
 52 |   # If set to true, it will be applied to all of the conv layers.
 53 |   upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
 54 |   upsample_params:                      # Upsampling network parameters.
 55 |     upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
 56 |   use_pitch_embed: false
 57 | 
 58 | ###########################################################
 59 | #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
 60 | ###########################################################
 61 | discriminator_params:
 62 |   in_channels: 1        # Number of input channels.
 63 |   out_channels: 1       # Number of output channels.
 64 |   kernel_size: 3        # Number of output channels.
 65 |   layers: 10            # Number of conv layers.
 66 |   conv_channels: 64     # Number of chnn layers.
 67 |   bias: true            # Whether to use bias parameter in conv.
 68 |   use_weight_norm: true # Whether to use weight norm.
 69 |   # If set to true, it will be applied to all of the conv layers.
 70 |   nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
 71 |   nonlinear_activation_params:      # Nonlinear function parameters
 72 |     negative_slope: 0.2           # Alpha in LeakyReLU.
 73 | 
 74 | ###########################################################
 75 | #                   STFT LOSS SETTING                     #
 76 | ###########################################################
 77 | stft_loss_params:
 78 |   fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
 79 |   hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
 80 |   win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
 81 |   window: "hann_window"         # Window function for STFT-based loss
 82 | use_mel_loss: false
 83 | 
 84 | ###########################################################
 85 | #               ADVERSARIAL LOSS SETTING                  #
 86 | ###########################################################
 87 | lambda_adv: 4.0  # Loss balancing coefficient.
 88 | 
 89 | ###########################################################
 90 | #             OPTIMIZER & SCHEDULER SETTING               #
 91 | ###########################################################
 92 | generator_optimizer_params:
 93 |   lr: 0.0001             # Generator's learning rate.
 94 |   eps: 1.0e-6            # Generator's epsilon.
 95 |   weight_decay: 0.0      # Generator's weight decay coefficient.
 96 | generator_scheduler_params:
 97 |   step_size: 200000      # Generator's scheduler step size.
 98 |   gamma: 0.5             # Generator's scheduler gamma.
 99 |   # At each step size, lr will be multiplied by this parameter.
100 | generator_grad_norm: 10    # Generator's gradient norm.
101 | discriminator_optimizer_params:
102 |   lr: 0.00005            # Discriminator's learning rate.
103 |   eps: 1.0e-6            # Discriminator's epsilon.
104 |   weight_decay: 0.0      # Discriminator's weight decay coefficient.
105 | discriminator_scheduler_params:
106 |   step_size: 200000      # Discriminator's scheduler step size.
107 |   gamma: 0.5             # Discriminator's scheduler gamma.
108 |   # At each step size, lr will be multiplied by this parameter.
109 | discriminator_grad_norm: 1 # Discriminator's gradient norm.
110 | disc_start_steps: 40000 # Number of steps to start to train discriminator.
111 | 


--------------------------------------------------------------------------------
/data/processed/ljspeech/dict.txt:
--------------------------------------------------------------------------------
 1 | ! !
 2 | , ,
 3 | . .
 4 | ; ;
 5 | <BOS> <BOS>
 6 | <EOS> <EOS>
 7 | ? ?
 8 | AA0 AA0
 9 | AA1 AA1
10 | AA2 AA2
11 | AE0 AE0
12 | AE1 AE1
13 | AE2 AE2
14 | AH0 AH0
15 | AH1 AH1
16 | AH2 AH2
17 | AO0 AO0
18 | AO1 AO1
19 | AO2 AO2
20 | AW0 AW0
21 | AW1 AW1
22 | AW2 AW2
23 | AY0 AY0
24 | AY1 AY1
25 | AY2 AY2
26 | B B
27 | CH CH
28 | D D
29 | DH DH
30 | EH0 EH0
31 | EH1 EH1
32 | EH2 EH2
33 | ER0 ER0
34 | ER1 ER1
35 | ER2 ER2
36 | EY0 EY0
37 | EY1 EY1
38 | EY2 EY2
39 | F F
40 | G G
41 | HH HH
42 | IH0 IH0
43 | IH1 IH1
44 | IH2 IH2
45 | IY0 IY0
46 | IY1 IY1
47 | IY2 IY2
48 | JH JH
49 | K K
50 | L L
51 | M M
52 | N N
53 | NG NG
54 | OW0 OW0
55 | OW1 OW1
56 | OW2 OW2
57 | OY0 OY0
58 | OY1 OY1
59 | OY2 OY2
60 | P P
61 | R R
62 | S S
63 | SH SH
64 | T T
65 | TH TH
66 | UH0 UH0
67 | UH1 UH1
68 | UH2 UH2
69 | UW0 UW0
70 | UW1 UW1
71 | UW2 UW2
72 | V V
73 | W W
74 | Y Y
75 | Z Z
76 | ZH ZH
77 | | |
78 | 


--------------------------------------------------------------------------------
/data/processed/ljspeech/phone_set.json:
--------------------------------------------------------------------------------
1 | ["!", ",", ".", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]


--------------------------------------------------------------------------------
/data_gen/tts/bin/binarize.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | os.environ["OMP_NUM_THREADS"] = "1"
 4 | 
 5 | import importlib
 6 | from utils.hparams import set_hparams, hparams
 7 | 
 8 | 
 9 | def binarize():
10 |     binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer')
11 |     pkg = ".".join(binarizer_cls.split(".")[:-1])
12 |     cls_name = binarizer_cls.split(".")[-1]
13 |     binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
14 |     print("| Binarizer: ", binarizer_cls)
15 |     binarizer_cls().process()
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     set_hparams()
20 |     binarize()
21 | 


--------------------------------------------------------------------------------
/data_gen/tts/binarizer_zh.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | os.environ["OMP_NUM_THREADS"] = "1"
 4 | 
 5 | from data_gen.tts.txt_processors.zh_g2pM import ALL_SHENMU
 6 | from data_gen.tts.base_binarizer import BaseBinarizer, BinarizationError
 7 | from data_gen.tts.data_gen_utils import get_mel2ph
 8 | from utils.hparams import set_hparams, hparams
 9 | import numpy as np
10 | 
11 | 
12 | class ZhBinarizer(BaseBinarizer):
13 |     @staticmethod
14 |     def get_align(tg_fn, ph, mel, phone_encoded, res):
15 |         if tg_fn is not None and os.path.exists(tg_fn):
16 |             _, dur = get_mel2ph(tg_fn, ph, mel, hparams)
17 |         else:
18 |             raise BinarizationError(f"Align not found")
19 |         ph_list = ph.split(" ")
20 |         assert len(dur) == len(ph_list)
21 |         mel2ph = []
22 |         # 分隔符的时长分配给韵母
23 |         dur_cumsum = np.pad(np.cumsum(dur), [1, 0], mode='constant', constant_values=0)
24 |         for i in range(len(dur)):
25 |             p = ph_list[i]
26 |             if p[0] != '<' and not p[0].isalpha():
27 |                 uv_ = res['f0'][dur_cumsum[i]:dur_cumsum[i + 1]] == 0
28 |                 j = 0
29 |                 while j < len(uv_) and not uv_[j]:
30 |                     j += 1
31 |                 dur[i - 1] += j
32 |                 dur[i] -= j
33 |                 if dur[i] < 100:
34 |                     dur[i - 1] += dur[i]
35 |                     dur[i] = 0
36 |         # 声母和韵母等长
37 |         for i in range(len(dur)):
38 |             p = ph_list[i]
39 |             if p in ALL_SHENMU:
40 |                 p_next = ph_list[i + 1]
41 |                 if not (dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU):
42 |                     print(f"assert dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU, "
43 |                           f"dur[i]: {dur[i]}, p: {p}, p_next: {p_next}.")
44 |                     continue
45 |                 total = dur[i + 1] + dur[i]
46 |                 dur[i] = total // 2
47 |                 dur[i + 1] = total - dur[i]
48 |         for i in range(len(dur)):
49 |             mel2ph += [i + 1] * dur[i]
50 |         mel2ph = np.array(mel2ph)
51 |         if mel2ph.max() - 1 >= len(phone_encoded):
52 |             raise BinarizationError(f"| Align does not match: {(mel2ph.max() - 1, len(phone_encoded))}")
53 |         res['mel2ph'] = mel2ph
54 |         res['dur'] = dur
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     set_hparams()
59 |     ZhBinarizer().process()
60 | 


--------------------------------------------------------------------------------
/data_gen/tts/txt_processors/base_text_processor.py:
--------------------------------------------------------------------------------
1 | class BaseTxtProcessor:
2 |     @staticmethod
3 |     def sp_phonemes():
4 |         return ['|']
5 | 
6 |     @classmethod
7 |     def process(cls, txt, pre_align_args):
8 |         raise NotImplementedError
9 | 


--------------------------------------------------------------------------------
/data_gen/tts/txt_processors/en.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from data_gen.tts.data_gen_utils import PUNCS
 3 | from g2p_en import G2p
 4 | import unicodedata
 5 | from g2p_en.expand import normalize_numbers
 6 | from nltk import pos_tag
 7 | from nltk.tokenize import TweetTokenizer
 8 | 
 9 | from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor
10 | 
11 | 
12 | class EnG2p(G2p):
13 |     word_tokenize = TweetTokenizer().tokenize
14 | 
15 |     def __call__(self, text):
16 |         # preprocessing
17 |         words = EnG2p.word_tokenize(text)
18 |         tokens = pos_tag(words)  # tuples of (word, tag)
19 | 
20 |         # steps
21 |         prons = []
22 |         for word, pos in tokens:
23 |             if re.search("[a-z]", word) is None:
24 |                 pron = [word]
25 | 
26 |             elif word in self.homograph2features:  # Check homograph
27 |                 pron1, pron2, pos1 = self.homograph2features[word]
28 |                 if pos.startswith(pos1):
29 |                     pron = pron1
30 |                 else:
31 |                     pron = pron2
32 |             elif word in self.cmu:  # lookup CMU dict
33 |                 pron = self.cmu[word][0]
34 |             else:  # predict for oov
35 |                 pron = self.predict(word)
36 | 
37 |             prons.extend(pron)
38 |             prons.extend([" "])
39 | 
40 |         return prons[:-1]
41 | 
42 | 
43 | class TxtProcessor(BaseTxtProcessor):
44 |     g2p = EnG2p()
45 | 
46 |     @staticmethod
47 |     def preprocess_text(text):
48 |         text = normalize_numbers(text)
49 |         text = ''.join(char for char in unicodedata.normalize('NFD', text)
50 |                        if unicodedata.category(char) != 'Mn')  # Strip accents
51 |         text = text.lower()
52 |         text = re.sub("[\'\"()]+", "", text)
53 |         text = re.sub("[-]+", " ", text)
54 |         text = re.sub(f"[^ a-z{PUNCS}]", "", text)
55 |         text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text)  # !! -> !
56 |         text = re.sub(f"([{PUNCS}])+", r"\1", text)  # !! -> !
57 |         text = text.replace("i.e.", "that is")
58 |         text = text.replace("i.e.", "that is")
59 |         text = text.replace("etc.", "etc")
60 |         text = re.sub(f"([{PUNCS}])", r" \1 ", text)
61 |         text = re.sub(rf"\s+", r" ", text)
62 |         return text
63 | 
64 |     @classmethod
65 |     def process(cls, txt, pre_align_args):
66 |         txt = cls.preprocess_text(txt).strip()
67 |         phs = cls.g2p(txt)
68 |         phs_ = []
69 |         n_word_sep = 0
70 |         for p in phs:
71 |             if p.strip() == '':
72 |                 phs_ += ['|']
73 |                 n_word_sep += 1
74 |             else:
75 |                 phs_ += p.split(" ")
76 |         phs = phs_
77 |         assert n_word_sep + 1 == len(txt.split(" ")), (phs, f"\"{txt}\"")
78 |         return phs, txt
79 | 


--------------------------------------------------------------------------------
/data_gen/tts/txt_processors/zh.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from pypinyin import pinyin, Style
 3 | from data_gen.tts.data_gen_utils import PUNCS
 4 | from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor
 5 | from utils.text_norm import NSWNormalizer
 6 | 
 7 | 
 8 | class TxtProcessor(BaseTxtProcessor):
 9 |     table = {ord(f): ord(t) for f, t in zip(
10 |         u'：，。！？【】（）％＃＠＆１２３４５６７８９０',
11 |         u':,.!?[]()%#@&1234567890')}
12 | 
13 |     @staticmethod
14 |     def preprocess_text(text):
15 |         text = text.translate(TxtProcessor.table)
16 |         text = NSWNormalizer(text).normalize(remove_punc=False)
17 |         text = re.sub("[\'\"()]+", "", text)
18 |         text = re.sub("[-]+", " ", text)
19 |         text = re.sub(f"[^ A-Za-z\u4e00-\u9fff{PUNCS}]", "", text)
20 |         text = re.sub(f"([{PUNCS}])+", r"\1", text)  # !! -> !
21 |         text = re.sub(f"([{PUNCS}])", r" \1 ", text)
22 |         text = re.sub(rf"\s+", r"", text)
23 |         return text
24 | 
25 |     @classmethod
26 |     def process(cls, txt, pre_align_args):
27 |         txt = cls.preprocess_text(txt)
28 |         shengmu = pinyin(txt, style=Style.INITIALS)  # https://blog.csdn.net/zhoulei124/article/details/89055403
29 |         yunmu_finals = pinyin(txt, style=Style.FINALS)
30 |         yunmu_tone3 = pinyin(txt, style=Style.FINALS_TONE3)
31 |         yunmu = [[t[0] + '5'] if t[0] == f[0] else t for f, t in zip(yunmu_finals, yunmu_tone3)] \
32 |             if pre_align_args['use_tone'] else yunmu_finals
33 | 
34 |         assert len(shengmu) == len(yunmu)
35 |         phs = ["|"]
36 |         for a, b, c in zip(shengmu, yunmu, yunmu_finals):
37 |             if a[0] == c[0]:
38 |                 phs += [a[0], "|"]
39 |             else:
40 |                 phs += [a[0], b[0], "|"]
41 |         return phs, txt
42 | 


--------------------------------------------------------------------------------
/data_gen/tts/txt_processors/zh_g2pM.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import jieba
 3 | from pypinyin import pinyin, Style
 4 | from data_gen.tts.data_gen_utils import PUNCS
 5 | from data_gen.tts.txt_processors import zh
 6 | from g2pM import G2pM
 7 | 
 8 | ALL_SHENMU = ['zh', 'ch', 'sh', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j',
 9 |               'q', 'x', 'r', 'z', 'c', 's', 'y', 'w']
10 | ALL_YUNMU = ['a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian',
11 |              'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iu', 'ng', 'o', 'ong', 'ou',
12 |              'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've', 'vn']
13 | 
14 | 
15 | class TxtProcessor(zh.TxtProcessor):
16 |     model = G2pM()
17 | 
18 |     @staticmethod
19 |     def sp_phonemes():
20 |         return ['|', '#']
21 | 
22 |     @classmethod
23 |     def process(cls, txt, pre_align_args):
24 |         txt = cls.preprocess_text(txt)
25 |         ph_list = cls.model(txt, tone=pre_align_args['use_tone'], char_split=True)
26 |         seg_list = '#'.join(jieba.cut(txt))
27 |         assert len(ph_list) == len([s for s in seg_list if s != '#']), (ph_list, seg_list)
28 | 
29 |         # 加入词边界'#'
30 |         ph_list_ = []
31 |         seg_idx = 0
32 |         for p in ph_list:
33 |             p = p.replace("u:", "v")
34 |             if seg_list[seg_idx] == '#':
35 |                 ph_list_.append('#')
36 |                 seg_idx += 1
37 |             else:
38 |                 ph_list_.append("|")
39 |             seg_idx += 1
40 |             if re.findall('[\u4e00-\u9fff]', p):
41 |                 if pre_align_args['use_tone']:
42 |                     p = pinyin(p, style=Style.TONE3, strict=True)[0][0]
43 |                     if p[-1] not in ['1', '2', '3', '4', '5']:
44 |                         p = p + '5'
45 |                 else:
46 |                     p = pinyin(p, style=Style.NORMAL, strict=True)[0][0]
47 | 
48 |             finished = False
49 |             if len([c.isalpha() for c in p]) > 1:
50 |                 for shenmu in ALL_SHENMU:
51 |                     if p.startswith(shenmu) and not p.lstrip(shenmu).isnumeric():
52 |                         ph_list_ += [shenmu, p.lstrip(shenmu)]
53 |                         finished = True
54 |                         break
55 |             if not finished:
56 |                 ph_list_.append(p)
57 | 
58 |         ph_list = ph_list_
59 | 
60 |         # 去除静音符号周围的词边界标记 [..., '#', ',', '#', ...]
61 |         sil_phonemes = list(PUNCS) + TxtProcessor.sp_phonemes()
62 |         ph_list_ = []
63 |         for i in range(0, len(ph_list), 1):
64 |             if ph_list[i] != '#' or (ph_list[i - 1] not in sil_phonemes and ph_list[i + 1] not in sil_phonemes):
65 |                 ph_list_.append(ph_list[i])
66 |         ph_list = ph_list_
67 |         return ph_list, txt
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     phs, txt = TxtProcessor.process('他来到了，网易杭研大厦', {'use_tone': True})
72 |     print(phs)
73 | 


--------------------------------------------------------------------------------
/docs/README-SVS-opencpop-cascade.md:
--------------------------------------------------------------------------------
  1 | # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
  2 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
  3 | [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
  4 | [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
  5 | 
  6 | ## DiffSinger (MIDI SVS | A version)
  7 | ### 0. Data Acquirement
  8 | For Opencpop dataset: Please strictly follow the instructions of [Opencpop](https://wenet.org.cn/opencpop/). We have no right to give you the access to Opencpop.
  9 | 
 10 | The pipeline below is designed for Opencpop dataset:
 11 | 
 12 | ### 1. Preparation
 13 | 
 14 | #### Data Preparation
 15 | a) Download and extract Opencpop, then create a link to the dataset folder: `ln -s /xxx/opencpop data/raw/`
 16 | 
 17 | b) Run the following scripts to pack the dataset for training/inference.
 18 | 
 19 | ```sh
 20 | export PYTHONPATH=.
 21 | CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml
 22 | 
 23 | # `data/binary/opencpop-midi-dp` will be generated.
 24 | ```
 25 | 
 26 | #### Vocoder Preparation
 27 | We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
 28 | Please unzip this file into `checkpoints` before training your acoustic model.
 29 | 
 30 | (Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
 31 | 
 32 | This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder. 
 33 | 
 34 | #### Exp Name Preparation
 35 | ```bash
 36 | export MY_FS_EXP_NAME=0302_opencpop_fs_midi
 37 | export MY_DS_EXP_NAME=0303_opencpop_ds58_midi
 38 | ```
 39 | 
 40 | ```
 41 | .
 42 | |--data
 43 |     |--raw
 44 |         |--opencpop
 45 |             |--segments
 46 |                 |--transcriptions.txt
 47 |                 |--wavs
 48 | |--checkpoints
 49 |     |--MY_FS_EXP_NAME (optional)
 50 |     |--MY_DS_EXP_NAME (optional)
 51 |     |--0109_hifigan_bigpopcs_hop128
 52 |         |--model_ckpt_steps_1512000.ckpt
 53 |         |--config.yaml
 54 | ```
 55 | 
 56 | ### 2. Training Example
 57 | First, you need a pre-trained FFT-Singer checkpoint. You can use the pre-trained model, or train FFT-Singer from scratch, run:
 58 | ```sh
 59 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml --exp_name $MY_FS_EXP_NAME --reset
 60 | ```
 61 | 
 62 | Then, to train DiffSinger, run:
 63 | 
 64 | ```sh
 65 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME --reset  
 66 | ```
 67 | 
 68 | Remember to adjust the "fs2_ckpt" parameter in `usr/configs/midi/cascade/opencs/ds60_rel.yaml` to fit your path.
 69 | 
 70 | ### 3. Inference from packed test set
 71 | ```sh
 72 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME --reset --infer
 73 | ```
 74 | Inference results will be saved in `./checkpoints/MY_DS_EXP_NAME/generated_` by default.
 75 | 
 76 | We also provide:
 77 |  - the pre-trained model of DiffSinger;
 78 |  - the pre-trained model of FFT-Singer;
 79 |  
 80 | They can be found in [here](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/adjust-receptive-field.zip).
 81 | 
 82 | Remember to put the pre-trained models in `checkpoints` directory.
 83 | 
 84 | ### 4. Inference from raw inputs
 85 | ```sh
 86 | python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME
 87 | ```
 88 | Raw inputs:
 89 | ```
 90 | inp = {
 91 |         'text': '小酒窝长睫毛AP是你最美的记号',
 92 |         'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
 93 |         'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
 94 |         'input_type': 'word'
 95 |     }  # user input: Chinese characters
 96 | or,
 97 | inp = {
 98 |         'text': '小酒窝长睫毛AP是你最美的记号',
 99 |         'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
100 |         'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
101 |         'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
102 |         'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
103 |         'input_type': 'phoneme'
104 |     }  # input like Opencpop dataset.
105 | ```
106 | Here the inference results will be saved in `./infer_out` by default.
107 | 
108 | ### 5. Some issues.
109 | a) the HifiGAN-Singing is trained on our [vocoder dataset](https://dl.acm.org/doi/abs/10.1145/3474085.3475437) and the training set of [PopCS](https://arxiv.org/abs/2105.02446). Opencpop is the out-of-domain dataset (unseen speaker). This may cause the deterioration of audio quality, and we are considering fine-tuning this vocoder on the training set of Opencpop.
110 | 
111 | b) in this version of codes, we used the melody frontend ([lyric + MIDI]->[F0+ph_dur]) to predict F0 contour and phoneme duration.
112 | 
113 | c) generated audio demos can be found in [MY_DS_EXP_NAME](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/adjust-receptive-field.zip).
114 | 


--------------------------------------------------------------------------------
/docs/README-SVS-opencpop-e2e.md:
--------------------------------------------------------------------------------
  1 | # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
  2 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
  3 | [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
  4 | [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
  5 | [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-blue?label=SVSDemo)](https://huggingface.co/spaces/Silentlin/DiffSinger)
  6 | 
  7 | Substantial update: We 1) **abandon** the explicit prediction of the F0 curve; 2) increase the receptive field of the denoiser; 3) make the linguistic encoder more robust.
  8 | **By doing so, 1) the synthesized recordings are more natural in terms of pitch; 2) the pipeline is simpler.**
  9 | 
 10 | 简而言之，把F0曲线的动态性交给生成式模型去捕捉，而不再是以前那样用MSE约束对数域F0。
 11 | 
 12 | ## DiffSinger (MIDI SVS | B version)
 13 | ### 0. Data Acquirement
 14 | For Opencpop dataset: Please strictly follow the instructions of [Opencpop](https://wenet.org.cn/opencpop/). We have no right to give you the access to Opencpop.
 15 | 
 16 | The pipeline below is designed for Opencpop dataset:
 17 | 
 18 | ### 1. Preparation
 19 | 
 20 | #### Data Preparation
 21 | a) Download and extract Opencpop, then create a link to the dataset folder: `ln -s /xxx/opencpop data/raw/`
 22 | 
 23 | b) Run the following scripts to pack the dataset for training/inference.
 24 | 
 25 | ```sh
 26 | export PYTHONPATH=.
 27 | CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml
 28 | 
 29 | # `data/binary/opencpop-midi-dp` will be generated.
 30 | ```
 31 | 
 32 | #### Vocoder Preparation
 33 | We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
 34 | 
 35 | Also, please unzip pre-trained vocoder and [this pendant for vocoder](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0102_xiaoma_pe.zip) into `checkpoints` before training your acoustic model.
 36 | 
 37 | (Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
 38 | 
 39 | This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder. 
 40 | 
 41 | #### Exp Name Preparation
 42 | ```bash
 43 | export MY_DS_EXP_NAME=0228_opencpop_ds100_rel
 44 | ```
 45 | 
 46 | ```
 47 | .
 48 | |--data
 49 |     |--raw
 50 |         |--opencpop
 51 |             |--segments
 52 |                 |--transcriptions.txt
 53 |                 |--wavs
 54 | |--checkpoints
 55 |     |--MY_DS_EXP_NAME (optional)
 56 |     |--0109_hifigan_bigpopcs_hop128 (vocoder)
 57 |         |--model_ckpt_steps_1512000.ckpt
 58 |         |--config.yaml
 59 | ```
 60 | 
 61 | ### 2. Training Example
 62 | ```sh
 63 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME --reset  
 64 | ```
 65 | 
 66 | ### 3. Inference from packed test set
 67 | ```sh
 68 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME --reset --infer
 69 | ```
 70 | Inference results will be saved in `./checkpoints/MY_DS_EXP_NAME/generated_` by default.
 71 | 
 72 | We also provide:
 73 |  - the pre-trained model of DiffSinger;
 74 |  
 75 | They can be found in [here](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0228_opencpop_ds100_rel.zip).
 76 | 
 77 | Remember to put the pre-trained models in `checkpoints` directory.
 78 | 
 79 | ### 4. Inference from raw inputs
 80 | ```sh
 81 | python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME
 82 | ```
 83 | Raw inputs:
 84 | ```
 85 | inp = {
 86 |         'text': '小酒窝长睫毛AP是你最美的记号',
 87 |         'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
 88 |         'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
 89 |         'input_type': 'word'
 90 |     }  # user input: Chinese characters
 91 | or,
 92 | inp = {
 93 |         'text': '小酒窝长睫毛AP是你最美的记号',
 94 |         'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
 95 |         'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
 96 |         'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
 97 |         'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
 98 |         'input_type': 'phoneme'
 99 |     }  # input like Opencpop dataset.
100 | ```
101 | Here the inference results will be saved in `./infer_out` by default.
102 | ### 5. Some issues.
103 | a) the HifiGAN-Singing is trained on our [vocoder dataset](https://dl.acm.org/doi/abs/10.1145/3474085.3475437) and the training set of [PopCS](https://arxiv.org/abs/2105.02446). Opencpop is the out-of-domain dataset (unseen speaker). This may cause the deterioration of audio quality, and we are considering fine-tuning this vocoder on the training set of Opencpop.
104 | 
105 | b) in this version of codes, we used the melody frontend ([lyric + MIDI]->[ph_dur]) to predict phoneme duration. F0 curve is implicitly predicted together with mel-spectrogram.
106 | 
107 | 


--------------------------------------------------------------------------------
/docs/README-SVS-opencpop-pndm.md:
--------------------------------------------------------------------------------
  1 | # DiffSinger-PNDM
  2 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
  3 | [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
  4 | [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
  5 | 
  6 | Highlights:
  7 | 
  8 | Training diffusion model: 1000 steps 
  9 | 
 10 | Default pndm_speedup: 40
 11 | 
 12 | Inference diffusion model: (1000 / pndm_speedup) steps = 25 steps
 13 | 
 14 | You can freely control the inference steps, by adding these arguments in your experiment scripts :
 15 | --hparams="pndm_speedup=40" or --hparams="pndm_speedup=20" or --hparams="pndm_speedup=10".
 16 | 
 17 | Contributed by @luping-liu .
 18 | 
 19 | ## DiffSinger (MIDI SVS | B version | +PNDM)
 20 | ### 0. Data Acquirement
 21 | For Opencpop dataset: Please strictly follow the instructions of [Opencpop](https://wenet.org.cn/opencpop/). We have no right to give you the access to Opencpop.
 22 | 
 23 | The pipeline below is designed for Opencpop dataset:
 24 | 
 25 | ### 1. Preparation
 26 | 
 27 | #### Data Preparation
 28 | a) Download and extract Opencpop, then create a link to the dataset folder: `ln -s /xxx/opencpop data/raw/`
 29 | 
 30 | b) Run the following scripts to pack the dataset for training/inference.
 31 | 
 32 | ```sh
 33 | export PYTHONPATH=.
 34 | CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml
 35 | 
 36 | # `data/binary/opencpop-midi-dp` will be generated.
 37 | ```
 38 | 
 39 | #### Vocoder Preparation
 40 | We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
 41 | 
 42 | Also, please unzip pre-trained vocoder and [this pendant for vocoder](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0102_xiaoma_pe.zip) into `checkpoints` before training your acoustic model.
 43 | 
 44 | (Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
 45 | 
 46 | This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder. 
 47 | 
 48 | #### Exp Name Preparation
 49 | ```bash
 50 | export MY_DS_EXP_NAME=0831_opencpop_ds1000
 51 | ```
 52 | 
 53 | ```
 54 | .
 55 | |--data
 56 |     |--raw
 57 |         |--opencpop
 58 |             |--segments
 59 |                 |--transcriptions.txt
 60 |                 |--wavs
 61 | |--checkpoints
 62 |     |--MY_DS_EXP_NAME (optional)
 63 |     |--0109_hifigan_bigpopcs_hop128 (vocoder)
 64 |         |--model_ckpt_steps_1512000.ckpt
 65 |         |--config.yaml
 66 | ```
 67 | 
 68 | ### 2. Training Example
 69 | ```sh
 70 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds1000.yaml --exp_name $MY_DS_EXP_NAME --reset  
 71 | ```
 72 | 
 73 | ### 3. Inference from packed test set
 74 | ```sh
 75 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds1000.yaml --exp_name $MY_DS_EXP_NAME --reset --infer
 76 | ```
 77 | Inference results will be saved in `./checkpoints/MY_DS_EXP_NAME/generated_` by default.
 78 | 
 79 | We also provide:
 80 |  - the pre-trained model of DiffSinger;
 81 |  
 82 | They can be found in [here](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0831_opencpop_ds1000.zip).
 83 | 
 84 | Remember to put the pre-trained models in `checkpoints` directory.
 85 | 
 86 | ### 4. Inference from raw inputs
 87 | ```sh
 88 | python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds1000.yaml --exp_name $MY_DS_EXP_NAME
 89 | ```
 90 | Raw inputs:
 91 | ```
 92 | inp = {
 93 |         'text': '小酒窝长睫毛AP是你最美的记号',
 94 |         'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
 95 |         'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
 96 |         'input_type': 'word'
 97 |     }  # user input: Chinese characters
 98 | or,
 99 | inp = {
100 |         'text': '小酒窝长睫毛AP是你最美的记号',
101 |         'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
102 |         'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
103 |         'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
104 |         'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
105 |         'input_type': 'phoneme'
106 |     }  # input like Opencpop dataset.
107 | ```
108 | Here the inference results will be saved in `./infer_out` by default.
109 | ### 5. Some issues.
110 | a) the HifiGAN-Singing is trained on our [vocoder dataset](https://dl.acm.org/doi/abs/10.1145/3474085.3475437) and the training set of [PopCS](https://arxiv.org/abs/2105.02446). Opencpop is the out-of-domain dataset (unseen speaker). This may cause the deterioration of audio quality, and we are considering fine-tuning this vocoder on the training set of Opencpop.
111 | 
112 | b) in this version of codes, we used the melody frontend ([lyric + MIDI]->[ph_dur]) to predict phoneme duration. F0 curve is implicitly predicted together with mel-spectrogram.
113 | 


--------------------------------------------------------------------------------
/docs/README-SVS-popcs.md:
--------------------------------------------------------------------------------
 1 | ## DiffSinger (SVS version)
 2 | 
 3 | ### 0. Data Acquirement
 4 | - [Download link](https://drive.google.com/file/d/1uFJmPEUWbzguGBdiuupYvYbBEjopN-Xq/view?usp=sharing).
 5 | - Please note that, if you are using PopCS, it means that you have accepted the terms in [apply_form](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md).
 6 | 
 7 | ### 1. Preparation
 8 | #### Data Preparation
 9 | a) Download and extract PopCS, then create a link to the dataset folder: `ln -s /xxx/popcs/ data/processed/popcs`
10 | 
11 | b) Run the following scripts to pack the dataset for training/inference.
12 | ```sh
13 | export PYTHONPATH=.
14 | CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/popcs_ds_beta6.yaml
15 | # `data/binary/popcs-pmf0` will be generated.
16 | ```
17 | 
18 | #### Vocoder Preparation
19 | We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
20 | Please unzip this file into `checkpoints` before training your acoustic model.
21 | 
22 | (Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
23 | 
24 | This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder. 
25 | 
26 | ### 2. Training Example
27 | First, you need a pre-trained FFT-Singer checkpoint. You can use the [pre-trained model](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip), or train FFT-Singer from scratch, run:
28 | 
29 | ```sh
30 | # First, train fft-singer;
31 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset
32 | # Then, infer fft-singer;
33 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset --infer 
34 | ```
35 | 
36 | Then, to train DiffSinger, run:
37 | ```sh
38 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset
39 | ```
40 | 
41 | Remember to adjust the "fs2_ckpt" parameter in `usr/configs/popcs_ds_beta6_offline.yaml` to fit your path.
42 | 
43 | ### 3. Inference Example
44 | ```sh
45 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset --infer
46 | ```
47 | 
48 | We also provide:
49 |  - the pre-trained model of [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_ds_beta6_offline_pmf0_1230.zip);
50 |  - the pre-trained model of [FFT-Singer](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip) for the shallow diffusion mechanism in DiffSinger;
51 | 
52 | Remember to put the pre-trained models in `checkpoints` directory.
53 | 
54 | *Note that:* 
55 | 
56 | - *the original PWG version vocoder in the paper we used has been put into commercial use, so we provide this HifiGAN version vocoder as a substitute.*
57 | - *we assume the ground-truth F0 to be given as the pitch information following [1][2][3]. If you want to conduct experiments on MIDI data, you need an external F0 predictor (like [MIDI-A-version](README-SVS-opencpop-cascade.md)) or a joint prediction with spectrograms(like [MIDI-B-version](README-SVS-opencpop-e2e.md)).*
58 | 
59 | [1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
60 | 
61 | [2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
62 | 
63 | [3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.
64 | 


--------------------------------------------------------------------------------
/docs/README-SVS.md:
--------------------------------------------------------------------------------
 1 | # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
 2 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
 3 | [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
 4 | [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
 5 | [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-blue?label=SVSDemo)](https://huggingface.co/spaces/Silentlin/DiffSinger)
 6 | 
 7 | ## DiffSinger (SVS)
 8 | 
 9 | ### PART1. [Run DiffSinger on PopCS](README-SVS-popcs.md)
10 | In PART1, we only focus on spectrum modeling (acoustic model) and assume the ground-truth (GT) F0 to be given as the pitch information following these papers [1][2][3]. If you want to conduct experiments with F0 prediction, please move to PART2.
11 | 
12 | Thus, the pipeline of this part can be summarized as:
13 | 
14 | ```
15 | [lyrics] -> [linguistic representation] (Frontend)
16 | [linguistic representation] + [GT F0] + [GT phoneme duration] -> [mel-spectrogram]  (Acoustic model)
17 | [mel-spectrogram] + [GT F0] -> [waveform] (Vocoder)
18 | ```
19 | 
20 | 
21 | [1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
22 | 
23 | [2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
24 | 
25 | [3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.
26 | 
27 | Click here for detailed instructions: [link](README-SVS-popcs.md).
28 | 
29 | 
30 | ### PART2. [Run DiffSinger on Opencpop](README-SVS-opencpop-cascade.md)
31 | Thanks [Opencpop team](https://wenet.org.cn/opencpop/) for releasing their SVS dataset with MIDI label, **Jan.20, 2022** (after we published our paper).
32 | 
33 | Since there are elaborately annotated MIDI labels, we are able to supplement the pipeline in PART 1 by adding a naive melody frontend.
34 | 
35 | #### 2.A
36 | Thus, the pipeline of [2.A](README-SVS-opencpop-cascade.md) can be summarized as:
37 | 
38 | ```
39 | [lyrics] + [MIDI] -> [linguistic representation (with MIDI information)] + [predicted F0] + [predicted phoneme duration] (Melody frontend)
40 | [linguistic representation] + [predicted F0] + [predicted phoneme duration] -> [mel-spectrogram]  (Acoustic model)
41 | [mel-spectrogram] + [predicted F0] -> [waveform] (Vocoder)
42 | ```
43 | 
44 | Click here for detailed instructions: [link](README-SVS-opencpop-cascade.md).
45 | 
46 | #### 2.B
47 | In 2.1, we find that if we predict F0 explicitly in the melody frontend, there will be many bad cases of uv/v prediction. Then, we abandon the explicit prediction of the F0 curve in the melody frontend and make a joint prediction with spectrograms.
48 | 
49 | Thus, the pipeline of [2.B](README-SVS-opencpop-e2e.md) can be summarized as:
50 | ```
51 | [lyrics] + [MIDI] -> [linguistic representation] + [predicted phoneme duration] (Melody frontend)
52 | [linguistic representation (with MIDI information)] + [predicted phoneme duration] -> [mel-spectrogram]  (Acoustic model)
53 | [mel-spectrogram] -> [predicted F0]  (Pitch extractor)
54 | [mel-spectrogram] + [predicted F0] -> [waveform] (Vocoder)
55 | ```
56 | 
57 | Click here for detailed instructions: [link](README-SVS-opencpop-e2e.md).
58 | 
59 | ### FAQ
60 | Q1: Why do I need F0 in Vocoders?
61 | 
62 | A1: See vocoder parts in HiFiSinger, DiffSinger or SingGAN. This is a common practice now.
63 | 
64 | Q2: Why not run MIDI version SVS on PopCS dataset? or Why not release MIDI labels for PopCS dataset?
65 | 
66 | A2: Our laboratory has no funds to label PopCS dataset. But there are funds for labeling other singing dataset, which is coming soon.
67 | 
68 | Q3: Why " 'HifiGAN' object has no attribute 'model' "?
69 | 
70 | A3: Please put the pretrained vocoders in your `checkpoints` dictionary.
71 | 
72 | Q4: How to check whether I use GT information or predicted information during inference from packed test set?
73 | 
74 | A4: Please see codes [here](https://github.com/MoonInTheRiver/DiffSinger/blob/55e2f46068af6e69940a9f8f02d306c24a940cab/tasks/tts/fs2.py#L343).
75 | 
76 | ...


--------------------------------------------------------------------------------
/docs/README-TTS-pndm.md:
--------------------------------------------------------------------------------
 1 | # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
 2 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
 3 | [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
 4 | [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
 5 | [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-blue?label=TTSDemo)](https://huggingface.co/spaces/NATSpeech/DiffSpeech) 
 6 | 
 7 | ## DiffSpeech (TTS)
 8 | ### 1. Preparation
 9 | 
10 | #### Data Preparation
11 | a) Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/), then create a link to the dataset folder: `ln -s /xxx/LJSpeech-1.1/ data/raw/`
12 | 
13 | b) Download and Unzip the [ground-truth duration](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/mfa_outputs.tar) extracted by [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz):  `tar -xvf mfa_outputs.tar; mv mfa_outputs data/processed/ljspeech/`
14 | 
15 | c) Run the following scripts to pack the dataset for training/inference.
16 | 
17 | ```sh
18 | export PYTHONPATH=.
19 | CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config configs/tts/lj/fs2.yaml
20 | 
21 | # `data/binary/ljspeech` will be generated.
22 | ```
23 | 
24 | #### Vocoder Preparation
25 | We provide the pre-trained model of [HifiGAN](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0414_hifi_lj_1.zip) vocoder.
26 | Please unzip this file into `checkpoints` before training your acoustic model.
27 | 
28 | ### 2. Training Example
29 | 
30 | ```sh
31 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_pndm.yaml --exp_name ds_pndm_lj_1 --reset
32 | ```
33 | 
34 | ### 3. Inference Example
35 | 
36 | ```sh
37 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_pndm.yaml --exp_name ds_pndm_lj_1 --reset --infer
38 | ```
39 | 


--------------------------------------------------------------------------------
/docs/README-TTS.md:
--------------------------------------------------------------------------------
 1 | # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
 2 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
 3 | [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
 4 | [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
 5 | [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-blue?label=TTSDemo)](https://huggingface.co/spaces/NATSpeech/DiffSpeech) 
 6 | 
 7 | ## DiffSpeech (TTS)
 8 | ### 1. Preparation
 9 | 
10 | #### Data Preparation
11 | a) Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/), then create a link to the dataset folder: `ln -s /xxx/LJSpeech-1.1/ data/raw/`
12 | 
13 | b) Download and Unzip the [ground-truth duration](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/mfa_outputs.tar) extracted by [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz):  `tar -xvf mfa_outputs.tar; mv mfa_outputs data/processed/ljspeech/`
14 | 
15 | c) Run the following scripts to pack the dataset for training/inference.
16 | 
17 | ```sh
18 | export PYTHONPATH=.
19 | CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config configs/tts/lj/fs2.yaml
20 | 
21 | # `data/binary/ljspeech` will be generated.
22 | ```
23 | 
24 | #### Vocoder Preparation
25 | We provide the pre-trained model of [HifiGAN](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0414_hifi_lj_1.zip) vocoder.
26 | Please unzip this file into `checkpoints` before training your acoustic model.
27 | 
28 | ### 2. Training Example
29 | 
30 | First, you need a pre-trained FastSpeech2 checkpoint. You can use the [pre-trained model](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip), or train FastSpeech2 from scratch, run:
31 | ```sh
32 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config configs/tts/lj/fs2.yaml --exp_name fs2_lj_1 --reset
33 | ```
34 | Then, to train DiffSpeech, run:
35 | ```sh
36 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset
37 | ```
38 | 
39 | Remember to adjust the "fs2_ckpt" parameter in `usr/configs/lj_ds_beta6.yaml` to fit your path.
40 | 
41 | ### 3. Inference Example
42 | 
43 | ```sh
44 | CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset --infer
45 | ```
46 | 
47 | We also provide:
48 |  - the pre-trained model of [DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip);
49 |  - the individual pre-trained model of [FastSpeech 2](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip) for the shallow diffusion mechanism in DiffSpeech;
50 |  
51 | Remember to put the pre-trained models in `checkpoints` directory.
52 | 
53 | ## Mel Visualization
54 | Along vertical axis, DiffSpeech: [0-80]; FastSpeech2: [80-160].
55 | 
56 | <table style="width:100%">
57 |   <tr>
58 |     <th>DiffSpeech vs. FastSpeech 2</th>
59 |   </tr>
60 |   <tr>
61 |     <td><img src="resources/diffspeech-fs2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
62 |   </tr>
63 |   <tr>
64 |     <td><img src="resources/diffspeech-fs2-1.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
65 |   </tr>
66 |   <tr>
67 |     <td><img src="resources/diffspeech-fs2-2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
68 |   </tr>
69 | </table>


--------------------------------------------------------------------------------
/inference/svs/ds_cascade.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | # from inference.tts.fs import FastSpeechInfer
 3 | # from modules.tts.fs2_orig import FastSpeech2Orig
 4 | from inference.svs.base_svs_infer import BaseSVSInfer
 5 | from utils import load_ckpt
 6 | from utils.hparams import hparams
 7 | from usr.diff.shallow_diffusion_tts import GaussianDiffusion
 8 | from usr.diffsinger_task import DIFF_DECODERS
 9 | 
10 | class DiffSingerCascadeInfer(BaseSVSInfer):
11 |     def build_model(self):
12 |         model = GaussianDiffusion(
13 |             phone_encoder=self.ph_encoder,
14 |             out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
15 |             timesteps=hparams['timesteps'],
16 |             K_step=hparams['K_step'],
17 |             loss_type=hparams['diff_loss_type'],
18 |             spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
19 |         )
20 |         model.eval()
21 |         load_ckpt(model, hparams['work_dir'], 'model')
22 |         return model
23 | 
24 |     def forward_model(self, inp):
25 |         sample = self.input_to_batch(inp)
26 |         txt_tokens = sample['txt_tokens']  # [B, T_t]
27 |         spk_id = sample.get('spk_ids')
28 |         with torch.no_grad():
29 |             output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
30 |                                 pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'],
31 |                                 is_slur=sample['is_slur'])
32 |             mel_out = output['mel_out']  # [B, T,80]
33 |             f0_pred = output['f0_denorm']
34 |             wav_out = self.run_vocoder(mel_out, f0=f0_pred)
35 |         wav_out = wav_out.cpu().numpy()
36 |         return wav_out[0]
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     inp = {
41 |         'text': '小酒窝长睫毛AP是你最美的记号',
42 |         'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
43 |         'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
44 |         'input_type': 'word'
45 |     }  # user input: Chinese characters
46 |     c = {
47 |         'text': '小酒窝长睫毛AP是你最美的记号',
48 |         'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
49 |         'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
50 |         'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
51 |         'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
52 |         'input_type': 'phoneme'
53 |     }  # input like Opencpop dataset.
54 |     DiffSingerCascadeInfer.example_run(inp)
55 | 


--------------------------------------------------------------------------------
/inference/svs/ds_e2e.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | # from inference.tts.fs import FastSpeechInfer
 3 | # from modules.tts.fs2_orig import FastSpeech2Orig
 4 | from inference.svs.base_svs_infer import BaseSVSInfer
 5 | from utils import load_ckpt
 6 | from utils.hparams import hparams
 7 | from usr.diff.shallow_diffusion_tts import GaussianDiffusion
 8 | from usr.diffsinger_task import DIFF_DECODERS
 9 | from modules.fastspeech.pe import PitchExtractor
10 | import utils
11 | 
12 | 
13 | class DiffSingerE2EInfer(BaseSVSInfer):
14 |     def build_model(self):
15 |         model = GaussianDiffusion(
16 |             phone_encoder=self.ph_encoder,
17 |             out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
18 |             timesteps=hparams['timesteps'],
19 |             K_step=hparams['K_step'],
20 |             loss_type=hparams['diff_loss_type'],
21 |             spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
22 |         )
23 |         model.eval()
24 |         load_ckpt(model, hparams['work_dir'], 'model')
25 | 
26 |         if hparams.get('pe_enable') is not None and hparams['pe_enable']:
27 |             self.pe = PitchExtractor().to(self.device)
28 |             utils.load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
29 |             self.pe.eval()
30 |         return model
31 | 
32 |     def forward_model(self, inp):
33 |         sample = self.input_to_batch(inp)
34 |         txt_tokens = sample['txt_tokens']  # [B, T_t]
35 |         spk_id = sample.get('spk_ids')
36 |         with torch.no_grad():
37 |             output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
38 |                                 pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'],
39 |                                 is_slur=sample['is_slur'])
40 |             mel_out = output['mel_out']  # [B, T,80]
41 |             if hparams.get('pe_enable') is not None and hparams['pe_enable']:
42 |                 f0_pred = self.pe(mel_out)['f0_denorm_pred']  # pe predict from Pred mel
43 |             else:
44 |                 f0_pred = output['f0_denorm']
45 |             wav_out = self.run_vocoder(mel_out, f0=f0_pred)
46 |         wav_out = wav_out.cpu().numpy()
47 |         return wav_out[0]
48 | 
49 | if __name__ == '__main__':
50 |     inp = {
51 |         'text': '小酒窝长睫毛AP是你最美的记号',
52 |         'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
53 |         'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
54 |         'input_type': 'word'
55 |     }  # user input: Chinese characters
56 |     c = {
57 |         'text': '小酒窝长睫毛AP是你最美的记号',
58 |         'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
59 |         'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
60 |         'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
61 |         'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
62 |         'input_type': 'phoneme'
63 |     }  # input like Opencpop dataset.
64 |     DiffSingerE2EInfer.example_run(inp)
65 | 
66 | 
67 | # python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel


--------------------------------------------------------------------------------
/inference/svs/gradio/gradio_settings.yaml:
--------------------------------------------------------------------------------
 1 | title: 'DiffSinger'
 2 | description: |
 3 |   This model is trained on 5 hours single female singing voice samples of Opencpop dataset. (该模型在开源数据集Opencpop的5小时单人歌声上训练。)
 4 | 
 5 |   Please assign pitch and duration values to each Chinese character. The corresponding pitch and duration value of each character should be separated by a | separator. It is necessary to ensure that the note window separated by the separator is consistent with the number of Chinese characters (AP or SP is also viewed as a Chinese character). (请给每个汉字分配音高和时值, 每个字对应的音高和时值需要用|分隔符隔开。需要保证分隔符分割出来的音符窗口与汉字个数(AP或SP也算一个汉字)一致。)
 6 | 
 7 |   You can click one of the examples to load them. (你可以点击下方示例，加载示例曲谱。)
 8 | 
 9 |   Note: This space is running on CPU. (该Demo是在Huggingface提供的CPU上运行的, 其推理速度在本地会更快一些。)
10 | 
11 | article: |
12 |   Link to <a href='https://github.com/MoonInTheRiver/DiffSinger' style='color:blue;' target='_blank\'>Github REPO</a>
13 | example_inputs:
14 |   - |-
15 |     你 说 你 不 SP 懂 为 何 在 这 时 牵 手 AP<sep>D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | rest | D#4/Eb4 | D4 | D4 | D4 | D#4/Eb4 | F4 | D#4/Eb4 | D4 | rest<sep>0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590
16 |   - |-
17 |     小酒窝长睫毛AP是你最美的记号<sep>C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4<sep>0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340
18 |   - |-
19 |     我真的SP爱你SP句句不轻易<sep>D4 | A4 | F#4 |  rest | A4 | D4 | rest | B4 | A4 F#4 | F#4 | A4 | A4<sep>0.8 | 0.4 | 0.967 | 0.3 | 0.4 | 0.967 | 0.4 | 0.8 | 0.4 0.4 | 0.25 | 0.967 | 0.9
20 |   - |-
21 |     好冷啊 AP 我在东北玩泥巴<sep>F4 | F4 | D4 | rest | D4 | D4 | C4 | C4 | B3 | C4 | D4<sep>0.5 | 0.3 | 0.3 | 0.3 | 0.2 | 0.2 | 0.2 | 0.2 | 0.25 | 0.25 | 0.4
22 | 
23 | #inference_cls: inference.svs.ds_cascade.DiffSingerCascadeInfer
24 | #exp_name: 0303_opencpop_ds58_midi
25 | 
26 | inference_cls: inference.svs.ds_e2e.DiffSingerE2EInfer
27 | exp_name: 0228_opencpop_ds100_rel


--------------------------------------------------------------------------------
/inference/svs/gradio/infer.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import re
 3 | 
 4 | import gradio as gr
 5 | import yaml
 6 | from gradio.inputs import Textbox
 7 | 
 8 | from inference.svs.base_svs_infer import BaseSVSInfer
 9 | from utils.hparams import set_hparams
10 | from utils.hparams import hparams as hp
11 | import numpy as np
12 | 
13 | 
14 | class GradioInfer:
15 |     def __init__(self, exp_name, inference_cls, title, description, article, example_inputs):
16 |         self.exp_name = exp_name
17 |         self.title = title
18 |         self.description = description
19 |         self.article = article
20 |         self.example_inputs = example_inputs
21 |         pkg = ".".join(inference_cls.split(".")[:-1])
22 |         cls_name = inference_cls.split(".")[-1]
23 |         self.inference_cls = getattr(importlib.import_module(pkg), cls_name)
24 | 
25 |     def greet(self, text, notes, notes_duration):
26 |         PUNCS = '。？；：'
27 |         sents = re.split(rf'([{PUNCS}])', text.replace('\n', ','))
28 |         sents_notes = re.split(rf'([{PUNCS}])', notes.replace('\n', ','))
29 |         sents_notes_dur = re.split(rf'([{PUNCS}])', notes_duration.replace('\n', ','))
30 | 
31 |         if sents[-1] not in list(PUNCS):
32 |             sents = sents + ['']
33 |             sents_notes = sents_notes + ['']
34 |             sents_notes_dur = sents_notes_dur + ['']
35 | 
36 |         audio_outs = []
37 |         s, n, n_dur = "", "", ""
38 |         for i in range(0, len(sents), 2):
39 |             if len(sents[i]) > 0:
40 |                 s += sents[i] + sents[i + 1]
41 |                 n += sents_notes[i] + sents_notes[i+1]
42 |                 n_dur += sents_notes_dur[i] + sents_notes_dur[i+1]
43 |             if len(s) >= 400 or (i >= len(sents) - 2 and len(s) > 0):
44 |                 audio_out = self.infer_ins.infer_once({
45 |                     'text': s,
46 |                     'notes': n,
47 |                     'notes_duration': n_dur,
48 |                 })
49 |                 audio_out = audio_out * 32767
50 |                 audio_out = audio_out.astype(np.int16)
51 |                 audio_outs.append(audio_out)
52 |                 audio_outs.append(np.zeros(int(hp['audio_sample_rate'] * 0.3)).astype(np.int16))
53 |                 s = ""
54 |                 n = ""
55 |         audio_outs = np.concatenate(audio_outs)
56 |         return hp['audio_sample_rate'], audio_outs
57 | 
58 |     def run(self):
59 |         set_hparams(exp_name=self.exp_name, print_hparams=False)
60 |         infer_cls = self.inference_cls
61 |         self.infer_ins: BaseSVSInfer = infer_cls(hp)
62 |         example_inputs = self.example_inputs
63 |         for i in range(len(example_inputs)):
64 |             text, notes, notes_dur = example_inputs[i].split('<sep>')
65 |             example_inputs[i] = [text, notes, notes_dur]
66 | 
67 |         iface = gr.Interface(fn=self.greet,
68 |                              inputs=[
69 |                                  Textbox(lines=2, placeholder=None, default=example_inputs[0][0], label="input text"),
70 |                                  Textbox(lines=2, placeholder=None, default=example_inputs[0][1], label="input note"),
71 |                                  Textbox(lines=2, placeholder=None, default=example_inputs[0][2], label="input duration")]
72 |                              ,
73 |                              outputs="audio",
74 |                              allow_flagging="never",
75 |                              title=self.title,
76 |                              description=self.description,
77 |                              article=self.article,
78 |                              examples=example_inputs,
79 |                              enable_queue=True)
80 |         iface.launch(share=True,)# cache_examples=True)
81 | 
82 | 
83 | if __name__ == '__main__':
84 |     gradio_config = yaml.safe_load(open('inference/svs/gradio/gradio_settings.yaml'))
85 |     g = GradioInfer(**gradio_config)
86 |     g.run()
87 | 
88 | 
89 | # python inference/svs/gradio/infer.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi
90 | # python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi
91 | # CUDA_VISIBLE_DEVICES=3 python inference/svs/gradio/infer.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel


--------------------------------------------------------------------------------
/inference/svs/opencpop/map.py:
--------------------------------------------------------------------------------
1 | def cpop_pinyin2ph_func():
2 |     # In the README file of opencpop dataset, they defined a "pinyin to phoneme mapping table"
3 |     pinyin2phs = {'AP': 'AP', 'SP': 'SP'}
4 |     with open('inference/svs/opencpop/cpop_pinyin2ph.txt') as rf:
5 |         for line in rf.readlines():
6 |             elements = [x.strip() for x in line.split('|') if x.strip() != '']
7 |             pinyin2phs[elements[0]] = elements[1]
8 |     return pinyin2phs


--------------------------------------------------------------------------------
/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/modules/__init__.py


--------------------------------------------------------------------------------
/modules/commons/espnet_positional_embedding.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | 
  4 | 
  5 | class PositionalEncoding(torch.nn.Module):
  6 |     """Positional encoding.
  7 |     Args:
  8 |         d_model (int): Embedding dimension.
  9 |         dropout_rate (float): Dropout rate.
 10 |         max_len (int): Maximum input length.
 11 |         reverse (bool): Whether to reverse the input position.
 12 |     """
 13 | 
 14 |     def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
 15 |         """Construct an PositionalEncoding object."""
 16 |         super(PositionalEncoding, self).__init__()
 17 |         self.d_model = d_model
 18 |         self.reverse = reverse
 19 |         self.xscale = math.sqrt(self.d_model)
 20 |         self.dropout = torch.nn.Dropout(p=dropout_rate)
 21 |         self.pe = None
 22 |         self.extend_pe(torch.tensor(0.0).expand(1, max_len))
 23 | 
 24 |     def extend_pe(self, x):
 25 |         """Reset the positional encodings."""
 26 |         if self.pe is not None:
 27 |             if self.pe.size(1) >= x.size(1):
 28 |                 if self.pe.dtype != x.dtype or self.pe.device != x.device:
 29 |                     self.pe = self.pe.to(dtype=x.dtype, device=x.device)
 30 |                 return
 31 |         pe = torch.zeros(x.size(1), self.d_model)
 32 |         if self.reverse:
 33 |             position = torch.arange(
 34 |                 x.size(1) - 1, -1, -1.0, dtype=torch.float32
 35 |             ).unsqueeze(1)
 36 |         else:
 37 |             position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
 38 |         div_term = torch.exp(
 39 |             torch.arange(0, self.d_model, 2, dtype=torch.float32)
 40 |             * -(math.log(10000.0) / self.d_model)
 41 |         )
 42 |         pe[:, 0::2] = torch.sin(position * div_term)
 43 |         pe[:, 1::2] = torch.cos(position * div_term)
 44 |         pe = pe.unsqueeze(0)
 45 |         self.pe = pe.to(device=x.device, dtype=x.dtype)
 46 | 
 47 |     def forward(self, x: torch.Tensor):
 48 |         """Add positional encoding.
 49 |         Args:
 50 |             x (torch.Tensor): Input tensor (batch, time, `*`).
 51 |         Returns:
 52 |             torch.Tensor: Encoded tensor (batch, time, `*`).
 53 |         """
 54 |         self.extend_pe(x)
 55 |         x = x * self.xscale + self.pe[:, : x.size(1)]
 56 |         return self.dropout(x)
 57 | 
 58 | 
 59 | class ScaledPositionalEncoding(PositionalEncoding):
 60 |     """Scaled positional encoding module.
 61 |     See Sec. 3.2  https://arxiv.org/abs/1809.08895
 62 |     Args:
 63 |         d_model (int): Embedding dimension.
 64 |         dropout_rate (float): Dropout rate.
 65 |         max_len (int): Maximum input length.
 66 |     """
 67 | 
 68 |     def __init__(self, d_model, dropout_rate, max_len=5000):
 69 |         """Initialize class."""
 70 |         super().__init__(d_model=d_model, dropout_rate=dropout_rate, max_len=max_len)
 71 |         self.alpha = torch.nn.Parameter(torch.tensor(1.0))
 72 | 
 73 |     def reset_parameters(self):
 74 |         """Reset parameters."""
 75 |         self.alpha.data = torch.tensor(1.0)
 76 | 
 77 |     def forward(self, x):
 78 |         """Add positional encoding.
 79 |         Args:
 80 |             x (torch.Tensor): Input tensor (batch, time, `*`).
 81 |         Returns:
 82 |             torch.Tensor: Encoded tensor (batch, time, `*`).
 83 |         """
 84 |         self.extend_pe(x)
 85 |         x = x + self.alpha * self.pe[:, : x.size(1)]
 86 |         return self.dropout(x)
 87 | 
 88 | 
 89 | class RelPositionalEncoding(PositionalEncoding):
 90 |     """Relative positional encoding module.
 91 |     See : Appendix B in https://arxiv.org/abs/1901.02860
 92 |     Args:
 93 |         d_model (int): Embedding dimension.
 94 |         dropout_rate (float): Dropout rate.
 95 |         max_len (int): Maximum input length.
 96 |     """
 97 | 
 98 |     def __init__(self, d_model, dropout_rate, max_len=5000):
 99 |         """Initialize class."""
100 |         super().__init__(d_model, dropout_rate, max_len, reverse=True)
101 | 
102 |     def forward(self, x):
103 |         """Compute positional encoding.
104 |         Args:
105 |             x (torch.Tensor): Input tensor (batch, time, `*`).
106 |         Returns:
107 |             torch.Tensor: Encoded tensor (batch, time, `*`).
108 |             torch.Tensor: Positional embedding tensor (1, time, `*`).
109 |         """
110 |         self.extend_pe(x)
111 |         x = x * self.xscale
112 |         pos_emb = self.pe[:, : x.size(1)]
113 |         return self.dropout(x) + self.dropout(pos_emb)


--------------------------------------------------------------------------------
/modules/diffsinger_midi/fs2.py:
--------------------------------------------------------------------------------
  1 | from modules.commons.common_layers import *
  2 | from modules.commons.common_layers import Embedding
  3 | from modules.fastspeech.tts_modules import FastspeechDecoder, DurationPredictor, LengthRegulator, PitchPredictor, \
  4 |     EnergyPredictor, FastspeechEncoder
  5 | from utils.cwt import cwt2f0
  6 | from utils.hparams import hparams
  7 | from utils.pitch_utils import f0_to_coarse, denorm_f0, norm_f0
  8 | from modules.fastspeech.fs2 import FastSpeech2
  9 | 
 10 | 
 11 | class FastspeechMIDIEncoder(FastspeechEncoder):
 12 |     def forward_embedding(self, txt_tokens, midi_embedding, midi_dur_embedding, slur_embedding):
 13 |         # embed tokens and positions
 14 |         x = self.embed_scale * self.embed_tokens(txt_tokens)
 15 |         x = x + midi_embedding + midi_dur_embedding + slur_embedding
 16 |         if hparams['use_pos_embed']:
 17 |             if hparams.get('rel_pos') is not None and hparams['rel_pos']:
 18 |                 x = self.embed_positions(x)
 19 |             else:
 20 |                 positions = self.embed_positions(txt_tokens)
 21 |                 x = x + positions
 22 |         x = F.dropout(x, p=self.dropout, training=self.training)
 23 |         return x
 24 | 
 25 |     def forward(self, txt_tokens, midi_embedding, midi_dur_embedding, slur_embedding):
 26 |         """
 27 | 
 28 |         :param txt_tokens: [B, T]
 29 |         :return: {
 30 |             'encoder_out': [T x B x C]
 31 |         }
 32 |         """
 33 |         encoder_padding_mask = txt_tokens.eq(self.padding_idx).data
 34 |         x = self.forward_embedding(txt_tokens, midi_embedding, midi_dur_embedding, slur_embedding)  # [B, T, H]
 35 |         x = super(FastspeechEncoder, self).forward(x, encoder_padding_mask)
 36 |         return x
 37 | 
 38 | 
 39 | FS_ENCODERS = {
 40 |     'fft': lambda hp, embed_tokens, d: FastspeechMIDIEncoder(
 41 |         embed_tokens, hp['hidden_size'], hp['enc_layers'], hp['enc_ffn_kernel_size'],
 42 |         num_heads=hp['num_heads']),
 43 | }
 44 | 
 45 | 
 46 | class FastSpeech2MIDI(FastSpeech2):
 47 |     def __init__(self, dictionary, out_dims=None):
 48 |         super().__init__(dictionary, out_dims)
 49 |         del self.encoder
 50 |         self.encoder = FS_ENCODERS[hparams['encoder_type']](hparams, self.encoder_embed_tokens, self.dictionary)
 51 |         self.midi_embed = Embedding(300, self.hidden_size, self.padding_idx)
 52 |         self.midi_dur_layer = Linear(1, self.hidden_size)
 53 |         self.is_slur_embed = Embedding(2, self.hidden_size)
 54 | 
 55 |     def forward(self, txt_tokens, mel2ph=None, spk_embed=None,
 56 |                 ref_mels=None, f0=None, uv=None, energy=None, skip_decoder=False,
 57 |                 spk_embed_dur_id=None, spk_embed_f0_id=None, infer=False, **kwargs):
 58 |         ret = {}
 59 | 
 60 |         midi_embedding = self.midi_embed(kwargs['pitch_midi'])
 61 |         midi_dur_embedding, slur_embedding = 0, 0
 62 |         if kwargs.get('midi_dur') is not None:
 63 |             midi_dur_embedding = self.midi_dur_layer(kwargs['midi_dur'][:, :, None])  # [B, T, 1] -> [B, T, H]
 64 |         if kwargs.get('is_slur') is not None:
 65 |             slur_embedding = self.is_slur_embed(kwargs['is_slur'])
 66 |         encoder_out = self.encoder(txt_tokens, midi_embedding, midi_dur_embedding, slur_embedding)  # [B, T, C]
 67 |         src_nonpadding = (txt_tokens > 0).float()[:, :, None]
 68 | 
 69 |         # add ref style embed
 70 |         # Not implemented
 71 |         # variance encoder
 72 |         var_embed = 0
 73 | 
 74 |         # encoder_out_dur denotes encoder outputs for duration predictor
 75 |         # in speech adaptation, duration predictor use old speaker embedding
 76 |         if hparams['use_spk_embed']:
 77 |             spk_embed_dur = spk_embed_f0 = spk_embed = self.spk_embed_proj(spk_embed)[:, None, :]
 78 |         elif hparams['use_spk_id']:
 79 |             spk_embed_id = spk_embed
 80 |             if spk_embed_dur_id is None:
 81 |                 spk_embed_dur_id = spk_embed_id
 82 |             if spk_embed_f0_id is None:
 83 |                 spk_embed_f0_id = spk_embed_id
 84 |             spk_embed = self.spk_embed_proj(spk_embed_id)[:, None, :]
 85 |             spk_embed_dur = spk_embed_f0 = spk_embed
 86 |             if hparams['use_split_spk_id']:
 87 |                 spk_embed_dur = self.spk_embed_dur(spk_embed_dur_id)[:, None, :]
 88 |                 spk_embed_f0 = self.spk_embed_f0(spk_embed_f0_id)[:, None, :]
 89 |         else:
 90 |             spk_embed_dur = spk_embed_f0 = spk_embed = 0
 91 | 
 92 |         # add dur
 93 |         dur_inp = (encoder_out + var_embed + spk_embed_dur) * src_nonpadding
 94 | 
 95 |         mel2ph = self.add_dur(dur_inp, mel2ph, txt_tokens, ret)
 96 | 
 97 |         decoder_inp = F.pad(encoder_out, [0, 0, 1, 0])
 98 | 
 99 |         mel2ph_ = mel2ph[..., None].repeat([1, 1, encoder_out.shape[-1]])
100 |         decoder_inp_origin = decoder_inp = torch.gather(decoder_inp, 1, mel2ph_)  # [B, T, H]
101 | 
102 |         tgt_nonpadding = (mel2ph > 0).float()[:, :, None]
103 | 
104 |         # add pitch and energy embed
105 |         pitch_inp = (decoder_inp_origin + var_embed + spk_embed_f0) * tgt_nonpadding
106 |         if hparams['use_pitch_embed']:
107 |             pitch_inp_ph = (encoder_out + var_embed + spk_embed_f0) * src_nonpadding
108 |             decoder_inp = decoder_inp + self.add_pitch(pitch_inp, f0, uv, mel2ph, ret, encoder_out=pitch_inp_ph)
109 |         if hparams['use_energy_embed']:
110 |             decoder_inp = decoder_inp + self.add_energy(pitch_inp, energy, ret)
111 | 
112 |         ret['decoder_inp'] = decoder_inp = (decoder_inp + spk_embed) * tgt_nonpadding
113 | 
114 |         if skip_decoder:
115 |             return ret
116 |         ret['mel_out'] = self.run_decoder(decoder_inp, tgt_nonpadding, ret, infer=infer, **kwargs)
117 | 
118 |         return ret
119 | 


--------------------------------------------------------------------------------
/modules/fastspeech/pe.py:
--------------------------------------------------------------------------------
  1 | from modules.commons.common_layers import *
  2 | from utils.hparams import hparams
  3 | from modules.fastspeech.tts_modules import PitchPredictor
  4 | from utils.pitch_utils import denorm_f0
  5 | 
  6 | 
  7 | class Prenet(nn.Module):
  8 |     def __init__(self, in_dim=80, out_dim=256, kernel=5, n_layers=3, strides=None):
  9 |         super(Prenet, self).__init__()
 10 |         padding = kernel // 2
 11 |         self.layers = []
 12 |         self.strides = strides if strides is not None else [1] * n_layers
 13 |         for l in range(n_layers):
 14 |             self.layers.append(nn.Sequential(
 15 |                 nn.Conv1d(in_dim, out_dim, kernel_size=kernel, padding=padding, stride=self.strides[l]),
 16 |                 nn.ReLU(),
 17 |                 nn.BatchNorm1d(out_dim)
 18 |             ))
 19 |             in_dim = out_dim
 20 |         self.layers = nn.ModuleList(self.layers)
 21 |         self.out_proj = nn.Linear(out_dim, out_dim)
 22 | 
 23 |     def forward(self, x):
 24 |         """
 25 | 
 26 |         :param x: [B, T, 80]
 27 |         :return: [L, B, T, H], [B, T, H]
 28 |         """
 29 |         padding_mask = x.abs().sum(-1).eq(0).data  # [B, T]
 30 |         nonpadding_mask_TB = 1 - padding_mask.float()[:, None, :]  # [B, 1, T]
 31 |         x = x.transpose(1, 2)
 32 |         hiddens = []
 33 |         for i, l in enumerate(self.layers):
 34 |             nonpadding_mask_TB = nonpadding_mask_TB[:, :, ::self.strides[i]]
 35 |             x = l(x) * nonpadding_mask_TB
 36 |         hiddens.append(x)
 37 |         hiddens = torch.stack(hiddens, 0)  # [L, B, H, T]
 38 |         hiddens = hiddens.transpose(2, 3)  # [L, B, T, H]
 39 |         x = self.out_proj(x.transpose(1, 2))  # [B, T, H]
 40 |         x = x * nonpadding_mask_TB.transpose(1, 2)
 41 |         return hiddens, x
 42 | 
 43 | 
 44 | class ConvBlock(nn.Module):
 45 |     def __init__(self, idim=80, n_chans=256, kernel_size=3, stride=1, norm='gn', dropout=0):
 46 |         super().__init__()
 47 |         self.conv = ConvNorm(idim, n_chans, kernel_size, stride=stride)
 48 |         self.norm = norm
 49 |         if self.norm == 'bn':
 50 |             self.norm = nn.BatchNorm1d(n_chans)
 51 |         elif self.norm == 'in':
 52 |             self.norm = nn.InstanceNorm1d(n_chans, affine=True)
 53 |         elif self.norm == 'gn':
 54 |             self.norm = nn.GroupNorm(n_chans // 16, n_chans)
 55 |         elif self.norm == 'ln':
 56 |             self.norm = LayerNorm(n_chans // 16, n_chans)
 57 |         elif self.norm == 'wn':
 58 |             self.conv = torch.nn.utils.weight_norm(self.conv.conv)
 59 |         self.dropout = nn.Dropout(dropout)
 60 |         self.relu = nn.ReLU()
 61 | 
 62 |     def forward(self, x):
 63 |         """
 64 | 
 65 |         :param x: [B, C, T]
 66 |         :return: [B, C, T]
 67 |         """
 68 |         x = self.conv(x)
 69 |         if not isinstance(self.norm, str):
 70 |             if self.norm == 'none':
 71 |                 pass
 72 |             elif self.norm == 'ln':
 73 |                 x = self.norm(x.transpose(1, 2)).transpose(1, 2)
 74 |             else:
 75 |                 x = self.norm(x)
 76 |         x = self.relu(x)
 77 |         x = self.dropout(x)
 78 |         return x
 79 | 
 80 | 
 81 | class ConvStacks(nn.Module):
 82 |     def __init__(self, idim=80, n_layers=5, n_chans=256, odim=32, kernel_size=5, norm='gn',
 83 |                  dropout=0, strides=None, res=True):
 84 |         super().__init__()
 85 |         self.conv = torch.nn.ModuleList()
 86 |         self.kernel_size = kernel_size
 87 |         self.res = res
 88 |         self.in_proj = Linear(idim, n_chans)
 89 |         if strides is None:
 90 |             strides = [1] * n_layers
 91 |         else:
 92 |             assert len(strides) == n_layers
 93 |         for idx in range(n_layers):
 94 |             self.conv.append(ConvBlock(
 95 |                 n_chans, n_chans, kernel_size, stride=strides[idx], norm=norm, dropout=dropout))
 96 |         self.out_proj = Linear(n_chans, odim)
 97 | 
 98 |     def forward(self, x, return_hiddens=False):
 99 |         """
100 | 
101 |         :param x: [B, T, H]
102 |         :return: [B, T, H]
103 |         """
104 |         x = self.in_proj(x)
105 |         x = x.transpose(1, -1)  # (B, idim, Tmax)
106 |         hiddens = []
107 |         for f in self.conv:
108 |             x_ = f(x)
109 |             x = x + x_ if self.res else x_  # (B, C, Tmax)
110 |             hiddens.append(x)
111 |         x = x.transpose(1, -1)
112 |         x = self.out_proj(x)  # (B, Tmax, H)
113 |         if return_hiddens:
114 |             hiddens = torch.stack(hiddens, 1)  # [B, L, C, T]
115 |             return x, hiddens
116 |         return x
117 | 
118 | 
119 | class PitchExtractor(nn.Module):
120 |     def __init__(self, n_mel_bins=80, conv_layers=2):
121 |         super().__init__()
122 |         self.hidden_size = hparams['hidden_size']
123 |         self.predictor_hidden = hparams['predictor_hidden'] if hparams['predictor_hidden'] > 0 else self.hidden_size
124 |         self.conv_layers = conv_layers
125 | 
126 |         self.mel_prenet = Prenet(n_mel_bins, self.hidden_size, strides=[1, 1, 1])
127 |         if self.conv_layers > 0:
128 |             self.mel_encoder = ConvStacks(
129 |                     idim=self.hidden_size, n_chans=self.hidden_size, odim=self.hidden_size, n_layers=self.conv_layers)
130 |         self.pitch_predictor = PitchPredictor(
131 |             self.hidden_size, n_chans=self.predictor_hidden,
132 |             n_layers=5, dropout_rate=0.1, odim=2,
133 |             padding=hparams['ffn_padding'], kernel_size=hparams['predictor_kernel'])
134 | 
135 |     def forward(self, mel_input=None):
136 |         ret = {}
137 |         mel_hidden = self.mel_prenet(mel_input)[1]
138 |         if self.conv_layers > 0:
139 |             mel_hidden = self.mel_encoder(mel_hidden)
140 | 
141 |         ret['pitch_pred'] = pitch_pred = self.pitch_predictor(mel_hidden)
142 | 
143 |         pitch_padding = mel_input.abs().sum(-1) == 0
144 |         use_uv = hparams['pitch_type'] == 'frame' and hparams['use_uv']
145 | 
146 |         ret['f0_denorm_pred'] = denorm_f0(
147 |             pitch_pred[:, :, 0], (pitch_pred[:, :, 1] > 0) if use_uv else None,
148 |             hparams, pitch_padding=pitch_padding)
149 |         return ret


--------------------------------------------------------------------------------
/modules/hifigan/mel_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.utils.data
 4 | from librosa.filters import mel as librosa_mel_fn
 5 | from scipy.io.wavfile import read
 6 | 
 7 | MAX_WAV_VALUE = 32768.0
 8 | 
 9 | 
10 | def load_wav(full_path):
11 |     sampling_rate, data = read(full_path)
12 |     return data, sampling_rate
13 | 
14 | 
15 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
16 |     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
17 | 
18 | 
19 | def dynamic_range_decompression(x, C=1):
20 |     return np.exp(x) / C
21 | 
22 | 
23 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
24 |     return torch.log(torch.clamp(x, min=clip_val) * C)
25 | 
26 | 
27 | def dynamic_range_decompression_torch(x, C=1):
28 |     return torch.exp(x) / C
29 | 
30 | 
31 | def spectral_normalize_torch(magnitudes):
32 |     output = dynamic_range_compression_torch(magnitudes)
33 |     return output
34 | 
35 | 
36 | def spectral_de_normalize_torch(magnitudes):
37 |     output = dynamic_range_decompression_torch(magnitudes)
38 |     return output
39 | 
40 | 
41 | mel_basis = {}
42 | hann_window = {}
43 | 
44 | 
45 | def mel_spectrogram(y, hparams, center=False, complex=False):
46 |     # hop_size: 512  # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
47 |     # win_size: 2048  # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
48 |     # fmin: 55  # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
49 |     # fmax: 10000  # To be increased/reduced depending on data.
50 |     # fft_size: 2048  # Extra window size is filled with 0 paddings to match this parameter
51 |     # n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax,
52 |     n_fft = hparams['fft_size']
53 |     num_mels = hparams['audio_num_mel_bins']
54 |     sampling_rate = hparams['audio_sample_rate']
55 |     hop_size = hparams['hop_size']
56 |     win_size = hparams['win_size']
57 |     fmin = hparams['fmin']
58 |     fmax = hparams['fmax']
59 |     y = y.clamp(min=-1., max=1.)
60 |     global mel_basis, hann_window
61 |     if fmax not in mel_basis:
62 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
63 |         mel_basis[str(fmax) + '_' + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
64 |         hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
65 | 
66 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
67 |                                 mode='reflect')
68 |     y = y.squeeze(1)
69 | 
70 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
71 |                       center=center, pad_mode='reflect', normalized=False, onesided=True)
72 | 
73 |     if not complex:
74 |         spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
75 |         spec = torch.matmul(mel_basis[str(fmax) + '_' + str(y.device)], spec)
76 |         spec = spectral_normalize_torch(spec)
77 |     else:
78 |         B, C, T, _ = spec.shape
79 |         spec = spec.transpose(1, 2)  # [B, T, n_fft, 2]
80 |     return spec
81 | 


--------------------------------------------------------------------------------
/modules/parallel_wavegan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/modules/parallel_wavegan/__init__.py


--------------------------------------------------------------------------------
/modules/parallel_wavegan/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .causal_conv import *  # NOQA
2 | from .pqmf import *  # NOQA
3 | from .residual_block import *  # NOQA
4 | from modules.parallel_wavegan.layers.residual_stack import *  # NOQA
5 | from .upsample import *  # NOQA
6 | 


--------------------------------------------------------------------------------
/modules/parallel_wavegan/layers/causal_conv.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2020 Tomoki Hayashi
 4 | #  MIT License (https://opensource.org/licenses/MIT)
 5 | 
 6 | """Causal convolusion layer modules."""
 7 | 
 8 | 
 9 | import torch
10 | 
11 | 
12 | class CausalConv1d(torch.nn.Module):
13 |     """CausalConv1d module with customized initialization."""
14 | 
15 |     def __init__(self, in_channels, out_channels, kernel_size,
16 |                  dilation=1, bias=True, pad="ConstantPad1d", pad_params={"value": 0.0}):
17 |         """Initialize CausalConv1d module."""
18 |         super(CausalConv1d, self).__init__()
19 |         self.pad = getattr(torch.nn, pad)((kernel_size - 1) * dilation, **pad_params)
20 |         self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size,
21 |                                     dilation=dilation, bias=bias)
22 | 
23 |     def forward(self, x):
24 |         """Calculate forward propagation.
25 | 
26 |         Args:
27 |             x (Tensor): Input tensor (B, in_channels, T).
28 | 
29 |         Returns:
30 |             Tensor: Output tensor (B, out_channels, T).
31 | 
32 |         """
33 |         return self.conv(self.pad(x))[:, :, :x.size(2)]
34 | 
35 | 
36 | class CausalConvTranspose1d(torch.nn.Module):
37 |     """CausalConvTranspose1d module with customized initialization."""
38 | 
39 |     def __init__(self, in_channels, out_channels, kernel_size, stride, bias=True):
40 |         """Initialize CausalConvTranspose1d module."""
41 |         super(CausalConvTranspose1d, self).__init__()
42 |         self.deconv = torch.nn.ConvTranspose1d(
43 |             in_channels, out_channels, kernel_size, stride, bias=bias)
44 |         self.stride = stride
45 | 
46 |     def forward(self, x):
47 |         """Calculate forward propagation.
48 | 
49 |         Args:
50 |             x (Tensor): Input tensor (B, in_channels, T_in).
51 | 
52 |         Returns:
53 |             Tensor: Output tensor (B, out_channels, T_out).
54 | 
55 |         """
56 |         return self.deconv(x)[:, :, :-self.stride]
57 | 


--------------------------------------------------------------------------------
/modules/parallel_wavegan/layers/pqmf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2020 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """Pseudo QMF modules."""
  7 | 
  8 | import numpy as np
  9 | import torch
 10 | import torch.nn.functional as F
 11 | 
 12 | from scipy.signal import kaiser
 13 | 
 14 | 
 15 | def design_prototype_filter(taps=62, cutoff_ratio=0.15, beta=9.0):
 16 |     """Design prototype filter for PQMF.
 17 | 
 18 |     This method is based on `A Kaiser window approach for the design of prototype
 19 |     filters of cosine modulated filterbanks`_.
 20 | 
 21 |     Args:
 22 |         taps (int): The number of filter taps.
 23 |         cutoff_ratio (float): Cut-off frequency ratio.
 24 |         beta (float): Beta coefficient for kaiser window.
 25 | 
 26 |     Returns:
 27 |         ndarray: Impluse response of prototype filter (taps + 1,).
 28 | 
 29 |     .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
 30 |         https://ieeexplore.ieee.org/abstract/document/681427
 31 | 
 32 |     """
 33 |     # check the arguments are valid
 34 |     assert taps % 2 == 0, "The number of taps mush be even number."
 35 |     assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0."
 36 | 
 37 |     # make initial filter
 38 |     omega_c = np.pi * cutoff_ratio
 39 |     with np.errstate(invalid='ignore'):
 40 |         h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) \
 41 |             / (np.pi * (np.arange(taps + 1) - 0.5 * taps))
 42 |     h_i[taps // 2] = np.cos(0) * cutoff_ratio  # fix nan due to indeterminate form
 43 | 
 44 |     # apply kaiser window
 45 |     w = kaiser(taps + 1, beta)
 46 |     h = h_i * w
 47 | 
 48 |     return h
 49 | 
 50 | 
 51 | class PQMF(torch.nn.Module):
 52 |     """PQMF module.
 53 | 
 54 |     This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
 55 | 
 56 |     .. _`Near-perfect-reconstruction pseudo-QMF banks`:
 57 |         https://ieeexplore.ieee.org/document/258122
 58 | 
 59 |     """
 60 | 
 61 |     def __init__(self, subbands=4, taps=62, cutoff_ratio=0.15, beta=9.0):
 62 |         """Initilize PQMF module.
 63 | 
 64 |         Args:
 65 |             subbands (int): The number of subbands.
 66 |             taps (int): The number of filter taps.
 67 |             cutoff_ratio (float): Cut-off frequency ratio.
 68 |             beta (float): Beta coefficient for kaiser window.
 69 | 
 70 |         """
 71 |         super(PQMF, self).__init__()
 72 | 
 73 |         # define filter coefficient
 74 |         h_proto = design_prototype_filter(taps, cutoff_ratio, beta)
 75 |         h_analysis = np.zeros((subbands, len(h_proto)))
 76 |         h_synthesis = np.zeros((subbands, len(h_proto)))
 77 |         for k in range(subbands):
 78 |             h_analysis[k] = 2 * h_proto * np.cos(
 79 |                 (2 * k + 1) * (np.pi / (2 * subbands)) *
 80 |                 (np.arange(taps + 1) - ((taps - 1) / 2)) +
 81 |                 (-1) ** k * np.pi / 4)
 82 |             h_synthesis[k] = 2 * h_proto * np.cos(
 83 |                 (2 * k + 1) * (np.pi / (2 * subbands)) *
 84 |                 (np.arange(taps + 1) - ((taps - 1) / 2)) -
 85 |                 (-1) ** k * np.pi / 4)
 86 | 
 87 |         # convert to tensor
 88 |         analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1)
 89 |         synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0)
 90 | 
 91 |         # register coefficients as beffer
 92 |         self.register_buffer("analysis_filter", analysis_filter)
 93 |         self.register_buffer("synthesis_filter", synthesis_filter)
 94 | 
 95 |         # filter for downsampling & upsampling
 96 |         updown_filter = torch.zeros((subbands, subbands, subbands)).float()
 97 |         for k in range(subbands):
 98 |             updown_filter[k, k, 0] = 1.0
 99 |         self.register_buffer("updown_filter", updown_filter)
100 |         self.subbands = subbands
101 | 
102 |         # keep padding info
103 |         self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
104 | 
105 |     def analysis(self, x):
106 |         """Analysis with PQMF.
107 | 
108 |         Args:
109 |             x (Tensor): Input tensor (B, 1, T).
110 | 
111 |         Returns:
112 |             Tensor: Output tensor (B, subbands, T // subbands).
113 | 
114 |         """
115 |         x = F.conv1d(self.pad_fn(x), self.analysis_filter)
116 |         return F.conv1d(x, self.updown_filter, stride=self.subbands)
117 | 
118 |     def synthesis(self, x):
119 |         """Synthesis with PQMF.
120 | 
121 |         Args:
122 |             x (Tensor): Input tensor (B, subbands, T // subbands).
123 | 
124 |         Returns:
125 |             Tensor: Output tensor (B, 1, T).
126 | 
127 |         """
128 |         x = F.conv_transpose1d(x, self.updown_filter * self.subbands, stride=self.subbands)
129 |         return F.conv1d(self.pad_fn(x), self.synthesis_filter)
130 | 


--------------------------------------------------------------------------------
/modules/parallel_wavegan/layers/residual_block.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """Residual block module in WaveNet.
  4 | 
  5 | This code is modified from https://github.com/r9y9/wavenet_vocoder.
  6 | 
  7 | """
  8 | 
  9 | import math
 10 | 
 11 | import torch
 12 | import torch.nn.functional as F
 13 | 
 14 | 
 15 | class Conv1d(torch.nn.Conv1d):
 16 |     """Conv1d module with customized initialization."""
 17 | 
 18 |     def __init__(self, *args, **kwargs):
 19 |         """Initialize Conv1d module."""
 20 |         super(Conv1d, self).__init__(*args, **kwargs)
 21 | 
 22 |     def reset_parameters(self):
 23 |         """Reset parameters."""
 24 |         torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu")
 25 |         if self.bias is not None:
 26 |             torch.nn.init.constant_(self.bias, 0.0)
 27 | 
 28 | 
 29 | class Conv1d1x1(Conv1d):
 30 |     """1x1 Conv1d with customized initialization."""
 31 | 
 32 |     def __init__(self, in_channels, out_channels, bias):
 33 |         """Initialize 1x1 Conv1d module."""
 34 |         super(Conv1d1x1, self).__init__(in_channels, out_channels,
 35 |                                         kernel_size=1, padding=0,
 36 |                                         dilation=1, bias=bias)
 37 | 
 38 | 
 39 | class ResidualBlock(torch.nn.Module):
 40 |     """Residual block module in WaveNet."""
 41 | 
 42 |     def __init__(self,
 43 |                  kernel_size=3,
 44 |                  residual_channels=64,
 45 |                  gate_channels=128,
 46 |                  skip_channels=64,
 47 |                  aux_channels=80,
 48 |                  dropout=0.0,
 49 |                  dilation=1,
 50 |                  bias=True,
 51 |                  use_causal_conv=False
 52 |                  ):
 53 |         """Initialize ResidualBlock module.
 54 | 
 55 |         Args:
 56 |             kernel_size (int): Kernel size of dilation convolution layer.
 57 |             residual_channels (int): Number of channels for residual connection.
 58 |             skip_channels (int): Number of channels for skip connection.
 59 |             aux_channels (int): Local conditioning channels i.e. auxiliary input dimension.
 60 |             dropout (float): Dropout probability.
 61 |             dilation (int): Dilation factor.
 62 |             bias (bool): Whether to add bias parameter in convolution layers.
 63 |             use_causal_conv (bool): Whether to use use_causal_conv or non-use_causal_conv convolution.
 64 | 
 65 |         """
 66 |         super(ResidualBlock, self).__init__()
 67 |         self.dropout = dropout
 68 |         # no future time stamps available
 69 |         if use_causal_conv:
 70 |             padding = (kernel_size - 1) * dilation
 71 |         else:
 72 |             assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
 73 |             padding = (kernel_size - 1) // 2 * dilation
 74 |         self.use_causal_conv = use_causal_conv
 75 | 
 76 |         # dilation conv
 77 |         self.conv = Conv1d(residual_channels, gate_channels, kernel_size,
 78 |                            padding=padding, dilation=dilation, bias=bias)
 79 | 
 80 |         # local conditioning
 81 |         if aux_channels > 0:
 82 |             self.conv1x1_aux = Conv1d1x1(aux_channels, gate_channels, bias=False)
 83 |         else:
 84 |             self.conv1x1_aux = None
 85 | 
 86 |         # conv output is split into two groups
 87 |         gate_out_channels = gate_channels // 2
 88 |         self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, bias=bias)
 89 |         self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_channels, bias=bias)
 90 | 
 91 |     def forward(self, x, c):
 92 |         """Calculate forward propagation.
 93 | 
 94 |         Args:
 95 |             x (Tensor): Input tensor (B, residual_channels, T).
 96 |             c (Tensor): Local conditioning auxiliary tensor (B, aux_channels, T).
 97 | 
 98 |         Returns:
 99 |             Tensor: Output tensor for residual connection (B, residual_channels, T).
100 |             Tensor: Output tensor for skip connection (B, skip_channels, T).
101 | 
102 |         """
103 |         residual = x
104 |         x = F.dropout(x, p=self.dropout, training=self.training)
105 |         x = self.conv(x)
106 | 
107 |         # remove future time steps if use_causal_conv conv
108 |         x = x[:, :, :residual.size(-1)] if self.use_causal_conv else x
109 | 
110 |         # split into two part for gated activation
111 |         splitdim = 1
112 |         xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim)
113 | 
114 |         # local conditioning
115 |         if c is not None:
116 |             assert self.conv1x1_aux is not None
117 |             c = self.conv1x1_aux(c)
118 |             ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim)
119 |             xa, xb = xa + ca, xb + cb
120 | 
121 |         x = torch.tanh(xa) * torch.sigmoid(xb)
122 | 
123 |         # for skip connection
124 |         s = self.conv1x1_skip(x)
125 | 
126 |         # for residual connection
127 |         x = (self.conv1x1_out(x) + residual) * math.sqrt(0.5)
128 | 
129 |         return x, s
130 | 


--------------------------------------------------------------------------------
/modules/parallel_wavegan/layers/residual_stack.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2020 Tomoki Hayashi
 4 | #  MIT License (https://opensource.org/licenses/MIT)
 5 | 
 6 | """Residual stack module in MelGAN."""
 7 | 
 8 | import torch
 9 | 
10 | from . import CausalConv1d
11 | 
12 | 
13 | class ResidualStack(torch.nn.Module):
14 |     """Residual stack module introduced in MelGAN."""
15 | 
16 |     def __init__(self,
17 |                  kernel_size=3,
18 |                  channels=32,
19 |                  dilation=1,
20 |                  bias=True,
21 |                  nonlinear_activation="LeakyReLU",
22 |                  nonlinear_activation_params={"negative_slope": 0.2},
23 |                  pad="ReflectionPad1d",
24 |                  pad_params={},
25 |                  use_causal_conv=False,
26 |                  ):
27 |         """Initialize ResidualStack module.
28 | 
29 |         Args:
30 |             kernel_size (int): Kernel size of dilation convolution layer.
31 |             channels (int): Number of channels of convolution layers.
32 |             dilation (int): Dilation factor.
33 |             bias (bool): Whether to add bias parameter in convolution layers.
34 |             nonlinear_activation (str): Activation function module name.
35 |             nonlinear_activation_params (dict): Hyperparameters for activation function.
36 |             pad (str): Padding function module name before dilated convolution layer.
37 |             pad_params (dict): Hyperparameters for padding function.
38 |             use_causal_conv (bool): Whether to use causal convolution.
39 | 
40 |         """
41 |         super(ResidualStack, self).__init__()
42 | 
43 |         # defile residual stack part
44 |         if not use_causal_conv:
45 |             assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
46 |             self.stack = torch.nn.Sequential(
47 |                 getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
48 |                 getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params),
49 |                 torch.nn.Conv1d(channels, channels, kernel_size, dilation=dilation, bias=bias),
50 |                 getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
51 |                 torch.nn.Conv1d(channels, channels, 1, bias=bias),
52 |             )
53 |         else:
54 |             self.stack = torch.nn.Sequential(
55 |                 getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
56 |                 CausalConv1d(channels, channels, kernel_size, dilation=dilation,
57 |                              bias=bias, pad=pad, pad_params=pad_params),
58 |                 getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
59 |                 torch.nn.Conv1d(channels, channels, 1, bias=bias),
60 |             )
61 | 
62 |         # defile extra layer for skip connection
63 |         self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias)
64 | 
65 |     def forward(self, c):
66 |         """Calculate forward propagation.
67 | 
68 |         Args:
69 |             c (Tensor): Input tensor (B, channels, T).
70 | 
71 |         Returns:
72 |             Tensor: Output tensor (B, chennels, T).
73 | 
74 |         """
75 |         return self.stack(c) + self.skip_layer(c)
76 | 


--------------------------------------------------------------------------------
/modules/parallel_wavegan/layers/tf_layers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2020 MINH ANH (@dathudeptrai)
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """Tensorflow Layer modules complatible with pytorch."""
  7 | 
  8 | import tensorflow as tf
  9 | 
 10 | 
 11 | class TFReflectionPad1d(tf.keras.layers.Layer):
 12 |     """Tensorflow ReflectionPad1d module."""
 13 | 
 14 |     def __init__(self, padding_size):
 15 |         """Initialize TFReflectionPad1d module.
 16 | 
 17 |         Args:
 18 |             padding_size (int): Padding size.
 19 | 
 20 |         """
 21 |         super(TFReflectionPad1d, self).__init__()
 22 |         self.padding_size = padding_size
 23 | 
 24 |     @tf.function
 25 |     def call(self, x):
 26 |         """Calculate forward propagation.
 27 | 
 28 |         Args:
 29 |             x (Tensor): Input tensor (B, T, 1, C).
 30 | 
 31 |         Returns:
 32 |             Tensor: Padded tensor (B, T + 2 * padding_size, 1, C).
 33 | 
 34 |         """
 35 |         return tf.pad(x, [[0, 0], [self.padding_size, self.padding_size], [0, 0], [0, 0]], "REFLECT")
 36 | 
 37 | 
 38 | class TFConvTranspose1d(tf.keras.layers.Layer):
 39 |     """Tensorflow ConvTranspose1d module."""
 40 | 
 41 |     def __init__(self, channels, kernel_size, stride, padding):
 42 |         """Initialize TFConvTranspose1d( module.
 43 | 
 44 |         Args:
 45 |             channels (int): Number of channels.
 46 |             kernel_size (int): kernel size.
 47 |             strides (int): Stride width.
 48 |             padding (str): Padding type ("same" or "valid").
 49 | 
 50 |         """
 51 |         super(TFConvTranspose1d, self).__init__()
 52 |         self.conv1d_transpose = tf.keras.layers.Conv2DTranspose(
 53 |             filters=channels,
 54 |             kernel_size=(kernel_size, 1),
 55 |             strides=(stride, 1),
 56 |             padding=padding,
 57 |         )
 58 | 
 59 |     @tf.function
 60 |     def call(self, x):
 61 |         """Calculate forward propagation.
 62 | 
 63 |         Args:
 64 |             x (Tensor): Input tensor (B, T, 1, C).
 65 | 
 66 |         Returns:
 67 |             Tensors: Output tensor (B, T', 1, C').
 68 | 
 69 |         """
 70 |         x = self.conv1d_transpose(x)
 71 |         return x
 72 | 
 73 | 
 74 | class TFResidualStack(tf.keras.layers.Layer):
 75 |     """Tensorflow ResidualStack module."""
 76 | 
 77 |     def __init__(self,
 78 |                  kernel_size,
 79 |                  channels,
 80 |                  dilation,
 81 |                  bias,
 82 |                  nonlinear_activation,
 83 |                  nonlinear_activation_params,
 84 |                  padding,
 85 |                  ):
 86 |         """Initialize TFResidualStack module.
 87 | 
 88 |         Args:
 89 |             kernel_size (int): Kernel size.
 90 |             channles (int): Number of channels.
 91 |             dilation (int): Dilation ine.
 92 |             bias (bool): Whether to add bias parameter in convolution layers.
 93 |             nonlinear_activation (str): Activation function module name.
 94 |             nonlinear_activation_params (dict): Hyperparameters for activation function.
 95 |             padding (str): Padding type ("same" or "valid").
 96 | 
 97 |         """
 98 |         super(TFResidualStack, self).__init__()
 99 |         self.block = [
100 |             getattr(tf.keras.layers, nonlinear_activation)(**nonlinear_activation_params),
101 |             TFReflectionPad1d(dilation),
102 |             tf.keras.layers.Conv2D(
103 |                 filters=channels,
104 |                 kernel_size=(kernel_size, 1),
105 |                 dilation_rate=(dilation, 1),
106 |                 use_bias=bias,
107 |                 padding="valid",
108 |             ),
109 |             getattr(tf.keras.layers, nonlinear_activation)(**nonlinear_activation_params),
110 |             tf.keras.layers.Conv2D(filters=channels, kernel_size=1, use_bias=bias)
111 |         ]
112 |         self.shortcut = tf.keras.layers.Conv2D(filters=channels, kernel_size=1, use_bias=bias)
113 | 
114 |     @tf.function
115 |     def call(self, x):
116 |         """Calculate forward propagation.
117 | 
118 |         Args:
119 |             x (Tensor): Input tensor (B, T, 1, C).
120 | 
121 |         Returns:
122 |             Tensor: Output tensor (B, T, 1, C).
123 | 
124 |         """
125 |         _x = tf.identity(x)
126 |         for i, layer in enumerate(self.block):
127 |             _x = layer(_x)
128 |         shortcut = self.shortcut(x)
129 |         return shortcut + _x
130 | 


--------------------------------------------------------------------------------
/modules/parallel_wavegan/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from .stft_loss import *  # NOQA
2 | 


--------------------------------------------------------------------------------
/modules/parallel_wavegan/losses/stft_loss.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2019 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """STFT-based Loss modules."""
  7 | 
  8 | import torch
  9 | import torch.nn.functional as F
 10 | 
 11 | 
 12 | def stft(x, fft_size, hop_size, win_length, window):
 13 |     """Perform STFT and convert to magnitude spectrogram.
 14 | 
 15 |     Args:
 16 |         x (Tensor): Input signal tensor (B, T).
 17 |         fft_size (int): FFT size.
 18 |         hop_size (int): Hop size.
 19 |         win_length (int): Window length.
 20 |         window (str): Window function type.
 21 | 
 22 |     Returns:
 23 |         Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
 24 | 
 25 |     """
 26 |     x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
 27 |     real = x_stft[..., 0]
 28 |     imag = x_stft[..., 1]
 29 | 
 30 |     # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
 31 |     return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1)
 32 | 
 33 | 
 34 | class SpectralConvergengeLoss(torch.nn.Module):
 35 |     """Spectral convergence loss module."""
 36 | 
 37 |     def __init__(self):
 38 |         """Initilize spectral convergence loss module."""
 39 |         super(SpectralConvergengeLoss, self).__init__()
 40 | 
 41 |     def forward(self, x_mag, y_mag):
 42 |         """Calculate forward propagation.
 43 | 
 44 |         Args:
 45 |             x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
 46 |             y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
 47 | 
 48 |         Returns:
 49 |             Tensor: Spectral convergence loss value.
 50 | 
 51 |         """
 52 |         return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro")
 53 | 
 54 | 
 55 | class LogSTFTMagnitudeLoss(torch.nn.Module):
 56 |     """Log STFT magnitude loss module."""
 57 | 
 58 |     def __init__(self):
 59 |         """Initilize los STFT magnitude loss module."""
 60 |         super(LogSTFTMagnitudeLoss, self).__init__()
 61 | 
 62 |     def forward(self, x_mag, y_mag):
 63 |         """Calculate forward propagation.
 64 | 
 65 |         Args:
 66 |             x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
 67 |             y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
 68 | 
 69 |         Returns:
 70 |             Tensor: Log STFT magnitude loss value.
 71 | 
 72 |         """
 73 |         return F.l1_loss(torch.log(y_mag), torch.log(x_mag))
 74 | 
 75 | 
 76 | class STFTLoss(torch.nn.Module):
 77 |     """STFT loss module."""
 78 | 
 79 |     def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"):
 80 |         """Initialize STFT loss module."""
 81 |         super(STFTLoss, self).__init__()
 82 |         self.fft_size = fft_size
 83 |         self.shift_size = shift_size
 84 |         self.win_length = win_length
 85 |         self.window = getattr(torch, window)(win_length)
 86 |         self.spectral_convergenge_loss = SpectralConvergengeLoss()
 87 |         self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
 88 | 
 89 |     def forward(self, x, y):
 90 |         """Calculate forward propagation.
 91 | 
 92 |         Args:
 93 |             x (Tensor): Predicted signal (B, T).
 94 |             y (Tensor): Groundtruth signal (B, T).
 95 | 
 96 |         Returns:
 97 |             Tensor: Spectral convergence loss value.
 98 |             Tensor: Log STFT magnitude loss value.
 99 | 
100 |         """
101 |         x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
102 |         y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
103 |         sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
104 |         mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
105 | 
106 |         return sc_loss, mag_loss
107 | 
108 | 
109 | class MultiResolutionSTFTLoss(torch.nn.Module):
110 |     """Multi resolution STFT loss module."""
111 | 
112 |     def __init__(self,
113 |                  fft_sizes=[1024, 2048, 512],
114 |                  hop_sizes=[120, 240, 50],
115 |                  win_lengths=[600, 1200, 240],
116 |                  window="hann_window"):
117 |         """Initialize Multi resolution STFT loss module.
118 | 
119 |         Args:
120 |             fft_sizes (list): List of FFT sizes.
121 |             hop_sizes (list): List of hop sizes.
122 |             win_lengths (list): List of window lengths.
123 |             window (str): Window function type.
124 | 
125 |         """
126 |         super(MultiResolutionSTFTLoss, self).__init__()
127 |         assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
128 |         self.stft_losses = torch.nn.ModuleList()
129 |         for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
130 |             self.stft_losses += [STFTLoss(fs, ss, wl, window)]
131 | 
132 |     def forward(self, x, y):
133 |         """Calculate forward propagation.
134 | 
135 |         Args:
136 |             x (Tensor): Predicted signal (B, T).
137 |             y (Tensor): Groundtruth signal (B, T).
138 | 
139 |         Returns:
140 |             Tensor: Multi resolution spectral convergence loss value.
141 |             Tensor: Multi resolution log STFT magnitude loss value.
142 | 
143 |         """
144 |         sc_loss = 0.0
145 |         mag_loss = 0.0
146 |         for f in self.stft_losses:
147 |             sc_l, mag_l = f(x, y)
148 |             sc_loss += sc_l
149 |             mag_loss += mag_l
150 |         sc_loss /= len(self.stft_losses)
151 |         mag_loss /= len(self.stft_losses)
152 | 
153 |         return sc_loss, mag_loss
154 | 


--------------------------------------------------------------------------------
/modules/parallel_wavegan/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .melgan import *  # NOQA
2 | from .parallel_wavegan import *  # NOQA
3 | 


--------------------------------------------------------------------------------
/modules/parallel_wavegan/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from torch.optim import *  # NOQA
2 | from .radam import *  # NOQA
3 | 


--------------------------------------------------------------------------------
/modules/parallel_wavegan/optimizers/radam.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """RAdam optimizer.
 4 | 
 5 | This code is drived from https://github.com/LiyuanLucasLiu/RAdam.
 6 | """
 7 | 
 8 | import math
 9 | import torch
10 | 
11 | from torch.optim.optimizer import Optimizer
12 | 
13 | 
14 | class RAdam(Optimizer):
15 |     """Rectified Adam optimizer."""
16 | 
17 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
18 |         """Initilize RAdam optimizer."""
19 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
20 |         self.buffer = [[None, None, None] for ind in range(10)]
21 |         super(RAdam, self).__init__(params, defaults)
22 | 
23 |     def __setstate__(self, state):
24 |         """Set state."""
25 |         super(RAdam, self).__setstate__(state)
26 | 
27 |     def step(self, closure=None):
28 |         """Run one step."""
29 |         loss = None
30 |         if closure is not None:
31 |             loss = closure()
32 | 
33 |         for group in self.param_groups:
34 | 
35 |             for p in group['params']:
36 |                 if p.grad is None:
37 |                     continue
38 |                 grad = p.grad.data.float()
39 |                 if grad.is_sparse:
40 |                     raise RuntimeError('RAdam does not support sparse gradients')
41 | 
42 |                 p_data_fp32 = p.data.float()
43 | 
44 |                 state = self.state[p]
45 | 
46 |                 if len(state) == 0:
47 |                     state['step'] = 0
48 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
49 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
50 |                 else:
51 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
52 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
53 | 
54 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
55 |                 beta1, beta2 = group['betas']
56 | 
57 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
58 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
59 | 
60 |                 state['step'] += 1
61 |                 buffered = self.buffer[int(state['step'] % 10)]
62 |                 if state['step'] == buffered[0]:
63 |                     N_sma, step_size = buffered[1], buffered[2]
64 |                 else:
65 |                     buffered[0] = state['step']
66 |                     beta2_t = beta2 ** state['step']
67 |                     N_sma_max = 2 / (1 - beta2) - 1
68 |                     N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
69 |                     buffered[1] = N_sma
70 | 
71 |                     # more conservative since it's an approximated value
72 |                     if N_sma >= 5:
73 |                         step_size = math.sqrt(
74 |                             (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])  # NOQA
75 |                     else:
76 |                         step_size = 1.0 / (1 - beta1 ** state['step'])
77 |                     buffered[2] = step_size
78 | 
79 |                 if group['weight_decay'] != 0:
80 |                     p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
81 | 
82 |                 # more conservative since it's an approximated value
83 |                 if N_sma >= 5:
84 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
85 |                     p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
86 |                 else:
87 |                     p_data_fp32.add_(-step_size * group['lr'], exp_avg)
88 | 
89 |                 p.data.copy_(p_data_fp32)
90 | 
91 |         return loss
92 | 


--------------------------------------------------------------------------------
/modules/parallel_wavegan/stft_loss.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2019 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """STFT-based Loss modules."""
  7 | import librosa
  8 | import torch
  9 | 
 10 | from modules.parallel_wavegan.losses import LogSTFTMagnitudeLoss, SpectralConvergengeLoss, stft
 11 | 
 12 | 
 13 | class STFTLoss(torch.nn.Module):
 14 |     """STFT loss module."""
 15 | 
 16 |     def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window",
 17 |                  use_mel_loss=False):
 18 |         """Initialize STFT loss module."""
 19 |         super(STFTLoss, self).__init__()
 20 |         self.fft_size = fft_size
 21 |         self.shift_size = shift_size
 22 |         self.win_length = win_length
 23 |         self.window = getattr(torch, window)(win_length)
 24 |         self.spectral_convergenge_loss = SpectralConvergengeLoss()
 25 |         self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
 26 |         self.use_mel_loss = use_mel_loss
 27 |         self.mel_basis = None
 28 | 
 29 |     def forward(self, x, y):
 30 |         """Calculate forward propagation.
 31 | 
 32 |         Args:
 33 |             x (Tensor): Predicted signal (B, T).
 34 |             y (Tensor): Groundtruth signal (B, T).
 35 | 
 36 |         Returns:
 37 |             Tensor: Spectral convergence loss value.
 38 |             Tensor: Log STFT magnitude loss value.
 39 | 
 40 |         """
 41 |         x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
 42 |         y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
 43 |         if self.use_mel_loss:
 44 |             if self.mel_basis is None:
 45 |                 self.mel_basis = torch.from_numpy(librosa.filters.mel(22050, self.fft_size, 80)).cuda().T
 46 |             x_mag = x_mag @ self.mel_basis
 47 |             y_mag = y_mag @ self.mel_basis
 48 | 
 49 |         sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
 50 |         mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
 51 | 
 52 |         return sc_loss, mag_loss
 53 | 
 54 | 
 55 | class MultiResolutionSTFTLoss(torch.nn.Module):
 56 |     """Multi resolution STFT loss module."""
 57 | 
 58 |     def __init__(self,
 59 |                  fft_sizes=[1024, 2048, 512],
 60 |                  hop_sizes=[120, 240, 50],
 61 |                  win_lengths=[600, 1200, 240],
 62 |                  window="hann_window",
 63 |                  use_mel_loss=False):
 64 |         """Initialize Multi resolution STFT loss module.
 65 | 
 66 |         Args:
 67 |             fft_sizes (list): List of FFT sizes.
 68 |             hop_sizes (list): List of hop sizes.
 69 |             win_lengths (list): List of window lengths.
 70 |             window (str): Window function type.
 71 | 
 72 |         """
 73 |         super(MultiResolutionSTFTLoss, self).__init__()
 74 |         assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
 75 |         self.stft_losses = torch.nn.ModuleList()
 76 |         for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
 77 |             self.stft_losses += [STFTLoss(fs, ss, wl, window, use_mel_loss)]
 78 | 
 79 |     def forward(self, x, y):
 80 |         """Calculate forward propagation.
 81 | 
 82 |         Args:
 83 |             x (Tensor): Predicted signal (B, T).
 84 |             y (Tensor): Groundtruth signal (B, T).
 85 | 
 86 |         Returns:
 87 |             Tensor: Multi resolution spectral convergence loss value.
 88 |             Tensor: Multi resolution log STFT magnitude loss value.
 89 | 
 90 |         """
 91 |         sc_loss = 0.0
 92 |         mag_loss = 0.0
 93 |         for f in self.stft_losses:
 94 |             sc_l, mag_l = f(x, y)
 95 |             sc_loss += sc_l
 96 |             mag_loss += mag_l
 97 |         sc_loss /= len(self.stft_losses)
 98 |         mag_loss /= len(self.stft_losses)
 99 | 
100 |         return sc_loss, mag_loss
101 | 


--------------------------------------------------------------------------------
/modules/parallel_wavegan/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *  # NOQA
2 | 


--------------------------------------------------------------------------------
/modules/parallel_wavegan/utils/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2019 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """Utility functions."""
  7 | 
  8 | import fnmatch
  9 | import logging
 10 | import os
 11 | import sys
 12 | 
 13 | import h5py
 14 | import numpy as np
 15 | 
 16 | 
 17 | def find_files(root_dir, query="*.wav", include_root_dir=True):
 18 |     """Find files recursively.
 19 | 
 20 |     Args:
 21 |         root_dir (str): Root root_dir to find.
 22 |         query (str): Query to find.
 23 |         include_root_dir (bool): If False, root_dir name is not included.
 24 | 
 25 |     Returns:
 26 |         list: List of found filenames.
 27 | 
 28 |     """
 29 |     files = []
 30 |     for root, dirnames, filenames in os.walk(root_dir, followlinks=True):
 31 |         for filename in fnmatch.filter(filenames, query):
 32 |             files.append(os.path.join(root, filename))
 33 |     if not include_root_dir:
 34 |         files = [file_.replace(root_dir + "/", "") for file_ in files]
 35 | 
 36 |     return files
 37 | 
 38 | 
 39 | def read_hdf5(hdf5_name, hdf5_path):
 40 |     """Read hdf5 dataset.
 41 | 
 42 |     Args:
 43 |         hdf5_name (str): Filename of hdf5 file.
 44 |         hdf5_path (str): Dataset name in hdf5 file.
 45 | 
 46 |     Return:
 47 |         any: Dataset values.
 48 | 
 49 |     """
 50 |     if not os.path.exists(hdf5_name):
 51 |         logging.error(f"There is no such a hdf5 file ({hdf5_name}).")
 52 |         sys.exit(1)
 53 | 
 54 |     hdf5_file = h5py.File(hdf5_name, "r")
 55 | 
 56 |     if hdf5_path not in hdf5_file:
 57 |         logging.error(f"There is no such a data in hdf5 file. ({hdf5_path})")
 58 |         sys.exit(1)
 59 | 
 60 |     hdf5_data = hdf5_file[hdf5_path][()]
 61 |     hdf5_file.close()
 62 | 
 63 |     return hdf5_data
 64 | 
 65 | 
 66 | def write_hdf5(hdf5_name, hdf5_path, write_data, is_overwrite=True):
 67 |     """Write dataset to hdf5.
 68 | 
 69 |     Args:
 70 |         hdf5_name (str): Hdf5 dataset filename.
 71 |         hdf5_path (str): Dataset path in hdf5.
 72 |         write_data (ndarray): Data to write.
 73 |         is_overwrite (bool): Whether to overwrite dataset.
 74 | 
 75 |     """
 76 |     # convert to numpy array
 77 |     write_data = np.array(write_data)
 78 | 
 79 |     # check folder existence
 80 |     folder_name, _ = os.path.split(hdf5_name)
 81 |     if not os.path.exists(folder_name) and len(folder_name) != 0:
 82 |         os.makedirs(folder_name)
 83 | 
 84 |     # check hdf5 existence
 85 |     if os.path.exists(hdf5_name):
 86 |         # if already exists, open with r+ mode
 87 |         hdf5_file = h5py.File(hdf5_name, "r+")
 88 |         # check dataset existence
 89 |         if hdf5_path in hdf5_file:
 90 |             if is_overwrite:
 91 |                 logging.warning("Dataset in hdf5 file already exists. "
 92 |                                 "recreate dataset in hdf5.")
 93 |                 hdf5_file.__delitem__(hdf5_path)
 94 |             else:
 95 |                 logging.error("Dataset in hdf5 file already exists. "
 96 |                               "if you want to overwrite, please set is_overwrite = True.")
 97 |                 hdf5_file.close()
 98 |                 sys.exit(1)
 99 |     else:
100 |         # if not exists, open with w mode
101 |         hdf5_file = h5py.File(hdf5_name, "w")
102 | 
103 |     # write data to hdf5
104 |     hdf5_file.create_dataset(hdf5_path, data=write_data)
105 |     hdf5_file.flush()
106 |     hdf5_file.close()
107 | 
108 | 
109 | class HDF5ScpLoader(object):
110 |     """Loader class for a fests.scp file of hdf5 file.
111 | 
112 |     Examples:
113 |         key1 /some/path/a.h5:feats
114 |         key2 /some/path/b.h5:feats
115 |         key3 /some/path/c.h5:feats
116 |         key4 /some/path/d.h5:feats
117 |         ...
118 |         >>> loader = HDF5ScpLoader("hdf5.scp")
119 |         >>> array = loader["key1"]
120 | 
121 |         key1 /some/path/a.h5
122 |         key2 /some/path/b.h5
123 |         key3 /some/path/c.h5
124 |         key4 /some/path/d.h5
125 |         ...
126 |         >>> loader = HDF5ScpLoader("hdf5.scp", "feats")
127 |         >>> array = loader["key1"]
128 | 
129 |     """
130 | 
131 |     def __init__(self, feats_scp, default_hdf5_path="feats"):
132 |         """Initialize HDF5 scp loader.
133 | 
134 |         Args:
135 |             feats_scp (str): Kaldi-style feats.scp file with hdf5 format.
136 |             default_hdf5_path (str): Path in hdf5 file. If the scp contain the info, not used.
137 | 
138 |         """
139 |         self.default_hdf5_path = default_hdf5_path
140 |         with open(feats_scp) as f:
141 |             lines = [line.replace("\n", "") for line in f.readlines()]
142 |         self.data = {}
143 |         for line in lines:
144 |             key, value = line.split()
145 |             self.data[key] = value
146 | 
147 |     def get_path(self, key):
148 |         """Get hdf5 file path for a given key."""
149 |         return self.data[key]
150 | 
151 |     def __getitem__(self, key):
152 |         """Get ndarray for a given key."""
153 |         p = self.data[key]
154 |         if ":" in p:
155 |             return read_hdf5(*p.split(":"))
156 |         else:
157 |             return read_hdf5(p, self.default_hdf5_path)
158 | 
159 |     def __len__(self):
160 |         """Return the length of the scp file."""
161 |         return len(self.data)
162 | 
163 |     def __iter__(self):
164 |         """Return the iterator of the scp file."""
165 |         return iter(self.data)
166 | 
167 |     def keys(self):
168 |         """Return the keys of the scp file."""
169 |         return self.data.keys()
170 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | matplotlib
 2 | librosa==0.8.0
 3 | tqdm
 4 | pandas
 5 | numba==0.53.1
 6 | numpy==1.19.2
 7 | scipy==1.5.4
 8 | PyYAML==5.3.1
 9 | tensorboardX
10 | pyloudnorm
11 | setuptools>=41.0.0
12 | g2p_en
13 | resemblyzer
14 | webrtcvad
15 | tensorboard==2.6.0
16 | scikit-learn==0.24.1
17 | scikit-image==0.16.2
18 | textgrid
19 | jiwer
20 | pycwt
21 | PyWavelets
22 | praat-parselmouth==0.3.3
23 | jieba
24 | einops
25 | chardet
26 | pretty-midi==0.2.9
27 | pytorch-lightning==0.7.1
28 | h5py==3.1.0
29 | pypinyin==0.39.0
30 | g2pM==0.1.2.5
31 | 


--------------------------------------------------------------------------------
/requirements_2080.txt:
--------------------------------------------------------------------------------
  1 | absl-py==0.11.0
  2 | alignment==1.0.10
  3 | altgraph==0.17
  4 | appdirs==1.4.4
  5 | async-timeout==3.0.1
  6 | audioread==2.1.9
  7 | backcall==0.2.0
  8 | blinker==1.4
  9 | brotlipy==0.7.0
 10 | cachetools==4.2.0
 11 | certifi==2020.12.5
 12 | cffi==1.14.4
 13 | chardet==4.0.0
 14 | click==7.1.2
 15 | cycler==0.10.0
 16 | Cython==0.29.21
 17 | cytoolz==0.11.0
 18 | decorator==4.4.2
 19 | Distance==0.1.3
 20 | einops==0.3.0
 21 | et-xmlfile==1.0.1
 22 | fsspec==0.8.4
 23 | future==0.18.2
 24 | g2p-en==2.1.0
 25 | g2pM==0.1.2.5
 26 | google-auth==1.24.0
 27 | google-auth-oauthlib==0.4.2
 28 | grpcio==1.34.0
 29 | h5py==3.1.0
 30 | horology==1.1.0
 31 | httplib2==0.18.1
 32 | idna==2.10
 33 | imageio==2.9.0
 34 | inflect==5.0.2
 35 | ipdb==0.13.4
 36 | ipython==7.19.0
 37 | ipython-genutils==0.2.0
 38 | jdcal==1.4.1
 39 | jedi==0.17.2
 40 | jieba==0.42.1
 41 | jiwer==2.2.0
 42 | joblib==1.0.0
 43 | kiwisolver==1.3.1
 44 | librosa==0.8.0
 45 | llvmlite==0.31.0
 46 | Markdown==3.3.3
 47 | matplotlib==3.3.3
 48 | miditoolkit==0.1.7
 49 | mido==1.2.9
 50 | music21==5.7.2
 51 | networkx==2.5
 52 | nltk==3.5
 53 | numba==0.48.0
 54 | numpy==1.19.4
 55 | oauth2client==4.1.3
 56 | oauthlib==3.1.0
 57 | olefile==0.46
 58 | packaging==20.7
 59 | pandas==1.2.0
 60 | parso==0.7.1
 61 | patsy==0.5.1
 62 | pexpect==4.8.0
 63 | pickleshare==0.7.5
 64 | Pillow==8.0.1
 65 | pooch==1.3.0
 66 | praat-parselmouth==0.3.3
 67 | prompt-toolkit==3.0.8
 68 | protobuf==3.13.0
 69 | ptyprocess==0.6.0
 70 | pyasn1==0.4.8
 71 | pyasn1-modules==0.2.8
 72 | pycparser==2.20
 73 | pycwt==0.3.0a22
 74 | Pygments==2.7.3
 75 | PyInstaller==3.6
 76 | PyJWT==1.7.1
 77 | pyloudnorm==0.1.0
 78 | pyparsing==2.4.7
 79 | pypinyin==0.39.0
 80 | PySocks==1.7.1
 81 | python-dateutil==2.8.1
 82 | python-Levenshtein==0.12.0
 83 | pytorch-lightning==0.7.1
 84 | pytz==2020.5
 85 | PyWavelets==1.1.1
 86 | pyworld==0.2.12
 87 | PyYAML==5.3.1
 88 | regex==2020.11.13
 89 | requests==2.25.1
 90 | requests-oauthlib==1.3.0
 91 | resampy==0.2.2
 92 | Resemblyzer==0.1.1.dev0
 93 | rsa==4.6
 94 | scikit-image==0.16.2
 95 | scikit-learn==0.22.2.post1
 96 | scipy==1.5.4
 97 | six==1.15.0
 98 | SoundFile==0.10.3.post1
 99 | stopit==1.1.1
100 | tensorboard==2.4.0
101 | tensorboard-plugin-wit==1.7.0
102 | tensorboardX==2.1
103 | TextGrid==1.5
104 | threadpoolctl==2.1.0
105 | toolz==0.11.1
106 | torch==1.6.0
107 | torchaudio==0.6.0
108 | torchvision==0.7.0
109 | tqdm==4.54.1
110 | traitlets==5.0.5
111 | typing==3.7.4.3
112 | urllib3==1.26.2
113 | uuid==1.30
114 | wcwidth==0.2.5
115 | webencodings==0.5.1
116 | webrtcvad==2.0.10
117 | Werkzeug==1.0.1
118 | pretty-midi==0.2.9
119 | 


--------------------------------------------------------------------------------
/requirements_3090.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.15.0
 2 | appdirs==1.4.4
 3 | audioread==2.1.9
 4 | beautifulsoup4==4.10.0
 5 | certifi==2021.10.8
 6 | cffi==1.15.0
 7 | charset-normalizer==2.0.7
 8 | cycler==0.11.0
 9 | Cython==0.29.24
10 | decorator==4.4.2
11 | dlib==19.22.1
12 | einops==0.3.2
13 | future==0.18.2
14 | g2p-en==2.1.0
15 | google==3.0.0
16 | grpcio==1.42.0
17 | h5py==2.8.0
18 | horology==1.2.0
19 | idna==3.3
20 | imageio==2.10.1
21 | imageio-ffmpeg==0.4.5
22 | importlib-metadata==4.8.1
23 | joblib==1.1.0
24 | kiwisolver==1.3.2
25 | librosa==0.8.0
26 | llvmlite==0.31.0
27 | Markdown==3.3.4
28 | matplotlib==3.4.3
29 | miditoolkit==0.1.7
30 | moviepy==1.0.3
31 | numba==0.48.0
32 | numpy==1.20.0
33 | opencv-python==4.5.4.58
34 | packaging==21.2
35 | pandas==1.3.4
36 | Pillow==8.4.0
37 | pooch==1.5.2
38 | praat-parselmouth==0.3.3
39 | proglog==0.1.9
40 | protobuf==3.19.1
41 | pycparser==2.20
42 | pycwt==0.3.0a22
43 | pydub==0.25.1
44 | pyloudnorm==0.1.0
45 | pyparsing==2.4.7
46 | pypinyin==0.43.0
47 | python-dateutil==2.8.2
48 | pytorch-lightning==0.7.1
49 | pytorch-ssim==0.1
50 | pytz==2021.3
51 | pyworld==0.3.0
52 | PyYAML==6.0
53 | requests==2.26.0
54 | resampy==0.2.2
55 | Resemblyzer==0.1.1.dev0
56 | scikit-image==0.16.2
57 | scikit-learn==0.22
58 | scipy==1.3.0
59 | six==1.16.0
60 | sklearn==0.0
61 | SoundFile==0.10.3.post1
62 | soupsieve==2.3
63 | sympy==1.9
64 | tensorboard==1.15.0
65 | tensorboardX==2.4
66 | test-tube==0.7.5
67 | TextGrid==1.5
68 | torch @ https://download.pytorch.org/whl/nightly/cu113/torch-1.10.0.dev20210907%2Bcu113-cp37-cp37m-linux_x86_64.whl
69 | torchvision==0.9.1
70 | tqdm==4.62.3
71 | typing-extensions==3.10.0.2
72 | urllib3==1.26.7
73 | uuid==1.30
74 | webrtcvad==2.0.10
75 | Werkzeug==2.0.2
76 | zipp==3.6.0
77 | 


--------------------------------------------------------------------------------
/resources/apply_form.md:
--------------------------------------------------------------------------------
 1 | # The way to apply for PopCS
 2 | Thanks for your attention to our works. Please write the email to jinglinliu@zju.edu.cn with:
 3 | 
 4 | "
 5 | 
 6 | name: ***
 7 | 
 8 | affiliations: *** (school or institution)
 9 | 
10 | research fields: ***
11 | 
12 | We want to apply for PopCS and agree to the dataset license: CC by-nc-sa 4.0 (NonCommercial!). 
13 | 
14 | We accept full responsibility for our use of the dataset and shall defend and indemnify the authors of DiffSinger, against any and all claims arising from our use of the dataset, including but not limited to our use of any copies of copyrighted audio files that we may create from the dataset.
15 | 
16 | We hereby represent that we are fully authorized to enter into this agreement on behalf of my employer.
17 | 
18 | We will cite your paper if these codes or data have been used. We will not distribute the download link to others without informing the authors of DiffSinger.
19 | 
20 | "
21 | 
22 | Then we will provide the download link to you. 
23 | 
24 | **Please note that, if you are using PopCS, it means that you have accepted the terms above.**
25 | 
26 | **Please use your Official Email Address (like xxx@zju.edu.cn)! Thank you!**


--------------------------------------------------------------------------------
/resources/diffspeech-fs2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/resources/diffspeech-fs2-1.png


--------------------------------------------------------------------------------
/resources/diffspeech-fs2-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/resources/diffspeech-fs2-2.png


--------------------------------------------------------------------------------
/resources/diffspeech-fs2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/resources/diffspeech-fs2.png


--------------------------------------------------------------------------------
/resources/model_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/resources/model_a.png


--------------------------------------------------------------------------------
/resources/model_b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/resources/model_b.png


--------------------------------------------------------------------------------
/resources/tfb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/resources/tfb.png


--------------------------------------------------------------------------------
/tasks/run.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | from utils.hparams import set_hparams, hparams
 3 | 
 4 | 
 5 | def run_task():
 6 |     assert hparams['task_cls'] != ''
 7 |     pkg = ".".join(hparams["task_cls"].split(".")[:-1])
 8 |     cls_name = hparams["task_cls"].split(".")[-1]
 9 |     task_cls = getattr(importlib.import_module(pkg), cls_name)
10 |     task_cls.start()
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     set_hparams()
15 |     run_task()
16 | 


--------------------------------------------------------------------------------
/tasks/tts/pe.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | matplotlib.use('Agg')
  3 | 
  4 | import torch
  5 | import numpy as np
  6 | import os
  7 | 
  8 | from tasks.base_task import BaseDataset
  9 | from tasks.tts.fs2 import FastSpeech2Task
 10 | from modules.fastspeech.pe import PitchExtractor
 11 | import utils
 12 | from utils.indexed_datasets import IndexedDataset
 13 | from utils.hparams import hparams
 14 | from utils.plot import f0_to_figure
 15 | from utils.pitch_utils import norm_interp_f0, denorm_f0
 16 | 
 17 | 
 18 | class PeDataset(BaseDataset):
 19 |     def __init__(self, prefix, shuffle=False):
 20 |         super().__init__(shuffle)
 21 |         self.data_dir = hparams['binary_data_dir']
 22 |         self.prefix = prefix
 23 |         self.hparams = hparams
 24 |         self.sizes = np.load(f'{self.data_dir}/{self.prefix}_lengths.npy')
 25 |         self.indexed_ds = None
 26 | 
 27 |         # pitch stats
 28 |         f0_stats_fn = f'{self.data_dir}/train_f0s_mean_std.npy'
 29 |         if os.path.exists(f0_stats_fn):
 30 |             hparams['f0_mean'], hparams['f0_std'] = self.f0_mean, self.f0_std = np.load(f0_stats_fn)
 31 |             hparams['f0_mean'] = float(hparams['f0_mean'])
 32 |             hparams['f0_std'] = float(hparams['f0_std'])
 33 |         else:
 34 |             hparams['f0_mean'], hparams['f0_std'] = self.f0_mean, self.f0_std = None, None
 35 | 
 36 |         if prefix == 'test':
 37 |             if hparams['num_test_samples'] > 0:
 38 |                 self.avail_idxs = list(range(hparams['num_test_samples'])) + hparams['test_ids']
 39 |                 self.sizes = [self.sizes[i] for i in self.avail_idxs]
 40 | 
 41 |     def _get_item(self, index):
 42 |         if hasattr(self, 'avail_idxs') and self.avail_idxs is not None:
 43 |             index = self.avail_idxs[index]
 44 |         if self.indexed_ds is None:
 45 |             self.indexed_ds = IndexedDataset(f'{self.data_dir}/{self.prefix}')
 46 |         return self.indexed_ds[index]
 47 | 
 48 |     def __getitem__(self, index):
 49 |         hparams = self.hparams
 50 |         item = self._get_item(index)
 51 |         max_frames = hparams['max_frames']
 52 |         spec = torch.Tensor(item['mel'])[:max_frames]
 53 |         # mel2ph = torch.LongTensor(item['mel2ph'])[:max_frames] if 'mel2ph' in item else None
 54 |         f0, uv = norm_interp_f0(item["f0"][:max_frames], hparams)
 55 |         pitch = torch.LongTensor(item.get("pitch"))[:max_frames]
 56 |         # print(item.keys(), item['mel'].shape, spec.shape)
 57 |         sample = {
 58 |             "id": index,
 59 |             "item_name": item['item_name'],
 60 |             "text": item['txt'],
 61 |             "mel": spec,
 62 |             "pitch": pitch,
 63 |             "f0": f0,
 64 |             "uv": uv,
 65 |             # "mel2ph": mel2ph,
 66 |             # "mel_nonpadding": spec.abs().sum(-1) > 0,
 67 |         }
 68 |         return sample
 69 | 
 70 |     def collater(self, samples):
 71 |         if len(samples) == 0:
 72 |             return {}
 73 |         id = torch.LongTensor([s['id'] for s in samples])
 74 |         item_names = [s['item_name'] for s in samples]
 75 |         text = [s['text'] for s in samples]
 76 |         f0 = utils.collate_1d([s['f0'] for s in samples], 0.0)
 77 |         pitch = utils.collate_1d([s['pitch'] for s in samples])
 78 |         uv = utils.collate_1d([s['uv'] for s in samples])
 79 |         mels = utils.collate_2d([s['mel'] for s in samples], 0.0)
 80 |         mel_lengths = torch.LongTensor([s['mel'].shape[0] for s in samples])
 81 |         # mel2ph = utils.collate_1d([s['mel2ph'] for s in samples], 0.0) \
 82 |         #     if samples[0]['mel2ph'] is not None else None
 83 |         # mel_nonpaddings = utils.collate_1d([s['mel_nonpadding'].float() for s in samples], 0.0)
 84 | 
 85 |         batch = {
 86 |             'id': id,
 87 |             'item_name': item_names,
 88 |             'nsamples': len(samples),
 89 |             'text': text,
 90 |             'mels': mels,
 91 |             'mel_lengths': mel_lengths,
 92 |             'pitch': pitch,
 93 |             # 'mel2ph': mel2ph,
 94 |             # 'mel_nonpaddings': mel_nonpaddings,
 95 |             'f0': f0,
 96 |             'uv': uv,
 97 |         }
 98 |         return batch
 99 | 
100 | 
101 | class PitchExtractionTask(FastSpeech2Task):
102 |     def __init__(self):
103 |         super().__init__()
104 |         self.dataset_cls = PeDataset
105 | 
106 |     def build_tts_model(self):
107 |         self.model = PitchExtractor(conv_layers=hparams['pitch_extractor_conv_layers'])
108 | 
109 |     # def build_scheduler(self, optimizer):
110 |     #     return torch.optim.lr_scheduler.StepLR(optimizer, hparams['decay_steps'], gamma=0.5)
111 |     def _training_step(self, sample, batch_idx, _):
112 |         loss_output = self.run_model(self.model, sample)
113 |         total_loss = sum([v for v in loss_output.values() if isinstance(v, torch.Tensor) and v.requires_grad])
114 |         loss_output['batch_size'] = sample['mels'].size()[0]
115 |         return total_loss, loss_output
116 | 
117 |     def validation_step(self, sample, batch_idx):
118 |         outputs = {}
119 |         outputs['losses'] = {}
120 |         outputs['losses'], model_out = self.run_model(self.model, sample, return_output=True, infer=True)
121 |         outputs['total_loss'] = sum(outputs['losses'].values())
122 |         outputs['nsamples'] = sample['nsamples']
123 |         outputs = utils.tensors_to_scalars(outputs)
124 |         if batch_idx < hparams['num_valid_plots']:
125 |             self.plot_pitch(batch_idx, model_out, sample)
126 |         return outputs
127 | 
128 |     def run_model(self, model, sample, return_output=False, infer=False):
129 |         f0 = sample['f0']
130 |         uv = sample['uv']
131 |         output = model(sample['mels'])
132 |         losses = {}
133 |         self.add_pitch_loss(output, sample, losses)
134 |         if not return_output:
135 |             return losses
136 |         else:
137 |             return losses, output
138 | 
139 |     def plot_pitch(self, batch_idx, model_out, sample):
140 |         gt_f0 = denorm_f0(sample['f0'], sample['uv'], hparams)
141 |         self.logger.experiment.add_figure(
142 |             f'f0_{batch_idx}',
143 |             f0_to_figure(gt_f0[0], None, model_out['f0_denorm_pred'][0]),
144 |             self.global_step)
145 | 
146 |     def add_pitch_loss(self, output, sample, losses):
147 |         # mel2ph = sample['mel2ph']  # [B, T_s]
148 |         mel = sample['mels']
149 |         f0 = sample['f0']
150 |         uv = sample['uv']
151 |         # nonpadding = (mel2ph != 0).float() if hparams['pitch_type'] == 'frame' \
152 |         #     else (sample['txt_tokens'] != 0).float()
153 |         nonpadding = (mel.abs().sum(-1) > 0).float()  # sample['mel_nonpaddings']
154 |         # print(nonpadding[0][-8:], nonpadding.shape)
155 |         self.add_f0_loss(output['pitch_pred'], f0, uv, losses, nonpadding=nonpadding)


--------------------------------------------------------------------------------
/tasks/tts/tts.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing.pool import Pool
  2 | 
  3 | import matplotlib
  4 | 
  5 | from utils.pl_utils import data_loader
  6 | from utils.training_utils import RSQRTSchedule
  7 | from vocoders.base_vocoder import get_vocoder_cls, BaseVocoder
  8 | from modules.fastspeech.pe import PitchExtractor
  9 | 
 10 | matplotlib.use('Agg')
 11 | import os
 12 | import numpy as np
 13 | from tqdm import tqdm
 14 | import torch.distributed as dist
 15 | 
 16 | from tasks.base_task import BaseTask
 17 | from utils.hparams import hparams
 18 | from utils.text_encoder import TokenTextEncoder
 19 | import json
 20 | 
 21 | import torch
 22 | import torch.optim
 23 | import torch.utils.data
 24 | import utils
 25 | 
 26 | 
 27 | 
 28 | class TtsTask(BaseTask):
 29 |     def __init__(self, *args, **kwargs):
 30 |         self.vocoder = None
 31 |         self.phone_encoder = self.build_phone_encoder(hparams['binary_data_dir'])
 32 |         self.padding_idx = self.phone_encoder.pad()
 33 |         self.eos_idx = self.phone_encoder.eos()
 34 |         self.seg_idx = self.phone_encoder.seg()
 35 |         self.saving_result_pool = None
 36 |         self.saving_results_futures = None
 37 |         self.stats = {}
 38 |         super().__init__(*args, **kwargs)
 39 | 
 40 |     def build_scheduler(self, optimizer):
 41 |         return RSQRTSchedule(optimizer)
 42 | 
 43 |     def build_optimizer(self, model):
 44 |         self.optimizer = optimizer = torch.optim.AdamW(
 45 |             model.parameters(),
 46 |             lr=hparams['lr'])
 47 |         return optimizer
 48 | 
 49 |     def build_dataloader(self, dataset, shuffle, max_tokens=None, max_sentences=None,
 50 |                          required_batch_size_multiple=-1, endless=False, batch_by_size=True):
 51 |         devices_cnt = torch.cuda.device_count()
 52 |         if devices_cnt == 0:
 53 |             devices_cnt = 1
 54 |         if required_batch_size_multiple == -1:
 55 |             required_batch_size_multiple = devices_cnt
 56 | 
 57 |         def shuffle_batches(batches):
 58 |             np.random.shuffle(batches)
 59 |             return batches
 60 | 
 61 |         if max_tokens is not None:
 62 |             max_tokens *= devices_cnt
 63 |         if max_sentences is not None:
 64 |             max_sentences *= devices_cnt
 65 |         indices = dataset.ordered_indices()
 66 |         if batch_by_size:
 67 |             batch_sampler = utils.batch_by_size(
 68 |                 indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences,
 69 |                 required_batch_size_multiple=required_batch_size_multiple,
 70 |             )
 71 |         else:
 72 |             batch_sampler = []
 73 |             for i in range(0, len(indices), max_sentences):
 74 |                 batch_sampler.append(indices[i:i + max_sentences])
 75 | 
 76 |         if shuffle:
 77 |             batches = shuffle_batches(list(batch_sampler))
 78 |             if endless:
 79 |                 batches = [b for _ in range(1000) for b in shuffle_batches(list(batch_sampler))]
 80 |         else:
 81 |             batches = batch_sampler
 82 |             if endless:
 83 |                 batches = [b for _ in range(1000) for b in batches]
 84 |         num_workers = dataset.num_workers
 85 |         if self.trainer.use_ddp:
 86 |             num_replicas = dist.get_world_size()
 87 |             rank = dist.get_rank()
 88 |             batches = [x[rank::num_replicas] for x in batches if len(x) % num_replicas == 0]
 89 |         return torch.utils.data.DataLoader(dataset,
 90 |                                            collate_fn=dataset.collater,
 91 |                                            batch_sampler=batches,
 92 |                                            num_workers=num_workers,
 93 |                                            pin_memory=False)
 94 | 
 95 |     def build_phone_encoder(self, data_dir):
 96 |         phone_list_file = os.path.join(data_dir, 'phone_set.json')
 97 | 
 98 |         phone_list = json.load(open(phone_list_file))
 99 |         return TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',')
100 | 
101 |     def build_optimizer(self, model):
102 |         self.optimizer = optimizer = torch.optim.AdamW(
103 |             model.parameters(),
104 |             lr=hparams['lr'])
105 |         return optimizer
106 | 
107 |     def test_start(self):
108 |         self.saving_result_pool = Pool(8)
109 |         self.saving_results_futures = []
110 |         self.vocoder: BaseVocoder = get_vocoder_cls(hparams)()
111 |         if hparams.get('pe_enable') is not None and hparams['pe_enable']:
112 |             self.pe = PitchExtractor().cuda()
113 |             utils.load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
114 |             self.pe.eval()
115 |     def test_end(self, outputs):
116 |         self.saving_result_pool.close()
117 |         [f.get() for f in tqdm(self.saving_results_futures)]
118 |         self.saving_result_pool.join()
119 |         return {}
120 | 
121 |     ##########
122 |     # utils
123 |     ##########
124 |     def weights_nonzero_speech(self, target):
125 |         # target : B x T x mel
126 |         # Assign weight 1.0 to all labels except for padding (id=0).
127 |         dim = target.size(-1)
128 |         return target.abs().sum(-1, keepdim=True).ne(0).float().repeat(1, 1, dim)
129 | 
130 | if __name__ == '__main__':
131 |     TtsTask.start()
132 | 


--------------------------------------------------------------------------------
/usr/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/usr/.gitkeep


--------------------------------------------------------------------------------
/usr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MoonInTheRiver/DiffSinger/ce7789f1427ddcdec647b3ab2bf2d1b12134e51e/usr/__init__.py


--------------------------------------------------------------------------------
/usr/configs/base.yaml:
--------------------------------------------------------------------------------
 1 | task_cls: usr.task.DiffFsTask
 2 | pitch_type: frame
 3 | timesteps: 100
 4 | dilation_cycle_length: 1
 5 | residual_layers: 20
 6 | residual_channels: 256
 7 | lr: 0.001
 8 | decay_steps: 50000
 9 | keep_bins: 80
10 | spec_min: [ ]
11 | spec_max: [ ]
12 | 
13 | content_cond_steps: [ ] # [ 0, 10000 ]
14 | spk_cond_steps: [ ] # [ 0, 10000 ]
15 | # train and eval
16 | fs2_ckpt: ''
17 | max_updates: 400000
18 | # max_updates: 200000
19 | use_gt_dur: true
20 | use_gt_f0: true
21 | gen_tgt_spk_id: -1
22 | max_sentences: 48
23 | num_sanity_val_steps: 1
24 | num_valid_plots: 1
25 | 


--------------------------------------------------------------------------------
/usr/configs/lj_ds_beta6.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - configs/tts/lj/fs2.yaml
 3 |   - ./base.yaml
 4 | # spec_min and spec_max are calculated on the training set.
 5 | spec_min: [ -4.7574, -4.6783, -4.6431, -4.5832, -4.5390, -4.6771, -4.8089, -4.7672,
 6 |             -4.5784, -4.7755, -4.7150, -4.8919, -4.8271, -4.7389, -4.6047, -4.7759,
 7 |             -4.6799, -4.8201, -4.7823, -4.8262, -4.7857, -4.7545, -4.9358, -4.9733,
 8 |             -5.1134, -5.1395, -4.9016, -4.8434, -5.0189, -4.8460, -5.0529, -4.9510,
 9 |             -5.0217, -5.0049, -5.1831, -5.1445, -5.1015, -5.0281, -4.9887, -4.9916,
10 |             -4.9785, -4.9071, -4.9488, -5.0342, -4.9332, -5.0650, -4.8924, -5.0875,
11 |             -5.0483, -5.0848, -5.1809, -5.0677, -5.0015, -5.0792, -5.0636, -5.2413,
12 |             -5.1421, -5.1710, -5.3256, -5.0511, -5.1186, -5.0057, -5.0446, -5.1173,
13 |             -5.0325, -5.1085, -5.0053, -5.0755, -5.1176, -5.1004, -5.2153, -5.2757,
14 |             -5.3025, -5.2867, -5.2918, -5.3328, -5.2731, -5.2985, -5.2400, -5.2211 ]
15 | spec_max: [ -0.5982, -0.0778,  0.1205,  0.2747,  0.4657,  0.5123,  0.5684,  0.7093,
16 |             0.6461,  0.6420,  0.7316,  0.7715,  0.7681,  0.8349,  0.7815,  0.7591,
17 |             0.7910,  0.7433,  0.7352,  0.6869,  0.6854,  0.6623,  0.5353,  0.6492,
18 |             0.6909,  0.6106,  0.5761,  0.5936,  0.5638,  0.4054,  0.4545,  0.3589,
19 |             0.3037,  0.3380,  0.1599,  0.2433,  0.2741,  0.2130,  0.1569,  0.1911,
20 |             0.2324,  0.1586,  0.1221,  0.0341, -0.0558,  0.0553, -0.1153, -0.0933,
21 |             -0.1171, -0.0050, -0.1519, -0.1629, -0.0522, -0.0739, -0.2069, -0.2405,
22 |             -0.1244, -0.2116, -0.1361, -0.1575, -0.1442,  0.0513, -0.1567, -0.2000,
23 |             0.0086, -0.0698,  0.1385,  0.0941,  0.1864,  0.1225,  0.2176,  0.2566,
24 |             0.1670,  0.1007,  0.1444,  0.0888,  0.1998,  0.2414,  0.2932,  0.3047 ]
25 | 
26 | task_cls: usr.diffspeech_task.DiffSpeechTask
27 | vocoder: vocoders.hifigan.HifiGAN
28 | vocoder_ckpt: checkpoints/0414_hifi_lj_1
29 | num_valid_plots: 10
30 | use_gt_dur: false
31 | use_gt_f0: false
32 | pitch_type: cwt
33 | pitch_extractor: 'parselmouth'
34 | max_updates: 160000
35 | lr: 0.001
36 | timesteps: 100
37 | K_step: 71
38 | diff_loss_type: l1
39 | diff_decoder_type: 'wavenet'
40 | schedule_type: 'linear'
41 | max_beta: 0.06
42 | fs2_ckpt: checkpoints/fs2_lj_1/model_ckpt_steps_150000.ckpt
43 | save_gt: true


--------------------------------------------------------------------------------
/usr/configs/lj_ds_pndm.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - ./lj_ds_beta6.yaml
 3 | 
 4 | fs2_ckpt: ''
 5 | gaussian_start: True
 6 | max_beta: 0.02
 7 | timesteps: 1000
 8 | K_step: 1000
 9 | pndm_speedup: 10
10 | 
11 | pitch_type: frame
12 | use_pitch_embed: false   #  using diffusion to model pitch curve
13 | lambda_f0: 0.
14 | lambda_uv: 0.
15 | #rel_pos: true
16 | 
17 | max_updates: 320000
18 | 


--------------------------------------------------------------------------------
/usr/configs/midi/cascade/opencs/aux_rel.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - configs/singing/fs2.yaml
 3 |   - usr/configs/midi/cascade/opencs/opencpop_statis.yaml
 4 | 
 5 | audio_sample_rate: 24000
 6 | hop_size: 128            # Hop size.
 7 | fft_size: 512           # FFT size.
 8 | win_size: 512           # FFT size.
 9 | fmin: 30
10 | fmax: 12000
11 | min_level_db: -120
12 | 
13 | binarization_args:
14 |   with_wav: true
15 |   with_spk_embed: false
16 |   with_align: true
17 | raw_data_dir: 'data/raw/opencpop/segments'
18 | processed_data_dir: 'xxx'
19 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
20 | 
21 | 
22 | binary_data_dir: 'data/binary/opencpop-midi-dp'
23 | use_midi: true  #  for midi exp
24 | use_gt_f0: false  #  for midi exp
25 | use_gt_dur: false  # for further midi exp
26 | lambda_f0: 1.0
27 | lambda_uv: 1.0
28 | #lambda_energy: 0.1
29 | lambda_ph_dur: 1.0
30 | lambda_sent_dur: 1.0
31 | lambda_word_dur: 1.0
32 | predictor_grad: 0.1
33 | pe_enable: false
34 | pe_ckpt: ''
35 | 
36 | num_spk: 1
37 | test_prefixes: [
38 |     '2044',
39 |     '2086',
40 |     '2092',
41 |     '2093',
42 |     '2100',
43 | ]
44 | 
45 | task_cls: usr.diffsinger_task.AuxDecoderMIDITask
46 | #vocoder: usr.singingvocoder.highgan.HighGAN
47 | #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
48 | vocoder: vocoders.hifigan.HifiGAN
49 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
50 | 
51 | use_nsf: true
52 | 
53 | # config for experiments
54 | max_frames: 5000
55 | max_tokens: 40000
56 | predictor_layers: 5
57 | rel_pos: true
58 | dur_predictor_layers: 5  # *
59 | 
60 | use_spk_embed: false
61 | num_valid_plots: 10
62 | max_updates: 160000
63 | save_gt: true


--------------------------------------------------------------------------------
/usr/configs/midi/cascade/opencs/ds60_rel.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - usr/configs/popcs_ds_beta6.yaml
 3 |   - usr/configs/midi/cascade/opencs/opencpop_statis.yaml
 4 | 
 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
 6 | binary_data_dir: 'data/binary/opencpop-midi-dp'
 7 | 
 8 | #switch_midi2f0_step: 174000
 9 | use_midi: true  #  for midi exp
10 | use_gt_f0: false  #  for midi exp
11 | use_gt_dur: false  # for further midi exp
12 | lambda_f0: 1.0
13 | lambda_uv: 1.0
14 | #lambda_energy: 0.1
15 | lambda_ph_dur: 1.0
16 | lambda_sent_dur: 1.0
17 | lambda_word_dur: 1.0
18 | predictor_grad: 0.1
19 | pe_enable: false
20 | pe_ckpt: ''
21 | 
22 | fs2_ckpt: 'checkpoints/0302_opencpop_fs_midi/model_ckpt_steps_160000.ckpt'  #
23 | #num_valid_plots: 0
24 | task_cls: usr.diffsinger_task.DiffSingerMIDITask
25 | 
26 | K_step: 60
27 | max_tokens: 40000
28 | predictor_layers: 5
29 | dilation_cycle_length: 4  # *
30 | rel_pos: true
31 | dur_predictor_layers: 5  # *
32 | max_updates: 160000
33 | gaussian_start: false
34 | 


--------------------------------------------------------------------------------
/usr/configs/midi/cascade/opencs/opencpop_statis.yaml:
--------------------------------------------------------------------------------
 1 | spec_min: [-6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
 2 |            -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
 3 |            -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
 4 |            -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
 5 |            -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
 6 |            -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
 7 |            -6., -6., -6., -6., -6., -6., -6., -6.]
 8 | spec_max: [-7.9453e-01, -8.1116e-01, -6.1631e-01, -3.0679e-01, -1.3863e-01,
 9 |            -5.0652e-02, -1.1563e-01, -1.0679e-01, -9.1068e-02, -6.2174e-02,
10 |            -7.5302e-02, -7.2217e-02, -6.3815e-02, -7.3299e-02,  7.3610e-03,
11 |            -7.2508e-02, -5.0234e-02, -1.6534e-01, -2.6928e-01, -2.0782e-01,
12 |            -2.0823e-01, -1.1702e-01, -7.0128e-02, -6.5868e-02, -1.2675e-02,
13 |            1.5121e-03, -8.9902e-02, -2.1392e-01, -2.3789e-01, -2.8922e-01,
14 |            -3.0405e-01, -2.3029e-01, -2.2088e-01, -2.1542e-01, -2.9367e-01,
15 |            -3.0137e-01, -3.8281e-01, -4.3590e-01, -2.8681e-01, -4.6855e-01,
16 |            -5.7485e-01, -4.7022e-01, -5.4266e-01, -4.4848e-01, -6.4120e-01,
17 |            -6.8700e-01, -6.4860e-01, -7.6436e-01, -4.9971e-01, -7.1068e-01,
18 |            -6.9724e-01, -6.1487e-01, -5.5843e-01, -6.9773e-01, -5.7502e-01,
19 |            -7.0919e-01, -8.2431e-01, -8.4213e-01, -9.0431e-01, -8.2840e-01,
20 |            -7.7945e-01, -8.2758e-01, -8.7699e-01, -1.0532e+00, -1.0766e+00,
21 |            -1.1198e+00, -1.0185e+00, -9.8983e-01, -1.0001e+00, -1.0756e+00,
22 |            -1.0024e+00, -1.0304e+00, -1.0579e+00, -1.0188e+00, -1.0500e+00,
23 |            -1.0842e+00, -1.0923e+00, -1.1223e+00, -1.2381e+00, -1.6467e+00]
24 | 
25 | mel_vmin: -6. #-6.
26 | mel_vmax: 1.5
27 | wav2spec_eps: 1e-6
28 | 
29 | raw_data_dir: 'data/raw/opencpop/segments'
30 | processed_data_dir: 'xxx'
31 | binary_data_dir: 'data/binary/opencpop-midi-dp'
32 | datasets: [
33 |   'opencpop',
34 | ]
35 | test_prefixes: [
36 |     '2044',
37 |     '2086',
38 |     '2092',
39 |     '2093',
40 |     '2100',
41 | ]
42 | 


--------------------------------------------------------------------------------
/usr/configs/midi/e2e/opencpop/ds1000.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - usr/configs/popcs_ds_beta6.yaml
 3 |   - usr/configs/midi/cascade/opencs/opencpop_statis.yaml
 4 | 
 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
 6 | binary_data_dir: 'data/binary/opencpop-midi-dp'
 7 | 
 8 | #switch_midi2f0_step: 174000
 9 | use_midi: true  #  for midi exp
10 | use_gt_dur: false  # for further midi exp
11 | lambda_ph_dur: 1.0
12 | lambda_sent_dur: 1.0
13 | lambda_word_dur: 1.0
14 | predictor_grad: 0.1
15 | dur_predictor_layers: 5  # *
16 | 
17 | 
18 | fs2_ckpt: ''  #
19 | #num_valid_plots: 0
20 | task_cls: usr.diffsinger_task.DiffSingerMIDITask
21 | 
22 | # for diffusion schedule
23 | timesteps: 1000
24 | K_step: 1000
25 | max_beta: 0.02
26 | max_tokens: 36000
27 | max_updates: 320000
28 | gaussian_start: True
29 | pndm_speedup: 40
30 | 
31 | use_pitch_embed: false
32 | use_gt_f0: false  #  for midi exp
33 | 
34 | lambda_f0: 0.
35 | lambda_uv: 0.
36 | dilation_cycle_length: 4  # *
37 | rel_pos: true
38 | predictor_layers: 5
39 | pe_enable: true
40 | pe_ckpt: 'checkpoints/0102_xiaoma_pe'
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - usr/configs/popcs_ds_beta6.yaml
 3 |   - usr/configs/midi/cascade/opencs/opencpop_statis.yaml
 4 | 
 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
 6 | binary_data_dir: 'data/binary/opencpop-midi-dp'
 7 | 
 8 | #switch_midi2f0_step: 174000
 9 | use_midi: true  #  for midi exp
10 | use_gt_dur: false  # for further midi exp
11 | lambda_ph_dur: 1.0
12 | lambda_sent_dur: 1.0
13 | lambda_word_dur: 1.0
14 | predictor_grad: 0.1
15 | dur_predictor_layers: 5  # *
16 | 
17 | 
18 | fs2_ckpt: ''  #
19 | #num_valid_plots: 0
20 | task_cls: usr.diffsinger_task.DiffSingerMIDITask
21 | 
22 | K_step: 100
23 | max_tokens: 40000
24 | max_updates: 160000
25 | gaussian_start: True
26 | 
27 | use_pitch_embed: false
28 | use_gt_f0: false  #  for midi exp
29 | 
30 | lambda_f0: 0.
31 | lambda_uv: 0.
32 | dilation_cycle_length: 4  # *
33 | rel_pos: true
34 | predictor_layers: 5
35 | pe_enable: true
36 | pe_ckpt: 'checkpoints/0102_xiaoma_pe'
37 | 


--------------------------------------------------------------------------------
/usr/configs/midi/e2e/popcs/ds100_adj_rel.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - usr/configs/popcs_ds_beta6.yaml
 3 |   - usr/configs/midi/cascade/popcs/popcs_statis.yaml
 4 | 
 5 | binarizer_cls: data_gen.singing.binarize.MidiSingingBinarizer
 6 | binary_data_dir: 'data/binary/popcs-midi-dp'
 7 | 
 8 | #switch_midi2f0_step: 174000
 9 | use_midi: true  #  for midi exp
10 | use_gt_dur: false  # for further midi exp
11 | lambda_ph_dur: 1.0
12 | lambda_sent_dur: 1.0
13 | lambda_word_dur: 1.0
14 | predictor_grad: 0.1
15 | dur_predictor_layers: 5  # *
16 | 
17 | 
18 | fs2_ckpt: ''  #
19 | #num_valid_plots: 0
20 | task_cls: usr.diffsinger_task.DiffSingerMIDITask
21 | 
22 | K_step: 100
23 | max_tokens: 40000
24 | max_updates: 160000
25 | gaussian_start: True
26 | 
27 | use_pitch_embed: false
28 | use_gt_f0: false  #  for midi exp
29 | 
30 | lambda_f0: 0.
31 | lambda_uv: 0.
32 | dilation_cycle_length: 4  # *
33 | rel_pos: true
34 | predictor_layers: 5
35 | pe_enable: true
36 | pe_ckpt: 'checkpoints/0102_xiaoma_pe'
37 | 


--------------------------------------------------------------------------------
/usr/configs/midi/pe.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - configs/tts/lj/fs2.yaml
 3 | 
 4 | max_frames: 8000
 5 | audio_sample_rate: 24000
 6 | hop_size: 128            # Hop size.
 7 | fft_size: 512           # FFT size.
 8 | win_size: 512           # FFT size.
 9 | fmin: 30
10 | fmax: 12000
11 | min_level_db: -120
12 | 
13 | binary_data_dir: 'xxx'
14 | 
15 | pitch_type: frame
16 | task_cls: tasks.tts.pe.PitchExtractionTask
17 | pitch_extractor_conv_layers: 2
18 | 
19 | 
20 | # config for experiments
21 | max_tokens: 20000
22 | use_spk_embed: false
23 | num_valid_plots: 10
24 | max_updates: 60000


--------------------------------------------------------------------------------
/usr/configs/popcs_ds_beta6.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - configs/tts/fs2.yaml
 3 |   - configs/singing/base.yaml
 4 |   - ./base.yaml
 5 | 
 6 | audio_sample_rate: 24000
 7 | hop_size: 128            # Hop size.
 8 | fft_size: 512           # FFT size.
 9 | win_size: 512           # FFT size.
10 | fmin: 30
11 | fmax: 12000
12 | min_level_db: -120
13 | 
14 | binarization_args:
15 |   with_wav: true
16 |   with_spk_embed: false
17 |   with_align: true
18 | raw_data_dir: 'data/raw/popcs'
19 | processed_data_dir: 'data/processed/popcs'
20 | binary_data_dir: 'data/binary/popcs-pmf0'
21 | num_spk: 1
22 | datasets: [
23 |   'popcs',
24 | ]
25 | test_prefixes: [
26 |   'popcs-说散就散',
27 |   'popcs-隐形的翅膀',
28 | ]
29 | 
30 | spec_min: [-6.8276, -7.0270, -6.8142, -7.1429, -7.6669, -7.6000, -7.1148, -6.9640,
31 |            -6.8414, -6.6596, -6.6880, -6.7439, -6.7986, -7.4940, -7.7845, -7.6586,
32 |            -6.9288, -6.7639, -6.9118, -6.8246, -6.7183, -7.1769, -6.9794, -7.4513,
33 |            -7.3422, -7.5623, -6.9610, -6.8158, -6.9595, -6.8403, -6.5688, -6.6356,
34 |            -7.0209, -6.5002, -6.7819, -6.5232, -6.6927, -6.5701, -6.5531, -6.7069,
35 |            -6.6462, -6.4523, -6.5954, -6.4264, -6.4487, -6.7070, -6.4025, -6.3042,
36 |            -6.4008, -6.3857, -6.3903, -6.3094, -6.2491, -6.3518, -6.3566, -6.4168,
37 |            -6.2481, -6.3624, -6.2858, -6.2575, -6.3638, -6.4520, -6.1835, -6.2754,
38 |            -6.1253, -6.1645, -6.0638, -6.1262, -6.0710, -6.1039, -6.4428, -6.1363,
39 |            -6.1054, -6.1252, -6.1797, -6.0235, -6.0758, -5.9453, -6.0213, -6.0446]
40 | spec_max: [ 0.2645,  0.0583, -0.2344, -0.0184,  0.1227,  0.1533,  0.1103,  0.1212,
41 |             0.2421,  0.1809,  0.2134,  0.3161,  0.3301,  0.3289,  0.2667,  0.2421,
42 |             0.2581,  0.2600,  0.1394,  0.1907,  0.1082,  0.1474,  0.1680,  0.2550,
43 |             0.1057,  0.0826,  0.0423,  0.1203, -0.0701, -0.0056,  0.0477, -0.0639,
44 |             -0.0272, -0.0728, -0.1648, -0.0855, -0.2652, -0.1998, -0.1547, -0.2167,
45 |             -0.4181, -0.5463, -0.4161, -0.4733, -0.6518, -0.5387, -0.4290, -0.4191,
46 |             -0.4151, -0.3042, -0.3810, -0.4160, -0.4496, -0.2847, -0.4676, -0.4658,
47 |             -0.4931, -0.4885, -0.5547, -0.5481, -0.6948, -0.7968, -0.8455, -0.8392,
48 |             -0.8770, -0.9520, -0.8749, -0.7297, -0.8374, -0.8667, -0.7157, -0.9035,
49 |             -0.9219, -0.8801, -0.9298, -0.9009, -0.9604, -1.0537, -1.0781, -1.3766]
50 | 
51 | task_cls: usr.diffsinger_task.DiffSingerTask
52 | #vocoder: usr.singingvocoder.highgan.HighGAN
53 | #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
54 | vocoder: vocoders.hifigan.HifiGAN
55 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
56 | 
57 | pitch_extractor: 'parselmouth'
58 | # config for experiments
59 | use_spk_embed: false
60 | num_valid_plots: 10
61 | max_updates: 160000
62 | lr: 0.001
63 | timesteps: 100
64 | K_step: 51
65 | diff_loss_type: l1
66 | diff_decoder_type: 'wavenet'
67 | schedule_type: 'linear'
68 | max_beta: 0.06
69 | fs2_ckpt: ''
70 | use_nsf: true


--------------------------------------------------------------------------------
/usr/configs/popcs_ds_beta6_offline.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - ./popcs_ds_beta6.yaml
 3 | 
 4 | fs2_ckpt: checkpoints/popcs_fs2_pmf0_1230/model_ckpt_steps_160000.ckpt  # to be infer
 5 | num_valid_plots: 0
 6 | task_cls: usr.diffsinger_task.DiffSingerOfflineTask
 7 | 
 8 | # tmp:
 9 | #pe_enable: true
10 | #pe_ckpt: ''
11 | vocoder: vocoders.hifigan.HifiGAN
12 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128


--------------------------------------------------------------------------------
/usr/configs/popcs_fs2.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - configs/singing/fs2.yaml
 3 | 
 4 | audio_sample_rate: 24000
 5 | hop_size: 128            # Hop size.
 6 | fft_size: 512           # FFT size.
 7 | win_size: 512           # FFT size.
 8 | fmin: 30
 9 | fmax: 12000
10 | min_level_db: -120
11 | 
12 | binarization_args:
13 |   with_wav: true
14 |   with_spk_embed: false
15 |   with_align: true
16 | raw_data_dir: 'data/raw/popcs'
17 | processed_data_dir: 'data/processed/popcs'
18 | binary_data_dir: 'data/binary/popcs-pmf0'
19 | num_spk: 1
20 | datasets: [
21 |   'popcs',
22 | ]
23 | test_prefixes: [
24 |   'popcs-说散就散',
25 |   'popcs-隐形的翅膀',
26 | ]
27 | 
28 | task_cls: tasks.tts.fs2.FastSpeech2Task
29 | #vocoder: usr.singingvocoder.highgan.HighGAN
30 | #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
31 | vocoder: vocoders.hifigan.HifiGAN
32 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
33 | use_nsf: true
34 | 
35 | # config for experiments
36 | max_tokens: 18000
37 | use_spk_embed: false
38 | num_valid_plots: 10
39 | max_updates: 160000
40 | save_gt: true
41 | 
42 | # tmp:
43 | #pe_enable: true
44 | #pe_ckpt: ''


--------------------------------------------------------------------------------
/usr/diff/candidate_decoder.py:
--------------------------------------------------------------------------------
 1 | from modules.fastspeech.tts_modules import FastspeechDecoder
 2 | # from modules.fastspeech.fast_tacotron import DecoderRNN
 3 | # from modules.fastspeech.speedy_speech.speedy_speech import ConvBlocks
 4 | # from modules.fastspeech.conformer.conformer import ConformerDecoder
 5 | import torch
 6 | from torch.nn import functional as F
 7 | import torch.nn as nn
 8 | import math
 9 | from utils.hparams import hparams
10 | from .diffusion import Mish
11 | Linear = nn.Linear
12 | 
13 | 
14 | class SinusoidalPosEmb(nn.Module):
15 |     def __init__(self, dim):
16 |         super().__init__()
17 |         self.dim = dim
18 | 
19 |     def forward(self, x):
20 |         device = x.device
21 |         half_dim = self.dim // 2
22 |         emb = math.log(10000) / (half_dim - 1)
23 |         emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
24 |         emb = x[:, None] * emb[None, :]
25 |         emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
26 |         return emb
27 | 
28 | 
29 | def Conv1d(*args, **kwargs):
30 |     layer = nn.Conv1d(*args, **kwargs)
31 |     nn.init.kaiming_normal_(layer.weight)
32 |     return layer
33 | 
34 | 
35 | class FFT(FastspeechDecoder):
36 |     def __init__(self, hidden_size=None, num_layers=None, kernel_size=None, num_heads=None):
37 |         super().__init__(hidden_size, num_layers, kernel_size, num_heads=num_heads)
38 |         dim = hparams['residual_channels']
39 |         self.input_projection = Conv1d(hparams['audio_num_mel_bins'], dim, 1)
40 |         self.diffusion_embedding = SinusoidalPosEmb(dim)
41 |         self.mlp = nn.Sequential(
42 |             nn.Linear(dim, dim * 4),
43 |             Mish(),
44 |             nn.Linear(dim * 4, dim)
45 |         )
46 |         self.get_mel_out = Linear(hparams['hidden_size'], 80, bias=True)
47 |         self.get_decode_inp = Linear(hparams['hidden_size'] + dim + dim,
48 |                                      hparams['hidden_size'])  # hs + dim + 80 -> hs
49 | 
50 |     def forward(self, spec, diffusion_step, cond, padding_mask=None, attn_mask=None, return_hiddens=False):
51 |         """
52 |         :param spec: [B, 1, 80, T]
53 |         :param diffusion_step: [B, 1]
54 |         :param cond: [B, M, T]
55 |         :return:
56 |         """
57 |         x = spec[:, 0]
58 |         x = self.input_projection(x).permute([0, 2, 1])  #  [B, T, residual_channel]
59 |         diffusion_step = self.diffusion_embedding(diffusion_step)
60 |         diffusion_step = self.mlp(diffusion_step)  # [B, dim]
61 |         cond = cond.permute([0, 2, 1])  # [B, T, M]
62 | 
63 |         seq_len = cond.shape[1]  # [T_mel]
64 |         time_embed = diffusion_step[:, None, :]  # [B, 1, dim]
65 |         time_embed = time_embed.repeat([1, seq_len, 1])  # # [B, T, dim]
66 | 
67 |         decoder_inp = torch.cat([x, cond, time_embed], dim=-1)  # [B, T, dim + H + dim]
68 |         decoder_inp = self.get_decode_inp(decoder_inp)  # [B, T, H]
69 |         x = decoder_inp
70 | 
71 |         '''
72 |         Required x: [B, T, C]
73 |         :return: [B, T, C] or [L, B, T, C]
74 |         '''
75 |         padding_mask = x.abs().sum(-1).eq(0).data if padding_mask is None else padding_mask
76 |         nonpadding_mask_TB = 1 - padding_mask.transpose(0, 1).float()[:, :, None]  # [T, B, 1]
77 |         if self.use_pos_embed:
78 |             positions = self.pos_embed_alpha * self.embed_positions(x[..., 0])
79 |             x = x + positions
80 |             x = F.dropout(x, p=self.dropout, training=self.training)
81 |         # B x T x C -> T x B x C
82 |         x = x.transpose(0, 1) * nonpadding_mask_TB
83 |         hiddens = []
84 |         for layer in self.layers:
85 |             x = layer(x, encoder_padding_mask=padding_mask, attn_mask=attn_mask) * nonpadding_mask_TB
86 |             hiddens.append(x)
87 |         if self.use_last_norm:
88 |             x = self.layer_norm(x) * nonpadding_mask_TB
89 |         if return_hiddens:
90 |             x = torch.stack(hiddens, 0)  # [L, T, B, C]
91 |             x = x.transpose(1, 2)  # [L, B, T, C]
92 |         else:
93 |             x = x.transpose(0, 1)  # [B, T, C]
94 | 
95 |         x = self.get_mel_out(x).permute([0, 2, 1])  # [B, 80, T]
96 |         return x[:, None, :, :]


--------------------------------------------------------------------------------
/usr/diff/net.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | from math import sqrt
  8 | 
  9 | from .diffusion import Mish
 10 | from utils.hparams import hparams
 11 | 
 12 | Linear = nn.Linear
 13 | ConvTranspose2d = nn.ConvTranspose2d
 14 | 
 15 | 
 16 | class AttrDict(dict):
 17 |     def __init__(self, *args, **kwargs):
 18 |         super(AttrDict, self).__init__(*args, **kwargs)
 19 |         self.__dict__ = self
 20 | 
 21 |     def override(self, attrs):
 22 |         if isinstance(attrs, dict):
 23 |             self.__dict__.update(**attrs)
 24 |         elif isinstance(attrs, (list, tuple, set)):
 25 |             for attr in attrs:
 26 |                 self.override(attr)
 27 |         elif attrs is not None:
 28 |             raise NotImplementedError
 29 |         return self
 30 | 
 31 | 
 32 | class SinusoidalPosEmb(nn.Module):
 33 |     def __init__(self, dim):
 34 |         super().__init__()
 35 |         self.dim = dim
 36 | 
 37 |     def forward(self, x):
 38 |         device = x.device
 39 |         half_dim = self.dim // 2
 40 |         emb = math.log(10000) / (half_dim - 1)
 41 |         emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
 42 |         emb = x[:, None] * emb[None, :]
 43 |         emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
 44 |         return emb
 45 | 
 46 | 
 47 | def Conv1d(*args, **kwargs):
 48 |     layer = nn.Conv1d(*args, **kwargs)
 49 |     nn.init.kaiming_normal_(layer.weight)
 50 |     return layer
 51 | 
 52 | 
 53 | @torch.jit.script
 54 | def silu(x):
 55 |     return x * torch.sigmoid(x)
 56 | 
 57 | 
 58 | class ResidualBlock(nn.Module):
 59 |     def __init__(self, encoder_hidden, residual_channels, dilation):
 60 |         super().__init__()
 61 |         self.dilated_conv = Conv1d(residual_channels, 2 * residual_channels, 3, padding=dilation, dilation=dilation)
 62 |         self.diffusion_projection = Linear(residual_channels, residual_channels)
 63 |         self.conditioner_projection = Conv1d(encoder_hidden, 2 * residual_channels, 1)
 64 |         self.output_projection = Conv1d(residual_channels, 2 * residual_channels, 1)
 65 | 
 66 |     def forward(self, x, conditioner, diffusion_step):
 67 |         diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
 68 |         conditioner = self.conditioner_projection(conditioner)
 69 |         y = x + diffusion_step
 70 | 
 71 |         y = self.dilated_conv(y) + conditioner
 72 | 
 73 |         gate, filter = torch.chunk(y, 2, dim=1)
 74 |         y = torch.sigmoid(gate) * torch.tanh(filter)
 75 | 
 76 |         y = self.output_projection(y)
 77 |         residual, skip = torch.chunk(y, 2, dim=1)
 78 |         return (x + residual) / sqrt(2.0), skip
 79 | 
 80 | 
 81 | class DiffNet(nn.Module):
 82 |     def __init__(self, in_dims=80):
 83 |         super().__init__()
 84 |         self.params = params = AttrDict(
 85 |             # Model params
 86 |             encoder_hidden=hparams['hidden_size'],
 87 |             residual_layers=hparams['residual_layers'],
 88 |             residual_channels=hparams['residual_channels'],
 89 |             dilation_cycle_length=hparams['dilation_cycle_length'],
 90 |         )
 91 |         self.input_projection = Conv1d(in_dims, params.residual_channels, 1)
 92 |         self.diffusion_embedding = SinusoidalPosEmb(params.residual_channels)
 93 |         dim = params.residual_channels
 94 |         self.mlp = nn.Sequential(
 95 |             nn.Linear(dim, dim * 4),
 96 |             Mish(),
 97 |             nn.Linear(dim * 4, dim)
 98 |         )
 99 |         self.residual_layers = nn.ModuleList([
100 |             ResidualBlock(params.encoder_hidden, params.residual_channels, 2 ** (i % params.dilation_cycle_length))
101 |             for i in range(params.residual_layers)
102 |         ])
103 |         self.skip_projection = Conv1d(params.residual_channels, params.residual_channels, 1)
104 |         self.output_projection = Conv1d(params.residual_channels, in_dims, 1)
105 |         nn.init.zeros_(self.output_projection.weight)
106 | 
107 |     def forward(self, spec, diffusion_step, cond):
108 |         """
109 | 
110 |         :param spec: [B, 1, M, T]
111 |         :param diffusion_step: [B, 1]
112 |         :param cond: [B, M, T]
113 |         :return:
114 |         """
115 |         x = spec[:, 0]
116 |         x = self.input_projection(x)  # x [B, residual_channel, T]
117 | 
118 |         x = F.relu(x)
119 |         diffusion_step = self.diffusion_embedding(diffusion_step)
120 |         diffusion_step = self.mlp(diffusion_step)
121 |         skip = []
122 |         for layer_id, layer in enumerate(self.residual_layers):
123 |             x, skip_connection = layer(x, cond, diffusion_step)
124 |             skip.append(skip_connection)
125 | 
126 |         x = torch.sum(torch.stack(skip), dim=0) / sqrt(len(self.residual_layers))
127 |         x = self.skip_projection(x)
128 |         x = F.relu(x)
129 |         x = self.output_projection(x)  # [B, 80, T]
130 |         return x[:, None, :, :]
131 | 


--------------------------------------------------------------------------------
/usr/diffspeech_task.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | import utils
  4 | from utils.hparams import hparams
  5 | from .diff.net import DiffNet
  6 | from .diff.shallow_diffusion_tts import GaussianDiffusion
  7 | from .task import DiffFsTask
  8 | from vocoders.base_vocoder import get_vocoder_cls, BaseVocoder
  9 | from utils.pitch_utils import denorm_f0
 10 | from tasks.tts.fs2_utils import FastSpeechDataset
 11 | 
 12 | DIFF_DECODERS = {
 13 |     'wavenet': lambda hp: DiffNet(hp['audio_num_mel_bins']),
 14 | }
 15 | 
 16 | 
 17 | class DiffSpeechTask(DiffFsTask):
 18 |     def __init__(self):
 19 |         super(DiffSpeechTask, self).__init__()
 20 |         self.dataset_cls = FastSpeechDataset
 21 |         self.vocoder: BaseVocoder = get_vocoder_cls(hparams)()
 22 | 
 23 |     def build_tts_model(self):
 24 |         mel_bins = hparams['audio_num_mel_bins']
 25 |         self.model = GaussianDiffusion(
 26 |             phone_encoder=self.phone_encoder,
 27 |             out_dims=mel_bins, denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
 28 |             timesteps=hparams['timesteps'],
 29 |             K_step=hparams['K_step'],
 30 |             loss_type=hparams['diff_loss_type'],
 31 |             spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
 32 |         )
 33 |         if hparams['fs2_ckpt'] != '':
 34 |             utils.load_ckpt(self.model.fs2, hparams['fs2_ckpt'], 'model', strict=True)
 35 |             # self.model.fs2.decoder = None
 36 |             for k, v in self.model.fs2.named_parameters():
 37 |                 if not 'predictor' in k:
 38 |                     v.requires_grad = False
 39 | 
 40 |     def build_optimizer(self, model):
 41 |         self.optimizer = optimizer = torch.optim.AdamW(
 42 |             filter(lambda p: p.requires_grad, model.parameters()),
 43 |             lr=hparams['lr'],
 44 |             betas=(hparams['optimizer_adam_beta1'], hparams['optimizer_adam_beta2']),
 45 |             weight_decay=hparams['weight_decay'])
 46 |         return optimizer
 47 | 
 48 |     def run_model(self, model, sample, return_output=False, infer=False):
 49 |         txt_tokens = sample['txt_tokens']  # [B, T_t]
 50 |         target = sample['mels']  # [B, T_s, 80]
 51 |         # mel2ph = sample['mel2ph'] if hparams['use_gt_dur'] else None # [B, T_s]
 52 |         mel2ph = sample['mel2ph']
 53 |         f0 = sample['f0']
 54 |         uv = sample['uv']
 55 |         energy = sample['energy']
 56 |         # fs2_mel = sample['fs2_mels']
 57 |         spk_embed = sample.get('spk_embed') if not hparams['use_spk_id'] else sample.get('spk_ids')
 58 |         if hparams['pitch_type'] == 'cwt':
 59 |             cwt_spec = sample[f'cwt_spec']
 60 |             f0_mean = sample['f0_mean']
 61 |             f0_std = sample['f0_std']
 62 |             sample['f0_cwt'] = f0 = model.cwt2f0_norm(cwt_spec, f0_mean, f0_std, mel2ph)
 63 | 
 64 |         output = model(txt_tokens, mel2ph=mel2ph, spk_embed=spk_embed,
 65 |                        ref_mels=target, f0=f0, uv=uv, energy=energy, infer=infer)
 66 | 
 67 |         losses = {}
 68 |         if 'diff_loss' in output:
 69 |             losses['mel'] = output['diff_loss']
 70 |         self.add_dur_loss(output['dur'], mel2ph, txt_tokens, losses=losses)
 71 |         if hparams['use_pitch_embed']:
 72 |             self.add_pitch_loss(output, sample, losses)
 73 |         if hparams['use_energy_embed']:
 74 |             self.add_energy_loss(output['energy_pred'], energy, losses)
 75 |         if not return_output:
 76 |             return losses
 77 |         else:
 78 |             return losses, output
 79 | 
 80 |     def validation_step(self, sample, batch_idx):
 81 |         outputs = {}
 82 |         txt_tokens = sample['txt_tokens']  # [B, T_t]
 83 | 
 84 |         energy = sample['energy']
 85 |         spk_embed = sample.get('spk_embed') if not hparams['use_spk_id'] else sample.get('spk_ids')
 86 |         mel2ph = sample['mel2ph']
 87 |         f0 = sample['f0']
 88 |         uv = sample['uv']
 89 | 
 90 |         outputs['losses'] = {}
 91 | 
 92 |         outputs['losses'], model_out = self.run_model(self.model, sample, return_output=True, infer=False)
 93 | 
 94 | 
 95 |         outputs['total_loss'] = sum(outputs['losses'].values())
 96 |         outputs['nsamples'] = sample['nsamples']
 97 |         outputs = utils.tensors_to_scalars(outputs)
 98 |         if batch_idx < hparams['num_valid_plots']:
 99 |             # model_out = self.model(
100 |             #     txt_tokens, spk_embed=spk_embed, mel2ph=None, f0=None, uv=None, energy=None, ref_mels=None, infer=True)
101 |             # self.plot_mel(batch_idx, model_out['mel_out'], model_out['fs2_mel'], name=f'diffspeech_vs_fs2_{batch_idx}')
102 |             model_out = self.model(
103 |                 txt_tokens, spk_embed=spk_embed, mel2ph=mel2ph, f0=f0, uv=uv, energy=energy, ref_mels=None, infer=True)
104 |             gt_f0 = denorm_f0(sample['f0'], sample['uv'], hparams)
105 |             self.plot_wav(batch_idx, sample['mels'], model_out['mel_out'], is_mel=True, gt_f0=gt_f0, f0=model_out.get('f0_denorm'))
106 |             self.plot_mel(batch_idx, sample['mels'], model_out['mel_out'])
107 |         return outputs
108 | 
109 |     ############
110 |     # validation plots
111 |     ############
112 |     def plot_wav(self, batch_idx, gt_wav, wav_out, is_mel=False, gt_f0=None, f0=None, name=None):
113 |         gt_wav = gt_wav[0].cpu().numpy()
114 |         wav_out = wav_out[0].cpu().numpy()
115 |         gt_f0 = gt_f0[0].cpu().numpy()
116 |         f0 = f0[0].cpu().numpy() if f0 is not None else None
117 |         if is_mel:
118 |             gt_wav = self.vocoder.spec2wav(gt_wav, f0=gt_f0)
119 |             wav_out = self.vocoder.spec2wav(wav_out, f0=f0)
120 |         self.logger.experiment.add_audio(f'gt_{batch_idx}', gt_wav, sample_rate=hparams['audio_sample_rate'], global_step=self.global_step)
121 |         self.logger.experiment.add_audio(f'wav_{batch_idx}', wav_out, sample_rate=hparams['audio_sample_rate'], global_step=self.global_step)
122 | 
123 | 


--------------------------------------------------------------------------------
/usr/task.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import utils
 4 | from .diff.diffusion import GaussianDiffusion
 5 | from .diff.net import DiffNet
 6 | from tasks.tts.fs2 import FastSpeech2Task
 7 | from utils.hparams import hparams
 8 | 
 9 | 
10 | DIFF_DECODERS = {
11 |     'wavenet': lambda hp: DiffNet(hp['audio_num_mel_bins']),
12 | }
13 | 
14 | 
15 | class DiffFsTask(FastSpeech2Task):
16 |     def build_tts_model(self):
17 |         mel_bins = hparams['audio_num_mel_bins']
18 |         self.model = GaussianDiffusion(
19 |             phone_encoder=self.phone_encoder,
20 |             out_dims=mel_bins, denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
21 |             timesteps=hparams['timesteps'],
22 |             loss_type=hparams['diff_loss_type'],
23 |             spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
24 |         )
25 | 
26 |     def run_model(self, model, sample, return_output=False, infer=False):
27 |         txt_tokens = sample['txt_tokens']  # [B, T_t]
28 |         target = sample['mels']  # [B, T_s, 80]
29 |         mel2ph = sample['mel2ph']  # [B, T_s]
30 |         f0 = sample['f0']
31 |         uv = sample['uv']
32 |         energy = sample['energy']
33 |         spk_embed = sample.get('spk_embed') if not hparams['use_spk_id'] else sample.get('spk_ids')
34 |         if hparams['pitch_type'] == 'cwt':
35 |             cwt_spec = sample[f'cwt_spec']
36 |             f0_mean = sample['f0_mean']
37 |             f0_std = sample['f0_std']
38 |             sample['f0_cwt'] = f0 = model.cwt2f0_norm(cwt_spec, f0_mean, f0_std, mel2ph)
39 | 
40 |         output = model(txt_tokens, mel2ph=mel2ph, spk_embed=spk_embed,
41 |                        ref_mels=target, f0=f0, uv=uv, energy=energy, infer=infer)
42 | 
43 |         losses = {}
44 |         if 'diff_loss' in output:
45 |             losses['mel'] = output['diff_loss']
46 |         self.add_dur_loss(output['dur'], mel2ph, txt_tokens, losses=losses)
47 |         if hparams['use_pitch_embed']:
48 |             self.add_pitch_loss(output, sample, losses)
49 |         if hparams['use_energy_embed']:
50 |             self.add_energy_loss(output['energy_pred'], energy, losses)
51 |         if not return_output:
52 |             return losses
53 |         else:
54 |             return losses, output
55 | 
56 |     def _training_step(self, sample, batch_idx, _):
57 |         log_outputs = self.run_model(self.model, sample)
58 |         total_loss = sum([v for v in log_outputs.values() if isinstance(v, torch.Tensor) and v.requires_grad])
59 |         log_outputs['batch_size'] = sample['txt_tokens'].size()[0]
60 |         log_outputs['lr'] = self.scheduler.get_lr()[0]
61 |         return total_loss, log_outputs
62 | 
63 |     def validation_step(self, sample, batch_idx):
64 |         outputs = {}
65 |         outputs['losses'] = {}
66 |         outputs['losses'], model_out = self.run_model(self.model, sample, return_output=True, infer=False)
67 |         outputs['total_loss'] = sum(outputs['losses'].values())
68 |         outputs['nsamples'] = sample['nsamples']
69 |         outputs = utils.tensors_to_scalars(outputs)
70 |         if batch_idx < hparams['num_valid_plots']:
71 |             _, model_out = self.run_model(self.model, sample, return_output=True, infer=True)
72 |             self.plot_mel(batch_idx, sample['mels'], model_out['mel_out'])
73 |         return outputs
74 | 
75 |     def build_scheduler(self, optimizer):
76 |         return torch.optim.lr_scheduler.StepLR(optimizer, hparams['decay_steps'], gamma=0.5)
77 | 
78 |     def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx):
79 |         if optimizer is None:
80 |             return
81 |         optimizer.step()
82 |         optimizer.zero_grad()
83 |         if self.scheduler is not None:
84 |             self.scheduler.step(self.global_step // hparams['accumulate_grad_batches'])
85 | 


--------------------------------------------------------------------------------
/utils/audio.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import matplotlib
 3 | 
 4 | matplotlib.use('Agg')
 5 | import librosa
 6 | import librosa.filters
 7 | import numpy as np
 8 | from scipy import signal
 9 | from scipy.io import wavfile
10 | 
11 | 
12 | def save_wav(wav, path, sr, norm=False):
13 |     if norm:
14 |         wav = wav / np.abs(wav).max()
15 |     wav *= 32767
16 |     # proposed by @dsmiller
17 |     wavfile.write(path, sr, wav.astype(np.int16))
18 | 
19 | 
20 | def get_hop_size(hparams):
21 |     hop_size = hparams['hop_size']
22 |     if hop_size is None:
23 |         assert hparams['frame_shift_ms'] is not None
24 |         hop_size = int(hparams['frame_shift_ms'] / 1000 * hparams['audio_sample_rate'])
25 |     return hop_size
26 | 
27 | 
28 | ###########################################################################################
29 | def _stft(y, hparams):
30 |     return librosa.stft(y=y, n_fft=hparams['fft_size'], hop_length=get_hop_size(hparams),
31 |                         win_length=hparams['win_size'], pad_mode='constant')
32 | 
33 | 
34 | def _istft(y, hparams):
35 |     return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams['win_size'])
36 | 
37 | 
38 | def librosa_pad_lr(x, fsize, fshift, pad_sides=1):
39 |     '''compute right padding (final frame) or both sides padding (first and final frames)
40 |     '''
41 |     assert pad_sides in (1, 2)
42 |     # return int(fsize // 2)
43 |     pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0]
44 |     if pad_sides == 1:
45 |         return 0, pad
46 |     else:
47 |         return pad // 2, pad // 2 + pad % 2
48 | 
49 | 
50 | # Conversions
51 | def amp_to_db(x):
52 |     return 20 * np.log10(np.maximum(1e-5, x))
53 | 
54 | 
55 | def normalize(S, hparams):
56 |     return (S - hparams['min_level_db']) / -hparams['min_level_db']
57 | 


--------------------------------------------------------------------------------
/utils/cwt.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import numpy as np
  3 | from pycwt import wavelet
  4 | from scipy.interpolate import interp1d
  5 | 
  6 | 
  7 | def load_wav(wav_file, sr):
  8 |     wav, _ = librosa.load(wav_file, sr=sr, mono=True)
  9 |     return wav
 10 | 
 11 | 
 12 | def convert_continuos_f0(f0):
 13 |     '''CONVERT F0 TO CONTINUOUS F0
 14 |     Args:
 15 |         f0 (ndarray): original f0 sequence with the shape (T)
 16 |     Return:
 17 |         (ndarray): continuous f0 with the shape (T)
 18 |     '''
 19 |     # get uv information as binary
 20 |     f0 = np.copy(f0)
 21 |     uv = np.float32(f0 != 0)
 22 | 
 23 |     # get start and end of f0
 24 |     if (f0 == 0).all():
 25 |         print("| all of the f0 values are 0.")
 26 |         return uv, f0
 27 |     start_f0 = f0[f0 != 0][0]
 28 |     end_f0 = f0[f0 != 0][-1]
 29 | 
 30 |     # padding start and end of f0 sequence
 31 |     start_idx = np.where(f0 == start_f0)[0][0]
 32 |     end_idx = np.where(f0 == end_f0)[0][-1]
 33 |     f0[:start_idx] = start_f0
 34 |     f0[end_idx:] = end_f0
 35 | 
 36 |     # get non-zero frame index
 37 |     nz_frames = np.where(f0 != 0)[0]
 38 | 
 39 |     # perform linear interpolation
 40 |     f = interp1d(nz_frames, f0[nz_frames])
 41 |     cont_f0 = f(np.arange(0, f0.shape[0]))
 42 | 
 43 |     return uv, cont_f0
 44 | 
 45 | 
 46 | def get_cont_lf0(f0, frame_period=5.0):
 47 |     uv, cont_f0_lpf = convert_continuos_f0(f0)
 48 |     # cont_f0_lpf = low_pass_filter(cont_f0_lpf, int(1.0 / (frame_period * 0.001)), cutoff=20)
 49 |     cont_lf0_lpf = np.log(cont_f0_lpf)
 50 |     return uv, cont_lf0_lpf
 51 | 
 52 | 
 53 | def get_lf0_cwt(lf0):
 54 |     '''
 55 |     input:
 56 |         signal of shape (N)
 57 |     output:
 58 |         Wavelet_lf0 of shape(10, N), scales of shape(10)
 59 |     '''
 60 |     mother = wavelet.MexicanHat()
 61 |     dt = 0.005
 62 |     dj = 1
 63 |     s0 = dt * 2
 64 |     J = 9
 65 | 
 66 |     Wavelet_lf0, scales, _, _, _, _ = wavelet.cwt(np.squeeze(lf0), dt, dj, s0, J, mother)
 67 |     # Wavelet.shape => (J + 1, len(lf0))
 68 |     Wavelet_lf0 = np.real(Wavelet_lf0).T
 69 |     return Wavelet_lf0, scales
 70 | 
 71 | 
 72 | def norm_scale(Wavelet_lf0):
 73 |     Wavelet_lf0_norm = np.zeros((Wavelet_lf0.shape[0], Wavelet_lf0.shape[1]))
 74 |     mean = Wavelet_lf0.mean(0)[None, :]
 75 |     std = Wavelet_lf0.std(0)[None, :]
 76 |     Wavelet_lf0_norm = (Wavelet_lf0 - mean) / std
 77 |     return Wavelet_lf0_norm, mean, std
 78 | 
 79 | 
 80 | def normalize_cwt_lf0(f0, mean, std):
 81 |     uv, cont_lf0_lpf = get_cont_lf0(f0)
 82 |     cont_lf0_norm = (cont_lf0_lpf - mean) / std
 83 |     Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_norm)
 84 |     Wavelet_lf0_norm, _, _ = norm_scale(Wavelet_lf0)
 85 | 
 86 |     return Wavelet_lf0_norm
 87 | 
 88 | 
 89 | def get_lf0_cwt_norm(f0s, mean, std):
 90 |     uvs = list()
 91 |     cont_lf0_lpfs = list()
 92 |     cont_lf0_lpf_norms = list()
 93 |     Wavelet_lf0s = list()
 94 |     Wavelet_lf0s_norm = list()
 95 |     scaless = list()
 96 | 
 97 |     means = list()
 98 |     stds = list()
 99 |     for f0 in f0s:
100 |         uv, cont_lf0_lpf = get_cont_lf0(f0)
101 |         cont_lf0_lpf_norm = (cont_lf0_lpf - mean) / std
102 | 
103 |         Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)  # [560,10]
104 |         Wavelet_lf0_norm, mean_scale, std_scale = norm_scale(Wavelet_lf0)  # [560,10],[1,10],[1,10]
105 | 
106 |         Wavelet_lf0s_norm.append(Wavelet_lf0_norm)
107 |         uvs.append(uv)
108 |         cont_lf0_lpfs.append(cont_lf0_lpf)
109 |         cont_lf0_lpf_norms.append(cont_lf0_lpf_norm)
110 |         Wavelet_lf0s.append(Wavelet_lf0)
111 |         scaless.append(scales)
112 |         means.append(mean_scale)
113 |         stds.append(std_scale)
114 | 
115 |     return Wavelet_lf0s_norm, scaless, means, stds
116 | 
117 | 
118 | def inverse_cwt_torch(Wavelet_lf0, scales):
119 |     import torch
120 |     b = ((torch.arange(0, len(scales)).float().to(Wavelet_lf0.device)[None, None, :] + 1 + 2.5) ** (-2.5))
121 |     lf0_rec = Wavelet_lf0 * b
122 |     lf0_rec_sum = lf0_rec.sum(-1)
123 |     lf0_rec_sum = (lf0_rec_sum - lf0_rec_sum.mean(-1, keepdim=True)) / lf0_rec_sum.std(-1, keepdim=True)
124 |     return lf0_rec_sum
125 | 
126 | 
127 | def inverse_cwt(Wavelet_lf0, scales):
128 |     b = ((np.arange(0, len(scales))[None, None, :] + 1 + 2.5) ** (-2.5))
129 |     lf0_rec = Wavelet_lf0 * b
130 |     lf0_rec_sum = lf0_rec.sum(-1)
131 |     lf0_rec_sum = (lf0_rec_sum - lf0_rec_sum.mean(-1, keepdims=True)) / lf0_rec_sum.std(-1, keepdims=True)
132 |     return lf0_rec_sum
133 | 
134 | 
135 | def cwt2f0(cwt_spec, mean, std, cwt_scales):
136 |     assert len(mean.shape) == 1 and len(std.shape) == 1 and len(cwt_spec.shape) == 3
137 |     import torch
138 |     if isinstance(cwt_spec, torch.Tensor):
139 |         f0 = inverse_cwt_torch(cwt_spec, cwt_scales)
140 |         f0 = f0 * std[:, None] + mean[:, None]
141 |         f0 = f0.exp()  # [B, T]
142 |     else:
143 |         f0 = inverse_cwt(cwt_spec, cwt_scales)
144 |         f0 = f0 * std[:, None] + mean[:, None]
145 |         f0 = np.exp(f0)  # [B, T]
146 |     return f0
147 | 


--------------------------------------------------------------------------------
/utils/hparams.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import yaml
  4 | 
  5 | global_print_hparams = True
  6 | hparams = {}
  7 | 
  8 | 
  9 | class Args:
 10 |     def __init__(self, **kwargs):
 11 |         for k, v in kwargs.items():
 12 |             self.__setattr__(k, v)
 13 | 
 14 | 
 15 | def override_config(old_config: dict, new_config: dict):
 16 |     for k, v in new_config.items():
 17 |         if isinstance(v, dict) and k in old_config:
 18 |             override_config(old_config[k], new_config[k])
 19 |         else:
 20 |             old_config[k] = v
 21 | 
 22 | 
 23 | def set_hparams(config='', exp_name='', hparams_str='', print_hparams=True, global_hparams=True):
 24 |     if config == '':
 25 |         parser = argparse.ArgumentParser(description='neural music')
 26 |         parser.add_argument('--config', type=str, default='',
 27 |                             help='location of the data corpus')
 28 |         parser.add_argument('--exp_name', type=str, default='', help='exp_name')
 29 |         parser.add_argument('--hparams', type=str, default='',
 30 |                             help='location of the data corpus')
 31 |         parser.add_argument('--infer', action='store_true', help='infer')
 32 |         parser.add_argument('--validate', action='store_true', help='validate')
 33 |         parser.add_argument('--reset', action='store_true', help='reset hparams')
 34 |         parser.add_argument('--debug', action='store_true', help='debug')
 35 |         args, unknown = parser.parse_known_args()
 36 |     else:
 37 |         args = Args(config=config, exp_name=exp_name, hparams=hparams_str,
 38 |                     infer=False, validate=False, reset=False, debug=False)
 39 |     args_work_dir = ''
 40 |     if args.exp_name != '':
 41 |         args.work_dir = args.exp_name
 42 |         args_work_dir = f'checkpoints/{args.work_dir}'
 43 | 
 44 |     config_chains = []
 45 |     loaded_config = set()
 46 | 
 47 |     def load_config(config_fn):  # deep first
 48 |         with open(config_fn) as f:
 49 |             hparams_ = yaml.safe_load(f)
 50 |         loaded_config.add(config_fn)
 51 |         if 'base_config' in hparams_:
 52 |             ret_hparams = {}
 53 |             if not isinstance(hparams_['base_config'], list):
 54 |                 hparams_['base_config'] = [hparams_['base_config']]
 55 |             for c in hparams_['base_config']:
 56 |                 if c not in loaded_config:
 57 |                     if c.startswith('.'):
 58 |                         c = f'{os.path.dirname(config_fn)}/{c}'
 59 |                         c = os.path.normpath(c)
 60 |                     override_config(ret_hparams, load_config(c))
 61 |             override_config(ret_hparams, hparams_)
 62 |         else:
 63 |             ret_hparams = hparams_
 64 |         config_chains.append(config_fn)
 65 |         return ret_hparams
 66 | 
 67 |     global hparams
 68 |     assert args.config != '' or args_work_dir != ''
 69 |     saved_hparams = {}
 70 |     if args_work_dir != 'checkpoints/':
 71 |         ckpt_config_path = f'{args_work_dir}/config.yaml'
 72 |         if os.path.exists(ckpt_config_path):
 73 |             try:
 74 |                 with open(ckpt_config_path) as f:
 75 |                     saved_hparams.update(yaml.safe_load(f))
 76 |             except:
 77 |                 pass
 78 |         if args.config == '':
 79 |             args.config = ckpt_config_path
 80 | 
 81 |     hparams_ = {}
 82 | 
 83 |     hparams_.update(load_config(args.config))
 84 |     
 85 |     if not args.reset:
 86 |         hparams_.update(saved_hparams)
 87 |     hparams_['work_dir'] = args_work_dir
 88 | 
 89 |     if args.hparams != "":
 90 |         for new_hparam in args.hparams.split(","):
 91 |             k, v = new_hparam.split("=")
 92 |             if v in ['True', 'False'] or type(hparams_[k]) == bool:
 93 |                 hparams_[k] = eval(v)
 94 |             else:
 95 |                 hparams_[k] = type(hparams_[k])(v)
 96 | 
 97 |     if args_work_dir != '' and (not os.path.exists(ckpt_config_path) or args.reset) and not args.infer:
 98 |         os.makedirs(hparams_['work_dir'], exist_ok=True)
 99 |         with open(ckpt_config_path, 'w') as f:
100 |             yaml.safe_dump(hparams_, f)
101 | 
102 |     hparams_['infer'] = args.infer
103 |     hparams_['debug'] = args.debug
104 |     hparams_['validate'] = args.validate
105 |     global global_print_hparams
106 |     if global_hparams:
107 |         hparams.clear()
108 |         hparams.update(hparams_)
109 | 
110 |     if print_hparams and global_print_hparams and global_hparams:
111 |         print('| Hparams chains: ', config_chains)
112 |         print('| Hparams: ')
113 |         for i, (k, v) in enumerate(sorted(hparams_.items())):
114 |             print(f"\033[;33;m{k}\033[0m: {v}, ", end="\n" if i % 5 == 4 else "")
115 |         print("")
116 |         global_print_hparams = False
117 |     # print(hparams_.keys())
118 |     if hparams.get('exp_name') is None:
119 |         hparams['exp_name'] = args.exp_name
120 |     if hparams_.get('exp_name') is None:
121 |         hparams_['exp_name'] = args.exp_name
122 |     return hparams_
123 | 


--------------------------------------------------------------------------------
/utils/indexed_datasets.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from copy import deepcopy
 3 | 
 4 | import numpy as np
 5 | 
 6 | 
 7 | class IndexedDataset:
 8 |     def __init__(self, path, num_cache=1):
 9 |         super().__init__()
10 |         self.path = path
11 |         self.data_file = None
12 |         self.data_offsets = np.load(f"{path}.idx", allow_pickle=True).item()['offsets']
13 |         self.data_file = open(f"{path}.data", 'rb', buffering=-1)
14 |         self.cache = []
15 |         self.num_cache = num_cache
16 | 
17 |     def check_index(self, i):
18 |         if i < 0 or i >= len(self.data_offsets) - 1:
19 |             raise IndexError('index out of range')
20 | 
21 |     def __del__(self):
22 |         if self.data_file:
23 |             self.data_file.close()
24 | 
25 |     def __getitem__(self, i):
26 |         self.check_index(i)
27 |         if self.num_cache > 0:
28 |             for c in self.cache:
29 |                 if c[0] == i:
30 |                     return c[1]
31 |         self.data_file.seek(self.data_offsets[i])
32 |         b = self.data_file.read(self.data_offsets[i + 1] - self.data_offsets[i])
33 |         item = pickle.loads(b)
34 |         if self.num_cache > 0:
35 |             self.cache = [(i, deepcopy(item))] + self.cache[:-1]
36 |         return item
37 | 
38 |     def __len__(self):
39 |         return len(self.data_offsets) - 1
40 | 
41 | class IndexedDatasetBuilder:
42 |     def __init__(self, path):
43 |         self.path = path
44 |         self.out_file = open(f"{path}.data", 'wb')
45 |         self.byte_offsets = [0]
46 | 
47 |     def add_item(self, item):
48 |         s = pickle.dumps(item)
49 |         bytes = self.out_file.write(s)
50 |         self.byte_offsets.append(self.byte_offsets[-1] + bytes)
51 | 
52 |     def finalize(self):
53 |         self.out_file.close()
54 |         np.save(open(f"{self.path}.idx", 'wb'), {'offsets': self.byte_offsets})
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     import random
59 |     from tqdm import tqdm
60 |     ds_path = '/tmp/indexed_ds_example'
61 |     size = 100
62 |     items = [{"a": np.random.normal(size=[10000, 10]),
63 |               "b": np.random.normal(size=[10000, 10])} for i in range(size)]
64 |     builder = IndexedDatasetBuilder(ds_path)
65 |     for i in tqdm(range(size)):
66 |         builder.add_item(items[i])
67 |     builder.finalize()
68 |     ds = IndexedDataset(ds_path)
69 |     for i in tqdm(range(10000)):
70 |         idx = random.randint(0, size - 1)
71 |         assert (ds[idx]['a'] == items[idx]['a']).all()
72 | 


--------------------------------------------------------------------------------
/utils/multiprocess_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import traceback
 3 | from multiprocessing import Queue, Process
 4 | 
 5 | 
 6 | def chunked_worker(worker_id, map_func, args, results_queue=None, init_ctx_func=None):
 7 |     ctx = init_ctx_func(worker_id) if init_ctx_func is not None else None
 8 |     for job_idx, arg in args:
 9 |         try:
10 |             if ctx is not None:
11 |                 res = map_func(*arg, ctx=ctx)
12 |             else:
13 |                 res = map_func(*arg)
14 |             results_queue.put((job_idx, res))
15 |         except:
16 |             traceback.print_exc()
17 |             results_queue.put((job_idx, None))
18 | 
19 | def chunked_multiprocess_run(map_func, args, num_workers=None, ordered=True, init_ctx_func=None, q_max_size=1000):
20 |     args = zip(range(len(args)), args)
21 |     args = list(args)
22 |     n_jobs = len(args)
23 |     if num_workers is None:
24 |         num_workers = int(os.getenv('N_PROC', os.cpu_count()))
25 |     results_queues = []
26 |     if ordered:
27 |         for i in range(num_workers):
28 |             results_queues.append(Queue(maxsize=q_max_size // num_workers))
29 |     else:
30 |         results_queue = Queue(maxsize=q_max_size)
31 |         for i in range(num_workers):
32 |             results_queues.append(results_queue)
33 |     workers = []
34 |     for i in range(num_workers):
35 |         args_worker = args[i::num_workers]
36 |         p = Process(target=chunked_worker, args=(
37 |             i, map_func, args_worker, results_queues[i], init_ctx_func), daemon=True)
38 |         workers.append(p)
39 |         p.start()
40 |     for n_finished in range(n_jobs):
41 |         results_queue = results_queues[n_finished % num_workers]
42 |         job_idx, res = results_queue.get()
43 |         assert job_idx == n_finished or not ordered, (job_idx, n_finished)
44 |         yield res
45 |     for w in workers:
46 |         w.join()
47 |         w.close()
48 | 


--------------------------------------------------------------------------------
/utils/pitch_utils.py:
--------------------------------------------------------------------------------
 1 | #########
 2 | # world
 3 | ##########
 4 | import librosa
 5 | import numpy as np
 6 | import torch
 7 | 
 8 | gamma = 0
 9 | mcepInput = 3  # 0 for dB, 3 for magnitude
10 | alpha = 0.45
11 | en_floor = 10 ** (-80 / 20)
12 | FFT_SIZE = 2048
13 | 
14 | 
15 | f0_bin = 256
16 | f0_max = 1100.0
17 | f0_min = 50.0
18 | f0_mel_min = 1127 * np.log(1 + f0_min / 700)
19 | f0_mel_max = 1127 * np.log(1 + f0_max / 700)
20 | 
21 | 
22 | def f0_to_coarse(f0):
23 |     is_torch = isinstance(f0, torch.Tensor)
24 |     f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
25 |     f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
26 | 
27 |     f0_mel[f0_mel <= 1] = 1
28 |     f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
29 |     f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int)
30 |     assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
31 |     return f0_coarse
32 | 
33 | 
34 | def norm_f0(f0, uv, hparams):
35 |     is_torch = isinstance(f0, torch.Tensor)
36 |     if hparams['pitch_norm'] == 'standard':
37 |         f0 = (f0 - hparams['f0_mean']) / hparams['f0_std']
38 |     if hparams['pitch_norm'] == 'log':
39 |         f0 = torch.log2(f0) if is_torch else np.log2(f0)
40 |     if uv is not None and hparams['use_uv']:
41 |         f0[uv > 0] = 0
42 |     return f0
43 | 
44 | 
45 | def norm_interp_f0(f0, hparams):
46 |     is_torch = isinstance(f0, torch.Tensor)
47 |     if is_torch:
48 |         device = f0.device
49 |         f0 = f0.data.cpu().numpy()
50 |     uv = f0 == 0
51 |     f0 = norm_f0(f0, uv, hparams)
52 |     if sum(uv) == len(f0):
53 |         f0[uv] = 0
54 |     elif sum(uv) > 0:
55 |         f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
56 |     uv = torch.FloatTensor(uv)
57 |     f0 = torch.FloatTensor(f0)
58 |     if is_torch:
59 |         f0 = f0.to(device)
60 |     return f0, uv
61 | 
62 | 
63 | def denorm_f0(f0, uv, hparams, pitch_padding=None, min=None, max=None):
64 |     if hparams['pitch_norm'] == 'standard':
65 |         f0 = f0 * hparams['f0_std'] + hparams['f0_mean']
66 |     if hparams['pitch_norm'] == 'log':
67 |         f0 = 2 ** f0
68 |     if min is not None:
69 |         f0 = f0.clamp(min=min)
70 |     if max is not None:
71 |         f0 = f0.clamp(max=max)
72 |     if uv is not None and hparams['use_uv']:
73 |         f0[uv > 0] = 0
74 |     if pitch_padding is not None:
75 |         f0[pitch_padding] = 0
76 |     return f0
77 | 


--------------------------------------------------------------------------------
/utils/plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | LINE_COLORS = ['w', 'r', 'y', 'cyan', 'm', 'b', 'lime']
 6 | 
 7 | 
 8 | def spec_to_figure(spec, vmin=None, vmax=None):
 9 |     if isinstance(spec, torch.Tensor):
10 |         spec = spec.cpu().numpy()
11 |     fig = plt.figure(figsize=(12, 6))
12 |     plt.pcolor(spec.T, vmin=vmin, vmax=vmax)
13 |     return fig
14 | 
15 | 
16 | def spec_f0_to_figure(spec, f0s, figsize=None):
17 |     max_y = spec.shape[1]
18 |     if isinstance(spec, torch.Tensor):
19 |         spec = spec.detach().cpu().numpy()
20 |         f0s = {k: f0.detach().cpu().numpy() for k, f0 in f0s.items()}
21 |     f0s = {k: f0 / 10 for k, f0 in f0s.items()}
22 |     fig = plt.figure(figsize=(12, 6) if figsize is None else figsize)
23 |     plt.pcolor(spec.T)
24 |     for i, (k, f0) in enumerate(f0s.items()):
25 |         plt.plot(f0.clip(0, max_y), label=k, c=LINE_COLORS[i], linewidth=1, alpha=0.8)
26 |     plt.legend()
27 |     return fig
28 | 
29 | 
30 | def dur_to_figure(dur_gt, dur_pred, txt):
31 |     dur_gt = dur_gt.long().cpu().numpy()
32 |     dur_pred = dur_pred.long().cpu().numpy()
33 |     dur_gt = np.cumsum(dur_gt)
34 |     dur_pred = np.cumsum(dur_pred)
35 |     fig = plt.figure(figsize=(12, 6))
36 |     for i in range(len(dur_gt)):
37 |         shift = (i % 8) + 1
38 |         plt.text(dur_gt[i], shift, txt[i])
39 |         plt.text(dur_pred[i], 10 + shift, txt[i])
40 |         plt.vlines(dur_gt[i], 0, 10, colors='b')  # blue is gt
41 |         plt.vlines(dur_pred[i], 10, 20, colors='r')  # red is pred
42 |     return fig
43 | 
44 | 
45 | def f0_to_figure(f0_gt, f0_cwt=None, f0_pred=None):
46 |     fig = plt.figure()
47 |     f0_gt = f0_gt.cpu().numpy()
48 |     plt.plot(f0_gt, color='r', label='gt')
49 |     if f0_cwt is not None:
50 |         f0_cwt = f0_cwt.cpu().numpy()
51 |         plt.plot(f0_cwt, color='b', label='cwt')
52 |     if f0_pred is not None:
53 |         f0_pred = f0_pred.cpu().numpy()
54 |         plt.plot(f0_pred, color='green', label='pred')
55 |     plt.legend()
56 |     return fig
57 | 


--------------------------------------------------------------------------------
/utils/training_utils.py:
--------------------------------------------------------------------------------
 1 | from utils.hparams import hparams
 2 | 
 3 | 
 4 | class RSQRTSchedule(object):
 5 |     def __init__(self, optimizer):
 6 |         super().__init__()
 7 |         self.optimizer = optimizer
 8 |         self.constant_lr = hparams['lr']
 9 |         self.warmup_updates = hparams['warmup_updates']
10 |         self.hidden_size = hparams['hidden_size']
11 |         self.lr = hparams['lr']
12 |         for param_group in optimizer.param_groups:
13 |             param_group['lr'] = self.lr
14 |         self.step(0)
15 | 
16 |     def step(self, num_updates):
17 |         constant_lr = self.constant_lr
18 |         warmup = min(num_updates / self.warmup_updates, 1.0)
19 |         rsqrt_decay = max(self.warmup_updates, num_updates) ** -0.5
20 |         rsqrt_hidden = self.hidden_size ** -0.5
21 |         self.lr = max(constant_lr * warmup * rsqrt_decay * rsqrt_hidden, 1e-7)
22 |         for param_group in self.optimizer.param_groups:
23 |             param_group['lr'] = self.lr
24 |         return self.lr
25 | 
26 |     def get_lr(self):
27 |         return self.optimizer.param_groups[0]['lr']
28 | 


--------------------------------------------------------------------------------
/utils/tts_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from collections import defaultdict
 4 | 
 5 | 
 6 | def make_positions(tensor, padding_idx):
 7 |     """Replace non-padding symbols with their position numbers.
 8 |     Position numbers begin at padding_idx+1. Padding symbols are ignored.
 9 |     """
10 |     # The series of casts and type-conversions here are carefully
11 |     # balanced to both work with ONNX export and XLA. In particular XLA
12 |     # prefers ints, cumsum defaults to output longs, and ONNX doesn't know
13 |     # how to handle the dtype kwarg in cumsum.
14 |     mask = tensor.ne(padding_idx).int()
15 |     return (
16 |                    torch.cumsum(mask, dim=1).type_as(mask) * mask
17 |            ).long() + padding_idx
18 | 
19 | 
20 | def softmax(x, dim):
21 |     return F.softmax(x, dim=dim, dtype=torch.float32)
22 | 


--------------------------------------------------------------------------------
/vocoders/__init__.py:
--------------------------------------------------------------------------------
1 | from vocoders import hifigan
2 | 


--------------------------------------------------------------------------------
/vocoders/base_vocoder.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | VOCODERS = {}
 3 | 
 4 | 
 5 | def register_vocoder(cls):
 6 |     VOCODERS[cls.__name__.lower()] = cls
 7 |     VOCODERS[cls.__name__] = cls
 8 |     return cls
 9 | 
10 | 
11 | def get_vocoder_cls(hparams):
12 |     if hparams['vocoder'] in VOCODERS:
13 |         return VOCODERS[hparams['vocoder']]
14 |     else:
15 |         vocoder_cls = hparams['vocoder']
16 |         pkg = ".".join(vocoder_cls.split(".")[:-1])
17 |         cls_name = vocoder_cls.split(".")[-1]
18 |         vocoder_cls = getattr(importlib.import_module(pkg), cls_name)
19 |         return vocoder_cls
20 | 
21 | 
22 | class BaseVocoder:
23 |     def spec2wav(self, mel):
24 |         """
25 | 
26 |         :param mel: [T, 80]
27 |         :return: wav: [T']
28 |         """
29 | 
30 |         raise NotImplementedError
31 | 
32 |     @staticmethod
33 |     def wav2spec(wav_fn):
34 |         """
35 | 
36 |         :param wav_fn: str
37 |         :return: wav, mel: [T, 80]
38 |         """
39 |         raise NotImplementedError
40 | 


--------------------------------------------------------------------------------
/vocoders/hifigan.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import json
 3 | import os
 4 | import re
 5 | 
 6 | import librosa
 7 | import torch
 8 | 
 9 | import utils
10 | from modules.hifigan.hifigan import HifiGanGenerator
11 | from utils.hparams import hparams, set_hparams
12 | from vocoders.base_vocoder import register_vocoder
13 | from vocoders.pwg import PWG
14 | from vocoders.vocoder_utils import denoise
15 | 
16 | 
17 | def load_model(config_path, checkpoint_path):
18 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19 |     ckpt_dict = torch.load(checkpoint_path, map_location="cpu")
20 |     if '.yaml' in config_path:
21 |         config = set_hparams(config_path, global_hparams=False)
22 |         state = ckpt_dict["state_dict"]["model_gen"]
23 |     elif '.json' in config_path:
24 |         config = json.load(open(config_path, 'r'))
25 |         state = ckpt_dict["generator"]
26 | 
27 |     model = HifiGanGenerator(config)
28 |     model.load_state_dict(state, strict=True)
29 |     model.remove_weight_norm()
30 |     model = model.eval().to(device)
31 |     print(f"| Loaded model parameters from {checkpoint_path}.")
32 |     print(f"| HifiGAN device: {device}.")
33 |     return model, config, device
34 | 
35 | 
36 | total_time = 0
37 | 
38 | 
39 | @register_vocoder
40 | class HifiGAN(PWG):
41 |     def __init__(self):
42 |         base_dir = hparams['vocoder_ckpt']
43 |         config_path = f'{base_dir}/config.yaml'
44 |         if os.path.exists(config_path):
45 |             ckpt = sorted(glob.glob(f'{base_dir}/model_ckpt_steps_*.ckpt'), key=
46 |             lambda x: int(re.findall(f'{base_dir}/model_ckpt_steps_(\d+).ckpt', x)[0]))[-1]
47 |             print('| load HifiGAN: ', ckpt)
48 |             self.model, self.config, self.device = load_model(config_path=config_path, checkpoint_path=ckpt)
49 |         else:
50 |             config_path = f'{base_dir}/config.json'
51 |             ckpt = f'{base_dir}/generator_v1'
52 |             if os.path.exists(config_path):
53 |                 self.model, self.config, self.device = load_model(config_path=config_path, checkpoint_path=ckpt)
54 | 
55 |     def spec2wav(self, mel, **kwargs):
56 |         device = self.device
57 |         with torch.no_grad():
58 |             c = torch.FloatTensor(mel).unsqueeze(0).transpose(2, 1).to(device)
59 |             with utils.Timer('hifigan', print_time=hparams['profile_infer']):
60 |                 f0 = kwargs.get('f0')
61 |                 if f0 is not None and hparams.get('use_nsf'):
62 |                     f0 = torch.FloatTensor(f0[None, :]).to(device)
63 |                     y = self.model(c, f0).view(-1)
64 |                 else:
65 |                     y = self.model(c).view(-1)
66 |         wav_out = y.cpu().numpy()
67 |         if hparams.get('vocoder_denoise_c', 0.0) > 0:
68 |             wav_out = denoise(wav_out, v=hparams['vocoder_denoise_c'])
69 |         return wav_out
70 | 
71 |     # @staticmethod
72 |     # def wav2spec(wav_fn, **kwargs):
73 |     #     wav, _ = librosa.core.load(wav_fn, sr=hparams['audio_sample_rate'])
74 |     #     wav_torch = torch.FloatTensor(wav)[None, :]
75 |     #     mel = mel_spectrogram(wav_torch, hparams).numpy()[0]
76 |     #     return wav, mel.T
77 | 


--------------------------------------------------------------------------------
/vocoders/pwg.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import re
  3 | import librosa
  4 | import torch
  5 | import yaml
  6 | from sklearn.preprocessing import StandardScaler
  7 | from torch import nn
  8 | from modules.parallel_wavegan.models import ParallelWaveGANGenerator
  9 | from modules.parallel_wavegan.utils import read_hdf5
 10 | from utils.hparams import hparams
 11 | from utils.pitch_utils import f0_to_coarse
 12 | from vocoders.base_vocoder import BaseVocoder, register_vocoder
 13 | import numpy as np
 14 | 
 15 | 
 16 | def load_pwg_model(config_path, checkpoint_path, stats_path):
 17 |     # load config
 18 |     with open(config_path) as f:
 19 |         config = yaml.load(f, Loader=yaml.Loader)
 20 | 
 21 |     # setup
 22 |     if torch.cuda.is_available():
 23 |         device = torch.device("cuda")
 24 |     else:
 25 |         device = torch.device("cpu")
 26 |     model = ParallelWaveGANGenerator(**config["generator_params"])
 27 | 
 28 |     ckpt_dict = torch.load(checkpoint_path, map_location="cpu")
 29 |     if 'state_dict' not in ckpt_dict:  # official vocoder
 30 |         model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["model"]["generator"])
 31 |         scaler = StandardScaler()
 32 |         if config["format"] == "hdf5":
 33 |             scaler.mean_ = read_hdf5(stats_path, "mean")
 34 |             scaler.scale_ = read_hdf5(stats_path, "scale")
 35 |         elif config["format"] == "npy":
 36 |             scaler.mean_ = np.load(stats_path)[0]
 37 |             scaler.scale_ = np.load(stats_path)[1]
 38 |         else:
 39 |             raise ValueError("support only hdf5 or npy format.")
 40 |     else:  # custom PWG vocoder
 41 |         fake_task = nn.Module()
 42 |         fake_task.model_gen = model
 43 |         fake_task.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["state_dict"], strict=False)
 44 |         scaler = None
 45 | 
 46 |     model.remove_weight_norm()
 47 |     model = model.eval().to(device)
 48 |     print(f"| Loaded model parameters from {checkpoint_path}.")
 49 |     print(f"| PWG device: {device}.")
 50 |     return model, scaler, config, device
 51 | 
 52 | 
 53 | @register_vocoder
 54 | class PWG(BaseVocoder):
 55 |     def __init__(self):
 56 |         if hparams['vocoder_ckpt'] == '':  # load LJSpeech PWG pretrained model
 57 |             base_dir = 'wavegan_pretrained'
 58 |             ckpts = glob.glob(f'{base_dir}/checkpoint-*steps.pkl')
 59 |             ckpt = sorted(ckpts, key=
 60 |             lambda x: int(re.findall(f'{base_dir}/checkpoint-(\d+)steps.pkl', x)[0]))[-1]
 61 |             config_path = f'{base_dir}/config.yaml'
 62 |             print('| load PWG: ', ckpt)
 63 |             self.model, self.scaler, self.config, self.device = load_pwg_model(
 64 |                 config_path=config_path,
 65 |                 checkpoint_path=ckpt,
 66 |                 stats_path=f'{base_dir}/stats.h5',
 67 |             )
 68 |         else:
 69 |             base_dir = hparams['vocoder_ckpt']
 70 |             print(base_dir)
 71 |             config_path = f'{base_dir}/config.yaml'
 72 |             ckpt = sorted(glob.glob(f'{base_dir}/model_ckpt_steps_*.ckpt'), key=
 73 |             lambda x: int(re.findall(f'{base_dir}/model_ckpt_steps_(\d+).ckpt', x)[0]))[-1]
 74 |             print('| load PWG: ', ckpt)
 75 |             self.scaler = None
 76 |             self.model, _, self.config, self.device = load_pwg_model(
 77 |                 config_path=config_path,
 78 |                 checkpoint_path=ckpt,
 79 |                 stats_path=f'{base_dir}/stats.h5',
 80 |             )
 81 | 
 82 |     def spec2wav(self, mel, **kwargs):
 83 |         # start generation
 84 |         config = self.config
 85 |         device = self.device
 86 |         pad_size = (config["generator_params"]["aux_context_window"],
 87 |                     config["generator_params"]["aux_context_window"])
 88 |         c = mel
 89 |         if self.scaler is not None:
 90 |             c = self.scaler.transform(c)
 91 | 
 92 |         with torch.no_grad():
 93 |             z = torch.randn(1, 1, c.shape[0] * config["hop_size"]).to(device)
 94 |             c = np.pad(c, (pad_size, (0, 0)), "edge")
 95 |             c = torch.FloatTensor(c).unsqueeze(0).transpose(2, 1).to(device)
 96 |             p = kwargs.get('f0')
 97 |             if p is not None:
 98 |                 p = f0_to_coarse(p)
 99 |                 p = np.pad(p, (pad_size,), "edge")
100 |                 p = torch.LongTensor(p[None, :]).to(device)
101 |             y = self.model(z, c, p).view(-1)
102 |         wav_out = y.cpu().numpy()
103 |         return wav_out
104 | 
105 |     @staticmethod
106 |     def wav2spec(wav_fn, return_linear=False):
107 |         from data_gen.tts.data_gen_utils import process_utterance
108 |         res = process_utterance(
109 |             wav_fn, fft_size=hparams['fft_size'],
110 |             hop_size=hparams['hop_size'],
111 |             win_length=hparams['win_size'],
112 |             num_mels=hparams['audio_num_mel_bins'],
113 |             fmin=hparams['fmin'],
114 |             fmax=hparams['fmax'],
115 |             sample_rate=hparams['audio_sample_rate'],
116 |             loud_norm=hparams['loud_norm'],
117 |             min_level_db=hparams['min_level_db'],
118 |             return_linear=return_linear, vocoder='pwg', eps=float(hparams.get('wav2spec_eps', 1e-10)))
119 |         if return_linear:
120 |             return res[0], res[1].T, res[2].T  # [T, 80], [T, n_fft]
121 |         else:
122 |             return res[0], res[1].T
123 | 
124 |     @staticmethod
125 |     def wav2mfcc(wav_fn):
126 |         fft_size = hparams['fft_size']
127 |         hop_size = hparams['hop_size']
128 |         win_length = hparams['win_size']
129 |         sample_rate = hparams['audio_sample_rate']
130 |         wav, _ = librosa.core.load(wav_fn, sr=sample_rate)
131 |         mfcc = librosa.feature.mfcc(y=wav, sr=sample_rate, n_mfcc=13,
132 |                                     n_fft=fft_size, hop_length=hop_size,
133 |                                     win_length=win_length, pad_mode="constant", power=1.0)
134 |         mfcc_delta = librosa.feature.delta(mfcc, order=1)
135 |         mfcc_delta_delta = librosa.feature.delta(mfcc, order=2)
136 |         mfcc = np.concatenate([mfcc, mfcc_delta, mfcc_delta_delta]).T
137 |         return mfcc
138 | 


--------------------------------------------------------------------------------
/vocoders/vocoder_utils.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | 
 3 | from utils.hparams import hparams
 4 | import numpy as np
 5 | 
 6 | 
 7 | def denoise(wav, v=0.1):
 8 |     spec = librosa.stft(y=wav, n_fft=hparams['fft_size'], hop_length=hparams['hop_size'],
 9 |                         win_length=hparams['win_size'], pad_mode='constant')
10 |     spec_m = np.abs(spec)
11 |     spec_m = np.clip(spec_m - v, a_min=0, a_max=None)
12 |     spec_a = np.angle(spec)
13 | 
14 |     return librosa.istft(spec_m * np.exp(1j * spec_a), hop_length=hparams['hop_size'],
15 |                          win_length=hparams['win_size'])
16 | 


--------------------------------------------------------------------------------