├── LICENSE ├── README.md ├── README_ZH.md ├── app.py ├── colab.ipynb ├── configs ├── base.yaml ├── singers │ ├── singer0001.npy │ ├── singer0002.npy │ ├── singer0003.npy │ ├── singer0004.npy │ ├── singer0005.npy │ ├── singer0006.npy │ ├── singer0007.npy │ ├── singer0008.npy │ ├── singer0009.npy │ ├── singer0010.npy │ ├── singer0011.npy │ ├── singer0012.npy │ ├── singer0013.npy │ ├── singer0014.npy │ ├── singer0015.npy │ ├── singer0016.npy │ ├── singer0017.npy │ ├── singer0018.npy │ ├── singer0019.npy │ ├── singer0020.npy │ ├── singer0021.npy │ ├── singer0022.npy │ ├── singer0023.npy │ ├── singer0024.npy │ ├── singer0025.npy │ ├── singer0026.npy │ ├── singer0027.npy │ ├── singer0028.npy │ ├── singer0029.npy │ ├── singer0030.npy │ ├── singer0031.npy │ ├── singer0032.npy │ ├── singer0033.npy │ ├── singer0034.npy │ ├── singer0035.npy │ ├── singer0036.npy │ ├── singer0037.npy │ ├── singer0038.npy │ ├── singer0039.npy │ ├── singer0040.npy │ ├── singer0041.npy │ ├── singer0042.npy │ ├── singer0043.npy │ ├── singer0044.npy │ ├── singer0045.npy │ ├── singer0046.npy │ ├── singer0047.npy │ ├── singer0048.npy │ ├── singer0049.npy │ ├── singer0050.npy │ ├── singer0051.npy │ ├── singer0052.npy │ ├── singer0053.npy │ ├── singer0054.npy │ ├── singer0055.npy │ └── singer0056.npy └── singers_sample │ ├── 22-wave-girl │ ├── 031.wav │ ├── 032.wav │ ├── 033.wav │ ├── 034.wav │ └── 035.wav │ ├── 30-wave-boy │ ├── 010.wav │ ├── 011.wav │ ├── 012.wav │ ├── 013.wav │ ├── 014.wav │ └── 015.wav │ ├── 47-wave-girl │ ├── 020.wav │ ├── 021.wav │ ├── 022.wav │ ├── 023.wav │ ├── 024.wav │ └── 025.wav │ └── 51-wave-boy │ ├── 006.wav │ ├── 007.wav │ ├── 008.wav │ ├── 009.wav │ └── 010.wav ├── crepe ├── LICENSE.txt ├── README.md ├── __init__.py ├── __main__.py ├── assets │ └── tiny.pth ├── convert.py ├── core.py ├── decode.py ├── filter.py ├── load.py ├── loudness.py ├── model.py └── threshold.py ├── environment.yml ├── feature_retrieval ├── __init__.py ├── index.py ├── retrieval.py ├── train.py └── transform.py ├── hubert ├── LICENSE.txt ├── __init__.py ├── hubert_model.py └── inference.py ├── hubert_pretrain └── README.md ├── pitch ├── __init__.py ├── core │ ├── LICENCE │ ├── README.md │ ├── __init__.py │ ├── pyin.py │ ├── salience.py │ ├── swipe.py │ ├── swipe_slim.py │ ├── utils.py │ └── yin.py ├── debug.py └── inference.py ├── prepare ├── preprocess_a.py ├── preprocess_cdc.py ├── preprocess_crepe.py ├── preprocess_f0.py ├── preprocess_f0_mouth.py ├── preprocess_hubert.py ├── preprocess_ppg.py ├── preprocess_random.py ├── preprocess_speaker.py ├── preprocess_speaker_ave.py ├── preprocess_spec.py ├── preprocess_train.py ├── preprocess_trim.py └── preprocess_zzz.py ├── requirements.txt ├── speaker ├── README.md ├── __init__.py ├── config.py ├── infer.py ├── models │ ├── __init__.py │ ├── lstm.py │ └── resnet.py ├── umap.png └── utils │ ├── __init__.py │ ├── audio.py │ ├── coqpit.py │ ├── io.py │ └── shared_configs.py ├── speaker_pretrain ├── README.md └── config.json ├── svc_eva.py ├── svc_export.py ├── svc_inference.py ├── svc_inference_batch.py ├── svc_inference_post.py ├── svc_inference_shift.py ├── svc_merge.py ├── svc_preprocessing.py ├── svc_train_retrieval.py ├── svc_trainer.py ├── test.wav ├── vad ├── LICENSE ├── assets │ └── silero_vad.jit └── utils.py ├── vits ├── LICENSE ├── __init__.py ├── attentions.py ├── commons.py ├── data_utils.py ├── losses.py ├── models.py ├── modules.py ├── modules_grl.py ├── spectrogram.py └── utils.py ├── vits_decoder ├── LICENSE.txt ├── __init__.py ├── alias │ ├── LICENSE-alias.txt │ ├── LICENSE-snake.txt │ ├── __init__.py │ ├── act.py │ ├── filter.py │ └── resample.py ├── bigv.py ├── discriminator.py ├── generator.py ├── med.py ├── mpd.py ├── mrd.py ├── msd.py └── nsf.py ├── vits_extend ├── __init__.py ├── dataloader.py ├── plotting.py ├── stft.py ├── stft_loss.py ├── train.py ├── validation.py └── writer.py ├── vits_pretrain └── README.md ├── whisper ├── LICENSE ├── README.md ├── __init__.py ├── audio.py ├── decoding.py ├── inference.py ├── model.py ├── tokenizer.py └── utils.py └── whisper_pretrain └── README.md /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 PlayVoice 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /configs/base.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | model: "sovits" 3 | seed: 1234 4 | epochs: 10000 5 | learning_rate: 5e-5 6 | betas: [0.8, 0.99] 7 | lr_decay: 0.999875 8 | eps: 1e-9 9 | batch_size: 8 10 | accum_step: 2 11 | c_stft: 9 12 | c_mel: 1. 13 | c_kl: 0.2 14 | port: 8001 15 | pretrain: "./vits_pretrain/sovits5.0.pretrain.pth" 16 | ############################# 17 | data: 18 | training_files: "files/train.txt" 19 | validation_files: "files/valid.txt" 20 | segment_size: 8000 # WARNING: base on hop_length 21 | max_wav_value: 32768.0 22 | sampling_rate: 32000 23 | filter_length: 1024 24 | hop_length: 320 25 | win_length: 1024 26 | mel_channels: 100 27 | mel_fmin: 50.0 28 | mel_fmax: 16000.0 29 | ############################# 30 | vits: 31 | ppg_dim: 1280 32 | vec_dim: 256 33 | spk_dim: 256 34 | gin_channels: 256 35 | inter_channels: 192 36 | hidden_channels: 192 37 | filter_channels: 640 38 | ############################# 39 | gen: 40 | upsample_input: 192 41 | upsample_rates: [5,4,4,2,2] 42 | upsample_kernel_sizes: [15,8,8,4,4] 43 | upsample_initial_channel: 320 44 | resblock_kernel_sizes: [3,7,11] 45 | resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] 46 | ############################# 47 | mpd: 48 | periods: [2,3,5,7,11] 49 | kernel_size: 5 50 | stride: 3 51 | use_spectral_norm: False 52 | lReLU_slope: 0.2 53 | ############################# 54 | mrd: 55 | resolutions: "[(1024, 120, 600), (2048, 240, 1200), (4096, 480, 2400), (512, 50, 240)]" # (filter_length, hop_length, win_length) 56 | use_spectral_norm: False 57 | lReLU_slope: 0.2 58 | ############################# 59 | log: 60 | info_interval: 100 61 | eval_interval: 1 62 | save_interval: 5 63 | num_audio: 6 64 | pth_dir: 'chkpt' 65 | log_dir: 'logs' 66 | keep_ckpts: 0 67 | ############################# 68 | dist_config: 69 | dist_backend: "nccl" 70 | dist_url: "tcp://localhost:54321" 71 | world_size: 1 72 | 73 | -------------------------------------------------------------------------------- /configs/singers/singer0001.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0001.npy -------------------------------------------------------------------------------- /configs/singers/singer0002.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0002.npy -------------------------------------------------------------------------------- /configs/singers/singer0003.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0003.npy -------------------------------------------------------------------------------- /configs/singers/singer0004.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0004.npy -------------------------------------------------------------------------------- /configs/singers/singer0005.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0005.npy -------------------------------------------------------------------------------- /configs/singers/singer0006.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0006.npy -------------------------------------------------------------------------------- /configs/singers/singer0007.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0007.npy -------------------------------------------------------------------------------- /configs/singers/singer0008.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0008.npy -------------------------------------------------------------------------------- /configs/singers/singer0009.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0009.npy -------------------------------------------------------------------------------- /configs/singers/singer0010.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0010.npy -------------------------------------------------------------------------------- /configs/singers/singer0011.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0011.npy -------------------------------------------------------------------------------- /configs/singers/singer0012.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0012.npy -------------------------------------------------------------------------------- /configs/singers/singer0013.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0013.npy -------------------------------------------------------------------------------- /configs/singers/singer0014.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0014.npy -------------------------------------------------------------------------------- /configs/singers/singer0015.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0015.npy -------------------------------------------------------------------------------- /configs/singers/singer0016.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0016.npy -------------------------------------------------------------------------------- /configs/singers/singer0017.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0017.npy -------------------------------------------------------------------------------- /configs/singers/singer0018.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0018.npy -------------------------------------------------------------------------------- /configs/singers/singer0019.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0019.npy -------------------------------------------------------------------------------- /configs/singers/singer0020.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0020.npy -------------------------------------------------------------------------------- /configs/singers/singer0021.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0021.npy -------------------------------------------------------------------------------- /configs/singers/singer0022.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0022.npy -------------------------------------------------------------------------------- /configs/singers/singer0023.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0023.npy -------------------------------------------------------------------------------- /configs/singers/singer0024.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0024.npy -------------------------------------------------------------------------------- /configs/singers/singer0025.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0025.npy -------------------------------------------------------------------------------- /configs/singers/singer0026.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0026.npy -------------------------------------------------------------------------------- /configs/singers/singer0027.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0027.npy -------------------------------------------------------------------------------- /configs/singers/singer0028.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0028.npy -------------------------------------------------------------------------------- /configs/singers/singer0029.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0029.npy -------------------------------------------------------------------------------- /configs/singers/singer0030.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0030.npy -------------------------------------------------------------------------------- /configs/singers/singer0031.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0031.npy -------------------------------------------------------------------------------- /configs/singers/singer0032.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0032.npy -------------------------------------------------------------------------------- /configs/singers/singer0033.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0033.npy -------------------------------------------------------------------------------- /configs/singers/singer0034.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0034.npy -------------------------------------------------------------------------------- /configs/singers/singer0035.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0035.npy -------------------------------------------------------------------------------- /configs/singers/singer0036.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0036.npy -------------------------------------------------------------------------------- /configs/singers/singer0037.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0037.npy -------------------------------------------------------------------------------- /configs/singers/singer0038.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0038.npy -------------------------------------------------------------------------------- /configs/singers/singer0039.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0039.npy -------------------------------------------------------------------------------- /configs/singers/singer0040.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0040.npy -------------------------------------------------------------------------------- /configs/singers/singer0041.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0041.npy -------------------------------------------------------------------------------- /configs/singers/singer0042.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0042.npy -------------------------------------------------------------------------------- /configs/singers/singer0043.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0043.npy -------------------------------------------------------------------------------- /configs/singers/singer0044.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0044.npy -------------------------------------------------------------------------------- /configs/singers/singer0045.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0045.npy -------------------------------------------------------------------------------- /configs/singers/singer0046.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0046.npy -------------------------------------------------------------------------------- /configs/singers/singer0047.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0047.npy -------------------------------------------------------------------------------- /configs/singers/singer0048.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0048.npy -------------------------------------------------------------------------------- /configs/singers/singer0049.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0049.npy -------------------------------------------------------------------------------- /configs/singers/singer0050.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0050.npy -------------------------------------------------------------------------------- /configs/singers/singer0051.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0051.npy -------------------------------------------------------------------------------- /configs/singers/singer0052.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0052.npy -------------------------------------------------------------------------------- /configs/singers/singer0053.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0053.npy -------------------------------------------------------------------------------- /configs/singers/singer0054.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0054.npy -------------------------------------------------------------------------------- /configs/singers/singer0055.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0055.npy -------------------------------------------------------------------------------- /configs/singers/singer0056.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0056.npy -------------------------------------------------------------------------------- /configs/singers_sample/22-wave-girl/031.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/22-wave-girl/031.wav -------------------------------------------------------------------------------- /configs/singers_sample/22-wave-girl/032.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/22-wave-girl/032.wav -------------------------------------------------------------------------------- /configs/singers_sample/22-wave-girl/033.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/22-wave-girl/033.wav -------------------------------------------------------------------------------- /configs/singers_sample/22-wave-girl/034.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/22-wave-girl/034.wav -------------------------------------------------------------------------------- /configs/singers_sample/22-wave-girl/035.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/22-wave-girl/035.wav -------------------------------------------------------------------------------- /configs/singers_sample/30-wave-boy/010.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/30-wave-boy/010.wav -------------------------------------------------------------------------------- /configs/singers_sample/30-wave-boy/011.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/30-wave-boy/011.wav -------------------------------------------------------------------------------- /configs/singers_sample/30-wave-boy/012.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/30-wave-boy/012.wav -------------------------------------------------------------------------------- /configs/singers_sample/30-wave-boy/013.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/30-wave-boy/013.wav -------------------------------------------------------------------------------- /configs/singers_sample/30-wave-boy/014.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/30-wave-boy/014.wav -------------------------------------------------------------------------------- /configs/singers_sample/30-wave-boy/015.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/30-wave-boy/015.wav -------------------------------------------------------------------------------- /configs/singers_sample/47-wave-girl/020.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/47-wave-girl/020.wav -------------------------------------------------------------------------------- /configs/singers_sample/47-wave-girl/021.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/47-wave-girl/021.wav -------------------------------------------------------------------------------- /configs/singers_sample/47-wave-girl/022.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/47-wave-girl/022.wav -------------------------------------------------------------------------------- /configs/singers_sample/47-wave-girl/023.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/47-wave-girl/023.wav -------------------------------------------------------------------------------- /configs/singers_sample/47-wave-girl/024.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/47-wave-girl/024.wav -------------------------------------------------------------------------------- /configs/singers_sample/47-wave-girl/025.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/47-wave-girl/025.wav -------------------------------------------------------------------------------- /configs/singers_sample/51-wave-boy/006.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/51-wave-boy/006.wav -------------------------------------------------------------------------------- /configs/singers_sample/51-wave-boy/007.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/51-wave-boy/007.wav -------------------------------------------------------------------------------- /configs/singers_sample/51-wave-boy/008.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/51-wave-boy/008.wav -------------------------------------------------------------------------------- /configs/singers_sample/51-wave-boy/009.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/51-wave-boy/009.wav -------------------------------------------------------------------------------- /configs/singers_sample/51-wave-boy/010.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/51-wave-boy/010.wav -------------------------------------------------------------------------------- /crepe/LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Max Morrison 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /crepe/__init__.py: -------------------------------------------------------------------------------- 1 | from . import decode 2 | from .core import * 3 | from .model import Crepe 4 | from . import convert 5 | from . import filter 6 | from . import load 7 | from . import loudness 8 | from . import threshold 9 | -------------------------------------------------------------------------------- /crepe/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import warnings 4 | 5 | import crepe 6 | 7 | 8 | ############################################################################### 9 | # Entry point 10 | ############################################################################### 11 | 12 | 13 | def parse_args(): 14 | """Parse command-line arguments""" 15 | parser = argparse.ArgumentParser() 16 | 17 | # Required arguments 18 | parser.add_argument( 19 | '--audio_files', 20 | nargs='+', 21 | required=True, 22 | help='The audio file to process') 23 | parser.add_argument( 24 | '--output_files', 25 | nargs='+', 26 | required=True, 27 | help='The file to save pitch or embedding') 28 | parser.add_argument( 29 | '--hop_length', 30 | type=int, 31 | help='The hop length of the analysis window') 32 | 33 | # Optionally save harmonicity [DEPRECATED] 34 | parser.add_argument( 35 | '--output_harmonicity_files', 36 | nargs='+', 37 | help='The file to save harmonicity') 38 | # Optionally save periodicity 39 | parser.add_argument( 40 | '--output_periodicity_files', 41 | nargs='+', 42 | help='The files to save periodicity') 43 | 44 | # Optionally create embedding instead of pitch contour 45 | parser.add_argument( 46 | '--embed', 47 | action='store_true', 48 | help='Performs embedding instead of pitch prediction') 49 | 50 | # Optional arguments 51 | parser.add_argument( 52 | '--fmin', 53 | default=50., 54 | type=float, 55 | help='The minimum frequency allowed') 56 | parser.add_argument( 57 | '--fmax', 58 | default=crepe.MAX_FMAX, 59 | type=float, 60 | help='The maximum frequency allowed') 61 | parser.add_argument( 62 | '--model', 63 | default='full', 64 | help='The model capacity. One of "tiny" or "full"') 65 | parser.add_argument( 66 | '--decoder', 67 | default='viterbi', 68 | help='The decoder to use. One of "argmax", "viterbi", or ' + 69 | '"weighted_argmax"') 70 | parser.add_argument( 71 | '--batch_size', 72 | type=int, 73 | help='The number of frames per batch') 74 | parser.add_argument( 75 | '--gpu', 76 | type=int, 77 | help='The gpu to perform inference on') 78 | parser.add_argument( 79 | '--no_pad', 80 | action='store_true', 81 | help='Whether to pad the audio') 82 | 83 | return parser.parse_args() 84 | 85 | 86 | def make_parent_directory(file): 87 | """Create parent directory for file if it does not already exist""" 88 | parent = os.path.dirname(os.path.abspath(file)) 89 | os.makedirs(parent, exist_ok=True) 90 | 91 | 92 | def main(): 93 | # Parse command-line arguments 94 | args = parse_args() 95 | 96 | # Deprecate output_harmonicity_files 97 | if args.output_harmonicity_files is not None: 98 | message = ( 99 | 'The crepe output_harmonicity_files argument is deprecated and ' 100 | 'will be removed in a future release. Please use ' 101 | 'output_periodicity_files. Rationale: if network confidence measured ' 102 | 'harmonic content, the value would be low for non-harmonic, periodic ' 103 | 'sounds (e.g., sine waves). But this is not observed.') 104 | warnings.warn(message, DeprecationWarning) 105 | args.output_periodicity_files = args.output_harmonicity_files 106 | 107 | # Ensure output directory exist 108 | [make_parent_directory(file) for file in args.output_files] 109 | if args.output_periodicity_files is not None: 110 | [make_parent_directory(file) for file in args.output_periodicity_files] 111 | 112 | # Get inference device 113 | device = 'cpu' if args.gpu is None else f'cuda:{args.gpu}' 114 | 115 | # Get decoder 116 | if args.decoder == 'argmax': 117 | decoder = crepe.decode.argmax 118 | elif args.decoder == 'weighted_argmax': 119 | decoder = crepe.decode.weighted_argmax 120 | elif args.decoder == 'viterbi': 121 | decoder = crepe.decode.viterbi 122 | 123 | # Infer pitch or embedding and save to disk 124 | if args.embed: 125 | crepe.embed_from_files_to_files(args.audio_files, 126 | args.output_files, 127 | args.hop_length, 128 | args.model, 129 | args.batch_size, 130 | device, 131 | not args.no_pad) 132 | else: 133 | crepe.predict_from_files_to_files(args.audio_files, 134 | args.output_files, 135 | None, 136 | args.output_periodicity_files, 137 | args.hop_length, 138 | args.fmin, 139 | args.fmax, 140 | args.model, 141 | decoder, 142 | args.batch_size, 143 | device, 144 | not args.no_pad) 145 | 146 | 147 | # Run module entry point 148 | main() 149 | -------------------------------------------------------------------------------- /crepe/assets/tiny.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/crepe/assets/tiny.pth -------------------------------------------------------------------------------- /crepe/convert.py: -------------------------------------------------------------------------------- 1 | import scipy 2 | import torch 3 | 4 | import crepe 5 | 6 | 7 | ############################################################################### 8 | # Pitch unit conversions 9 | ############################################################################### 10 | 11 | 12 | def bins_to_cents(bins): 13 | """Converts pitch bins to cents""" 14 | cents = crepe.CENTS_PER_BIN * bins + 1997.3794084376191 15 | 16 | # Trade quantization error for noise 17 | return dither(cents) 18 | 19 | 20 | def bins_to_frequency(bins): 21 | """Converts pitch bins to frequency in Hz""" 22 | return cents_to_frequency(bins_to_cents(bins)) 23 | 24 | 25 | def cents_to_bins(cents, quantize_fn=torch.floor): 26 | """Converts cents to pitch bins""" 27 | bins = (cents - 1997.3794084376191) / crepe.CENTS_PER_BIN 28 | return quantize_fn(bins).int() 29 | 30 | 31 | def cents_to_frequency(cents): 32 | """Converts cents to frequency in Hz""" 33 | return 10 * 2 ** (cents / 1200) 34 | 35 | 36 | def frequency_to_bins(frequency, quantize_fn=torch.floor): 37 | """Convert frequency in Hz to pitch bins""" 38 | return cents_to_bins(frequency_to_cents(frequency), quantize_fn) 39 | 40 | 41 | def frequency_to_cents(frequency): 42 | """Convert frequency in Hz to cents""" 43 | return 1200 * torch.log2(frequency / 10.) 44 | 45 | 46 | ############################################################################### 47 | # Utilities 48 | ############################################################################### 49 | 50 | 51 | def dither(cents): 52 | """Dither the predicted pitch in cents to remove quantization error""" 53 | noise = scipy.stats.triang.rvs(c=0.5, 54 | loc=-crepe.CENTS_PER_BIN, 55 | scale=2 * crepe.CENTS_PER_BIN, 56 | size=cents.size()) 57 | return cents + cents.new_tensor(noise) 58 | -------------------------------------------------------------------------------- /crepe/decode.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | import torch 4 | 5 | import crepe 6 | 7 | 8 | ############################################################################### 9 | # Probability sequence decoding methods 10 | ############################################################################### 11 | 12 | 13 | def argmax(logits): 14 | """Sample observations by taking the argmax""" 15 | bins = logits.argmax(dim=1) 16 | 17 | # Convert to frequency in Hz 18 | return bins, crepe.convert.bins_to_frequency(bins) 19 | 20 | 21 | def weighted_argmax(logits): 22 | """Sample observations using weighted sum near the argmax""" 23 | # Find center of analysis window 24 | bins = logits.argmax(dim=1) 25 | 26 | # Find bounds of analysis window 27 | start = torch.max(torch.tensor(0, device=logits.device), bins - 4) 28 | end = torch.min(torch.tensor(logits.size(1), device=logits.device), bins + 5) 29 | 30 | # Mask out everything outside of window 31 | for batch in range(logits.size(0)): 32 | for time in range(logits.size(2)): 33 | logits[batch, :start[batch, time], time] = -float('inf') 34 | logits[batch, end[batch, time]:, time] = -float('inf') 35 | 36 | # Construct weights 37 | if not hasattr(weighted_argmax, 'weights'): 38 | weights = crepe.convert.bins_to_cents(torch.arange(360)) 39 | weighted_argmax.weights = weights[None, :, None] 40 | 41 | # Ensure devices are the same (no-op if they are) 42 | weighted_argmax.weights = weighted_argmax.weights.to(logits.device) 43 | 44 | # Convert to probabilities 45 | with torch.no_grad(): 46 | probs = torch.sigmoid(logits) 47 | 48 | # Apply weights 49 | cents = (weighted_argmax.weights * probs).sum(dim=1) / probs.sum(dim=1) 50 | 51 | # Convert to frequency in Hz 52 | return bins, crepe.convert.cents_to_frequency(cents) 53 | 54 | 55 | def viterbi(logits): 56 | """Sample observations using viterbi decoding""" 57 | # Create viterbi transition matrix 58 | if not hasattr(viterbi, 'transition'): 59 | xx, yy = np.meshgrid(range(360), range(360)) 60 | transition = np.maximum(12 - abs(xx - yy), 0) 61 | transition = transition / transition.sum(axis=1, keepdims=True) 62 | viterbi.transition = transition 63 | 64 | # Normalize logits 65 | with torch.no_grad(): 66 | probs = torch.nn.functional.softmax(logits, dim=1) 67 | 68 | # Convert to numpy 69 | sequences = probs.cpu().numpy() 70 | 71 | # Perform viterbi decoding 72 | bins = np.array([ 73 | librosa.sequence.viterbi(sequence, viterbi.transition).astype(np.int64) 74 | for sequence in sequences]) 75 | 76 | # Convert to pytorch 77 | bins = torch.tensor(bins, device=probs.device) 78 | 79 | # Convert to frequency in Hz 80 | return bins, crepe.convert.bins_to_frequency(bins) 81 | -------------------------------------------------------------------------------- /crepe/load.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import torch 5 | import crepe 6 | from scipy.io import wavfile 7 | 8 | 9 | def audio(filename): 10 | """Load audio from disk""" 11 | sample_rate, audio = wavfile.read(filename) 12 | 13 | # Convert to float32 14 | if audio.dtype == np.int16: 15 | audio = audio.astype(np.float32) / np.iinfo(np.int16).max 16 | 17 | # PyTorch is not compatible with non-writeable arrays, so we make a copy 18 | return torch.tensor(np.copy(audio))[None], sample_rate 19 | 20 | 21 | def model(device, capacity='full'): 22 | """Preloads model from disk""" 23 | # Bind model and capacity 24 | crepe.infer.capacity = capacity 25 | crepe.infer.model = crepe.Crepe(capacity) 26 | 27 | # Load weights 28 | file = os.path.join(os.path.dirname(__file__), 'assets', f'{capacity}.pth') 29 | crepe.infer.model.load_state_dict( 30 | torch.load(file, map_location=device)) 31 | 32 | # Place on device 33 | crepe.infer.model = crepe.infer.model.to(torch.device(device)) 34 | 35 | # Eval mode 36 | crepe.infer.model.eval() 37 | -------------------------------------------------------------------------------- /crepe/loudness.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import librosa 4 | import numpy as np 5 | import resampy 6 | import torch 7 | 8 | import crepe 9 | 10 | 11 | ############################################################################### 12 | # Constants 13 | ############################################################################### 14 | 15 | 16 | # Minimum decibel level 17 | MIN_DB = -100. 18 | 19 | # Reference decibel level 20 | REF_DB = 20. 21 | 22 | 23 | ############################################################################### 24 | # A-weighted loudness 25 | ############################################################################### 26 | 27 | 28 | def a_weighted(audio, sample_rate, hop_length=None, pad=True): 29 | """Retrieve the per-frame loudness""" 30 | # Save device 31 | device = audio.device 32 | 33 | # Default hop length of 10 ms 34 | hop_length = sample_rate // 100 if hop_length is None else hop_length 35 | 36 | # Convert to numpy 37 | audio = audio.detach().cpu().numpy().squeeze(0) 38 | 39 | # Resample 40 | if sample_rate != crepe.SAMPLE_RATE: 41 | audio = resampy.resample(audio, sample_rate, crepe.SAMPLE_RATE) 42 | hop_length = int(hop_length * crepe.SAMPLE_RATE / sample_rate) 43 | 44 | # Cache weights 45 | if not hasattr(a_weighted, 'weights'): 46 | a_weighted.weights = perceptual_weights() 47 | 48 | # Take stft 49 | stft = librosa.stft(audio, 50 | n_fft=crepe.WINDOW_SIZE, 51 | hop_length=hop_length, 52 | win_length=crepe.WINDOW_SIZE, 53 | center=pad, 54 | pad_mode='constant') 55 | 56 | # Compute magnitude on db scale 57 | db = librosa.amplitude_to_db(np.abs(stft)) 58 | 59 | # Apply A-weighting 60 | weighted = db + a_weighted.weights 61 | 62 | # Threshold 63 | weighted[weighted < MIN_DB] = MIN_DB 64 | 65 | # Average over weighted frequencies 66 | return torch.from_numpy(weighted.mean(axis=0)).float().to(device)[None] 67 | 68 | 69 | def perceptual_weights(): 70 | """A-weighted frequency-dependent perceptual loudness weights""" 71 | frequencies = librosa.fft_frequencies(sr=crepe.SAMPLE_RATE, 72 | n_fft=crepe.WINDOW_SIZE) 73 | 74 | # A warning is raised for nearly inaudible frequencies, but it ends up 75 | # defaulting to -100 db. That default is fine for our purposes. 76 | with warnings.catch_warnings(): 77 | warnings.simplefilter('ignore', RuntimeWarning) 78 | return librosa.A_weighting(frequencies)[:, None] - REF_DB 79 | -------------------------------------------------------------------------------- /crepe/model.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | import crepe 7 | 8 | 9 | ########################################################################### 10 | # Model definition 11 | ########################################################################### 12 | 13 | 14 | class Crepe(torch.nn.Module): 15 | """Crepe model definition""" 16 | 17 | def __init__(self, model='full'): 18 | super().__init__() 19 | 20 | # Model-specific layer parameters 21 | if model == 'full': 22 | in_channels = [1, 1024, 128, 128, 128, 256] 23 | out_channels = [1024, 128, 128, 128, 256, 512] 24 | self.in_features = 2048 25 | elif model == 'tiny': 26 | in_channels = [1, 128, 16, 16, 16, 32] 27 | out_channels = [128, 16, 16, 16, 32, 64] 28 | self.in_features = 256 29 | else: 30 | raise ValueError(f'Model {model} is not supported') 31 | 32 | # Shared layer parameters 33 | kernel_sizes = [(512, 1)] + 5 * [(64, 1)] 34 | strides = [(4, 1)] + 5 * [(1, 1)] 35 | 36 | # Overload with eps and momentum conversion given by MMdnn 37 | batch_norm_fn = functools.partial(torch.nn.BatchNorm2d, 38 | eps=0.0010000000474974513, 39 | momentum=0.0) 40 | 41 | # Layer definitions 42 | self.conv1 = torch.nn.Conv2d( 43 | in_channels=in_channels[0], 44 | out_channels=out_channels[0], 45 | kernel_size=kernel_sizes[0], 46 | stride=strides[0]) 47 | self.conv1_BN = batch_norm_fn( 48 | num_features=out_channels[0]) 49 | 50 | self.conv2 = torch.nn.Conv2d( 51 | in_channels=in_channels[1], 52 | out_channels=out_channels[1], 53 | kernel_size=kernel_sizes[1], 54 | stride=strides[1]) 55 | self.conv2_BN = batch_norm_fn( 56 | num_features=out_channels[1]) 57 | 58 | self.conv3 = torch.nn.Conv2d( 59 | in_channels=in_channels[2], 60 | out_channels=out_channels[2], 61 | kernel_size=kernel_sizes[2], 62 | stride=strides[2]) 63 | self.conv3_BN = batch_norm_fn( 64 | num_features=out_channels[2]) 65 | 66 | self.conv4 = torch.nn.Conv2d( 67 | in_channels=in_channels[3], 68 | out_channels=out_channels[3], 69 | kernel_size=kernel_sizes[3], 70 | stride=strides[3]) 71 | self.conv4_BN = batch_norm_fn( 72 | num_features=out_channels[3]) 73 | 74 | self.conv5 = torch.nn.Conv2d( 75 | in_channels=in_channels[4], 76 | out_channels=out_channels[4], 77 | kernel_size=kernel_sizes[4], 78 | stride=strides[4]) 79 | self.conv5_BN = batch_norm_fn( 80 | num_features=out_channels[4]) 81 | 82 | self.conv6 = torch.nn.Conv2d( 83 | in_channels=in_channels[5], 84 | out_channels=out_channels[5], 85 | kernel_size=kernel_sizes[5], 86 | stride=strides[5]) 87 | self.conv6_BN = batch_norm_fn( 88 | num_features=out_channels[5]) 89 | 90 | self.classifier = torch.nn.Linear( 91 | in_features=self.in_features, 92 | out_features=crepe.PITCH_BINS) 93 | 94 | def forward(self, x, embed=False): 95 | # Forward pass through first five layers 96 | x = self.embed(x) 97 | 98 | if embed: 99 | return x 100 | 101 | # Forward pass through layer six 102 | x = self.layer(x, self.conv6, self.conv6_BN) 103 | 104 | # shape=(batch, self.in_features) 105 | x = x.permute(0, 2, 1, 3).reshape(-1, self.in_features) 106 | 107 | # Compute logits 108 | return torch.sigmoid(self.classifier(x)) 109 | 110 | ########################################################################### 111 | # Forward pass utilities 112 | ########################################################################### 113 | 114 | def embed(self, x): 115 | """Map input audio to pitch embedding""" 116 | # shape=(batch, 1, 1024, 1) 117 | x = x[:, None, :, None] 118 | 119 | # Forward pass through first five layers 120 | x = self.layer(x, self.conv1, self.conv1_BN, (0, 0, 254, 254)) 121 | x = self.layer(x, self.conv2, self.conv2_BN) 122 | x = self.layer(x, self.conv3, self.conv3_BN) 123 | x = self.layer(x, self.conv4, self.conv4_BN) 124 | x = self.layer(x, self.conv5, self.conv5_BN) 125 | 126 | return x 127 | 128 | def layer(self, x, conv, batch_norm, padding=(0, 0, 31, 32)): 129 | """Forward pass through one layer""" 130 | x = F.pad(x, padding) 131 | x = conv(x) 132 | x = F.relu(x) 133 | x = batch_norm(x) 134 | return F.max_pool2d(x, (2, 1), (2, 1)) 135 | -------------------------------------------------------------------------------- /crepe/threshold.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | import crepe 5 | 6 | 7 | ############################################################################### 8 | # Pitch thresholding methods 9 | ############################################################################### 10 | 11 | 12 | class At: 13 | """Simple thresholding at a specified probability value""" 14 | 15 | def __init__(self, value): 16 | self.value = value 17 | 18 | def __call__(self, pitch, periodicity): 19 | # Make a copy to prevent in-place modification 20 | pitch = torch.clone(pitch) 21 | 22 | # Threshold 23 | pitch[periodicity < self.value] = crepe.UNVOICED 24 | return pitch 25 | 26 | 27 | class Hysteresis: 28 | """Hysteresis thresholding""" 29 | 30 | def __init__(self, 31 | lower_bound=.19, 32 | upper_bound=.31, 33 | width=.2, 34 | stds=1.7, 35 | return_threshold=False): 36 | self.lower_bound = lower_bound 37 | self.upper_bound = upper_bound 38 | self.width = width 39 | self.stds = stds 40 | self.return_threshold = return_threshold 41 | 42 | def __call__(self, pitch, periodicity): 43 | # Save output device 44 | device = pitch.device 45 | 46 | # Perform hysteresis in log-2 space 47 | pitch = torch.log2(pitch).detach().flatten().cpu().numpy() 48 | 49 | # Flatten periodicity 50 | periodicity = periodicity.flatten().cpu().numpy() 51 | 52 | # Ignore confidently unvoiced pitch 53 | pitch[periodicity < self.lower_bound] = crepe.UNVOICED 54 | 55 | # Whiten pitch 56 | mean, std = np.nanmean(pitch), np.nanstd(pitch) 57 | pitch = (pitch - mean) / std 58 | 59 | # Require high confidence to make predictions far from the mean 60 | parabola = self.width * pitch ** 2 - self.width * self.stds ** 2 61 | threshold = \ 62 | self.lower_bound + np.clip(parabola, 0, 1 - self.lower_bound) 63 | threshold[np.isnan(threshold)] = self.lower_bound 64 | 65 | # Apply hysteresis to prevent short, unconfident voiced regions 66 | i = 0 67 | while i < len(periodicity) - 1: 68 | 69 | # Detect unvoiced to voiced transition 70 | if periodicity[i] < threshold[i] and \ 71 | periodicity[i + 1] > threshold[i + 1]: 72 | 73 | # Grow region until next unvoiced or end of array 74 | start, end, keep = i + 1, i + 1, False 75 | while end < len(periodicity) and \ 76 | periodicity[end] > threshold[end]: 77 | if periodicity[end] > self.upper_bound: 78 | keep = True 79 | end += 1 80 | 81 | # Force unvoiced if we didn't pass the confidence required by 82 | # the hysteresis 83 | if not keep: 84 | threshold[start:end] = 1 85 | 86 | i = end 87 | 88 | else: 89 | i += 1 90 | 91 | # Remove pitch with low periodicity 92 | pitch[periodicity < threshold] = crepe.UNVOICED 93 | 94 | # Unwhiten 95 | pitch = pitch * std + mean 96 | 97 | # Convert to Hz 98 | pitch = torch.tensor(2 ** pitch, device=device)[None, :] 99 | 100 | # Optionally return threshold 101 | if self.return_threshold: 102 | return pitch, torch.tensor(threshold, device=device) 103 | 104 | return pitch 105 | 106 | 107 | ############################################################################### 108 | # Periodicity thresholding methods 109 | ############################################################################### 110 | 111 | 112 | class Silence: 113 | """Set periodicity to zero in silent regions""" 114 | 115 | def __init__(self, value=-60): 116 | self.value = value 117 | 118 | def __call__(self, 119 | periodicity, 120 | audio, 121 | sample_rate=crepe.SAMPLE_RATE, 122 | hop_length=None, 123 | pad=True): 124 | # Don't modify in-place 125 | periodicity = torch.clone(periodicity) 126 | 127 | # Compute loudness 128 | loudness = crepe.loudness.a_weighted( 129 | audio, sample_rate, hop_length, pad) 130 | 131 | # Threshold silence 132 | periodicity[loudness < self.value] = 0. 133 | 134 | return periodicity 135 | -------------------------------------------------------------------------------- /feature_retrieval/__init__.py: -------------------------------------------------------------------------------- 1 | from .index import * 2 | from .train import * 3 | from .transform import * 4 | from .retrieval import * 5 | -------------------------------------------------------------------------------- /feature_retrieval/retrieval.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import logging 3 | 4 | import torch 5 | 6 | from feature_retrieval import FaissRetrievableFeatureIndex 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class IRetrieval(abc.ABC): 12 | @abc.abstractmethod 13 | def retriv_whisper(self, vec: torch.Tensor) -> torch.Tensor: 14 | raise NotImplementedError 15 | 16 | @abc.abstractmethod 17 | def retriv_hubert(self, vec: torch.Tensor) -> torch.Tensor: 18 | raise NotImplementedError 19 | 20 | 21 | class DummyRetrieval(IRetrieval): 22 | def retriv_whisper(self, vec: torch.FloatTensor) -> torch.FloatTensor: 23 | logger.debug("start dummy retriv whisper") 24 | return vec.clone().to(torch.device("cpu")) 25 | 26 | def retriv_hubert(self, vec: torch.FloatTensor) -> torch.FloatTensor: 27 | logger.debug("start dummy retriv hubert") 28 | return vec.clone().to(torch.device("cpu")) 29 | 30 | 31 | class FaissIndexRetrieval(IRetrieval): 32 | def __init__(self, hubert_index: FaissRetrievableFeatureIndex, whisper_index: FaissRetrievableFeatureIndex) -> None: 33 | self._hubert_index = hubert_index 34 | self._whisper_index = whisper_index 35 | 36 | def retriv_whisper(self, vec: torch.Tensor) -> torch.Tensor: 37 | logger.debug("start retriv whisper") 38 | np_vec = self._whisper_index.retriv(vec.numpy()) 39 | return torch.from_numpy(np_vec) 40 | 41 | def retriv_hubert(self, vec: torch.Tensor) -> torch.Tensor: 42 | logger.debug("start retriv hubert") 43 | np_vec = self._hubert_index.retriv(vec.numpy()) 44 | return torch.from_numpy(np_vec) 45 | -------------------------------------------------------------------------------- /feature_retrieval/train.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import cast 3 | 4 | import numpy as np 5 | 6 | from feature_retrieval import NumpyArray 7 | from feature_retrieval.index import FaissIVFFlatTrainableFeatureIndexBuilder, logger 8 | from feature_retrieval.transform import IFeatureMatrixTransform 9 | 10 | 11 | def train_index( 12 | features_path: Path, 13 | index_save_filepath: Path, 14 | index_builder: FaissIVFFlatTrainableFeatureIndexBuilder, 15 | feature_transform: IFeatureMatrixTransform, 16 | ) -> None: 17 | logger.info("start getting feature vectors from %s", features_path.absolute()) 18 | feature_matrix = get_feature_matrix(features_path) 19 | logger.debug("fetched %s features", feature_matrix.shape[0]) 20 | 21 | logger.info("apply transform to feature matrix") 22 | feature_matrix = feature_transform.transform(feature_matrix) 23 | num_vectors, vector_dim = feature_matrix.shape 24 | logger.debug("features transformed. Current features %s", num_vectors) 25 | 26 | feature_index = index_builder.build(num_vectors=num_vectors, vector_dim=vector_dim) 27 | logger.info("adding features to index with training") 28 | 29 | feature_index.add_with_train(feature_matrix) 30 | feature_index.save(index_save_filepath) 31 | logger.info("index saved to %s", index_save_filepath.absolute()) 32 | 33 | 34 | def get_feature_matrix(features_dir_path: Path) -> NumpyArray: 35 | matrices = [np.load(str(features_path)) for features_path in features_dir_path.rglob("*.npy")] 36 | feature_matrix = np.concatenate(matrices, axis=0) 37 | return cast(NumpyArray, feature_matrix) 38 | -------------------------------------------------------------------------------- /feature_retrieval/transform.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import logging 3 | from typing import cast, Callable 4 | 5 | from sklearn.cluster import MiniBatchKMeans 6 | 7 | from feature_retrieval.index import NumpyArray 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class IFeatureMatrixTransform: 14 | """Interface for transform encoded voice feature from (n_features,vector_dim) to (m_features,vector_dim)""" 15 | 16 | @abc.abstractmethod 17 | def transform(self, matrix: NumpyArray) -> NumpyArray: 18 | """transform given feature matrix from (n_features,vector_dim) to (m_features,vector_dim)""" 19 | raise NotImplementedError 20 | 21 | 22 | class DummyFeatureTransform(IFeatureMatrixTransform): 23 | """do nothing""" 24 | 25 | def transform(self, matrix: NumpyArray) -> NumpyArray: 26 | return matrix 27 | 28 | 29 | class MinibatchKmeansFeatureTransform(IFeatureMatrixTransform): 30 | """replaces number of examples with k-means centroids using minibatch algorythm""" 31 | 32 | def __init__(self, n_clusters: int, n_parallel: int) -> None: 33 | self._n_clusters = n_clusters 34 | self._n_parallel = n_parallel 35 | 36 | @property 37 | def _batch_size(self) -> int: 38 | return self._n_parallel * 256 39 | 40 | def transform(self, matrix: NumpyArray) -> NumpyArray: 41 | """transform given feature matrix from (n_features,vector_dim) to (n_clusters,vector_dim)""" 42 | cluster = MiniBatchKMeans( 43 | n_clusters=self._n_clusters, 44 | verbose=True, 45 | batch_size=self._batch_size, 46 | compute_labels=False, 47 | init="k-means++", 48 | ) 49 | return cast(NumpyArray, cluster.fit(matrix).cluster_centers_) 50 | 51 | 52 | class OnConditionFeatureTransform(IFeatureMatrixTransform): 53 | """call given transform if condition is True else call otherwise transform""" 54 | 55 | def __init__( 56 | self, 57 | condition: Callable[[NumpyArray], bool], 58 | on_condition: IFeatureMatrixTransform, 59 | otherwise: IFeatureMatrixTransform, 60 | ) -> None: 61 | self._condition = condition 62 | self._on_condition = on_condition 63 | self._otherwise = otherwise 64 | 65 | def transform(self, matrix: NumpyArray) -> NumpyArray: 66 | if self._condition(matrix): 67 | transform_name = self._on_condition.__class__.__name__ 68 | logger.info(f"pass condition. Transform by rule {transform_name}") 69 | return self._on_condition.transform(matrix) 70 | transform_name = self._otherwise.__class__.__name__ 71 | logger.info(f"condition is not passed. Transform by rule {transform_name}") 72 | return self._otherwise.transform(matrix) 73 | -------------------------------------------------------------------------------- /hubert/LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Benjamin van Niekerk 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /hubert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/hubert/__init__.py -------------------------------------------------------------------------------- /hubert/inference.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 3 | import numpy as np 4 | import argparse 5 | import torch 6 | import librosa 7 | 8 | from hubert import hubert_model 9 | 10 | 11 | def load_audio(file: str, sr: int = 16000): 12 | x, sr = librosa.load(file, sr=sr) 13 | return x 14 | 15 | 16 | def load_model(path, device): 17 | model = hubert_model.hubert_soft(path) 18 | model.eval() 19 | if not (device == "cpu"): 20 | model.half() 21 | model.to(device) 22 | return model 23 | 24 | 25 | def pred_vec(model, wavPath, vecPath, device): 26 | audio = load_audio(wavPath) 27 | audln = audio.shape[0] 28 | vec_a = [] 29 | idx_s = 0 30 | while (idx_s + 20 * 16000 < audln): 31 | feats = audio[idx_s:idx_s + 20 * 16000] 32 | feats = torch.from_numpy(feats).to(device) 33 | feats = feats[None, None, :] 34 | if not (device == "cpu"): 35 | feats = feats.half() 36 | with torch.no_grad(): 37 | vec = model.units(feats).squeeze().data.cpu().float().numpy() 38 | vec_a.extend(vec) 39 | idx_s = idx_s + 20 * 16000 40 | if (idx_s < audln): 41 | feats = audio[idx_s:audln] 42 | feats = torch.from_numpy(feats).to(device) 43 | feats = feats[None, None, :] 44 | if not (device == "cpu"): 45 | feats = feats.half() 46 | with torch.no_grad(): 47 | vec = model.units(feats).squeeze().data.cpu().float().numpy() 48 | # print(vec.shape) # [length, dim=256] hop=320 49 | vec_a.extend(vec) 50 | np.save(vecPath, vec_a, allow_pickle=False) 51 | 52 | 53 | if __name__ == "__main__": 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True) 56 | parser.add_argument("-v", "--vec", help="vec", dest="vec", required=True) 57 | args = parser.parse_args() 58 | print(args.wav) 59 | print(args.vec) 60 | 61 | wavPath = args.wav 62 | vecPath = args.vec 63 | 64 | device = "cuda" if torch.cuda.is_available() else "cpu" 65 | hubert = load_model(os.path.join( 66 | "hubert_pretrain", "hubert-soft-0d54a1f4.pt"), device) 67 | pred_vec(hubert, wavPath, vecPath, device) 68 | -------------------------------------------------------------------------------- /hubert_pretrain/README.md: -------------------------------------------------------------------------------- 1 | Path for: 2 | 3 | hubert-soft-0d54a1f4.pt -------------------------------------------------------------------------------- /pitch/__init__.py: -------------------------------------------------------------------------------- 1 | from .inference import load_csv_pitch -------------------------------------------------------------------------------- /pitch/core/LICENCE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Sebastian Rosenzweig, Simon Schwär, Meinard Müller, International Audio Laboratories Erlangen, Germany. 4 | We thank the German Research Foundation (DFG) for various research grants that 5 | allow us for conducting fundamental research in music processing. 6 | The International Audio Laboratories Erlangen are a joint institution of the 7 | Friedrich-Alexander-Universität Erlangen-Nürnberg (FAU) and Fraunhofer 8 | Institute for Integrated Circuits IIS. 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy of 11 | this software and associated documentation files (the "Software"), to deal in 12 | the Software without restriction, including without limitation the rights to 13 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 14 | the Software, and to permit persons to whom the Software is furnished to do so, 15 | subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 22 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 23 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 24 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 25 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /pitch/core/README.md: -------------------------------------------------------------------------------- 1 | This repository contains a Python package called libf0 which provides open-source implementations for four popular model-based F0-estimation approaches, YIN (Cheveigné & Kawahara, 2002), pYIN (Mauch & Dixon, 2014), an approach inspired by Melodia (Salamon & Gómez, 2012), and SWIPE (Camacho & Harris, 2008). 2 | 3 | If you use the libf0 in your research, please consider the following references. 4 | 5 | ## References 6 | 7 | Sebastian Rosenzweig, Simon Schwär, and Meinard Müller. 8 | [A Python Library for Fundamental Frequency Estimation.](https://archives.ismir.net/ismir2022/latebreaking/000003.pdf) 9 | In Late Breaking Demos of the International Society for Music Information Retrieval Conference (ISMIR), Bengaluru, India, 2022. 10 | 11 | Alain de Cheveigné and Hideki Kawahara. 12 | YIN, a fundamental frequency estimator for speech and music. Journal of the Acoustical Society of America (JASA), 111(4):1917–1930, 2002. 13 | 14 | Matthias Mauch and Simon Dixon. 15 | pYIN: A fundamental frequency estimator using probabilistic threshold distributions. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pages 659–663, Florence, Italy, 2014. 16 | 17 | Justin Salamon and Emilia Gómez. 18 | Melody extraction from polyphonic music signals using pitch contour characteristics. IEEE Transactions on Audio, Speech, and Language Processing, 20(6): 19 | 1759–1770, 2012. 20 | 21 | Arturo Camacho and John G. Harris. 22 | A sawtooth waveform inspired pitch estimator for speech and music. The Journal of the Acoustical Society of America, 124(3):1638–1652, 2008. 23 | 24 | Meinard Müller. Fundamentals of Music Processing – Using Python and Jupyter Notebooks. Springer Verlag, 2nd edition, 2021. ISBN 978-3-030-69807-2. doi: 10.1007/978-3-030-69808-9. 25 | 26 | ## Documentation 27 | There is also an API documentation for libf0: 28 | 29 | https://groupmm.github.io/libf0 30 | 31 | ## Contributing 32 | 33 | We are happy for suggestions and contributions. We would be grateful for either directly contacting us via email (meinard.mueller@audiolabs-erlangen.de) or for creating an issue in our Github repository. Please do not submit a pull request without prior consultation with us. 34 | 35 | ## Licence 36 | 37 | The code for this toolbox is published under an MIT licence. 38 | 39 | ## Acknowledgements 40 | 41 | This work was supported by the German Research Foundation (MU 2686/13-1, SCHE 280/20-1). We thank Edgar Suárez and Vojtěch Pešek for helping with the implementations. Furthermore, we thank Fatemeh Eftekhar and Maryam Pirmoradi for testing the toolbox. The International Audio Laboratories Erlangen are a joint institution of the Friedrich-Alexander-Universität Erlangen-Nürnberg (FAU) and Fraunhofer Institute for Integrated Circuits IIS. 42 | -------------------------------------------------------------------------------- /pitch/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/pitch/core/__init__.py -------------------------------------------------------------------------------- /pitch/core/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | | Description: libf0 utility functions 3 | | Contributors: Sebastian Rosenzweig, Simon Schwär, Meinard Müller 4 | | License: The MIT license, https://opensource.org/licenses/MIT 5 | | This file is part of libf0. 6 | """ 7 | import numpy as np 8 | 9 | 10 | def sonify_trajectory_with_sinusoid(f0, t, audio_len, confidence=None, Fs=22050, smooth_len=11): 11 | """ 12 | Sonification of trajectory with sinusoidal. Adapted from FMP notebook: C8/C8S2_FundFreqTracking.ipynb 13 | 14 | Parameters 15 | ---------- 16 | f0 : ndarray 17 | F0-trajectory 18 | t : ndarray 19 | Time axis 20 | audio_len : int 21 | Desired audio length in samples 22 | confidence : None or ndarray 23 | Confidence values for amplitude control 24 | Fs : int 25 | Sampling rate 26 | smooth_len : int 27 | Smoothing filter length to avoid clicks in the sonification 28 | 29 | Returns 30 | ------- 31 | x_soni : ndarray 32 | Sonified F0-trajectory 33 | """ 34 | if confidence is None: 35 | confidence = np.ones_like(f0) 36 | 37 | # initialize 38 | x_soni = np.zeros(audio_len) 39 | amplitude_mod = np.zeros(audio_len) 40 | 41 | # Computation of hop size 42 | sine_len = int(t[1] * Fs) 43 | 44 | t = np.arange(0, sine_len) / Fs 45 | phase = 0 46 | 47 | # loop over all F0 values, ensure continuous phase 48 | for idx in np.arange(0, len(f0)): 49 | cur_f = f0[idx] 50 | cur_amp = confidence[idx] 51 | 52 | if cur_f == 0: 53 | phase = 0 54 | continue 55 | 56 | cur_soni = np.sin(2*np.pi*(cur_f*t+phase)) 57 | diff = np.maximum(0, (idx+1)*sine_len - len(x_soni)) 58 | if diff > 0: 59 | x_soni[idx * sine_len:(idx + 1) * sine_len - diff] = cur_soni[:-diff] 60 | amplitude_mod[idx * sine_len:(idx + 1) * sine_len - diff] = cur_amp 61 | else: 62 | x_soni[idx*sine_len:(idx+1)*sine_len-diff] = cur_soni 63 | amplitude_mod[idx*sine_len:(idx+1)*sine_len-diff] = cur_amp 64 | 65 | phase += cur_f * sine_len / Fs 66 | phase -= 2 * np.round(phase/2) 67 | 68 | # filter amplitudes to avoid transients 69 | amplitude_mod = np.convolve(amplitude_mod, np.hanning(smooth_len)/np.sum(np.hanning(smooth_len)), 'same') 70 | x_soni = x_soni * amplitude_mod 71 | return x_soni 72 | 73 | 74 | def hz_to_cents(F, F_ref=55.0): 75 | """ 76 | Converts frequency in Hz to cents. 77 | 78 | Parameters 79 | ---------- 80 | F : float or ndarray 81 | Frequency value in Hz 82 | F_ref : float 83 | Reference frequency in Hz (Default value = 55.0) 84 | Returns 85 | ------- 86 | F_cents : float or ndarray 87 | Frequency in cents 88 | """ 89 | 90 | # Avoid division by 0 91 | F_temp = np.array(F).astype(float) 92 | F_temp[F_temp == 0] = np.nan 93 | 94 | F_cents = 1200 * np.log2(F_temp / F_ref) 95 | 96 | return F_cents 97 | 98 | 99 | def cents_to_hz(F_cents, F_ref=55.0): 100 | """ 101 | Converts frequency in cents to Hz. 102 | 103 | Parameters 104 | ---------- 105 | F_cents : float or ndarray 106 | Frequency in cents 107 | F_ref : float 108 | Reference frequency in Hz (Default value = 55.0) 109 | Returns 110 | ------- 111 | F : float or ndarray 112 | Frequency in Hz 113 | """ 114 | F = F_ref * 2 ** (F_cents / 1200) 115 | 116 | # Avoid NaN output 117 | F = np.nan_to_num(F, copy=False, nan=0) 118 | 119 | return F 120 | -------------------------------------------------------------------------------- /pitch/debug.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | 4 | 5 | def save_csv_pitch(pitch, path): 6 | with open(path, "w", encoding='utf-8') as pitch_file: 7 | for i in range(len(pitch)): 8 | t = i * 10 9 | minute = t // 60000 10 | seconds = (t - minute * 60000) // 1000 11 | millisecond = t % 1000 12 | print( 13 | f"{minute}m {seconds}s {millisecond:3d},{int(pitch[i])}", file=pitch_file) 14 | 15 | 16 | if __name__ == "__main__": 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("-p", "--pit", help="pit", dest="pit", required=True) # pit for train 19 | args = parser.parse_args() 20 | print(args.pit) 21 | 22 | pitch = np.load(args.pit) 23 | save_csv_pitch(pitch, 'pitch_debug.csv') 24 | -------------------------------------------------------------------------------- /pitch/inference.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 3 | import torch 4 | import librosa 5 | import argparse 6 | import numpy as np 7 | import crepe 8 | 9 | 10 | def move_average(a, n, mode="same"): 11 | return (np.convolve(a, np.ones((n,))/n, mode=mode)) 12 | 13 | 14 | def compute_f0_mouth(path, device): 15 | # pip install praat-parselmouth 16 | import parselmouth 17 | 18 | x, sr = librosa.load(path, sr=16000) 19 | assert sr == 16000 20 | lpad = 1024 // 160 21 | rpad = lpad 22 | f0 = parselmouth.Sound(x, sr).to_pitch_ac( 23 | time_step=160 / sr, 24 | voicing_threshold=0.5, 25 | pitch_floor=30, 26 | pitch_ceiling=1000).selected_array['frequency'] 27 | f0 = np.pad(f0, [[lpad, rpad]], mode='constant') 28 | return f0 29 | 30 | 31 | def compute_f0_salience(filename, device): 32 | from pitch.core.salience import salience 33 | audio, sr = librosa.load(filename, sr=16000) 34 | assert sr == 16000 35 | f0, t, s = salience( 36 | audio, 37 | Fs=sr, 38 | H=320, 39 | N=2048, 40 | F_min=45.0, 41 | F_max=1760.0) 42 | f0 = np.repeat(f0, 2, -1) # 320 -> 160 * 2 43 | f0 = move_average(f0, 3) 44 | return f0 45 | 46 | 47 | def compute_f0_voice(filename, device): 48 | audio, sr = librosa.load(filename, sr=16000) 49 | assert sr == 16000 50 | audio = torch.tensor(np.copy(audio))[None] 51 | audio = audio + torch.randn_like(audio) * 0.001 52 | # Here we'll use a 10 millisecond hop length 53 | hop_length = 160 54 | fmin = 50 55 | fmax = 1000 56 | model = "full" 57 | batch_size = 512 58 | pitch = crepe.predict( 59 | audio, 60 | sr, 61 | hop_length, 62 | fmin, 63 | fmax, 64 | model, 65 | batch_size=batch_size, 66 | device=device, 67 | return_periodicity=False, 68 | ) 69 | pitch = crepe.filter.mean(pitch, 3) 70 | pitch = pitch.squeeze(0) 71 | return pitch 72 | 73 | 74 | def compute_f0_sing(filename, device): 75 | audio, sr = librosa.load(filename, sr=16000) 76 | assert sr == 16000 77 | audio = torch.tensor(np.copy(audio))[None] 78 | audio = audio + torch.randn_like(audio) * 0.001 79 | # Here we'll use a 20 millisecond hop length 80 | hop_length = 320 81 | fmin = 50 82 | fmax = 1000 83 | model = "full" 84 | batch_size = 512 85 | pitch = crepe.predict( 86 | audio, 87 | sr, 88 | hop_length, 89 | fmin, 90 | fmax, 91 | model, 92 | batch_size=batch_size, 93 | device=device, 94 | return_periodicity=False, 95 | ) 96 | pitch = np.repeat(pitch, 2, -1) # 320 -> 160 * 2 97 | pitch = crepe.filter.mean(pitch, 5) 98 | pitch = pitch.squeeze(0) 99 | return pitch 100 | 101 | 102 | def save_csv_pitch(pitch, path): 103 | with open(path, "w", encoding='utf-8') as pitch_file: 104 | for i in range(len(pitch)): 105 | t = i * 10 106 | minute = t // 60000 107 | seconds = (t - minute * 60000) // 1000 108 | millisecond = t % 1000 109 | print( 110 | f"{minute}m {seconds}s {millisecond:3d},{int(pitch[i])}", file=pitch_file) 111 | 112 | 113 | def load_csv_pitch(path): 114 | pitch = [] 115 | with open(path, "r", encoding='utf-8') as pitch_file: 116 | for line in pitch_file.readlines(): 117 | pit = line.strip().split(",")[-1] 118 | pitch.append(int(pit)) 119 | return pitch 120 | 121 | 122 | if __name__ == "__main__": 123 | parser = argparse.ArgumentParser() 124 | parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True) 125 | parser.add_argument("-p", "--pit", help="pit", dest="pit", required=True) # csv for excel 126 | args = parser.parse_args() 127 | print(args.wav) 128 | print(args.pit) 129 | 130 | device = "cuda" if torch.cuda.is_available() else "cpu" 131 | pitch = compute_f0_sing(args.wav, device) 132 | save_csv_pitch(pitch, args.pit) 133 | # tmp = load_csv_pitch(args.pit) 134 | # save_csv_pitch(tmp, "tmp.csv") 135 | -------------------------------------------------------------------------------- /prepare/preprocess_a.py: -------------------------------------------------------------------------------- 1 | import os 2 | import librosa 3 | import argparse 4 | import numpy as np 5 | from tqdm import tqdm 6 | from concurrent.futures import ThreadPoolExecutor, as_completed 7 | from scipy.io import wavfile 8 | 9 | 10 | def resample_wave(wav_in, wav_out, sample_rate): 11 | wav, _ = librosa.load(wav_in, sr=sample_rate) 12 | wav = wav / np.abs(wav).max() * 0.6 13 | wav = wav / max(0.01, np.max(np.abs(wav))) * 32767 * 0.6 14 | wavfile.write(wav_out, sample_rate, wav.astype(np.int16)) 15 | 16 | 17 | def process_file(file, wavPath, spks, outPath, sr): 18 | if file.endswith(".wav"): 19 | file = file[:-4] 20 | resample_wave(f"{wavPath}/{spks}/{file}.wav", f"{outPath}/{spks}/{file}.wav", sr) 21 | 22 | 23 | def process_files_with_thread_pool(wavPath, spks, outPath, sr, thread_num=None): 24 | files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")] 25 | 26 | with ThreadPoolExecutor(max_workers=thread_num) as executor: 27 | futures = {executor.submit(process_file, file, wavPath, spks, outPath, sr): file for file in files} 28 | 29 | for future in tqdm(as_completed(futures), total=len(futures), desc=f'Processing {sr} {spks}'): 30 | future.result() 31 | 32 | 33 | if __name__ == "__main__": 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True) 36 | parser.add_argument("-o", "--out", help="out", dest="out", required=True) 37 | parser.add_argument("-s", "--sr", help="sample rate", dest="sr", type=int, required=True) 38 | parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1) 39 | 40 | args = parser.parse_args() 41 | print(args.wav) 42 | print(args.out) 43 | print(args.sr) 44 | 45 | os.makedirs(args.out, exist_ok=True) 46 | wavPath = args.wav 47 | outPath = args.out 48 | 49 | assert args.sr == 16000 or args.sr == 32000 50 | 51 | for spks in os.listdir(wavPath): 52 | if os.path.isdir(f"./{wavPath}/{spks}"): 53 | os.makedirs(f"./{outPath}/{spks}", exist_ok=True) 54 | if args.thread_count == 0: 55 | process_num = os.cpu_count() // 2 + 1 56 | else: 57 | process_num = args.thread_count 58 | process_files_with_thread_pool(wavPath, spks, outPath, args.sr, process_num) 59 | -------------------------------------------------------------------------------- /prepare/preprocess_cdc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import torch 4 | import torchaudio 5 | 6 | from tqdm import tqdm 7 | from scipy.io.wavfile import read 8 | from scipy.io.wavfile import write 9 | # torch=1.9.0 -> pip install torchaudio==0.9.0 -i https://mirrors.aliyun.com/pypi/simple/ 10 | # this file is for VCTK 11 | 12 | 13 | MAX_WAV_VALUE = 32768.0 14 | 15 | 16 | def cut_direct_content(iWave, oWave): 17 | source, sr = torchaudio.load(iWave) 18 | stft = torch.stft(source, 1024, 256, 1024, torch.hann_window(1024), return_complex=True) 19 | stft[:, 0, :] = 0 20 | stft[:, 1, :] = 0 21 | istft = torch.istft(stft, 1024, 256, 1024, torch.hann_window(1024)) 22 | audio = istft.squeeze() 23 | audio = MAX_WAV_VALUE * audio 24 | audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE-1) 25 | audio = audio.short() 26 | audio = audio.data.cpu().detach().numpy() 27 | write(oWave, sr, audio) 28 | 29 | 30 | if __name__ == "__main__": 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument("-i", help="input path", dest="inPath", required=True) 33 | parser.add_argument("-o", help="output path", dest="outPath", required=True) 34 | 35 | args = parser.parse_args() 36 | print(args.inPath) 37 | print(args.outPath) 38 | 39 | os.makedirs(args.outPath, exist_ok=True) 40 | rootPath = args.inPath 41 | outPath = args.outPath 42 | 43 | for spks in os.listdir(rootPath): 44 | if (os.path.isdir(f"./{rootPath}/{spks}")): 45 | os.makedirs(f"./{outPath}/{spks}", exist_ok=True) 46 | 47 | files = [f for f in os.listdir(f"./{rootPath}/{spks}") if f.endswith(".wav")] 48 | for file in tqdm(files, desc=f'Processing cdc {spks}'): 49 | iWave = f"./{rootPath}/{spks}/{file}" 50 | oWave = f"./{outPath}/{spks}/{file}" 51 | cut_direct_content(iWave, oWave) 52 | -------------------------------------------------------------------------------- /prepare/preprocess_crepe.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 3 | import numpy as np 4 | import librosa 5 | import torch 6 | import crepe 7 | import argparse 8 | from tqdm import tqdm 9 | 10 | 11 | def compute_f0(filename, save, device): 12 | audio, sr = librosa.load(filename, sr=16000) 13 | assert sr == 16000 14 | # Load audio 15 | audio = torch.tensor(np.copy(audio))[None] 16 | audio = audio + torch.randn_like(audio) * 0.001 17 | # Here we'll use a 10 millisecond hop length 18 | hop_length = 160 19 | # Provide a sensible frequency range for your domain (upper limit is 2006 Hz) 20 | # This would be a reasonable range for speech 21 | fmin = 50 22 | fmax = 1000 23 | # Select a model capacity--one of "tiny" or "full" 24 | model = "full" 25 | # Pick a batch size that doesn't cause memory errors on your gpu 26 | batch_size = 512 27 | # Compute pitch using first gpu 28 | pitch, periodicity = crepe.predict( 29 | audio, 30 | sr, 31 | hop_length, 32 | fmin, 33 | fmax, 34 | model, 35 | batch_size=batch_size, 36 | device=device, 37 | return_periodicity=True, 38 | ) 39 | # CREPE was not trained on silent audio. some error on silent need filter.pitPath 40 | periodicity = crepe.filter.median(periodicity, 7) 41 | pitch = crepe.filter.mean(pitch, 5) 42 | pitch[periodicity < 0.5] = 0 43 | pitch = pitch.squeeze(0) 44 | np.save(save, pitch, allow_pickle=False) 45 | 46 | 47 | if __name__ == "__main__": 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True) 50 | parser.add_argument("-p", "--pit", help="pit", dest="pit", required=True) 51 | 52 | args = parser.parse_args() 53 | print(args.wav) 54 | print(args.pit) 55 | 56 | os.makedirs(args.pit, exist_ok=True) 57 | wavPath = args.wav 58 | pitPath = args.pit 59 | 60 | device = "cuda" if torch.cuda.is_available() else "cpu" 61 | 62 | for spks in os.listdir(wavPath): 63 | if os.path.isdir(f"./{wavPath}/{spks}"): 64 | os.makedirs(f"./{pitPath}/{spks}", exist_ok=True) 65 | 66 | files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")] 67 | for file in tqdm(files, desc=f'Processing crepe {spks}'): 68 | file = file[:-4] 69 | compute_f0(f"{wavPath}/{spks}/{file}.wav", f"{pitPath}/{spks}/{file}.pit", device) 70 | -------------------------------------------------------------------------------- /prepare/preprocess_f0.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import librosa 4 | import pyworld 5 | import argparse 6 | from tqdm import tqdm 7 | from concurrent.futures import ProcessPoolExecutor, as_completed 8 | 9 | 10 | def compute_f0(path, save): 11 | x, sr = librosa.load(path, sr=16000) 12 | assert sr == 16000 13 | f0, t = pyworld.dio( 14 | x.astype(np.double), 15 | fs=sr, 16 | f0_ceil=900, 17 | frame_period=1000 * 160 / sr, 18 | ) 19 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, fs=16000) 20 | for index, pitch in enumerate(f0): 21 | f0[index] = round(pitch, 1) 22 | np.save(save, f0, allow_pickle=False) 23 | 24 | 25 | def process_file(file, wavPath, spks, pitPath): 26 | if file.endswith(".wav"): 27 | file = file[:-4] 28 | compute_f0(f"{wavPath}/{spks}/{file}.wav", f"{pitPath}/{spks}/{file}.pit") 29 | 30 | 31 | def process_files_with_process_pool(wavPath, spks, pitPath, process_num=None): 32 | files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")] 33 | 34 | with ProcessPoolExecutor(max_workers=process_num) as executor: 35 | futures = {executor.submit(process_file, file, wavPath, spks, pitPath): file for file in files} 36 | 37 | for future in tqdm(as_completed(futures), total=len(futures), desc=f'Processing f0 {spks}'): 38 | future.result() 39 | 40 | 41 | if __name__ == "__main__": 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True) 44 | parser.add_argument("-p", "--pit", help="pit", dest="pit", required=True) 45 | parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1) 46 | 47 | args = parser.parse_args() 48 | print(args.wav) 49 | print(args.pit) 50 | 51 | os.makedirs(args.pit, exist_ok=True) 52 | wavPath = args.wav 53 | pitPath = args.pit 54 | 55 | for spks in os.listdir(wavPath): 56 | if os.path.isdir(f"./{wavPath}/{spks}"): 57 | os.makedirs(f"./{pitPath}/{spks}", exist_ok=True) 58 | if args.thread_count == 0: 59 | process_num = os.cpu_count() // 2 + 1 60 | else: 61 | process_num = args.thread_count 62 | process_files_with_process_pool(wavPath, spks, pitPath, process_num) 63 | -------------------------------------------------------------------------------- /prepare/preprocess_f0_mouth.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import librosa 4 | import argparse 5 | import parselmouth 6 | # pip install praat-parselmouth 7 | from tqdm import tqdm 8 | from concurrent.futures import ProcessPoolExecutor, as_completed 9 | 10 | 11 | def compute_f0(path, save): 12 | x, sr = librosa.load(path, sr=16000) 13 | assert sr == 16000 14 | lpad = 1024 // 160 15 | rpad = lpad 16 | f0 = parselmouth.Sound(x, sr).to_pitch_ac( 17 | time_step=160 / sr, 18 | voicing_threshold=0.5, 19 | pitch_floor=30, 20 | pitch_ceiling=1000).selected_array['frequency'] 21 | f0 = np.pad(f0, [[lpad, rpad]], mode='constant') 22 | np.save(save, f0, allow_pickle=False) 23 | 24 | 25 | def process_file(file, wavPath, spks, pitPath): 26 | if file.endswith(".wav"): 27 | file = file[:-4] 28 | compute_f0(f"{wavPath}/{spks}/{file}.wav", f"{pitPath}/{spks}/{file}.pit") 29 | 30 | 31 | def process_files_with_process_pool(wavPath, spks, pitPath, process_num=None): 32 | files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")] 33 | 34 | with ProcessPoolExecutor(max_workers=process_num) as executor: 35 | futures = {executor.submit(process_file, file, wavPath, spks, pitPath): file for file in files} 36 | 37 | for future in tqdm(as_completed(futures), total=len(futures), desc=f'Processing f0 {spks}'): 38 | future.result() 39 | 40 | 41 | if __name__ == "__main__": 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True) 44 | parser.add_argument("-p", "--pit", help="pit", dest="pit", required=True) 45 | parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1) 46 | 47 | args = parser.parse_args() 48 | print(args.wav) 49 | print(args.pit) 50 | 51 | os.makedirs(args.pit, exist_ok=True) 52 | wavPath = args.wav 53 | pitPath = args.pit 54 | 55 | for spks in os.listdir(wavPath): 56 | if os.path.isdir(f"./{wavPath}/{spks}"): 57 | os.makedirs(f"./{pitPath}/{spks}", exist_ok=True) 58 | if args.thread_count == 0: 59 | process_num = os.cpu_count() // 2 + 1 60 | else: 61 | process_num = args.thread_count 62 | process_files_with_process_pool(wavPath, spks, pitPath, process_num) 63 | -------------------------------------------------------------------------------- /prepare/preprocess_hubert.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 3 | import numpy as np 4 | import argparse 5 | import torch 6 | import librosa 7 | 8 | from tqdm import tqdm 9 | from hubert import hubert_model 10 | 11 | 12 | def load_audio(file: str, sr: int = 16000): 13 | x, sr = librosa.load(file, sr=sr) 14 | return x 15 | 16 | 17 | def load_model(path, device): 18 | model = hubert_model.hubert_soft(path) 19 | model.eval() 20 | model.half() 21 | model.to(device) 22 | return model 23 | 24 | 25 | def pred_vec(model, wavPath, vecPath, device): 26 | feats = load_audio(wavPath) 27 | feats = torch.from_numpy(feats).to(device) 28 | feats = feats[None, None, :].half() 29 | with torch.no_grad(): 30 | vec = model.units(feats).squeeze().data.cpu().float().numpy() 31 | # print(vec.shape) # [length, dim=256] hop=320 32 | np.save(vecPath, vec, allow_pickle=False) 33 | 34 | 35 | if __name__ == "__main__": 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True) 38 | parser.add_argument("-v", "--vec", help="vec", dest="vec", required=True) 39 | 40 | args = parser.parse_args() 41 | print(args.wav) 42 | print(args.vec) 43 | os.makedirs(args.vec, exist_ok=True) 44 | 45 | wavPath = args.wav 46 | vecPath = args.vec 47 | 48 | device = "cuda" if torch.cuda.is_available() else "cpu" 49 | hubert = load_model(os.path.join("hubert_pretrain", "hubert-soft-0d54a1f4.pt"), device) 50 | 51 | for spks in os.listdir(wavPath): 52 | if os.path.isdir(f"./{wavPath}/{spks}"): 53 | os.makedirs(f"./{vecPath}/{spks}", exist_ok=True) 54 | 55 | files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")] 56 | for file in tqdm(files, desc=f'Processing vec {spks}'): 57 | file = file[:-4] 58 | pred_vec(hubert, f"{wavPath}/{spks}/{file}.wav", f"{vecPath}/{spks}/{file}.vec", device) 59 | -------------------------------------------------------------------------------- /prepare/preprocess_ppg.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 3 | import numpy as np 4 | import argparse 5 | import torch 6 | import random 7 | from tqdm import tqdm 8 | from whisper.model import Whisper, ModelDimensions 9 | from whisper.audio import load_audio, pad_or_trim, log_mel_spectrogram 10 | 11 | 12 | def load_model(path) -> Whisper: 13 | device = "cuda" if torch.cuda.is_available() else "cpu" 14 | checkpoint = torch.load(path, map_location="cpu") 15 | dims = ModelDimensions(**checkpoint["dims"]) 16 | print(dims) 17 | model = Whisper(dims) 18 | del model.decoder 19 | cut = len(model.encoder.blocks) // 4 20 | cut = -1 * cut 21 | del model.encoder.blocks[cut:] 22 | model.load_state_dict(checkpoint["model_state_dict"], strict=False) 23 | model.eval() 24 | model.half() 25 | model.to(device) 26 | return model 27 | 28 | 29 | def pred_ppg(whisper: Whisper, wavPath, ppgPath): 30 | audio = load_audio(wavPath) 31 | audln = audio.shape[0] 32 | ppgln = audln // 320 33 | audio = pad_or_trim(audio) 34 | mel = log_mel_spectrogram(audio).half().to(whisper.device) 35 | with torch.no_grad(): 36 | ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy() 37 | ppg = ppg[:ppgln,] # [length, dim=1280] 38 | # print(ppg.shape) 39 | np.save(ppgPath, ppg, allow_pickle=False) 40 | 41 | 42 | if __name__ == "__main__": 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True) 45 | parser.add_argument("-p", "--ppg", help="ppg", dest="ppg", required=True) 46 | args = parser.parse_args() 47 | print(args.wav) 48 | print(args.ppg) 49 | 50 | os.makedirs(args.ppg, exist_ok=True) 51 | wavPath = args.wav 52 | ppgPath = args.ppg 53 | 54 | whisper = load_model(os.path.join("whisper_pretrain", "large-v2.pt")) 55 | spkPaths = os.listdir(wavPath) 56 | random.shuffle(spkPaths) 57 | 58 | for spks in spkPaths: 59 | if os.path.isdir(f"./{wavPath}/{spks}"): 60 | os.makedirs(f"./{ppgPath}/{spks}", exist_ok=True) 61 | 62 | files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")] 63 | for file in tqdm(files, desc=f'Processing ppg {spks}'): 64 | if file.endswith(".wav"): 65 | # print(file) 66 | file = file[:-4] 67 | path_wav = f"{wavPath}/{spks}/{file}.wav" 68 | path_ppg = f"{ppgPath}/{spks}/{file}.ppg" 69 | if os.path.isfile(f"{path_ppg}.npy"): 70 | continue 71 | pred_ppg(whisper, path_wav, path_ppg) 72 | -------------------------------------------------------------------------------- /prepare/preprocess_random.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | if __name__ == "__main__": 5 | all_items = [] 6 | fo = open("./files/train_all.txt", "r+", encoding='utf-8') 7 | while (True): 8 | try: 9 | item = fo.readline().strip() 10 | except Exception as e: 11 | print('nothing of except:', e) 12 | break 13 | if (item == None or item == ""): 14 | break 15 | all_items.append(item) 16 | fo.close() 17 | 18 | random.shuffle(all_items) 19 | 20 | fw = open("./files/train_all.txt", "w", encoding="utf-8") 21 | for strs in all_items: 22 | print(strs, file=fw) 23 | fw.close() 24 | -------------------------------------------------------------------------------- /prepare/preprocess_speaker.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 3 | import torch 4 | import numpy as np 5 | import argparse 6 | 7 | from tqdm import tqdm 8 | from functools import partial 9 | from argparse import RawTextHelpFormatter 10 | from multiprocessing.pool import ThreadPool 11 | 12 | from speaker.models.lstm import LSTMSpeakerEncoder 13 | from speaker.config import SpeakerEncoderConfig 14 | from speaker.utils.audio import AudioProcessor 15 | from speaker.infer import read_json 16 | 17 | 18 | def get_spk_wavs(dataset_path, output_path): 19 | wav_files = [] 20 | os.makedirs(f"./{output_path}", exist_ok=True) 21 | for spks in os.listdir(dataset_path): 22 | if os.path.isdir(f"./{dataset_path}/{spks}"): 23 | os.makedirs(f"./{output_path}/{spks}", exist_ok=True) 24 | for file in os.listdir(f"./{dataset_path}/{spks}"): 25 | if file.endswith(".wav"): 26 | wav_files.append(f"./{dataset_path}/{spks}/{file}") 27 | elif spks.endswith(".wav"): 28 | wav_files.append(f"./{dataset_path}/{spks}") 29 | return wav_files 30 | 31 | 32 | def process_wav(wav_file, dataset_path, output_path, args, speaker_encoder_ap, speaker_encoder): 33 | waveform = speaker_encoder_ap.load_wav( 34 | wav_file, sr=speaker_encoder_ap.sample_rate 35 | ) 36 | spec = speaker_encoder_ap.melspectrogram(waveform) 37 | spec = torch.from_numpy(spec.T) 38 | if args.use_cuda: 39 | spec = spec.cuda() 40 | spec = spec.unsqueeze(0) 41 | embed = speaker_encoder.compute_embedding(spec).detach().cpu().numpy() 42 | embed = embed.squeeze() 43 | embed_path = wav_file.replace(dataset_path, output_path) 44 | embed_path = embed_path.replace(".wav", ".spk") 45 | np.save(embed_path, embed, allow_pickle=False) 46 | 47 | 48 | def extract_speaker_embeddings(wav_files, dataset_path, output_path, args, speaker_encoder_ap, speaker_encoder, concurrency): 49 | bound_process_wav = partial(process_wav, dataset_path=dataset_path, output_path=output_path, args=args, speaker_encoder_ap=speaker_encoder_ap, speaker_encoder=speaker_encoder) 50 | 51 | with ThreadPool(concurrency) as pool: 52 | list(tqdm(pool.imap(bound_process_wav, wav_files), total=len(wav_files))) 53 | 54 | 55 | if __name__ == "__main__": 56 | 57 | parser = argparse.ArgumentParser( 58 | description="""Compute embedding vectors for each wav file in a dataset.""", 59 | formatter_class=RawTextHelpFormatter, 60 | ) 61 | parser.add_argument("dataset_path", type=str, help="Path to dataset waves.") 62 | parser.add_argument( 63 | "output_path", type=str, help="path for output speaker/speaker_wavs.npy." 64 | ) 65 | parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) 66 | parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1) 67 | args = parser.parse_args() 68 | dataset_path = args.dataset_path 69 | output_path = args.output_path 70 | thread_count = args.thread_count 71 | # model 72 | args.model_path = os.path.join("speaker_pretrain", "best_model.pth.tar") 73 | args.config_path = os.path.join("speaker_pretrain", "config.json") 74 | # config 75 | config_dict = read_json(args.config_path) 76 | 77 | # model 78 | config = SpeakerEncoderConfig(config_dict) 79 | config.from_dict(config_dict) 80 | 81 | speaker_encoder = LSTMSpeakerEncoder( 82 | config.model_params["input_dim"], 83 | config.model_params["proj_dim"], 84 | config.model_params["lstm_dim"], 85 | config.model_params["num_lstm_layers"], 86 | ) 87 | 88 | speaker_encoder.load_checkpoint(args.model_path, eval=True, use_cuda=args.use_cuda) 89 | 90 | # preprocess 91 | speaker_encoder_ap = AudioProcessor(**config.audio) 92 | # normalize the input audio level and trim silences 93 | speaker_encoder_ap.do_sound_norm = True 94 | speaker_encoder_ap.do_trim_silence = True 95 | 96 | wav_files = get_spk_wavs(dataset_path, output_path) 97 | 98 | if thread_count == 0: 99 | process_num = os.cpu_count() 100 | else: 101 | process_num = thread_count 102 | 103 | extract_speaker_embeddings(wav_files, dataset_path, output_path, args, speaker_encoder_ap, speaker_encoder, process_num) -------------------------------------------------------------------------------- /prepare/preprocess_speaker_ave.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import argparse 4 | import numpy as np 5 | from tqdm import tqdm 6 | 7 | 8 | if __name__ == "__main__": 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("dataset_speaker", type=str) 11 | parser.add_argument("dataset_singer", type=str) 12 | 13 | data_speaker = parser.parse_args().dataset_speaker 14 | data_singer = parser.parse_args().dataset_singer 15 | 16 | os.makedirs(data_singer, exist_ok=True) 17 | 18 | for speaker in os.listdir(data_speaker): 19 | subfile_num = 0 20 | speaker_ave = 0 21 | 22 | for file in tqdm(os.listdir(os.path.join(data_speaker, speaker)), desc=f"average {speaker}"): 23 | if not file.endswith(".npy"): 24 | continue 25 | source_embed = np.load(os.path.join(data_speaker, speaker, file)) 26 | source_embed = source_embed.astype(np.float32) 27 | speaker_ave = speaker_ave + source_embed 28 | subfile_num = subfile_num + 1 29 | if subfile_num == 0: 30 | continue 31 | speaker_ave = speaker_ave / subfile_num 32 | 33 | np.save(os.path.join(data_singer, f"{speaker}.spk.npy"), 34 | speaker_ave, allow_pickle=False) 35 | 36 | # rewrite timbre code by average, if similarity is larger than cmp_val 37 | rewrite_timbre_code = False 38 | if not rewrite_timbre_code: 39 | continue 40 | cmp_src = torch.FloatTensor(speaker_ave) 41 | cmp_num = 0 42 | cmp_val = 0.85 43 | for file in tqdm(os.listdir(os.path.join(data_speaker, speaker)), desc=f"rewrite {speaker}"): 44 | if not file.endswith(".npy"): 45 | continue 46 | cmp_tmp = np.load(os.path.join(data_speaker, speaker, file)) 47 | cmp_tmp = cmp_tmp.astype(np.float32) 48 | cmp_tmp = torch.FloatTensor(cmp_tmp) 49 | cmp_cos = torch.cosine_similarity(cmp_src, cmp_tmp, dim=0) 50 | if (cmp_cos > cmp_val): 51 | cmp_num += 1 52 | np.save(os.path.join(data_speaker, speaker, file), 53 | speaker_ave, allow_pickle=False) 54 | print(f"rewrite timbre for {speaker} with :", cmp_num) 55 | -------------------------------------------------------------------------------- /prepare/preprocess_spec.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 3 | import torch 4 | import argparse 5 | import multiprocessing 6 | from concurrent.futures import ThreadPoolExecutor 7 | from tqdm import tqdm 8 | from vits import spectrogram 9 | from vits import utils 10 | from omegaconf import OmegaConf 11 | 12 | 13 | def compute_spec(hps, filename, specname): 14 | audio, sampling_rate = utils.load_wav_to_torch(filename) 15 | assert sampling_rate == hps.sampling_rate, f"{sampling_rate} is not {hps.sampling_rate}" 16 | audio_norm = audio / hps.max_wav_value 17 | audio_norm = audio_norm.unsqueeze(0) 18 | n_fft = hps.filter_length 19 | sampling_rate = hps.sampling_rate 20 | hop_size = hps.hop_length 21 | win_size = hps.win_length 22 | spec = spectrogram.spectrogram_torch( 23 | audio_norm, n_fft, sampling_rate, hop_size, win_size, center=False) 24 | spec = torch.squeeze(spec, 0) 25 | torch.save(spec, specname) 26 | 27 | 28 | def process_file(file): 29 | if file.endswith(".wav"): 30 | file = file[:-4] 31 | compute_spec(hps.data, f"{wavPath}/{spks}/{file}.wav", f"{spePath}/{spks}/{file}.pt") 32 | 33 | 34 | def process_files_with_thread_pool(wavPath, spks, thread_num): 35 | files = os.listdir(f"./{wavPath}/{spks}") 36 | with ThreadPoolExecutor(max_workers=thread_num) as executor: 37 | list(tqdm(executor.map(process_file, files), total=len(files), desc=f'Processing spec {spks}')) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True) 43 | parser.add_argument("-s", "--spe", help="spe", dest="spe", required=True) 44 | parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1) 45 | 46 | args = parser.parse_args() 47 | print(args.wav) 48 | print(args.spe) 49 | 50 | os.makedirs(args.spe, exist_ok=True) 51 | wavPath = args.wav 52 | spePath = args.spe 53 | hps = OmegaConf.load("./configs/base.yaml") 54 | 55 | for spks in os.listdir(wavPath): 56 | if os.path.isdir(f"./{wavPath}/{spks}"): 57 | os.makedirs(f"./{spePath}/{spks}", exist_ok=True) 58 | if args.thread_count == 0: 59 | process_num = os.cpu_count() // 2 + 1 60 | else: 61 | process_num = args.thread_count 62 | process_files_with_thread_pool(wavPath, spks, process_num) 63 | -------------------------------------------------------------------------------- /prepare/preprocess_train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | 5 | def print_error(info): 6 | print(f"\033[31m File isn't existed: {info}\033[0m") 7 | 8 | 9 | IndexBySinger = False 10 | if __name__ == "__main__": 11 | os.makedirs("./files/", exist_ok=True) 12 | 13 | rootPath = "./data_svc/waves-32k/" 14 | all_items = [] 15 | for spks in os.listdir(f"./{rootPath}"): 16 | if not os.path.isdir(f"./{rootPath}/{spks}"): 17 | continue 18 | print(f"./{rootPath}/{spks}") 19 | for file in os.listdir(f"./{rootPath}/{spks}"): 20 | if file.endswith(".wav"): 21 | file = file[:-4] 22 | 23 | if (IndexBySinger == False): 24 | path_spk = f"./data_svc/speaker/{spks}/{file}.spk.npy" 25 | else: 26 | path_spk = f"./data_svc/singer/{spks}.spk.npy" 27 | 28 | path_wave = f"./data_svc/waves-32k/{spks}/{file}.wav" 29 | path_spec = f"./data_svc/specs/{spks}/{file}.pt" 30 | path_pitch = f"./data_svc/pitch/{spks}/{file}.pit.npy" 31 | path_hubert = f"./data_svc/hubert/{spks}/{file}.vec.npy" 32 | path_whisper = f"./data_svc/whisper/{spks}/{file}.ppg.npy" 33 | has_error = 0 34 | if not os.path.isfile(path_spk): 35 | print_error(path_spk) 36 | has_error = 1 37 | if not os.path.isfile(path_wave): 38 | print_error(path_wave) 39 | has_error = 1 40 | if not os.path.isfile(path_spec): 41 | print_error(path_spec) 42 | has_error = 1 43 | if not os.path.isfile(path_pitch): 44 | print_error(path_pitch) 45 | has_error = 1 46 | if not os.path.isfile(path_hubert): 47 | print_error(path_hubert) 48 | has_error = 1 49 | if not os.path.isfile(path_whisper): 50 | print_error(path_whisper) 51 | has_error = 1 52 | if has_error == 0: 53 | all_items.append( 54 | f"{path_wave}|{path_spec}|{path_pitch}|{path_hubert}|{path_whisper}|{path_spk}") 55 | 56 | random.shuffle(all_items) 57 | valids = all_items[:10] 58 | valids.sort() 59 | trains = all_items[10:] 60 | # trains.sort() 61 | fw = open("./files/valid.txt", "w", encoding="utf-8") 62 | for strs in valids: 63 | print(strs, file=fw) 64 | fw.close() 65 | fw = open("./files/train.txt", "w", encoding="utf-8") 66 | for strs in trains: 67 | print(strs, file=fw) 68 | fw.close() 69 | -------------------------------------------------------------------------------- /prepare/preprocess_trim.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | from tqdm import tqdm 5 | from pydub import AudioSegment 6 | from pydub.silence import split_on_silence 7 | from pydub import effects 8 | # this file is for VCTK, use after CDC 9 | 10 | 11 | def trim_silence(iWave, oWave): 12 | try: 13 | audio = AudioSegment.from_wav(iWave) 14 | # audio = effects.normalize(audio, 6)# max - 6dB 15 | audio_chunks = split_on_silence( 16 | audio, 17 | min_silence_len=200, 18 | silence_thresh=-45, 19 | keep_silence=200, 20 | ) 21 | for chunk in audio_chunks[1:]: 22 | audio_chunks[0] += chunk 23 | audio_chunks[0].export(oWave, format="wav") 24 | except Exception as e: 25 | print(str(e)) 26 | print(iWave) 27 | 28 | 29 | if __name__ == "__main__": 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument("-i", help="input path", dest="inPath", required=True) 32 | parser.add_argument("-o", help="output path", dest="outPath", required=True) 33 | 34 | args = parser.parse_args() 35 | print(args.inPath) 36 | print(args.outPath) 37 | 38 | os.makedirs(args.outPath, exist_ok=True) 39 | rootPath = args.inPath 40 | outPath = args.outPath 41 | 42 | for spks in os.listdir(rootPath): 43 | if (os.path.isdir(f"./{rootPath}/{spks}")): 44 | os.makedirs(f"./{outPath}/{spks}", exist_ok=True) 45 | 46 | files = [f for f in os.listdir(f"./{rootPath}/{spks}") if f.endswith(".wav")] 47 | for file in tqdm(files, desc=f'Processing sil {spks}'): 48 | iWave = f"./{rootPath}/{spks}/{file}" 49 | oWave = f"./{outPath}/{spks}/{file}" 50 | trim_silence(iWave, oWave) 51 | -------------------------------------------------------------------------------- /prepare/preprocess_zzz.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 3 | from tqdm import tqdm 4 | from torch.utils.data import DataLoader 5 | from omegaconf import OmegaConf 6 | from vits.data_utils import TextAudioSpeakerSet 7 | from vits.data_utils import TextAudioSpeakerCollate 8 | from vits.data_utils import DistributedBucketSampler 9 | 10 | 11 | hps = OmegaConf.load("./configs/base.yaml") 12 | dataset = TextAudioSpeakerSet("files/valid.txt", hps.data) 13 | 14 | for _ in tqdm(dataset): 15 | pass 16 | 17 | 18 | sampler = DistributedBucketSampler( 19 | dataset, 20 | 4, 21 | [150, 300, 450], 22 | num_replicas=1, 23 | rank=0, 24 | shuffle=True) 25 | collate_fn = TextAudioSpeakerCollate() 26 | loader = DataLoader(dataset, num_workers=0, shuffle=False, pin_memory=True, 27 | collate_fn=collate_fn, batch_sampler=sampler) 28 | 29 | 30 | for _ in tqdm(loader): 31 | pass 32 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fsspec 2 | pyworld 3 | matplotlib 4 | soundfile 5 | scikit-learn 6 | scipy 7 | tensorboard 8 | transformers 9 | tqdm 10 | librosa 11 | omegaconf 12 | gradio==3.36.1 13 | ruamel.yaml 14 | resampy 15 | numpy==1.24 16 | chardet 17 | faiss-cpu==1.7.4 18 | -------------------------------------------------------------------------------- /speaker/README.md: -------------------------------------------------------------------------------- 1 | ### Speaker Encoder 2 | 3 | This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding. 4 | 5 | With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart. 6 | 7 | Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q). 8 | 9 | ![](umap.png) 10 | 11 | Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page. 12 | 13 | To run the code, you need to follow the same flow as in TTS. 14 | 15 | - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model. 16 | - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360``` 17 | - Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth.tar model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files. 18 | - Watch training on Tensorboard as in TTS 19 | -------------------------------------------------------------------------------- /speaker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/speaker/__init__.py -------------------------------------------------------------------------------- /speaker/config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import asdict, dataclass, field 2 | from typing import Dict, List 3 | 4 | from .utils.coqpit import MISSING 5 | from .utils.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig 6 | 7 | 8 | @dataclass 9 | class SpeakerEncoderConfig(BaseTrainingConfig): 10 | """Defines parameters for Speaker Encoder model.""" 11 | 12 | model: str = "speaker_encoder" 13 | audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) 14 | datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) 15 | # model params 16 | model_params: Dict = field( 17 | default_factory=lambda: { 18 | "model_name": "lstm", 19 | "input_dim": 80, 20 | "proj_dim": 256, 21 | "lstm_dim": 768, 22 | "num_lstm_layers": 3, 23 | "use_lstm_with_projection": True, 24 | } 25 | ) 26 | 27 | audio_augmentation: Dict = field(default_factory=lambda: {}) 28 | 29 | storage: Dict = field( 30 | default_factory=lambda: { 31 | "sample_from_storage_p": 0.66, # the probability with which we'll sample from the DataSet in-memory storage 32 | "storage_size": 15, # the size of the in-memory storage with respect to a single batch 33 | } 34 | ) 35 | 36 | # training params 37 | max_train_step: int = 1000000 # end training when number of training steps reaches this value. 38 | loss: str = "angleproto" 39 | grad_clip: float = 3.0 40 | lr: float = 0.0001 41 | lr_decay: bool = False 42 | warmup_steps: int = 4000 43 | wd: float = 1e-6 44 | 45 | # logging params 46 | tb_model_param_stats: bool = False 47 | steps_plot_stats: int = 10 48 | checkpoint: bool = True 49 | save_step: int = 1000 50 | print_step: int = 20 51 | 52 | # data loader 53 | num_speakers_in_batch: int = MISSING 54 | num_utters_per_speaker: int = MISSING 55 | num_loader_workers: int = MISSING 56 | skip_speakers: bool = False 57 | voice_len: float = 1.6 58 | 59 | def check_values(self): 60 | super().check_values() 61 | c = asdict(self) 62 | assert ( 63 | c["model_params"]["input_dim"] == self.audio.num_mels 64 | ), " [!] model input dimendion must be equal to melspectrogram dimension." 65 | -------------------------------------------------------------------------------- /speaker/infer.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | import fsspec 4 | import torch 5 | import numpy as np 6 | import argparse 7 | 8 | from argparse import RawTextHelpFormatter 9 | from .models.lstm import LSTMSpeakerEncoder 10 | from .config import SpeakerEncoderConfig 11 | from .utils.audio import AudioProcessor 12 | 13 | 14 | def read_json(json_path): 15 | config_dict = {} 16 | try: 17 | with fsspec.open(json_path, "r", encoding="utf-8") as f: 18 | data = json.load(f) 19 | except json.decoder.JSONDecodeError: 20 | # backwards compat. 21 | data = read_json_with_comments(json_path) 22 | config_dict.update(data) 23 | return config_dict 24 | 25 | 26 | def read_json_with_comments(json_path): 27 | """for backward compat.""" 28 | # fallback to json 29 | with fsspec.open(json_path, "r", encoding="utf-8") as f: 30 | input_str = f.read() 31 | # handle comments 32 | input_str = re.sub(r"\\\n", "", input_str) 33 | input_str = re.sub(r"//.*\n", "\n", input_str) 34 | data = json.loads(input_str) 35 | return data 36 | 37 | 38 | if __name__ == "__main__": 39 | 40 | parser = argparse.ArgumentParser( 41 | description="""Compute embedding vectors for each wav file in a dataset.""", 42 | formatter_class=RawTextHelpFormatter, 43 | ) 44 | parser.add_argument("model_path", type=str, help="Path to model checkpoint file.") 45 | parser.add_argument( 46 | "config_path", 47 | type=str, 48 | help="Path to model config file.", 49 | ) 50 | 51 | parser.add_argument("-s", "--source", help="input wave", dest="source") 52 | parser.add_argument( 53 | "-t", "--target", help="output 256d speaker embeddimg", dest="target" 54 | ) 55 | 56 | parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) 57 | parser.add_argument("--eval", type=bool, help="compute eval.", default=True) 58 | 59 | args = parser.parse_args() 60 | source_file = args.source 61 | target_file = args.target 62 | 63 | # config 64 | config_dict = read_json(args.config_path) 65 | # print(config_dict) 66 | 67 | # model 68 | config = SpeakerEncoderConfig(config_dict) 69 | config.from_dict(config_dict) 70 | 71 | speaker_encoder = LSTMSpeakerEncoder( 72 | config.model_params["input_dim"], 73 | config.model_params["proj_dim"], 74 | config.model_params["lstm_dim"], 75 | config.model_params["num_lstm_layers"], 76 | ) 77 | 78 | speaker_encoder.load_checkpoint(args.model_path, eval=True, use_cuda=args.use_cuda) 79 | 80 | # preprocess 81 | speaker_encoder_ap = AudioProcessor(**config.audio) 82 | # normalize the input audio level and trim silences 83 | speaker_encoder_ap.do_sound_norm = True 84 | speaker_encoder_ap.do_trim_silence = True 85 | 86 | # compute speaker embeddings 87 | 88 | # extract the embedding 89 | waveform = speaker_encoder_ap.load_wav( 90 | source_file, sr=speaker_encoder_ap.sample_rate 91 | ) 92 | spec = speaker_encoder_ap.melspectrogram(waveform) 93 | spec = torch.from_numpy(spec.T) 94 | if args.use_cuda: 95 | spec = spec.cuda() 96 | spec = spec.unsqueeze(0) 97 | embed = speaker_encoder.compute_embedding(spec).detach().cpu().numpy() 98 | embed = embed.squeeze() 99 | # print(embed) 100 | # print(embed.size) 101 | np.save(target_file, embed, allow_pickle=False) 102 | 103 | 104 | if hasattr(speaker_encoder, 'module'): 105 | state_dict = speaker_encoder.module.state_dict() 106 | else: 107 | state_dict = speaker_encoder.state_dict() 108 | torch.save({'model': state_dict}, "model_small.pth") 109 | -------------------------------------------------------------------------------- /speaker/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/speaker/models/__init__.py -------------------------------------------------------------------------------- /speaker/models/lstm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | 5 | from ..utils.io import load_fsspec 6 | 7 | 8 | class LSTMWithProjection(nn.Module): 9 | def __init__(self, input_size, hidden_size, proj_size): 10 | super().__init__() 11 | self.input_size = input_size 12 | self.hidden_size = hidden_size 13 | self.proj_size = proj_size 14 | self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True) 15 | self.linear = nn.Linear(hidden_size, proj_size, bias=False) 16 | 17 | def forward(self, x): 18 | self.lstm.flatten_parameters() 19 | o, (_, _) = self.lstm(x) 20 | return self.linear(o) 21 | 22 | 23 | class LSTMWithoutProjection(nn.Module): 24 | def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers): 25 | super().__init__() 26 | self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True) 27 | self.linear = nn.Linear(lstm_dim, proj_dim, bias=True) 28 | self.relu = nn.ReLU() 29 | 30 | def forward(self, x): 31 | _, (hidden, _) = self.lstm(x) 32 | return self.relu(self.linear(hidden[-1])) 33 | 34 | 35 | class LSTMSpeakerEncoder(nn.Module): 36 | def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3, use_lstm_with_projection=True): 37 | super().__init__() 38 | self.use_lstm_with_projection = use_lstm_with_projection 39 | layers = [] 40 | # choise LSTM layer 41 | if use_lstm_with_projection: 42 | layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim)) 43 | for _ in range(num_lstm_layers - 1): 44 | layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim)) 45 | self.layers = nn.Sequential(*layers) 46 | else: 47 | self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers) 48 | 49 | self._init_layers() 50 | 51 | def _init_layers(self): 52 | for name, param in self.layers.named_parameters(): 53 | if "bias" in name: 54 | nn.init.constant_(param, 0.0) 55 | elif "weight" in name: 56 | nn.init.xavier_normal_(param) 57 | 58 | def forward(self, x): 59 | # TODO: implement state passing for lstms 60 | d = self.layers(x) 61 | if self.use_lstm_with_projection: 62 | d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1) 63 | else: 64 | d = torch.nn.functional.normalize(d, p=2, dim=1) 65 | return d 66 | 67 | @torch.no_grad() 68 | def inference(self, x): 69 | d = self.layers.forward(x) 70 | if self.use_lstm_with_projection: 71 | d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1) 72 | else: 73 | d = torch.nn.functional.normalize(d, p=2, dim=1) 74 | return d 75 | 76 | def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True): 77 | """ 78 | Generate embeddings for a batch of utterances 79 | x: 1xTxD 80 | """ 81 | max_len = x.shape[1] 82 | 83 | if max_len < num_frames: 84 | num_frames = max_len 85 | 86 | offsets = np.linspace(0, max_len - num_frames, num=num_eval) 87 | 88 | frames_batch = [] 89 | for offset in offsets: 90 | offset = int(offset) 91 | end_offset = int(offset + num_frames) 92 | frames = x[:, offset:end_offset] 93 | frames_batch.append(frames) 94 | 95 | frames_batch = torch.cat(frames_batch, dim=0) 96 | embeddings = self.inference(frames_batch) 97 | 98 | if return_mean: 99 | embeddings = torch.mean(embeddings, dim=0, keepdim=True) 100 | 101 | return embeddings 102 | 103 | def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5): 104 | """ 105 | Generate embeddings for a batch of utterances 106 | x: BxTxD 107 | """ 108 | num_overlap = num_frames * overlap 109 | max_len = x.shape[1] 110 | embed = None 111 | num_iters = seq_lens / (num_frames - num_overlap) 112 | cur_iter = 0 113 | for offset in range(0, max_len, num_frames - num_overlap): 114 | cur_iter += 1 115 | end_offset = min(x.shape[1], offset + num_frames) 116 | frames = x[:, offset:end_offset] 117 | if embed is None: 118 | embed = self.inference(frames) 119 | else: 120 | embed[cur_iter <= num_iters, :] += self.inference(frames[cur_iter <= num_iters, :, :]) 121 | return embed / num_iters 122 | 123 | # pylint: disable=unused-argument, redefined-builtin 124 | def load_checkpoint(self, checkpoint_path: str, eval: bool = False, use_cuda: bool = False): 125 | state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) 126 | self.load_state_dict(state["model"]) 127 | if use_cuda: 128 | self.cuda() 129 | if eval: 130 | self.eval() 131 | assert not self.training 132 | -------------------------------------------------------------------------------- /speaker/umap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/speaker/umap.png -------------------------------------------------------------------------------- /speaker/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/speaker/utils/__init__.py -------------------------------------------------------------------------------- /speaker_pretrain/README.md: -------------------------------------------------------------------------------- 1 | Path for: 2 | 3 | best_model.pth.tar 4 | 5 | config.json 6 | -------------------------------------------------------------------------------- /speaker_pretrain/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "lstm", 3 | "run_name": "mueller91", 4 | "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ", 5 | "audio":{ 6 | // Audio processing parameters 7 | "num_mels": 80, // size of the mel spec frame. 8 | "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. 9 | "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. 10 | "win_length": 1024, // stft window length in ms. 11 | "hop_length": 256, // stft window hop-lengh in ms. 12 | "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. 13 | "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. 14 | "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. 15 | "min_level_db": -100, // normalization range 16 | "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. 17 | "power": 1.5, // value to sharpen wav signals after GL algorithm. 18 | "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. 19 | // Normalization parameters 20 | "signal_norm": true, // normalize the spec values in range [0, 1] 21 | "symmetric_norm": true, // move normalization to range [-1, 1] 22 | "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] 23 | "clip_norm": true, // clip normalized values into the range. 24 | "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! 25 | "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! 26 | "do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) 27 | "trim_db": 60 // threshold for timming silence. Set this according to your dataset. 28 | }, 29 | "reinit_layers": [], 30 | "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) 31 | "grad_clip": 3.0, // upper limit for gradients for clipping. 32 | "epochs": 1000, // total number of epochs to train. 33 | "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. 34 | "lr_decay": false, // if true, Noam learning rate decaying is applied through training. 35 | "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" 36 | "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 37 | "steps_plot_stats": 10, // number of steps to plot embeddings. 38 | "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. 39 | "voice_len": 2.0, // size of the voice 40 | "num_utters_per_speaker": 10, // 41 | "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. 42 | "wd": 0.000001, // Weight decay weight. 43 | "checkpoint": true, // If true, it saves checkpoints per "save_step" 44 | "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. 45 | "print_step": 20, // Number of steps to log traning on console. 46 | "output_path": "../../OutputsMozilla/checkpoints/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. 47 | "model": { 48 | "input_dim": 80, 49 | "proj_dim": 256, 50 | "lstm_dim": 768, 51 | "num_lstm_layers": 3, 52 | "use_lstm_with_projection": true 53 | }, 54 | "storage": { 55 | "sample_from_storage_p": 0.9, // the probability with which we'll sample from the DataSet in-memory storage 56 | "storage_size": 25, // the size of the in-memory storage with respect to a single batch 57 | "additive_noise": 1e-5 // add very small gaussian noise to the data in order to increase robustness 58 | }, 59 | "datasets": 60 | [ 61 | { 62 | "name": "vctk_slim", 63 | "path": "../../../audio-datasets/en/VCTK-Corpus/", 64 | "meta_file_train": null, 65 | "meta_file_val": null 66 | }, 67 | { 68 | "name": "libri_tts", 69 | "path": "../../../audio-datasets/en/LibriTTS/train-clean-100", 70 | "meta_file_train": null, 71 | "meta_file_val": null 72 | }, 73 | { 74 | "name": "libri_tts", 75 | "path": "../../../audio-datasets/en/LibriTTS/train-clean-360", 76 | "meta_file_train": null, 77 | "meta_file_val": null 78 | }, 79 | { 80 | "name": "libri_tts", 81 | "path": "../../../audio-datasets/en/LibriTTS/train-other-500", 82 | "meta_file_train": null, 83 | "meta_file_val": null 84 | }, 85 | { 86 | "name": "voxceleb1", 87 | "path": "../../../audio-datasets/en/voxceleb1/", 88 | "meta_file_train": null, 89 | "meta_file_val": null 90 | }, 91 | { 92 | "name": "voxceleb2", 93 | "path": "../../../audio-datasets/en/voxceleb2/", 94 | "meta_file_train": null, 95 | "meta_file_val": null 96 | }, 97 | { 98 | "name": "common_voice", 99 | "path": "../../../audio-datasets/en/MozillaCommonVoice", 100 | "meta_file_train": "train.tsv", 101 | "meta_file_val": "test.tsv" 102 | } 103 | ] 104 | } -------------------------------------------------------------------------------- /svc_eva.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | # average -> ave -> eva :haha 5 | 6 | eva_conf = { 7 | './configs/singers/singer0022.npy': 0, 8 | './configs/singers/singer0030.npy': 0, 9 | './configs/singers/singer0047.npy': 0.5, 10 | './configs/singers/singer0051.npy': 0.5, 11 | } 12 | 13 | if __name__ == "__main__": 14 | 15 | eva = np.zeros(256) 16 | for k, v in eva_conf.items(): 17 | assert os.path.isfile(k), k 18 | spk = np.load(k) 19 | eva = eva + spk * v 20 | np.save("eva.spk.npy", eva, allow_pickle=False) 21 | -------------------------------------------------------------------------------- /svc_export.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 3 | import torch 4 | import argparse 5 | from omegaconf import OmegaConf 6 | 7 | from vits.models import SynthesizerInfer 8 | 9 | 10 | def load_model(checkpoint_path, model): 11 | assert os.path.isfile(checkpoint_path) 12 | checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") 13 | saved_state_dict = checkpoint_dict["model_g"] 14 | if hasattr(model, "module"): 15 | state_dict = model.module.state_dict() 16 | else: 17 | state_dict = model.state_dict() 18 | new_state_dict = {} 19 | for k, v in state_dict.items(): 20 | try: 21 | new_state_dict[k] = saved_state_dict[k] 22 | except: 23 | new_state_dict[k] = v 24 | if hasattr(model, "module"): 25 | model.module.load_state_dict(new_state_dict) 26 | else: 27 | model.load_state_dict(new_state_dict) 28 | return model 29 | 30 | 31 | def save_pretrain(checkpoint_path, save_path): 32 | assert os.path.isfile(checkpoint_path) 33 | checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") 34 | torch.save({ 35 | 'model_g': checkpoint_dict['model_g'], 36 | 'model_d': checkpoint_dict['model_d'], 37 | }, save_path) 38 | 39 | 40 | def save_model(model, checkpoint_path): 41 | if hasattr(model, 'module'): 42 | state_dict = model.module.state_dict() 43 | else: 44 | state_dict = model.state_dict() 45 | torch.save({'model_g': state_dict}, checkpoint_path) 46 | 47 | 48 | def main(args): 49 | hp = OmegaConf.load(args.config) 50 | model = SynthesizerInfer( 51 | hp.data.filter_length // 2 + 1, 52 | hp.data.segment_size // hp.data.hop_length, 53 | hp) 54 | 55 | # save_pretrain(args.checkpoint_path, "sovits5.0.pretrain.pth") 56 | load_model(args.checkpoint_path, model) 57 | save_model(model, "sovits5.0.pth") 58 | 59 | 60 | if __name__ == '__main__': 61 | parser = argparse.ArgumentParser() 62 | parser.add_argument('-c', '--config', type=str, required=True, 63 | help="yaml file for config. will use hp_str from checkpoint if not given.") 64 | parser.add_argument('-p', '--checkpoint_path', type=str, required=True, 65 | help="path of checkpoint pt file for evaluation") 66 | args = parser.parse_args() 67 | 68 | main(args) 69 | -------------------------------------------------------------------------------- /svc_inference_batch.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 3 | import tqdm 4 | import torch 5 | import argparse 6 | 7 | from whisper.inference import load_model, pred_ppg 8 | 9 | # How to use 10 | # python svc_inference_batch.py --config configs/base.yaml --model vits_pretrain/sovits5.0.pth --wave test_waves/ --spk configs/singers/singer0047.npy 11 | 12 | out_path = "./_svc_out" 13 | os.makedirs(out_path, exist_ok=True) 14 | 15 | if __name__ == '__main__': 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('--config', type=str, required=True, 18 | help="yaml file for config.") 19 | parser.add_argument('--model', type=str, required=True, 20 | help="path of model for evaluation") 21 | parser.add_argument('--wave', type=str, required=True, 22 | help="Path of raw audio.") 23 | parser.add_argument('--spk', type=str, required=True, 24 | help="Path of speaker.") 25 | parser.add_argument('--shift', type=int, default=0, 26 | help="Pitch shift key.") 27 | args = parser.parse_args() 28 | wave_path = args.wave 29 | assert os.path.isdir(wave_path), f"{wave_path} is not folder" 30 | waves = [file for file in os.listdir(wave_path) if file.endswith(".wav")] 31 | for file in waves: 32 | print(file) 33 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 34 | whisper = load_model(os.path.join("whisper_pretrain", "large-v2.pt"), device=device) 35 | for file in tqdm.tqdm(waves, desc="whisper"): 36 | pred_ppg(whisper, f"{wave_path}/{file}", f"{out_path}/{file}.ppg.npy", device=device) 37 | del whisper 38 | 39 | for file in tqdm.tqdm(waves, desc="svc"): 40 | os.system( 41 | f"python svc_inference.py --config {args.config} --model {args.model} --wave {wave_path}/{file} --ppg {out_path}/{file}.ppg.npy --spk {args.spk} --shift {args.shift}") 42 | os.system(f"mv svc_out.wav {out_path}/{file}") 43 | os.system(f"rm {out_path}/{file}.ppg.npy") 44 | -------------------------------------------------------------------------------- /svc_inference_post.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 3 | import torch 4 | import librosa 5 | import argparse 6 | import numpy as np 7 | from scipy.io.wavfile import write 8 | from vad.utils import init_jit_model, get_speech_timestamps 9 | 10 | 11 | def load_audio(file: str, sr: int = 16000): 12 | x, sr = librosa.load(file, sr=sr) 13 | return x 14 | 15 | 16 | if __name__ == '__main__': 17 | parser = argparse.ArgumentParser() 18 | 19 | parser.add_argument('--ref', type=str, required=True, 20 | help="Path of ref audio.") 21 | parser.add_argument('--svc', type=str, required=True, 22 | help="Path of svc audio.") 23 | parser.add_argument('--out', type=str, required=True, 24 | help="Path of out audio.") 25 | 26 | args = parser.parse_args() 27 | print("svc in wave :", args.ref) 28 | print("svc out wave :", args.svc) 29 | print("svc post wave :", args.out) 30 | 31 | model = init_jit_model(os.path.join('vad/assets', 'silero_vad.jit')) 32 | model.eval() 33 | 34 | ref_wave = load_audio(args.ref, sr=16000) 35 | tmp_wave = torch.from_numpy(ref_wave).squeeze(0) 36 | tag_wave = get_speech_timestamps( 37 | tmp_wave, model, threshold=0.2, sampling_rate=16000) 38 | 39 | ref_wave[:] = 0 40 | for tag in tag_wave: 41 | ref_wave[tag["start"]:tag["end"]] = 1 42 | 43 | ref_wave = np.repeat(ref_wave, 2, -1) 44 | svc_wave = load_audio(args.svc, sr=32000) 45 | 46 | min_len = min(len(ref_wave), len(svc_wave)) 47 | ref_wave = ref_wave[:min_len] 48 | svc_wave = svc_wave[:min_len] 49 | svc_wave[ref_wave == 0] = 0 50 | 51 | write(args.out, 32000, svc_wave) 52 | -------------------------------------------------------------------------------- /svc_inference_shift.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 3 | import torch 4 | import argparse 5 | import numpy as np 6 | 7 | from omegaconf import OmegaConf 8 | from scipy.io.wavfile import write 9 | from pitch import load_csv_pitch 10 | from vits.models import SynthesizerInfer 11 | from svc_inference import load_svc_model, svc_infer 12 | 13 | 14 | def main(args): 15 | if (args.ppg == None): 16 | args.ppg = "svc_tmp.ppg.npy" 17 | print( 18 | f"Auto run : python whisper/inference.py -w {args.wave} -p {args.ppg}") 19 | os.system(f"python whisper/inference.py -w {args.wave} -p {args.ppg}") 20 | 21 | if (args.vec == None): 22 | args.vec = "svc_tmp.vec.npy" 23 | print( 24 | f"Auto run : python hubert/inference.py -w {args.wave} -v {args.vec}") 25 | os.system(f"python hubert/inference.py -w {args.wave} -v {args.vec}") 26 | 27 | if (args.pit == None): 28 | args.pit = "svc_tmp.pit.csv" 29 | print( 30 | f"Auto run : python pitch/inference.py -w {args.wave} -p {args.pit}") 31 | os.system(f"python pitch/inference.py -w {args.wave} -p {args.pit}") 32 | 33 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 34 | hp = OmegaConf.load(args.config) 35 | model = SynthesizerInfer( 36 | hp.data.filter_length // 2 + 1, 37 | hp.data.segment_size // hp.data.hop_length, 38 | hp) 39 | load_svc_model(args.model, model) 40 | model.eval() 41 | model.to(device) 42 | 43 | spk = np.load(args.spk) 44 | spk = torch.FloatTensor(spk) 45 | 46 | ppg = np.load(args.ppg) 47 | ppg = np.repeat(ppg, 2, 0) 48 | ppg = torch.FloatTensor(ppg) 49 | 50 | vec = np.load(args.vec) 51 | vec = np.repeat(vec, 2, 0) 52 | vec = torch.FloatTensor(vec) 53 | 54 | pit = load_csv_pitch(args.pit) 55 | 56 | shift_l = args.shift_l 57 | shift_r = args.shift_r 58 | 59 | print(f"pitch shift: [{shift_l}, {shift_r}]") 60 | 61 | for shift in range(shift_l, shift_r + 1): 62 | print(shift) 63 | tmp = np.array(pit) 64 | tmp = tmp * (2 ** (shift / 12)) 65 | tmp = torch.FloatTensor(tmp) 66 | 67 | out_audio = svc_infer(model, spk, tmp, ppg, vec, hp, device) 68 | write(os.path.join("./_svc_out", f"svc_out_{shift}.wav"), 69 | hp.data.sampling_rate, out_audio) 70 | 71 | 72 | if __name__ == '__main__': 73 | parser = argparse.ArgumentParser() 74 | parser.add_argument('--config', type=str, required=True, 75 | help="yaml file for config.") 76 | parser.add_argument('--model', type=str, required=True, 77 | help="path of model for evaluation") 78 | parser.add_argument('--wave', type=str, required=True, 79 | help="Path of raw audio.") 80 | parser.add_argument('--spk', type=str, required=True, 81 | help="Path of speaker.") 82 | parser.add_argument('--ppg', type=str, 83 | help="Path of content vector.") 84 | parser.add_argument('--vec', type=str, 85 | help="Path of hubert vector.") 86 | parser.add_argument('--pit', type=str, 87 | help="Path of pitch csv file.") 88 | parser.add_argument('--shift_l', type=int, default=0, 89 | help="Pitch shift key for [shift_l, shift_r]") 90 | parser.add_argument('--shift_r', type=int, default=0, 91 | help="Pitch shift key for [shift_l, shift_r]") 92 | args = parser.parse_args() 93 | 94 | assert args.shift_l >= -12 95 | assert args.shift_r >= -12 96 | assert args.shift_l <= 12 97 | assert args.shift_r <= 12 98 | assert args.shift_l <= args.shift_r 99 | 100 | os.makedirs("./_svc_out", exist_ok=True) 101 | 102 | main(args) 103 | -------------------------------------------------------------------------------- /svc_merge.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import argparse 4 | import collections 5 | 6 | 7 | def load_model(checkpoint_path): 8 | assert os.path.isfile(checkpoint_path) 9 | checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") 10 | saved_state_dict = checkpoint_dict["model_g"] 11 | return saved_state_dict 12 | 13 | 14 | def save_model(state_dict, checkpoint_path): 15 | torch.save({'model_g': state_dict}, checkpoint_path) 16 | 17 | 18 | def average_model(model_list): 19 | model_keys = list(model_list[0].keys()) 20 | model_average = collections.OrderedDict() 21 | for key in model_keys: 22 | key_sum = 0 23 | for i in range(len(model_list)): 24 | key_sum = (key_sum + model_list[i][key]) 25 | model_average[key] = torch.div(key_sum, float(len(model_list))) 26 | return model_average 27 | # ss_list = [] 28 | # ss_list.append(s1) 29 | # ss_list.append(s2) 30 | # ss_merge = average_model(ss_list) 31 | 32 | 33 | def merge_model(model1, model2, rate): 34 | model_keys = model1.keys() 35 | model_merge = collections.OrderedDict() 36 | for key in model_keys: 37 | key_merge = rate * model1[key] + (1 - rate) * model2[key] 38 | model_merge[key] = key_merge 39 | return model_merge 40 | 41 | 42 | if __name__ == '__main__': 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument('-m1', '--model1', type=str, required=True) 45 | parser.add_argument('-m2', '--model2', type=str, required=True) 46 | parser.add_argument('-r1', '--rate', type=float, required=True) 47 | args = parser.parse_args() 48 | 49 | print(args.model1) 50 | print(args.model2) 51 | print(args.rate) 52 | 53 | assert args.rate > 0 and args.rate < 1, f"{args.rate} should be in range (0, 1)" 54 | s1 = load_model(args.model1) 55 | s2 = load_model(args.model2) 56 | 57 | merge = merge_model(s1, s2, args.rate) 58 | save_model(merge, "sovits5.0_merge.pth") 59 | -------------------------------------------------------------------------------- /svc_preprocessing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import argparse 4 | import subprocess 5 | 6 | assert torch.cuda.is_available(), "\033[31m You need GPU to Train! \033[0m" 7 | print("CPU Count is :", os.cpu_count()) 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("-t", type=int, default=0, help="thread count") 11 | args = parser.parse_args() 12 | 13 | 14 | commands = [ 15 | "python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-16k -s 16000 -t 0", 16 | "python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000 -t 0", 17 | "python prepare/preprocess_crepe.py -w data_svc/waves-16k/ -p data_svc/pitch", 18 | "python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper", 19 | "python prepare/preprocess_hubert.py -w data_svc/waves-16k/ -v data_svc/hubert", 20 | "python prepare/preprocess_speaker.py data_svc/waves-16k/ data_svc/speaker -t 0", 21 | "python prepare/preprocess_speaker_ave.py data_svc/speaker/ data_svc/singer", 22 | "python prepare/preprocess_spec.py -w data_svc/waves-32k/ -s data_svc/specs -t 0", 23 | "python prepare/preprocess_train.py", 24 | "python prepare/preprocess_zzz.py", 25 | ] 26 | 27 | 28 | for command in commands: 29 | print(f"Command: {command}") 30 | 31 | process = subprocess.Popen(command, shell=True) 32 | outcode = process.wait() 33 | if (outcode): 34 | break 35 | -------------------------------------------------------------------------------- /svc_train_retrieval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import multiprocessing 4 | from functools import partial 5 | from pathlib import Path 6 | 7 | import faiss 8 | 9 | from feature_retrieval import ( 10 | train_index, 11 | FaissIVFFlatTrainableFeatureIndexBuilder, 12 | OnConditionFeatureTransform, 13 | MinibatchKmeansFeatureTransform, 14 | DummyFeatureTransform, 15 | ) 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | def get_speaker_list(base_path: Path): 21 | speakers_path = base_path / "waves-16k" 22 | if not speakers_path.exists(): 23 | raise FileNotFoundError(f"path {speakers_path} does not exists") 24 | return [speaker_dir.name for speaker_dir in speakers_path.iterdir() if speaker_dir.is_dir()] 25 | 26 | 27 | def create_indexes_path(base_path: Path) -> Path: 28 | indexes_path = base_path / "indexes" 29 | logger.info("create indexes folder %s", indexes_path) 30 | indexes_path.mkdir(exist_ok=True) 31 | return indexes_path 32 | 33 | 34 | def create_index( 35 | feature_name: str, 36 | prefix: str, 37 | speaker: str, 38 | base_path: Path, 39 | indexes_path: Path, 40 | compress_features_after: int, 41 | n_clusters: int, 42 | n_parallel: int, 43 | train_batch_size: int = 8192, 44 | ) -> None: 45 | features_path = base_path / feature_name / speaker 46 | if not features_path.exists(): 47 | raise ValueError(f'features not found by path {features_path}') 48 | index_path = indexes_path / speaker 49 | index_path.mkdir(exist_ok=True) 50 | index_filename = f"{prefix}{feature_name}.index" 51 | index_filepath = index_path / index_filename 52 | logger.debug('index will be save to %s', index_filepath) 53 | 54 | builder = FaissIVFFlatTrainableFeatureIndexBuilder(train_batch_size, distance=faiss.METRIC_L2) 55 | transform = OnConditionFeatureTransform( 56 | condition=lambda matrix: matrix.shape[0] > compress_features_after, 57 | on_condition=MinibatchKmeansFeatureTransform(n_clusters, n_parallel), 58 | otherwise=DummyFeatureTransform() 59 | ) 60 | train_index(features_path, index_filepath, builder, transform) 61 | 62 | 63 | def main() -> None: 64 | arg_parser = argparse.ArgumentParser("crate faiss indexes for feature retrieval") 65 | arg_parser.add_argument("--debug", action="store_true") 66 | arg_parser.add_argument("--prefix", default='', help="add prefix to index filename") 67 | arg_parser.add_argument('--speakers', nargs="+", 68 | help="speaker names to create an index. By default all speakers are from data_svc") 69 | arg_parser.add_argument("--compress-features-after", type=int, default=200_000, 70 | help="If the number of features is greater than the value compress " 71 | "feature vectors using MiniBatchKMeans.") 72 | arg_parser.add_argument("--n-clusters", type=int, default=10_000, 73 | help="Number of centroids to which features will be compressed") 74 | 75 | arg_parser.add_argument("--n-parallel", type=int, default=multiprocessing.cpu_count()-1, 76 | help="Nuber of parallel job of MinibatchKmeans. Default is cpus-1") 77 | args = arg_parser.parse_args() 78 | 79 | if args.debug: 80 | logging.basicConfig(level=logging.DEBUG) 81 | else: 82 | logging.basicConfig(level=logging.INFO) 83 | 84 | base_path = Path(".").absolute() / "data_svc" 85 | if args.speakers: 86 | speakers = args.speakers 87 | else: 88 | speakers = get_speaker_list(base_path) 89 | 90 | logger.info("got %s speakers: %s", len(speakers), speakers) 91 | indexes_path = create_indexes_path(base_path) 92 | 93 | create_index_func = partial( 94 | create_index, 95 | prefix=args.prefix, 96 | base_path=base_path, 97 | indexes_path=indexes_path, 98 | compress_features_after=args.compress_features_after, 99 | n_clusters=args.n_clusters, 100 | n_parallel=args.n_parallel, 101 | ) 102 | 103 | for speaker in speakers: 104 | logger.info("create hubert index for speaker %s", speaker) 105 | create_index_func(feature_name="hubert", speaker=speaker) 106 | 107 | logger.info("create whisper index for speaker %s", speaker) 108 | create_index_func(feature_name="whisper", speaker=speaker) 109 | 110 | logger.info("done!") 111 | 112 | 113 | if __name__ == '__main__': 114 | main() 115 | -------------------------------------------------------------------------------- /svc_trainer.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 3 | import argparse 4 | import torch 5 | import torch.multiprocessing as mp 6 | from omegaconf import OmegaConf 7 | 8 | from vits_extend.train import train 9 | 10 | torch.backends.cudnn.benchmark = True 11 | 12 | 13 | if __name__ == '__main__': 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('-c', '--config', type=str, required=True, 16 | help="yaml file for configuration") 17 | parser.add_argument('-p', '--checkpoint_path', type=str, default=None, 18 | help="path of checkpoint pt file to resume training") 19 | parser.add_argument('-n', '--name', type=str, required=True, 20 | help="name of the model for logging, saving checkpoint") 21 | args = parser.parse_args() 22 | 23 | hp = OmegaConf.load(args.config) 24 | with open(args.config, 'r') as f: 25 | hp_str = ''.join(f.readlines()) 26 | 27 | assert hp.data.hop_length == 320, \ 28 | 'hp.data.hop_length must be equal to 320, got %d' % hp.data.hop_length 29 | 30 | args.num_gpus = 0 31 | torch.manual_seed(hp.train.seed) 32 | if torch.cuda.is_available(): 33 | torch.cuda.manual_seed(hp.train.seed) 34 | args.num_gpus = torch.cuda.device_count() 35 | print('Batch size per GPU :', hp.train.batch_size) 36 | 37 | if args.num_gpus > 1: 38 | mp.spawn(train, nprocs=args.num_gpus, 39 | args=(args, args.checkpoint_path, hp, hp_str,)) 40 | else: 41 | train(0, args, args.checkpoint_path, hp, hp_str) 42 | else: 43 | print('No GPU find!') 44 | -------------------------------------------------------------------------------- /test.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/test.wav -------------------------------------------------------------------------------- /vad/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020-present Silero Team 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /vad/assets/silero_vad.jit: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/vad/assets/silero_vad.jit -------------------------------------------------------------------------------- /vits/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Jaehyeon Kim 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /vits/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/vits/__init__.py -------------------------------------------------------------------------------- /vits/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def feature_loss(fmap_r, fmap_g): 5 | loss = 0 6 | for dr, dg in zip(fmap_r, fmap_g): 7 | for rl, gl in zip(dr, dg): 8 | rl = rl.float().detach() 9 | gl = gl.float() 10 | loss += torch.mean(torch.abs(rl - gl)) 11 | 12 | return loss * 2 13 | 14 | 15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 16 | loss = 0 17 | r_losses = [] 18 | g_losses = [] 19 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 20 | dr = dr.float() 21 | dg = dg.float() 22 | r_loss = torch.mean((1 - dr) ** 2) 23 | g_loss = torch.mean(dg**2) 24 | loss += r_loss + g_loss 25 | r_losses.append(r_loss.item()) 26 | g_losses.append(g_loss.item()) 27 | 28 | return loss, r_losses, g_losses 29 | 30 | 31 | def generator_loss(disc_outputs): 32 | loss = 0 33 | gen_losses = [] 34 | for dg in disc_outputs: 35 | dg = dg.float() 36 | l = torch.mean((1 - dg) ** 2) 37 | gen_losses.append(l) 38 | loss += l 39 | 40 | return loss, gen_losses 41 | 42 | 43 | def kl_loss(z_p, logs_q, m_p, logs_p, total_logdet, z_mask): 44 | """ 45 | z_p, logs_q: [b, h, t_t] 46 | m_p, logs_p: [b, h, t_t] 47 | total_logdet: [b] - total_logdet summed over each batch 48 | """ 49 | z_p = z_p.float() 50 | logs_q = logs_q.float() 51 | m_p = m_p.float() 52 | logs_p = logs_p.float() 53 | z_mask = z_mask.float() 54 | 55 | kl = logs_p - logs_q - 0.5 56 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) 57 | kl = torch.sum(kl * z_mask) 58 | # add total_logdet (Negative LL) 59 | kl -= torch.sum(total_logdet) 60 | l = kl / torch.sum(z_mask) 61 | return l 62 | 63 | 64 | def kl_loss_back(z_p, logs_q, m_p, logs_p, z_mask): 65 | """ 66 | z_p, logs_q: [b, h, t_t] 67 | m_p, logs_p: [b, h, t_t] 68 | """ 69 | z_p = z_p.float() 70 | logs_q = logs_q.float() 71 | m_p = m_p.float() 72 | logs_p = logs_p.float() 73 | z_mask = z_mask.float() 74 | 75 | kl = logs_p - logs_q - 0.5 76 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) 77 | kl = torch.sum(kl * z_mask) 78 | l = kl / torch.sum(z_mask) 79 | return l 80 | -------------------------------------------------------------------------------- /vits/modules_grl.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/ubisoft/ubisoft-laforge-daft-exprt Apache License Version 2.0 2 | # Unsupervised Domain Adaptation by Backpropagation 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from torch.autograd import Function 8 | from torch.nn.utils import weight_norm 9 | 10 | 11 | class GradientReversalFunction(Function): 12 | @staticmethod 13 | def forward(ctx, x, lambda_): 14 | ctx.lambda_ = lambda_ 15 | return x.clone() 16 | 17 | @staticmethod 18 | def backward(ctx, grads): 19 | lambda_ = ctx.lambda_ 20 | lambda_ = grads.new_tensor(lambda_) 21 | dx = -lambda_ * grads 22 | return dx, None 23 | 24 | 25 | class GradientReversal(torch.nn.Module): 26 | ''' Gradient Reversal Layer 27 | Y. Ganin, V. Lempitsky, 28 | "Unsupervised Domain Adaptation by Backpropagation", 29 | in ICML, 2015. 30 | Forward pass is the identity function 31 | In the backward pass, upstream gradients are multiplied by -lambda (i.e. gradient are reversed) 32 | ''' 33 | 34 | def __init__(self, lambda_reversal=1): 35 | super(GradientReversal, self).__init__() 36 | self.lambda_ = lambda_reversal 37 | 38 | def forward(self, x): 39 | return GradientReversalFunction.apply(x, self.lambda_) 40 | 41 | 42 | class SpeakerClassifier(nn.Module): 43 | 44 | def __init__(self, embed_dim, spk_dim): 45 | super(SpeakerClassifier, self).__init__() 46 | self.classifier = nn.Sequential( 47 | GradientReversal(lambda_reversal=1), 48 | weight_norm(nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)), 49 | nn.ReLU(), 50 | weight_norm(nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)), 51 | nn.ReLU(), 52 | weight_norm(nn.Conv1d(embed_dim, spk_dim, kernel_size=5, padding=2)) 53 | ) 54 | 55 | def forward(self, x): 56 | ''' Forward function of Speaker Classifier: 57 | x = (B, embed_dim, len) 58 | ''' 59 | # pass through classifier 60 | outputs = self.classifier(x) # (B, nb_speakers) 61 | outputs = torch.mean(outputs, dim=-1) 62 | return outputs 63 | -------------------------------------------------------------------------------- /vits/spectrogram.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data 3 | 4 | from librosa.filters import mel as librosa_mel_fn 5 | 6 | MAX_WAV_VALUE = 32768.0 7 | 8 | 9 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 10 | """ 11 | PARAMS 12 | ------ 13 | C: compression factor 14 | """ 15 | return torch.log(torch.clamp(x, min=clip_val) * C) 16 | 17 | 18 | def dynamic_range_decompression_torch(x, C=1): 19 | """ 20 | PARAMS 21 | ------ 22 | C: compression factor used to compress 23 | """ 24 | return torch.exp(x) / C 25 | 26 | 27 | def spectral_normalize_torch(magnitudes): 28 | output = dynamic_range_compression_torch(magnitudes) 29 | return output 30 | 31 | 32 | def spectral_de_normalize_torch(magnitudes): 33 | output = dynamic_range_decompression_torch(magnitudes) 34 | return output 35 | 36 | 37 | mel_basis = {} 38 | hann_window = {} 39 | 40 | 41 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): 42 | if torch.min(y) < -1.0: 43 | print("min value is ", torch.min(y)) 44 | if torch.max(y) > 1.0: 45 | print("max value is ", torch.max(y)) 46 | 47 | global hann_window 48 | dtype_device = str(y.dtype) + "_" + str(y.device) 49 | wnsize_dtype_device = str(win_size) + "_" + dtype_device 50 | if wnsize_dtype_device not in hann_window: 51 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( 52 | dtype=y.dtype, device=y.device 53 | ) 54 | 55 | y = torch.nn.functional.pad( 56 | y.unsqueeze(1), 57 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 58 | mode="reflect", 59 | ) 60 | y = y.squeeze(1) 61 | 62 | spec = torch.stft( 63 | y, 64 | n_fft, 65 | hop_length=hop_size, 66 | win_length=win_size, 67 | window=hann_window[wnsize_dtype_device], 68 | center=center, 69 | pad_mode="reflect", 70 | normalized=False, 71 | onesided=True, 72 | return_complex=False, 73 | ) 74 | 75 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 76 | return spec 77 | 78 | 79 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): 80 | global mel_basis 81 | dtype_device = str(spec.dtype) + "_" + str(spec.device) 82 | fmax_dtype_device = str(fmax) + "_" + dtype_device 83 | if fmax_dtype_device not in mel_basis: 84 | mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) 85 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( 86 | dtype=spec.dtype, device=spec.device 87 | ) 88 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 89 | spec = spectral_normalize_torch(spec) 90 | return spec 91 | 92 | 93 | def mel_spectrogram_torch( 94 | y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False 95 | ): 96 | if torch.min(y) < -1.0: 97 | print("min value is ", torch.min(y)) 98 | if torch.max(y) > 1.0: 99 | print("max value is ", torch.max(y)) 100 | 101 | global mel_basis, hann_window 102 | dtype_device = str(y.dtype) + "_" + str(y.device) 103 | fmax_dtype_device = str(fmax) + "_" + dtype_device 104 | wnsize_dtype_device = str(win_size) + "_" + dtype_device 105 | if fmax_dtype_device not in mel_basis: 106 | mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) 107 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( 108 | dtype=y.dtype, device=y.device 109 | ) 110 | if wnsize_dtype_device not in hann_window: 111 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( 112 | dtype=y.dtype, device=y.device 113 | ) 114 | 115 | y = torch.nn.functional.pad( 116 | y.unsqueeze(1), 117 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 118 | mode="reflect", 119 | ) 120 | y = y.squeeze(1) 121 | 122 | spec = torch.stft( 123 | y, 124 | n_fft, 125 | hop_length=hop_size, 126 | win_length=win_size, 127 | window=hann_window[wnsize_dtype_device], 128 | center=center, 129 | pad_mode="reflect", 130 | normalized=False, 131 | onesided=True, 132 | return_complex=False, 133 | ) 134 | 135 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 136 | 137 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 138 | spec = spectral_normalize_torch(spec) 139 | 140 | return spec 141 | -------------------------------------------------------------------------------- /vits/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from scipy.io.wavfile import read 4 | 5 | MATPLOTLIB_FLAG = False 6 | 7 | 8 | def load_wav_to_torch(full_path): 9 | sampling_rate, data = read(full_path) 10 | return torch.FloatTensor(data.astype(np.float32)), sampling_rate 11 | 12 | 13 | f0_bin = 256 14 | f0_max = 1100.0 15 | f0_min = 50.0 16 | f0_mel_min = 1127 * np.log(1 + f0_min / 700) 17 | f0_mel_max = 1127 * np.log(1 + f0_max / 700) 18 | 19 | 20 | def f0_to_coarse(f0): 21 | is_torch = isinstance(f0, torch.Tensor) 22 | f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * \ 23 | np.log(1 + f0 / 700) 24 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * \ 25 | (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 26 | 27 | f0_mel[f0_mel <= 1] = 1 28 | f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 29 | f0_coarse = ( 30 | f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int) 31 | assert f0_coarse.max() <= 255 and f0_coarse.min( 32 | ) >= 1, (f0_coarse.max(), f0_coarse.min()) 33 | return f0_coarse 34 | -------------------------------------------------------------------------------- /vits_decoder/LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 NVIDIA CORPORATION. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /vits_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | from .alias.act import SnakeAlias -------------------------------------------------------------------------------- /vits_decoder/alias/LICENSE-snake.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Edward Dixon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /vits_decoder/alias/__init__.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | from .filter import * 5 | from .resample import * 6 | from .act import * -------------------------------------------------------------------------------- /vits_decoder/alias/act.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | from torch import sin, pow 9 | from torch.nn import Parameter 10 | from .resample import UpSample1d, DownSample1d 11 | 12 | 13 | class Activation1d(nn.Module): 14 | def __init__(self, 15 | activation, 16 | up_ratio: int = 2, 17 | down_ratio: int = 2, 18 | up_kernel_size: int = 12, 19 | down_kernel_size: int = 12): 20 | super().__init__() 21 | self.up_ratio = up_ratio 22 | self.down_ratio = down_ratio 23 | self.act = activation 24 | self.upsample = UpSample1d(up_ratio, up_kernel_size) 25 | self.downsample = DownSample1d(down_ratio, down_kernel_size) 26 | 27 | # x: [B,C,T] 28 | def forward(self, x): 29 | x = self.upsample(x) 30 | x = self.act(x) 31 | x = self.downsample(x) 32 | 33 | return x 34 | 35 | 36 | class SnakeBeta(nn.Module): 37 | ''' 38 | A modified Snake function which uses separate parameters for the magnitude of the periodic components 39 | Shape: 40 | - Input: (B, C, T) 41 | - Output: (B, C, T), same shape as the input 42 | Parameters: 43 | - alpha - trainable parameter that controls frequency 44 | - beta - trainable parameter that controls magnitude 45 | References: 46 | - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: 47 | https://arxiv.org/abs/2006.08195 48 | Examples: 49 | >>> a1 = snakebeta(256) 50 | >>> x = torch.randn(256) 51 | >>> x = a1(x) 52 | ''' 53 | 54 | def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): 55 | ''' 56 | Initialization. 57 | INPUT: 58 | - in_features: shape of the input 59 | - alpha - trainable parameter that controls frequency 60 | - beta - trainable parameter that controls magnitude 61 | alpha is initialized to 1 by default, higher values = higher-frequency. 62 | beta is initialized to 1 by default, higher values = higher-magnitude. 63 | alpha will be trained along with the rest of your model. 64 | ''' 65 | super(SnakeBeta, self).__init__() 66 | self.in_features = in_features 67 | # initialize alpha 68 | self.alpha_logscale = alpha_logscale 69 | if self.alpha_logscale: # log scale alphas initialized to zeros 70 | self.alpha = Parameter(torch.zeros(in_features) * alpha) 71 | self.beta = Parameter(torch.zeros(in_features) * alpha) 72 | else: # linear scale alphas initialized to ones 73 | self.alpha = Parameter(torch.ones(in_features) * alpha) 74 | self.beta = Parameter(torch.ones(in_features) * alpha) 75 | self.alpha.requires_grad = alpha_trainable 76 | self.beta.requires_grad = alpha_trainable 77 | self.no_div_by_zero = 0.000000001 78 | 79 | def forward(self, x): 80 | ''' 81 | Forward pass of the function. 82 | Applies the function to the input elementwise. 83 | SnakeBeta = x + 1/b * sin^2 (xa) 84 | ''' 85 | alpha = self.alpha.unsqueeze( 86 | 0).unsqueeze(-1) # line up with x to [B, C, T] 87 | beta = self.beta.unsqueeze(0).unsqueeze(-1) 88 | if self.alpha_logscale: 89 | alpha = torch.exp(alpha) 90 | beta = torch.exp(beta) 91 | x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2) 92 | return x 93 | 94 | 95 | class Mish(nn.Module): 96 | """ 97 | Mish activation function is proposed in "Mish: A Self 98 | Regularized Non-Monotonic Neural Activation Function" 99 | paper, https://arxiv.org/abs/1908.08681. 100 | """ 101 | 102 | def __init__(self): 103 | super().__init__() 104 | 105 | def forward(self, x): 106 | return x * torch.tanh(F.softplus(x)) 107 | 108 | 109 | class SnakeAlias(nn.Module): 110 | def __init__(self, 111 | channels, 112 | up_ratio: int = 2, 113 | down_ratio: int = 2, 114 | up_kernel_size: int = 12, 115 | down_kernel_size: int = 12): 116 | super().__init__() 117 | self.up_ratio = up_ratio 118 | self.down_ratio = down_ratio 119 | self.act = SnakeBeta(channels, alpha_logscale=True) 120 | self.upsample = UpSample1d(up_ratio, up_kernel_size) 121 | self.downsample = DownSample1d(down_ratio, down_kernel_size) 122 | 123 | # x: [B,C,T] 124 | def forward(self, x): 125 | x = self.upsample(x) 126 | x = self.act(x) 127 | x = self.downsample(x) 128 | 129 | return x -------------------------------------------------------------------------------- /vits_decoder/alias/filter.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import math 8 | 9 | if 'sinc' in dir(torch): 10 | sinc = torch.sinc 11 | else: 12 | # This code is adopted from adefossez's julius.core.sinc under the MIT License 13 | # https://adefossez.github.io/julius/julius/core.html 14 | # LICENSE is in incl_licenses directory. 15 | def sinc(x: torch.Tensor): 16 | """ 17 | Implementation of sinc, i.e. sin(pi * x) / (pi * x) 18 | __Warning__: Different to julius.sinc, the input is multiplied by `pi`! 19 | """ 20 | return torch.where(x == 0, 21 | torch.tensor(1., device=x.device, dtype=x.dtype), 22 | torch.sin(math.pi * x) / math.pi / x) 23 | 24 | 25 | # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License 26 | # https://adefossez.github.io/julius/julius/lowpass.html 27 | # LICENSE is in incl_licenses directory. 28 | def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size] 29 | even = (kernel_size % 2 == 0) 30 | half_size = kernel_size // 2 31 | 32 | #For kaiser window 33 | delta_f = 4 * half_width 34 | A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95 35 | if A > 50.: 36 | beta = 0.1102 * (A - 8.7) 37 | elif A >= 21.: 38 | beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.) 39 | else: 40 | beta = 0. 41 | window = torch.kaiser_window(kernel_size, beta=beta, periodic=False) 42 | 43 | # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio 44 | if even: 45 | time = (torch.arange(-half_size, half_size) + 0.5) 46 | else: 47 | time = torch.arange(kernel_size) - half_size 48 | if cutoff == 0: 49 | filter_ = torch.zeros_like(time) 50 | else: 51 | filter_ = 2 * cutoff * window * sinc(2 * cutoff * time) 52 | # Normalize filter to have sum = 1, otherwise we will have a small leakage 53 | # of the constant component in the input signal. 54 | filter_ /= filter_.sum() 55 | filter = filter_.view(1, 1, kernel_size) 56 | 57 | return filter 58 | 59 | 60 | class LowPassFilter1d(nn.Module): 61 | def __init__(self, 62 | cutoff=0.5, 63 | half_width=0.6, 64 | stride: int = 1, 65 | padding: bool = True, 66 | padding_mode: str = 'replicate', 67 | kernel_size: int = 12): 68 | # kernel_size should be even number for stylegan3 setup, 69 | # in this implementation, odd number is also possible. 70 | super().__init__() 71 | if cutoff < -0.: 72 | raise ValueError("Minimum cutoff must be larger than zero.") 73 | if cutoff > 0.5: 74 | raise ValueError("A cutoff above 0.5 does not make sense.") 75 | self.kernel_size = kernel_size 76 | self.even = (kernel_size % 2 == 0) 77 | self.pad_left = kernel_size // 2 - int(self.even) 78 | self.pad_right = kernel_size // 2 79 | self.stride = stride 80 | self.padding = padding 81 | self.padding_mode = padding_mode 82 | filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size) 83 | self.register_buffer("filter", filter) 84 | 85 | #input [B, C, T] 86 | def forward(self, x): 87 | _, C, _ = x.shape 88 | 89 | if self.padding: 90 | x = F.pad(x, (self.pad_left, self.pad_right), 91 | mode=self.padding_mode) 92 | out = F.conv1d(x, self.filter.expand(C, -1, -1), 93 | stride=self.stride, groups=C) 94 | 95 | return out -------------------------------------------------------------------------------- /vits_decoder/alias/resample.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch.nn as nn 5 | from torch.nn import functional as F 6 | from .filter import LowPassFilter1d 7 | from .filter import kaiser_sinc_filter1d 8 | 9 | 10 | class UpSample1d(nn.Module): 11 | def __init__(self, ratio=2, kernel_size=None): 12 | super().__init__() 13 | self.ratio = ratio 14 | self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size 15 | self.stride = ratio 16 | self.pad = self.kernel_size // ratio - 1 17 | self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2 18 | self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2 19 | filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, 20 | half_width=0.6 / ratio, 21 | kernel_size=self.kernel_size) 22 | self.register_buffer("filter", filter) 23 | 24 | # x: [B, C, T] 25 | def forward(self, x): 26 | _, C, _ = x.shape 27 | 28 | x = F.pad(x, (self.pad, self.pad), mode='replicate') 29 | x = self.ratio * F.conv_transpose1d( 30 | x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C) 31 | x = x[..., self.pad_left:-self.pad_right] 32 | 33 | return x 34 | 35 | 36 | class DownSample1d(nn.Module): 37 | def __init__(self, ratio=2, kernel_size=None): 38 | super().__init__() 39 | self.ratio = ratio 40 | self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size 41 | self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio, 42 | half_width=0.6 / ratio, 43 | stride=ratio, 44 | kernel_size=self.kernel_size) 45 | 46 | def forward(self, x): 47 | xx = self.lowpass(x) 48 | 49 | return xx -------------------------------------------------------------------------------- /vits_decoder/bigv.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from torch.nn import Conv1d 5 | from torch.nn.utils import weight_norm, remove_weight_norm 6 | from .alias.act import SnakeAlias 7 | 8 | 9 | def init_weights(m, mean=0.0, std=0.01): 10 | classname = m.__class__.__name__ 11 | if classname.find("Conv") != -1: 12 | m.weight.data.normal_(mean, std) 13 | 14 | 15 | def get_padding(kernel_size, dilation=1): 16 | return int((kernel_size*dilation - dilation)/2) 17 | 18 | 19 | class AMPBlock(torch.nn.Module): 20 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): 21 | super(AMPBlock, self).__init__() 22 | self.convs1 = nn.ModuleList([ 23 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 24 | padding=get_padding(kernel_size, dilation[0]))), 25 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 26 | padding=get_padding(kernel_size, dilation[1]))), 27 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], 28 | padding=get_padding(kernel_size, dilation[2]))) 29 | ]) 30 | self.convs1.apply(init_weights) 31 | 32 | self.convs2 = nn.ModuleList([ 33 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 34 | padding=get_padding(kernel_size, 1))), 35 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 36 | padding=get_padding(kernel_size, 1))), 37 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 38 | padding=get_padding(kernel_size, 1))) 39 | ]) 40 | self.convs2.apply(init_weights) 41 | 42 | # total number of conv layers 43 | self.num_layers = len(self.convs1) + len(self.convs2) 44 | 45 | # periodic nonlinearity with snakebeta function and anti-aliasing 46 | self.activations = nn.ModuleList([ 47 | SnakeAlias(channels) for _ in range(self.num_layers) 48 | ]) 49 | 50 | def forward(self, x): 51 | acts1, acts2 = self.activations[::2], self.activations[1::2] 52 | for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2): 53 | xt = a1(x) 54 | xt = c1(xt) 55 | xt = a2(xt) 56 | xt = c2(xt) 57 | x = xt + x 58 | return x 59 | 60 | def remove_weight_norm(self): 61 | for l in self.convs1: 62 | remove_weight_norm(l) 63 | for l in self.convs2: 64 | remove_weight_norm(l) -------------------------------------------------------------------------------- /vits_decoder/discriminator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from omegaconf import OmegaConf 5 | from .msd import ScaleDiscriminator 6 | from .mpd import MultiPeriodDiscriminator 7 | from .mrd import MultiResolutionDiscriminator 8 | 9 | 10 | class Discriminator(nn.Module): 11 | def __init__(self, hp): 12 | super(Discriminator, self).__init__() 13 | self.MRD = MultiResolutionDiscriminator(hp) 14 | self.MPD = MultiPeriodDiscriminator(hp) 15 | self.MSD = ScaleDiscriminator() 16 | 17 | def forward(self, x): 18 | r = self.MRD(x) 19 | p = self.MPD(x) 20 | s = self.MSD(x) 21 | return r + p + s 22 | 23 | 24 | if __name__ == '__main__': 25 | hp = OmegaConf.load('../config/base.yaml') 26 | model = Discriminator(hp) 27 | 28 | x = torch.randn(3, 1, 16384) 29 | print(x.shape) 30 | 31 | output = model(x) 32 | for features, score in output: 33 | for feat in features: 34 | print(feat.shape) 35 | print(score.shape) 36 | 37 | pytorch_total_params = sum(p.numel() 38 | for p in model.parameters() if p.requires_grad) 39 | print(pytorch_total_params) 40 | -------------------------------------------------------------------------------- /vits_decoder/med.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchaudio 3 | import typing as T 4 | 5 | 6 | class MelspecDiscriminator(torch.nn.Module): 7 | """mel spectrogram (frequency domain) discriminator""" 8 | 9 | def __init__(self) -> None: 10 | super().__init__() 11 | self.SAMPLE_RATE = 48000 12 | # mel filterbank transform 13 | self._melspec = torchaudio.transforms.MelSpectrogram( 14 | sample_rate=self.SAMPLE_RATE, 15 | n_fft=2048, 16 | win_length=int(0.025 * self.SAMPLE_RATE), 17 | hop_length=int(0.010 * self.SAMPLE_RATE), 18 | n_mels=128, 19 | power=1, 20 | ) 21 | 22 | # time-frequency 2D convolutions 23 | kernel_sizes = [(7, 7), (4, 4), (4, 4), (4, 4)] 24 | strides = [(1, 2), (1, 2), (1, 2), (1, 2)] 25 | self._convs = torch.nn.ModuleList( 26 | [ 27 | torch.nn.Sequential( 28 | torch.nn.Conv2d( 29 | in_channels=1 if i == 0 else 32, 30 | out_channels=64, 31 | kernel_size=k, 32 | stride=s, 33 | padding=(1, 2), 34 | bias=False, 35 | ), 36 | torch.nn.BatchNorm2d(num_features=64), 37 | torch.nn.GLU(dim=1), 38 | ) 39 | for i, (k, s) in enumerate(zip(kernel_sizes, strides)) 40 | ] 41 | ) 42 | 43 | # output adversarial projection 44 | self._postnet = torch.nn.Conv2d( 45 | in_channels=32, 46 | out_channels=1, 47 | kernel_size=(15, 3), 48 | stride=(1, 2), 49 | ) 50 | 51 | def forward(self, x: torch.Tensor) -> T.Tuple[torch.Tensor, T.List[torch.Tensor]]: 52 | # apply the log-scale mel spectrogram transform 53 | x = torch.log(self._melspec(x) + 1e-5) 54 | 55 | # compute hidden layers and feature maps 56 | f = [] 57 | for c in self._convs: 58 | x = c(x) 59 | f.append(x) 60 | 61 | # apply the output projection and global average pooling 62 | x = self._postnet(x) 63 | x = x.mean(dim=[-2, -1]) 64 | 65 | return [(f, x)] 66 | -------------------------------------------------------------------------------- /vits_decoder/mpd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.nn.utils import weight_norm, spectral_norm 5 | 6 | class DiscriminatorP(nn.Module): 7 | def __init__(self, hp, period): 8 | super(DiscriminatorP, self).__init__() 9 | 10 | self.LRELU_SLOPE = hp.mpd.lReLU_slope 11 | self.period = period 12 | 13 | kernel_size = hp.mpd.kernel_size 14 | stride = hp.mpd.stride 15 | norm_f = weight_norm if hp.mpd.use_spectral_norm == False else spectral_norm 16 | 17 | self.convs = nn.ModuleList([ 18 | norm_f(nn.Conv2d(1, 64, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))), 19 | norm_f(nn.Conv2d(64, 128, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))), 20 | norm_f(nn.Conv2d(128, 256, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))), 21 | norm_f(nn.Conv2d(256, 512, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))), 22 | norm_f(nn.Conv2d(512, 1024, (kernel_size, 1), 1, padding=(kernel_size // 2, 0))), 23 | ]) 24 | self.conv_post = norm_f(nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 25 | 26 | def forward(self, x): 27 | fmap = [] 28 | 29 | # 1d to 2d 30 | b, c, t = x.shape 31 | if t % self.period != 0: # pad first 32 | n_pad = self.period - (t % self.period) 33 | x = F.pad(x, (0, n_pad), "reflect") 34 | t = t + n_pad 35 | x = x.view(b, c, t // self.period, self.period) 36 | 37 | for l in self.convs: 38 | x = l(x) 39 | x = F.leaky_relu(x, self.LRELU_SLOPE) 40 | fmap.append(x) 41 | x = self.conv_post(x) 42 | fmap.append(x) 43 | x = torch.flatten(x, 1, -1) 44 | 45 | return fmap, x 46 | 47 | 48 | class MultiPeriodDiscriminator(nn.Module): 49 | def __init__(self, hp): 50 | super(MultiPeriodDiscriminator, self).__init__() 51 | 52 | self.discriminators = nn.ModuleList( 53 | [DiscriminatorP(hp, period) for period in hp.mpd.periods] 54 | ) 55 | 56 | def forward(self, x): 57 | ret = list() 58 | for disc in self.discriminators: 59 | ret.append(disc(x)) 60 | 61 | return ret # [(feat, score), (feat, score), (feat, score), (feat, score), (feat, score)] 62 | -------------------------------------------------------------------------------- /vits_decoder/mrd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.nn.utils import weight_norm, spectral_norm 5 | 6 | class DiscriminatorR(torch.nn.Module): 7 | def __init__(self, hp, resolution): 8 | super(DiscriminatorR, self).__init__() 9 | 10 | self.resolution = resolution 11 | self.LRELU_SLOPE = hp.mpd.lReLU_slope 12 | 13 | norm_f = weight_norm if hp.mrd.use_spectral_norm == False else spectral_norm 14 | 15 | self.convs = nn.ModuleList([ 16 | norm_f(nn.Conv2d(1, 32, (3, 9), padding=(1, 4))), 17 | norm_f(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))), 18 | norm_f(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))), 19 | norm_f(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))), 20 | norm_f(nn.Conv2d(32, 32, (3, 3), padding=(1, 1))), 21 | ]) 22 | self.conv_post = norm_f(nn.Conv2d(32, 1, (3, 3), padding=(1, 1))) 23 | 24 | def forward(self, x): 25 | fmap = [] 26 | 27 | x = self.spectrogram(x) 28 | x = x.unsqueeze(1) 29 | for l in self.convs: 30 | x = l(x) 31 | x = F.leaky_relu(x, self.LRELU_SLOPE) 32 | fmap.append(x) 33 | x = self.conv_post(x) 34 | fmap.append(x) 35 | x = torch.flatten(x, 1, -1) 36 | 37 | return fmap, x 38 | 39 | def spectrogram(self, x): 40 | n_fft, hop_length, win_length = self.resolution 41 | x = F.pad(x, (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), mode='reflect') 42 | x = x.squeeze(1) 43 | x = torch.stft(x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False, return_complex=False) #[B, F, TT, 2] 44 | mag = torch.norm(x, p=2, dim =-1) #[B, F, TT] 45 | 46 | return mag 47 | 48 | 49 | class MultiResolutionDiscriminator(torch.nn.Module): 50 | def __init__(self, hp): 51 | super(MultiResolutionDiscriminator, self).__init__() 52 | self.resolutions = eval(hp.mrd.resolutions) 53 | self.discriminators = nn.ModuleList( 54 | [DiscriminatorR(hp, resolution) for resolution in self.resolutions] 55 | ) 56 | 57 | def forward(self, x): 58 | ret = list() 59 | for disc in self.discriminators: 60 | ret.append(disc(x)) 61 | 62 | return ret # [(feat, score), (feat, score), (feat, score)] 63 | -------------------------------------------------------------------------------- /vits_decoder/msd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.nn.utils import weight_norm 5 | 6 | 7 | class ScaleDiscriminator(torch.nn.Module): 8 | def __init__(self): 9 | super(ScaleDiscriminator, self).__init__() 10 | self.convs = nn.ModuleList([ 11 | weight_norm(nn.Conv1d(1, 16, 15, 1, padding=7)), 12 | weight_norm(nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)), 13 | weight_norm(nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)), 14 | weight_norm(nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)), 15 | weight_norm(nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), 16 | weight_norm(nn.Conv1d(1024, 1024, 5, 1, padding=2)), 17 | ]) 18 | self.conv_post = weight_norm(nn.Conv1d(1024, 1, 3, 1, padding=1)) 19 | 20 | def forward(self, x): 21 | fmap = [] 22 | for l in self.convs: 23 | x = l(x) 24 | x = F.leaky_relu(x, 0.1) 25 | fmap.append(x) 26 | x = self.conv_post(x) 27 | fmap.append(x) 28 | x = torch.flatten(x, 1, -1) 29 | return [(fmap, x)] 30 | -------------------------------------------------------------------------------- /vits_extend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/vits_extend/__init__.py -------------------------------------------------------------------------------- /vits_extend/dataloader.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import DataLoader 2 | from vits.data_utils import DistributedBucketSampler 3 | from vits.data_utils import TextAudioSpeakerCollate 4 | from vits.data_utils import TextAudioSpeakerSet 5 | 6 | 7 | def create_dataloader_train(hps, n_gpus, rank): 8 | collate_fn = TextAudioSpeakerCollate() 9 | train_dataset = TextAudioSpeakerSet(hps.data.training_files, hps.data) 10 | train_sampler = DistributedBucketSampler( 11 | train_dataset, 12 | hps.train.batch_size, 13 | [150, 300, 450], 14 | num_replicas=n_gpus, 15 | rank=rank, 16 | shuffle=True) 17 | train_loader = DataLoader( 18 | train_dataset, 19 | num_workers=4, 20 | shuffle=False, 21 | pin_memory=True, 22 | collate_fn=collate_fn, 23 | batch_sampler=train_sampler) 24 | return train_loader 25 | 26 | 27 | def create_dataloader_eval(hps): 28 | collate_fn = TextAudioSpeakerCollate() 29 | eval_dataset = TextAudioSpeakerSet(hps.data.validation_files, hps.data) 30 | eval_loader = DataLoader( 31 | eval_dataset, 32 | num_workers=2, 33 | shuffle=False, 34 | batch_size=hps.train.batch_size, 35 | pin_memory=True, 36 | drop_last=False, 37 | collate_fn=collate_fn) 38 | return eval_loader 39 | -------------------------------------------------------------------------------- /vits_extend/plotting.py: -------------------------------------------------------------------------------- 1 | import logging 2 | mpl_logger = logging.getLogger('matplotlib') # must before import matplotlib 3 | mpl_logger.setLevel(logging.WARNING) 4 | import matplotlib 5 | matplotlib.use("Agg") 6 | 7 | import numpy as np 8 | import matplotlib.pylab as plt 9 | 10 | 11 | def save_figure_to_numpy(fig): 12 | # save it to a numpy array. 13 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') 14 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 15 | data = np.transpose(data, (2, 0, 1)) 16 | return data 17 | 18 | 19 | def plot_waveform_to_numpy(waveform): 20 | fig, ax = plt.subplots(figsize=(12, 4)) 21 | ax.plot() 22 | ax.plot(range(len(waveform)), waveform, 23 | linewidth=0.1, alpha=0.7, color='blue') 24 | 25 | plt.xlabel("Samples") 26 | plt.ylabel("Amplitude") 27 | plt.ylim(-1, 1) 28 | plt.tight_layout() 29 | 30 | fig.canvas.draw() 31 | data = save_figure_to_numpy(fig) 32 | plt.close() 33 | 34 | return data 35 | 36 | 37 | def plot_spectrogram_to_numpy(spectrogram): 38 | fig, ax = plt.subplots(figsize=(12, 4)) 39 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", 40 | interpolation='none') 41 | plt.colorbar(im, ax=ax) 42 | plt.xlabel("Frames") 43 | plt.ylabel("Channels") 44 | plt.tight_layout() 45 | 46 | fig.canvas.draw() 47 | data = save_figure_to_numpy(fig) 48 | plt.close() 49 | return data 50 | -------------------------------------------------------------------------------- /vits_extend/stft.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Jungil Kong 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import math 24 | import os 25 | import random 26 | import torch 27 | import torch.utils.data 28 | import numpy as np 29 | from librosa.util import normalize 30 | from scipy.io.wavfile import read 31 | from librosa.filters import mel as librosa_mel_fn 32 | 33 | 34 | class TacotronSTFT(torch.nn.Module): 35 | def __init__(self, filter_length=512, hop_length=160, win_length=512, 36 | n_mel_channels=80, sampling_rate=16000, mel_fmin=0.0, 37 | mel_fmax=None, center=False, device='cpu'): 38 | super(TacotronSTFT, self).__init__() 39 | self.n_mel_channels = n_mel_channels 40 | self.sampling_rate = sampling_rate 41 | self.n_fft = filter_length 42 | self.hop_size = hop_length 43 | self.win_size = win_length 44 | self.fmin = mel_fmin 45 | self.fmax = mel_fmax 46 | self.center = center 47 | 48 | mel = librosa_mel_fn( 49 | sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax) 50 | 51 | mel_basis = torch.from_numpy(mel).float().to(device) 52 | hann_window = torch.hann_window(win_length).to(device) 53 | 54 | self.register_buffer('mel_basis', mel_basis) 55 | self.register_buffer('hann_window', hann_window) 56 | 57 | def linear_spectrogram(self, y): 58 | assert (torch.min(y.data) >= -1) 59 | assert (torch.max(y.data) <= 1) 60 | 61 | y = torch.nn.functional.pad(y.unsqueeze(1), 62 | (int((self.n_fft - self.hop_size) / 2), int((self.n_fft - self.hop_size) / 2)), 63 | mode='reflect') 64 | y = y.squeeze(1) 65 | spec = torch.stft(y, self.n_fft, hop_length=self.hop_size, win_length=self.win_size, window=self.hann_window, 66 | center=self.center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) 67 | spec = torch.norm(spec, p=2, dim=-1) 68 | 69 | return spec 70 | 71 | def mel_spectrogram(self, y): 72 | """Computes mel-spectrograms from a batch of waves 73 | PARAMS 74 | ------ 75 | y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] 76 | 77 | RETURNS 78 | ------- 79 | mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) 80 | """ 81 | assert(torch.min(y.data) >= -1) 82 | assert(torch.max(y.data) <= 1) 83 | 84 | y = torch.nn.functional.pad(y.unsqueeze(1), 85 | (int((self.n_fft - self.hop_size) / 2), int((self.n_fft - self.hop_size) / 2)), 86 | mode='reflect') 87 | y = y.squeeze(1) 88 | 89 | spec = torch.stft(y, self.n_fft, hop_length=self.hop_size, win_length=self.win_size, window=self.hann_window, 90 | center=self.center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) 91 | 92 | spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) 93 | 94 | spec = torch.matmul(self.mel_basis, spec) 95 | spec = self.spectral_normalize_torch(spec) 96 | 97 | return spec 98 | 99 | def spectral_normalize_torch(self, magnitudes): 100 | output = self.dynamic_range_compression_torch(magnitudes) 101 | return output 102 | 103 | def dynamic_range_compression_torch(self, x, C=1, clip_val=1e-5): 104 | return torch.log(torch.clamp(x, min=clip_val) * C) 105 | -------------------------------------------------------------------------------- /vits_extend/stft_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2019 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """STFT-based Loss modules.""" 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | 11 | 12 | def stft(x, fft_size, hop_size, win_length, window): 13 | """Perform STFT and convert to magnitude spectrogram. 14 | Args: 15 | x (Tensor): Input signal tensor (B, T). 16 | fft_size (int): FFT size. 17 | hop_size (int): Hop size. 18 | win_length (int): Window length. 19 | window (str): Window function type. 20 | Returns: 21 | Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). 22 | """ 23 | x_stft = torch.stft(x, fft_size, hop_size, win_length, window, return_complex=False) 24 | real = x_stft[..., 0] 25 | imag = x_stft[..., 1] 26 | 27 | # NOTE(kan-bayashi): clamp is needed to avoid nan or inf 28 | return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1) 29 | 30 | 31 | class SpectralConvergengeLoss(torch.nn.Module): 32 | """Spectral convergence loss module.""" 33 | 34 | def __init__(self): 35 | """Initilize spectral convergence loss module.""" 36 | super(SpectralConvergengeLoss, self).__init__() 37 | 38 | def forward(self, x_mag, y_mag): 39 | """Calculate forward propagation. 40 | Args: 41 | x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). 42 | y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). 43 | Returns: 44 | Tensor: Spectral convergence loss value. 45 | """ 46 | return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro") 47 | 48 | 49 | class LogSTFTMagnitudeLoss(torch.nn.Module): 50 | """Log STFT magnitude loss module.""" 51 | 52 | def __init__(self): 53 | """Initilize los STFT magnitude loss module.""" 54 | super(LogSTFTMagnitudeLoss, self).__init__() 55 | 56 | def forward(self, x_mag, y_mag): 57 | """Calculate forward propagation. 58 | Args: 59 | x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). 60 | y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). 61 | Returns: 62 | Tensor: Log STFT magnitude loss value. 63 | """ 64 | return F.l1_loss(torch.log(y_mag), torch.log(x_mag)) 65 | 66 | 67 | class STFTLoss(torch.nn.Module): 68 | """STFT loss module.""" 69 | 70 | def __init__(self, device, fft_size=1024, shift_size=120, win_length=600, window="hann_window"): 71 | """Initialize STFT loss module.""" 72 | super(STFTLoss, self).__init__() 73 | self.fft_size = fft_size 74 | self.shift_size = shift_size 75 | self.win_length = win_length 76 | self.window = getattr(torch, window)(win_length).to(device) 77 | self.spectral_convergenge_loss = SpectralConvergengeLoss() 78 | self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss() 79 | 80 | def forward(self, x, y): 81 | """Calculate forward propagation. 82 | Args: 83 | x (Tensor): Predicted signal (B, T). 84 | y (Tensor): Groundtruth signal (B, T). 85 | Returns: 86 | Tensor: Spectral convergence loss value. 87 | Tensor: Log STFT magnitude loss value. 88 | """ 89 | x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window) 90 | y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window) 91 | sc_loss = self.spectral_convergenge_loss(x_mag, y_mag) 92 | mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag) 93 | 94 | return sc_loss, mag_loss 95 | 96 | 97 | class MultiResolutionSTFTLoss(torch.nn.Module): 98 | """Multi resolution STFT loss module.""" 99 | 100 | def __init__(self, 101 | device, 102 | resolutions, 103 | window="hann_window"): 104 | """Initialize Multi resolution STFT loss module. 105 | Args: 106 | resolutions (list): List of (FFT size, hop size, window length). 107 | window (str): Window function type. 108 | """ 109 | super(MultiResolutionSTFTLoss, self).__init__() 110 | self.stft_losses = torch.nn.ModuleList() 111 | for fs, ss, wl in resolutions: 112 | self.stft_losses += [STFTLoss(device, fs, ss, wl, window)] 113 | 114 | def forward(self, x, y): 115 | """Calculate forward propagation. 116 | Args: 117 | x (Tensor): Predicted signal (B, T). 118 | y (Tensor): Groundtruth signal (B, T). 119 | Returns: 120 | Tensor: Multi resolution spectral convergence loss value. 121 | Tensor: Multi resolution log STFT magnitude loss value. 122 | """ 123 | sc_loss = 0.0 124 | mag_loss = 0.0 125 | for f in self.stft_losses: 126 | sc_l, mag_l = f(x, y) 127 | sc_loss += sc_l 128 | mag_loss += mag_l 129 | 130 | sc_loss /= len(self.stft_losses) 131 | mag_loss /= len(self.stft_losses) 132 | 133 | return sc_loss, mag_loss 134 | -------------------------------------------------------------------------------- /vits_extend/validation.py: -------------------------------------------------------------------------------- 1 | import tqdm 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | 6 | def validate(hp, args, generator, discriminator, valloader, stft, writer, step, device): 7 | generator.eval() 8 | discriminator.eval() 9 | torch.backends.cudnn.benchmark = False 10 | 11 | loader = tqdm.tqdm(valloader, desc='Validation loop') 12 | mel_loss = 0.0 13 | for idx, (ppg, ppg_l, vec, pit, spk, spec, spec_l, audio, audio_l) in enumerate(loader): 14 | ppg = ppg.to(device) 15 | vec = vec.to(device) 16 | pit = pit.to(device) 17 | spk = spk.to(device) 18 | ppg_l = ppg_l.to(device) 19 | audio = audio.to(device) 20 | 21 | if hasattr(generator, 'module'): 22 | fake_audio = generator.module.infer(ppg, vec, pit, spk, ppg_l)[ 23 | :, :, :audio.size(2)] 24 | else: 25 | fake_audio = generator.infer(ppg, vec, pit, spk, ppg_l)[ 26 | :, :, :audio.size(2)] 27 | 28 | mel_fake = stft.mel_spectrogram(fake_audio.squeeze(1)) 29 | mel_real = stft.mel_spectrogram(audio.squeeze(1)) 30 | 31 | mel_loss += F.l1_loss(mel_fake, mel_real).item() 32 | 33 | if idx < hp.log.num_audio: 34 | spec_fake = stft.linear_spectrogram(fake_audio.squeeze(1)) 35 | spec_real = stft.linear_spectrogram(audio.squeeze(1)) 36 | 37 | audio = audio[0][0].cpu().detach().numpy() 38 | fake_audio = fake_audio[0][0].cpu().detach().numpy() 39 | spec_fake = spec_fake[0].cpu().detach().numpy() 40 | spec_real = spec_real[0].cpu().detach().numpy() 41 | writer.log_fig_audio( 42 | audio, fake_audio, spec_fake, spec_real, idx, step) 43 | 44 | mel_loss = mel_loss / len(valloader.dataset) 45 | 46 | writer.log_validation(mel_loss, generator, discriminator, step) 47 | 48 | torch.backends.cudnn.benchmark = True 49 | -------------------------------------------------------------------------------- /vits_extend/writer.py: -------------------------------------------------------------------------------- 1 | from torch.utils.tensorboard import SummaryWriter 2 | import numpy as np 3 | import librosa 4 | 5 | from .plotting import plot_waveform_to_numpy, plot_spectrogram_to_numpy 6 | 7 | class MyWriter(SummaryWriter): 8 | def __init__(self, hp, logdir): 9 | super(MyWriter, self).__init__(logdir) 10 | self.sample_rate = hp.data.sampling_rate 11 | 12 | def log_training(self, g_loss, d_loss, mel_loss, stft_loss, k_loss, r_loss, score_loss, step): 13 | self.add_scalar('train/g_loss', g_loss, step) 14 | self.add_scalar('train/d_loss', d_loss, step) 15 | 16 | self.add_scalar('train/score_loss', score_loss, step) 17 | self.add_scalar('train/stft_loss', stft_loss, step) 18 | self.add_scalar('train/mel_loss', mel_loss, step) 19 | self.add_scalar('train/kl_f_loss', k_loss, step) 20 | self.add_scalar('train/kl_r_loss', r_loss, step) 21 | 22 | def log_validation(self, mel_loss, generator, discriminator, step): 23 | self.add_scalar('validation/mel_loss', mel_loss, step) 24 | 25 | def log_fig_audio(self, real, fake, spec_fake, spec_real, idx, step): 26 | if idx == 0: 27 | spec_fake = librosa.amplitude_to_db(spec_fake, ref=np.max,top_db=80.) 28 | spec_real = librosa.amplitude_to_db(spec_real, ref=np.max,top_db=80.) 29 | self.add_image(f'spec_fake/{step}', plot_spectrogram_to_numpy(spec_fake), step) 30 | self.add_image(f'wave_fake/{step}', plot_waveform_to_numpy(fake), step) 31 | self.add_image(f'spec_real/{step}', plot_spectrogram_to_numpy(spec_real), step) 32 | self.add_image(f'wave_real/{step}', plot_waveform_to_numpy(real), step) 33 | 34 | self.add_audio(f'fake/{step}', fake, step, self.sample_rate) 35 | self.add_audio(f'real/{step}', real, step, self.sample_rate) 36 | 37 | def log_histogram(self, model, step): 38 | for tag, value in model.named_parameters(): 39 | self.add_histogram(tag.replace('.', '/'), value.cpu().detach().numpy(), step) 40 | -------------------------------------------------------------------------------- /vits_pretrain/README.md: -------------------------------------------------------------------------------- 1 | Path for: 2 | 3 | sovits5.0_bigvgan_mix_v2.pth 4 | -------------------------------------------------------------------------------- /whisper/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 OpenAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /whisper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/whisper/__init__.py -------------------------------------------------------------------------------- /whisper/audio.py: -------------------------------------------------------------------------------- 1 | import os 2 | from functools import lru_cache 3 | from typing import Union 4 | 5 | import librosa 6 | import numpy as np 7 | import torch 8 | import torch.nn.functional as F 9 | 10 | from .utils import exact_div 11 | 12 | from librosa.filters import mel as librosa_mel_fn 13 | 14 | # hard-coded audio hyperparameters 15 | SAMPLE_RATE = 16000 16 | N_FFT = 400 17 | N_MELS = 80 18 | HOP_LENGTH = 160 19 | CHUNK_LENGTH = 30 20 | N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000: number of samples in a chunk 21 | N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH) # 3000: number of frames in a mel spectrogram input 22 | 23 | 24 | def load_audio(file: str, sr: int = SAMPLE_RATE): 25 | x, sr = librosa.load(file, sr=sr) 26 | return x 27 | 28 | 29 | def pad_or_trim(array, length_max: int = N_SAMPLES, length_min: int = N_SAMPLES // 2, *, axis: int = -1): 30 | """ 31 | Pad or trim the audio array to N_SAMPLES, as expected by the encoder. 32 | """ 33 | if torch.is_tensor(array): 34 | if array.shape[axis] > length_max: 35 | array = array.index_select(dim=axis, index=torch.arange(length_max, device=array.device)) 36 | 37 | if array.shape[axis] < length_min: 38 | pad_widths = [(0, 0)] * array.ndim 39 | pad_widths[axis] = (0, length_min - array.shape[axis]) 40 | array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes]) 41 | else: 42 | if array.shape[axis] > length_max: 43 | array = array.take(indices=range(length_max), axis=axis) 44 | 45 | if array.shape[axis] < length_min: 46 | pad_widths = [(0, 0)] * array.ndim 47 | pad_widths[axis] = (0, length_min - array.shape[axis]) 48 | array = np.pad(array, pad_widths) 49 | 50 | return array 51 | 52 | 53 | @lru_cache(maxsize=None) 54 | def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor: 55 | """ 56 | load the mel filterbank matrix for projecting STFT into a Mel spectrogram. 57 | Allows decoupling librosa dependency; saved using: 58 | 59 | np.savez_compressed( 60 | "mel_filters.npz", 61 | mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80), 62 | ) 63 | """ 64 | assert n_mels == 80, f"Unsupported n_mels: {n_mels}" 65 | return torch.from_numpy(librosa_mel_fn(sr=SAMPLE_RATE,n_fft=N_FFT,n_mels=n_mels)).to(device) 66 | 67 | 68 | def log_mel_spectrogram(audio: Union[str, np.ndarray, torch.Tensor], n_mels: int = N_MELS): 69 | """ 70 | Compute the log-Mel spectrogram of 71 | 72 | Parameters 73 | ---------- 74 | audio: Union[str, np.ndarray, torch.Tensor], shape = (*) 75 | The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz 76 | 77 | n_mels: int 78 | The number of Mel-frequency filters, only 80 is supported 79 | 80 | Returns 81 | ------- 82 | torch.Tensor, shape = (80, n_frames) 83 | A Tensor that contains the Mel spectrogram 84 | """ 85 | if not torch.is_tensor(audio): 86 | if isinstance(audio, str): 87 | audio = load_audio(audio) 88 | audio = torch.from_numpy(audio) 89 | 90 | window = torch.hann_window(N_FFT).to(audio.device) 91 | stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True) 92 | magnitudes = stft[..., :-1].abs() ** 2 93 | 94 | filters = mel_filters(audio.device, n_mels) 95 | mel_spec = filters @ magnitudes 96 | 97 | log_spec = torch.clamp(mel_spec, min=1e-10).log10() 98 | log_spec = torch.maximum(log_spec, log_spec.max() - 8.0) 99 | log_spec = (log_spec + 4.0) / 4.0 100 | return log_spec 101 | -------------------------------------------------------------------------------- /whisper/inference.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 3 | import numpy as np 4 | import argparse 5 | import torch 6 | 7 | from whisper.model import Whisper, ModelDimensions 8 | from whisper.audio import load_audio, pad_or_trim, log_mel_spectrogram 9 | 10 | 11 | def load_model(path, device) -> Whisper: 12 | checkpoint = torch.load(path, map_location="cpu") 13 | dims = ModelDimensions(**checkpoint["dims"]) 14 | # print(dims) 15 | model = Whisper(dims) 16 | del model.decoder 17 | cut = len(model.encoder.blocks) // 4 18 | cut = -1 * cut 19 | del model.encoder.blocks[cut:] 20 | model.load_state_dict(checkpoint["model_state_dict"], strict=False) 21 | model.eval() 22 | if not (device == "cpu"): 23 | model.half() 24 | model.to(device) 25 | # torch.save({ 26 | # 'dims': checkpoint["dims"], 27 | # 'model_state_dict': model.state_dict(), 28 | # }, "large-v2.pt") 29 | return model 30 | 31 | 32 | def pred_ppg(whisper: Whisper, wavPath, ppgPath, device): 33 | audio = load_audio(wavPath) 34 | audln = audio.shape[0] 35 | ppg_a = [] 36 | idx_s = 0 37 | while (idx_s + 15 * 16000 < audln): 38 | short = audio[idx_s:idx_s + 15 * 16000] 39 | idx_s = idx_s + 15 * 16000 40 | ppgln = 15 * 16000 // 320 41 | # short = pad_or_trim(short) 42 | mel = log_mel_spectrogram(short).to(device) 43 | if not (device == "cpu"): 44 | mel = mel.half() 45 | with torch.no_grad(): 46 | mel = mel + torch.randn_like(mel) * 0.1 47 | ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy() 48 | ppg = ppg[:ppgln,] # [length, dim=1024] 49 | ppg_a.extend(ppg) 50 | if (idx_s < audln): 51 | short = audio[idx_s:audln] 52 | ppgln = (audln - idx_s) // 320 53 | # short = pad_or_trim(short) 54 | mel = log_mel_spectrogram(short).to(device) 55 | if not (device == "cpu"): 56 | mel = mel.half() 57 | with torch.no_grad(): 58 | mel = mel + torch.randn_like(mel) * 0.1 59 | ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy() 60 | ppg = ppg[:ppgln,] # [length, dim=1024] 61 | ppg_a.extend(ppg) 62 | np.save(ppgPath, ppg_a, allow_pickle=False) 63 | 64 | 65 | if __name__ == "__main__": 66 | parser = argparse.ArgumentParser() 67 | parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True) 68 | parser.add_argument("-p", "--ppg", help="ppg", dest="ppg", required=True) 69 | args = parser.parse_args() 70 | print(args.wav) 71 | print(args.ppg) 72 | 73 | wavPath = args.wav 74 | ppgPath = args.ppg 75 | 76 | device = "cuda" if torch.cuda.is_available() else "cpu" 77 | whisper = load_model(os.path.join("whisper_pretrain", "large-v2.pt"), device) 78 | pred_ppg(whisper, wavPath, ppgPath, device) 79 | -------------------------------------------------------------------------------- /whisper/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | import zlib 5 | from typing import Callable, TextIO 6 | 7 | system_encoding = sys.getdefaultencoding() 8 | 9 | if system_encoding != "utf-8": 10 | def make_safe(string): 11 | # replaces any character not representable using the system default encoding with an '?', 12 | # avoiding UnicodeEncodeError (https://github.com/openai/whisper/discussions/729). 13 | return string.encode(system_encoding, errors="replace").decode(system_encoding) 14 | else: 15 | def make_safe(string): 16 | # utf-8 can encode any Unicode code point, so no need to do the round-trip encoding 17 | return string 18 | 19 | 20 | def exact_div(x, y): 21 | assert x % y == 0 22 | return x // y 23 | 24 | 25 | def str2bool(string): 26 | str2val = {"True": True, "False": False} 27 | if string in str2val: 28 | return str2val[string] 29 | else: 30 | raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}") 31 | 32 | 33 | def optional_int(string): 34 | return None if string == "None" else int(string) 35 | 36 | 37 | def optional_float(string): 38 | return None if string == "None" else float(string) 39 | 40 | 41 | def compression_ratio(text) -> float: 42 | text_bytes = text.encode("utf-8") 43 | return len(text_bytes) / len(zlib.compress(text_bytes)) 44 | 45 | 46 | def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'): 47 | assert seconds >= 0, "non-negative timestamp expected" 48 | milliseconds = round(seconds * 1000.0) 49 | 50 | hours = milliseconds // 3_600_000 51 | milliseconds -= hours * 3_600_000 52 | 53 | minutes = milliseconds // 60_000 54 | milliseconds -= minutes * 60_000 55 | 56 | seconds = milliseconds // 1_000 57 | milliseconds -= seconds * 1_000 58 | 59 | hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else "" 60 | return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}" 61 | 62 | 63 | class ResultWriter: 64 | extension: str 65 | 66 | def __init__(self, output_dir: str): 67 | self.output_dir = output_dir 68 | 69 | def __call__(self, result: dict, audio_path: str): 70 | audio_basename = os.path.basename(audio_path) 71 | output_path = os.path.join(self.output_dir, audio_basename + "." + self.extension) 72 | 73 | with open(output_path, "w", encoding="utf-8") as f: 74 | self.write_result(result, file=f) 75 | 76 | def write_result(self, result: dict, file: TextIO): 77 | raise NotImplementedError 78 | 79 | 80 | class WriteTXT(ResultWriter): 81 | extension: str = "txt" 82 | 83 | def write_result(self, result: dict, file: TextIO): 84 | for segment in result["segments"]: 85 | print(segment['text'].strip(), file=file, flush=True) 86 | 87 | 88 | class WriteVTT(ResultWriter): 89 | extension: str = "vtt" 90 | 91 | def write_result(self, result: dict, file: TextIO): 92 | print("WEBVTT\n", file=file) 93 | for segment in result["segments"]: 94 | print( 95 | f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n" 96 | f"{segment['text'].strip().replace('-->', '->')}\n", 97 | file=file, 98 | flush=True, 99 | ) 100 | 101 | 102 | class WriteSRT(ResultWriter): 103 | extension: str = "srt" 104 | 105 | def write_result(self, result: dict, file: TextIO): 106 | for i, segment in enumerate(result["segments"], start=1): 107 | # write srt lines 108 | print( 109 | f"{i}\n" 110 | f"{format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> " 111 | f"{format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n" 112 | f"{segment['text'].strip().replace('-->', '->')}\n", 113 | file=file, 114 | flush=True, 115 | ) 116 | 117 | 118 | class WriteTSV(ResultWriter): 119 | """ 120 | Write a transcript to a file in TSV (tab-separated values) format containing lines like: 121 | \t\t 122 | 123 | Using integer milliseconds as start and end times means there's no chance of interference from 124 | an environment setting a language encoding that causes the decimal in a floating point number 125 | to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++. 126 | """ 127 | extension: str = "tsv" 128 | 129 | def write_result(self, result: dict, file: TextIO): 130 | print("start", "end", "text", sep="\t", file=file) 131 | for segment in result["segments"]: 132 | print(round(1000 * segment['start']), file=file, end="\t") 133 | print(round(1000 * segment['end']), file=file, end="\t") 134 | print(segment['text'].strip().replace("\t", " "), file=file, flush=True) 135 | 136 | 137 | class WriteJSON(ResultWriter): 138 | extension: str = "json" 139 | 140 | def write_result(self, result: dict, file: TextIO): 141 | json.dump(result, file) 142 | 143 | 144 | def get_writer(output_format: str, output_dir: str) -> Callable[[dict, TextIO], None]: 145 | writers = { 146 | "txt": WriteTXT, 147 | "vtt": WriteVTT, 148 | "srt": WriteSRT, 149 | "tsv": WriteTSV, 150 | "json": WriteJSON, 151 | } 152 | 153 | if output_format == "all": 154 | all_writers = [writer(output_dir) for writer in writers.values()] 155 | 156 | def write_all(result: dict, file: TextIO): 157 | for writer in all_writers: 158 | writer(result, file) 159 | 160 | return write_all 161 | 162 | return writers[output_format](output_dir) 163 | 164 | -------------------------------------------------------------------------------- /whisper_pretrain/README.md: -------------------------------------------------------------------------------- 1 | Path for: 2 | 3 | large-v2.pt 4 | --------------------------------------------------------------------------------