├── LICENSE
├── README.md
├── README_ZH.md
├── app.py
├── colab.ipynb
├── configs
    ├── base.yaml
    ├── singers
    │   ├── singer0001.npy
    │   ├── singer0002.npy
    │   ├── singer0003.npy
    │   ├── singer0004.npy
    │   ├── singer0005.npy
    │   ├── singer0006.npy
    │   ├── singer0007.npy
    │   ├── singer0008.npy
    │   ├── singer0009.npy
    │   ├── singer0010.npy
    │   ├── singer0011.npy
    │   ├── singer0012.npy
    │   ├── singer0013.npy
    │   ├── singer0014.npy
    │   ├── singer0015.npy
    │   ├── singer0016.npy
    │   ├── singer0017.npy
    │   ├── singer0018.npy
    │   ├── singer0019.npy
    │   ├── singer0020.npy
    │   ├── singer0021.npy
    │   ├── singer0022.npy
    │   ├── singer0023.npy
    │   ├── singer0024.npy
    │   ├── singer0025.npy
    │   ├── singer0026.npy
    │   ├── singer0027.npy
    │   ├── singer0028.npy
    │   ├── singer0029.npy
    │   ├── singer0030.npy
    │   ├── singer0031.npy
    │   ├── singer0032.npy
    │   ├── singer0033.npy
    │   ├── singer0034.npy
    │   ├── singer0035.npy
    │   ├── singer0036.npy
    │   ├── singer0037.npy
    │   ├── singer0038.npy
    │   ├── singer0039.npy
    │   ├── singer0040.npy
    │   ├── singer0041.npy
    │   ├── singer0042.npy
    │   ├── singer0043.npy
    │   ├── singer0044.npy
    │   ├── singer0045.npy
    │   ├── singer0046.npy
    │   ├── singer0047.npy
    │   ├── singer0048.npy
    │   ├── singer0049.npy
    │   ├── singer0050.npy
    │   ├── singer0051.npy
    │   ├── singer0052.npy
    │   ├── singer0053.npy
    │   ├── singer0054.npy
    │   ├── singer0055.npy
    │   └── singer0056.npy
    └── singers_sample
    │   ├── 22-wave-girl
    │       ├── 031.wav
    │       ├── 032.wav
    │       ├── 033.wav
    │       ├── 034.wav
    │       └── 035.wav
    │   ├── 30-wave-boy
    │       ├── 010.wav
    │       ├── 011.wav
    │       ├── 012.wav
    │       ├── 013.wav
    │       ├── 014.wav
    │       └── 015.wav
    │   ├── 47-wave-girl
    │       ├── 020.wav
    │       ├── 021.wav
    │       ├── 022.wav
    │       ├── 023.wav
    │       ├── 024.wav
    │       └── 025.wav
    │   └── 51-wave-boy
    │       ├── 006.wav
    │       ├── 007.wav
    │       ├── 008.wav
    │       ├── 009.wav
    │       └── 010.wav
├── crepe
    ├── LICENSE.txt
    ├── README.md
    ├── __init__.py
    ├── __main__.py
    ├── assets
    │   └── tiny.pth
    ├── convert.py
    ├── core.py
    ├── decode.py
    ├── filter.py
    ├── load.py
    ├── loudness.py
    ├── model.py
    └── threshold.py
├── environment.yml
├── feature_retrieval
    ├── __init__.py
    ├── index.py
    ├── retrieval.py
    ├── train.py
    └── transform.py
├── hubert
    ├── LICENSE.txt
    ├── __init__.py
    ├── hubert_model.py
    └── inference.py
├── hubert_pretrain
    └── README.md
├── pitch
    ├── __init__.py
    ├── core
    │   ├── LICENCE
    │   ├── README.md
    │   ├── __init__.py
    │   ├── pyin.py
    │   ├── salience.py
    │   ├── swipe.py
    │   ├── swipe_slim.py
    │   ├── utils.py
    │   └── yin.py
    ├── debug.py
    └── inference.py
├── prepare
    ├── preprocess_a.py
    ├── preprocess_cdc.py
    ├── preprocess_crepe.py
    ├── preprocess_f0.py
    ├── preprocess_f0_mouth.py
    ├── preprocess_hubert.py
    ├── preprocess_ppg.py
    ├── preprocess_random.py
    ├── preprocess_speaker.py
    ├── preprocess_speaker_ave.py
    ├── preprocess_spec.py
    ├── preprocess_train.py
    ├── preprocess_trim.py
    └── preprocess_zzz.py
├── requirements.txt
├── speaker
    ├── README.md
    ├── __init__.py
    ├── config.py
    ├── infer.py
    ├── models
    │   ├── __init__.py
    │   ├── lstm.py
    │   └── resnet.py
    ├── umap.png
    └── utils
    │   ├── __init__.py
    │   ├── audio.py
    │   ├── coqpit.py
    │   ├── io.py
    │   └── shared_configs.py
├── speaker_pretrain
    ├── README.md
    └── config.json
├── svc_eva.py
├── svc_export.py
├── svc_inference.py
├── svc_inference_batch.py
├── svc_inference_post.py
├── svc_inference_shift.py
├── svc_merge.py
├── svc_preprocessing.py
├── svc_train_retrieval.py
├── svc_trainer.py
├── test.wav
├── vad
    ├── LICENSE
    ├── assets
    │   └── silero_vad.jit
    └── utils.py
├── vits
    ├── LICENSE
    ├── __init__.py
    ├── attentions.py
    ├── commons.py
    ├── data_utils.py
    ├── losses.py
    ├── models.py
    ├── modules.py
    ├── modules_grl.py
    ├── spectrogram.py
    └── utils.py
├── vits_decoder
    ├── LICENSE.txt
    ├── __init__.py
    ├── alias
    │   ├── LICENSE-alias.txt
    │   ├── LICENSE-snake.txt
    │   ├── __init__.py
    │   ├── act.py
    │   ├── filter.py
    │   └── resample.py
    ├── bigv.py
    ├── discriminator.py
    ├── generator.py
    ├── med.py
    ├── mpd.py
    ├── mrd.py
    ├── msd.py
    └── nsf.py
├── vits_extend
    ├── __init__.py
    ├── dataloader.py
    ├── plotting.py
    ├── stft.py
    ├── stft_loss.py
    ├── train.py
    ├── validation.py
    └── writer.py
├── vits_pretrain
    └── README.md
├── whisper
    ├── LICENSE
    ├── README.md
    ├── __init__.py
    ├── audio.py
    ├── decoding.py
    ├── inference.py
    ├── model.py
    ├── tokenizer.py
    └── utils.py
└── whisper_pretrain
    └── README.md


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 PlayVoice
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/configs/base.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   model: "sovits"
 3 |   seed: 1234
 4 |   epochs: 10000
 5 |   learning_rate: 5e-5
 6 |   betas: [0.8, 0.99]
 7 |   lr_decay: 0.999875
 8 |   eps: 1e-9
 9 |   batch_size: 8
10 |   accum_step: 2
11 |   c_stft: 9
12 |   c_mel: 1.
13 |   c_kl: 0.2
14 |   port: 8001
15 |   pretrain: "./vits_pretrain/sovits5.0.pretrain.pth"
16 | #############################
17 | data: 
18 |   training_files: "files/train.txt"
19 |   validation_files: "files/valid.txt"
20 |   segment_size: 8000  # WARNING: base on hop_length
21 |   max_wav_value: 32768.0
22 |   sampling_rate: 32000
23 |   filter_length: 1024
24 |   hop_length: 320
25 |   win_length: 1024
26 |   mel_channels: 100
27 |   mel_fmin: 50.0
28 |   mel_fmax: 16000.0
29 | #############################
30 | vits:
31 |   ppg_dim: 1280
32 |   vec_dim: 256
33 |   spk_dim: 256
34 |   gin_channels: 256
35 |   inter_channels: 192
36 |   hidden_channels: 192
37 |   filter_channels: 640
38 | #############################
39 | gen:
40 |   upsample_input: 192
41 |   upsample_rates: [5,4,4,2,2]
42 |   upsample_kernel_sizes: [15,8,8,4,4]
43 |   upsample_initial_channel: 320
44 |   resblock_kernel_sizes: [3,7,11]
45 |   resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
46 | #############################
47 | mpd:
48 |   periods: [2,3,5,7,11]
49 |   kernel_size: 5
50 |   stride: 3
51 |   use_spectral_norm: False
52 |   lReLU_slope: 0.2
53 | #############################
54 | mrd:
55 |   resolutions: "[(1024, 120, 600), (2048, 240, 1200), (4096, 480, 2400), (512, 50, 240)]" # (filter_length, hop_length, win_length)
56 |   use_spectral_norm: False
57 |   lReLU_slope: 0.2
58 | #############################
59 | log:
60 |   info_interval: 100
61 |   eval_interval: 1
62 |   save_interval: 5
63 |   num_audio: 6
64 |   pth_dir: 'chkpt'
65 |   log_dir: 'logs'
66 |   keep_ckpts: 0
67 | #############################
68 | dist_config:
69 |   dist_backend: "nccl"
70 |   dist_url: "tcp://localhost:54321"
71 |   world_size: 1
72 | 
73 | 


--------------------------------------------------------------------------------
/configs/singers/singer0001.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0001.npy


--------------------------------------------------------------------------------
/configs/singers/singer0002.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0002.npy


--------------------------------------------------------------------------------
/configs/singers/singer0003.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0003.npy


--------------------------------------------------------------------------------
/configs/singers/singer0004.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0004.npy


--------------------------------------------------------------------------------
/configs/singers/singer0005.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0005.npy


--------------------------------------------------------------------------------
/configs/singers/singer0006.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0006.npy


--------------------------------------------------------------------------------
/configs/singers/singer0007.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0007.npy


--------------------------------------------------------------------------------
/configs/singers/singer0008.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0008.npy


--------------------------------------------------------------------------------
/configs/singers/singer0009.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0009.npy


--------------------------------------------------------------------------------
/configs/singers/singer0010.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0010.npy


--------------------------------------------------------------------------------
/configs/singers/singer0011.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0011.npy


--------------------------------------------------------------------------------
/configs/singers/singer0012.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0012.npy


--------------------------------------------------------------------------------
/configs/singers/singer0013.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0013.npy


--------------------------------------------------------------------------------
/configs/singers/singer0014.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0014.npy


--------------------------------------------------------------------------------
/configs/singers/singer0015.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0015.npy


--------------------------------------------------------------------------------
/configs/singers/singer0016.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0016.npy


--------------------------------------------------------------------------------
/configs/singers/singer0017.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0017.npy


--------------------------------------------------------------------------------
/configs/singers/singer0018.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0018.npy


--------------------------------------------------------------------------------
/configs/singers/singer0019.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0019.npy


--------------------------------------------------------------------------------
/configs/singers/singer0020.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0020.npy


--------------------------------------------------------------------------------
/configs/singers/singer0021.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0021.npy


--------------------------------------------------------------------------------
/configs/singers/singer0022.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0022.npy


--------------------------------------------------------------------------------
/configs/singers/singer0023.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0023.npy


--------------------------------------------------------------------------------
/configs/singers/singer0024.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0024.npy


--------------------------------------------------------------------------------
/configs/singers/singer0025.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0025.npy


--------------------------------------------------------------------------------
/configs/singers/singer0026.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0026.npy


--------------------------------------------------------------------------------
/configs/singers/singer0027.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0027.npy


--------------------------------------------------------------------------------
/configs/singers/singer0028.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0028.npy


--------------------------------------------------------------------------------
/configs/singers/singer0029.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0029.npy


--------------------------------------------------------------------------------
/configs/singers/singer0030.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0030.npy


--------------------------------------------------------------------------------
/configs/singers/singer0031.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0031.npy


--------------------------------------------------------------------------------
/configs/singers/singer0032.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0032.npy


--------------------------------------------------------------------------------
/configs/singers/singer0033.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0033.npy


--------------------------------------------------------------------------------
/configs/singers/singer0034.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0034.npy


--------------------------------------------------------------------------------
/configs/singers/singer0035.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0035.npy


--------------------------------------------------------------------------------
/configs/singers/singer0036.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0036.npy


--------------------------------------------------------------------------------
/configs/singers/singer0037.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0037.npy


--------------------------------------------------------------------------------
/configs/singers/singer0038.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0038.npy


--------------------------------------------------------------------------------
/configs/singers/singer0039.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0039.npy


--------------------------------------------------------------------------------
/configs/singers/singer0040.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0040.npy


--------------------------------------------------------------------------------
/configs/singers/singer0041.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0041.npy


--------------------------------------------------------------------------------
/configs/singers/singer0042.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0042.npy


--------------------------------------------------------------------------------
/configs/singers/singer0043.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0043.npy


--------------------------------------------------------------------------------
/configs/singers/singer0044.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0044.npy


--------------------------------------------------------------------------------
/configs/singers/singer0045.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0045.npy


--------------------------------------------------------------------------------
/configs/singers/singer0046.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0046.npy


--------------------------------------------------------------------------------
/configs/singers/singer0047.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0047.npy


--------------------------------------------------------------------------------
/configs/singers/singer0048.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0048.npy


--------------------------------------------------------------------------------
/configs/singers/singer0049.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0049.npy


--------------------------------------------------------------------------------
/configs/singers/singer0050.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0050.npy


--------------------------------------------------------------------------------
/configs/singers/singer0051.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0051.npy


--------------------------------------------------------------------------------
/configs/singers/singer0052.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0052.npy


--------------------------------------------------------------------------------
/configs/singers/singer0053.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0053.npy


--------------------------------------------------------------------------------
/configs/singers/singer0054.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0054.npy


--------------------------------------------------------------------------------
/configs/singers/singer0055.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0055.npy


--------------------------------------------------------------------------------
/configs/singers/singer0056.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers/singer0056.npy


--------------------------------------------------------------------------------
/configs/singers_sample/22-wave-girl/031.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/22-wave-girl/031.wav


--------------------------------------------------------------------------------
/configs/singers_sample/22-wave-girl/032.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/22-wave-girl/032.wav


--------------------------------------------------------------------------------
/configs/singers_sample/22-wave-girl/033.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/22-wave-girl/033.wav


--------------------------------------------------------------------------------
/configs/singers_sample/22-wave-girl/034.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/22-wave-girl/034.wav


--------------------------------------------------------------------------------
/configs/singers_sample/22-wave-girl/035.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/22-wave-girl/035.wav


--------------------------------------------------------------------------------
/configs/singers_sample/30-wave-boy/010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/30-wave-boy/010.wav


--------------------------------------------------------------------------------
/configs/singers_sample/30-wave-boy/011.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/30-wave-boy/011.wav


--------------------------------------------------------------------------------
/configs/singers_sample/30-wave-boy/012.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/30-wave-boy/012.wav


--------------------------------------------------------------------------------
/configs/singers_sample/30-wave-boy/013.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/30-wave-boy/013.wav


--------------------------------------------------------------------------------
/configs/singers_sample/30-wave-boy/014.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/30-wave-boy/014.wav


--------------------------------------------------------------------------------
/configs/singers_sample/30-wave-boy/015.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/30-wave-boy/015.wav


--------------------------------------------------------------------------------
/configs/singers_sample/47-wave-girl/020.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/47-wave-girl/020.wav


--------------------------------------------------------------------------------
/configs/singers_sample/47-wave-girl/021.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/47-wave-girl/021.wav


--------------------------------------------------------------------------------
/configs/singers_sample/47-wave-girl/022.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/47-wave-girl/022.wav


--------------------------------------------------------------------------------
/configs/singers_sample/47-wave-girl/023.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/47-wave-girl/023.wav


--------------------------------------------------------------------------------
/configs/singers_sample/47-wave-girl/024.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/47-wave-girl/024.wav


--------------------------------------------------------------------------------
/configs/singers_sample/47-wave-girl/025.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/47-wave-girl/025.wav


--------------------------------------------------------------------------------
/configs/singers_sample/51-wave-boy/006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/51-wave-boy/006.wav


--------------------------------------------------------------------------------
/configs/singers_sample/51-wave-boy/007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/51-wave-boy/007.wav


--------------------------------------------------------------------------------
/configs/singers_sample/51-wave-boy/008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/51-wave-boy/008.wav


--------------------------------------------------------------------------------
/configs/singers_sample/51-wave-boy/009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/51-wave-boy/009.wav


--------------------------------------------------------------------------------
/configs/singers_sample/51-wave-boy/010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/configs/singers_sample/51-wave-boy/010.wav


--------------------------------------------------------------------------------
/crepe/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Max Morrison
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/crepe/__init__.py:
--------------------------------------------------------------------------------
1 | from . import decode
2 | from .core import *
3 | from .model import Crepe
4 | from . import convert
5 | from . import filter
6 | from . import load
7 | from . import loudness
8 | from . import threshold
9 | 


--------------------------------------------------------------------------------
/crepe/__main__.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import warnings
  4 | 
  5 | import crepe
  6 | 
  7 | 
  8 | ###############################################################################
  9 | # Entry point
 10 | ###############################################################################
 11 | 
 12 | 
 13 | def parse_args():
 14 |     """Parse command-line arguments"""
 15 |     parser = argparse.ArgumentParser()
 16 | 
 17 |     # Required arguments
 18 |     parser.add_argument(
 19 |         '--audio_files',
 20 |         nargs='+',
 21 |         required=True,
 22 |         help='The audio file to process')
 23 |     parser.add_argument(
 24 |         '--output_files',
 25 |         nargs='+',
 26 |         required=True,
 27 |         help='The file to save pitch or embedding')
 28 |     parser.add_argument(
 29 |         '--hop_length',
 30 |         type=int,
 31 |         help='The hop length of the analysis window')
 32 | 
 33 |     # Optionally save harmonicity [DEPRECATED]
 34 |     parser.add_argument(
 35 |         '--output_harmonicity_files',
 36 |         nargs='+',
 37 |         help='The file to save harmonicity')
 38 |     # Optionally save periodicity
 39 |     parser.add_argument(
 40 |         '--output_periodicity_files',
 41 |         nargs='+',
 42 |         help='The files to save periodicity')
 43 | 
 44 |     # Optionally create embedding instead of pitch contour
 45 |     parser.add_argument(
 46 |         '--embed',
 47 |         action='store_true',
 48 |         help='Performs embedding instead of pitch prediction')
 49 | 
 50 |     # Optional arguments
 51 |     parser.add_argument(
 52 |         '--fmin',
 53 |         default=50.,
 54 |         type=float,
 55 |         help='The minimum frequency allowed')
 56 |     parser.add_argument(
 57 |         '--fmax',
 58 |         default=crepe.MAX_FMAX,
 59 |         type=float,
 60 |         help='The maximum frequency allowed')
 61 |     parser.add_argument(
 62 |         '--model',
 63 |         default='full',
 64 |         help='The model capacity. One of "tiny" or "full"')
 65 |     parser.add_argument(
 66 |         '--decoder',
 67 |         default='viterbi',
 68 |         help='The decoder to use. One of "argmax", "viterbi", or ' +
 69 |              '"weighted_argmax"')
 70 |     parser.add_argument(
 71 |         '--batch_size',
 72 |         type=int,
 73 |         help='The number of frames per batch')
 74 |     parser.add_argument(
 75 |         '--gpu',
 76 |         type=int,
 77 |         help='The gpu to perform inference on')
 78 |     parser.add_argument(
 79 |         '--no_pad',
 80 |         action='store_true',
 81 |         help='Whether to pad the audio')
 82 | 
 83 |     return parser.parse_args()
 84 | 
 85 | 
 86 | def make_parent_directory(file):
 87 |     """Create parent directory for file if it does not already exist"""
 88 |     parent = os.path.dirname(os.path.abspath(file))
 89 |     os.makedirs(parent, exist_ok=True)
 90 | 
 91 | 
 92 | def main():
 93 |     # Parse command-line arguments
 94 |     args = parse_args()
 95 | 
 96 |     # Deprecate output_harmonicity_files
 97 |     if args.output_harmonicity_files is not None:
 98 |         message = (
 99 |             'The crepe output_harmonicity_files argument is deprecated and '
100 |             'will be removed in a future release. Please use '
101 |             'output_periodicity_files. Rationale: if network confidence measured '
102 |             'harmonic content, the value would be low for non-harmonic, periodic '
103 |             'sounds (e.g., sine waves). But this is not observed.')
104 |         warnings.warn(message, DeprecationWarning)
105 |         args.output_periodicity_files = args.output_harmonicity_files
106 | 
107 |     # Ensure output directory exist
108 |     [make_parent_directory(file) for file in args.output_files]
109 |     if args.output_periodicity_files is not None:
110 |         [make_parent_directory(file) for file in args.output_periodicity_files]
111 | 
112 |     # Get inference device
113 |     device = 'cpu' if args.gpu is None else f'cuda:{args.gpu}'
114 | 
115 |     # Get decoder
116 |     if args.decoder == 'argmax':
117 |         decoder = crepe.decode.argmax
118 |     elif args.decoder == 'weighted_argmax':
119 |         decoder = crepe.decode.weighted_argmax
120 |     elif args.decoder == 'viterbi':
121 |         decoder = crepe.decode.viterbi
122 | 
123 |     # Infer pitch or embedding and save to disk
124 |     if args.embed:
125 |         crepe.embed_from_files_to_files(args.audio_files,
126 |                                              args.output_files,
127 |                                              args.hop_length,
128 |                                              args.model,
129 |                                              args.batch_size,
130 |                                              device,
131 |                                              not args.no_pad)
132 |     else:
133 |         crepe.predict_from_files_to_files(args.audio_files,
134 |                                                args.output_files,
135 |                                                None,
136 |                                                args.output_periodicity_files,
137 |                                                args.hop_length,
138 |                                                args.fmin,
139 |                                                args.fmax,
140 |                                                args.model,
141 |                                                decoder,
142 |                                                args.batch_size,
143 |                                                device,
144 |                                                not args.no_pad)
145 | 
146 | 
147 | # Run module entry point
148 | main()
149 | 


--------------------------------------------------------------------------------
/crepe/assets/tiny.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/crepe/assets/tiny.pth


--------------------------------------------------------------------------------
/crepe/convert.py:
--------------------------------------------------------------------------------
 1 | import scipy
 2 | import torch
 3 | 
 4 | import crepe
 5 | 
 6 | 
 7 | ###############################################################################
 8 | # Pitch unit conversions
 9 | ###############################################################################
10 | 
11 | 
12 | def bins_to_cents(bins):
13 |     """Converts pitch bins to cents"""
14 |     cents = crepe.CENTS_PER_BIN * bins + 1997.3794084376191
15 | 
16 |     # Trade quantization error for noise
17 |     return dither(cents)
18 | 
19 | 
20 | def bins_to_frequency(bins):
21 |     """Converts pitch bins to frequency in Hz"""
22 |     return cents_to_frequency(bins_to_cents(bins))
23 | 
24 | 
25 | def cents_to_bins(cents, quantize_fn=torch.floor):
26 |     """Converts cents to pitch bins"""
27 |     bins = (cents - 1997.3794084376191) / crepe.CENTS_PER_BIN
28 |     return quantize_fn(bins).int()
29 | 
30 | 
31 | def cents_to_frequency(cents):
32 |     """Converts cents to frequency in Hz"""
33 |     return 10 * 2 ** (cents / 1200)
34 | 
35 | 
36 | def frequency_to_bins(frequency, quantize_fn=torch.floor):
37 |     """Convert frequency in Hz to pitch bins"""
38 |     return cents_to_bins(frequency_to_cents(frequency), quantize_fn)
39 | 
40 | 
41 | def frequency_to_cents(frequency):
42 |     """Convert frequency in Hz to cents"""
43 |     return 1200 * torch.log2(frequency / 10.)
44 | 
45 | 
46 | ###############################################################################
47 | # Utilities
48 | ###############################################################################
49 | 
50 | 
51 | def dither(cents):
52 |     """Dither the predicted pitch in cents to remove quantization error"""
53 |     noise = scipy.stats.triang.rvs(c=0.5,
54 |                                    loc=-crepe.CENTS_PER_BIN,
55 |                                    scale=2 * crepe.CENTS_PER_BIN,
56 |                                    size=cents.size())
57 |     return cents + cents.new_tensor(noise)
58 | 


--------------------------------------------------------------------------------
/crepe/decode.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | import crepe
 6 | 
 7 | 
 8 | ###############################################################################
 9 | # Probability sequence decoding methods
10 | ###############################################################################
11 | 
12 | 
13 | def argmax(logits):
14 |     """Sample observations by taking the argmax"""
15 |     bins = logits.argmax(dim=1)
16 | 
17 |     # Convert to frequency in Hz
18 |     return bins, crepe.convert.bins_to_frequency(bins)
19 | 
20 | 
21 | def weighted_argmax(logits):
22 |     """Sample observations using weighted sum near the argmax"""
23 |     # Find center of analysis window
24 |     bins = logits.argmax(dim=1)
25 | 
26 |     # Find bounds of analysis window
27 |     start = torch.max(torch.tensor(0, device=logits.device), bins - 4)
28 |     end = torch.min(torch.tensor(logits.size(1), device=logits.device), bins + 5)
29 | 
30 |     # Mask out everything outside of window
31 |     for batch in range(logits.size(0)):
32 |         for time in range(logits.size(2)):
33 |             logits[batch, :start[batch, time], time] = -float('inf')
34 |             logits[batch, end[batch, time]:, time] = -float('inf')
35 | 
36 |     # Construct weights
37 |     if not hasattr(weighted_argmax, 'weights'):
38 |         weights = crepe.convert.bins_to_cents(torch.arange(360))
39 |         weighted_argmax.weights = weights[None, :, None]
40 | 
41 |     # Ensure devices are the same (no-op if they are)
42 |     weighted_argmax.weights = weighted_argmax.weights.to(logits.device)
43 | 
44 |     # Convert to probabilities
45 |     with torch.no_grad():
46 |         probs = torch.sigmoid(logits)
47 | 
48 |     # Apply weights
49 |     cents = (weighted_argmax.weights * probs).sum(dim=1) / probs.sum(dim=1)
50 | 
51 |     # Convert to frequency in Hz
52 |     return bins, crepe.convert.cents_to_frequency(cents)
53 | 
54 | 
55 | def viterbi(logits):
56 |     """Sample observations using viterbi decoding"""
57 |     # Create viterbi transition matrix
58 |     if not hasattr(viterbi, 'transition'):
59 |         xx, yy = np.meshgrid(range(360), range(360))
60 |         transition = np.maximum(12 - abs(xx - yy), 0)
61 |         transition = transition / transition.sum(axis=1, keepdims=True)
62 |         viterbi.transition = transition
63 | 
64 |     # Normalize logits
65 |     with torch.no_grad():
66 |         probs = torch.nn.functional.softmax(logits, dim=1)
67 | 
68 |     # Convert to numpy
69 |     sequences = probs.cpu().numpy()
70 | 
71 |     # Perform viterbi decoding
72 |     bins = np.array([
73 |         librosa.sequence.viterbi(sequence, viterbi.transition).astype(np.int64)
74 |         for sequence in sequences])
75 | 
76 |     # Convert to pytorch
77 |     bins = torch.tensor(bins, device=probs.device)
78 | 
79 |     # Convert to frequency in Hz
80 |     return bins, crepe.convert.bins_to_frequency(bins)
81 | 


--------------------------------------------------------------------------------
/crepe/load.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | import crepe
 6 | from scipy.io import wavfile
 7 | 
 8 | 
 9 | def audio(filename):
10 |     """Load audio from disk"""
11 |     sample_rate, audio = wavfile.read(filename)
12 | 
13 |     # Convert to float32
14 |     if audio.dtype == np.int16:
15 |         audio = audio.astype(np.float32) / np.iinfo(np.int16).max
16 | 
17 |     # PyTorch is not compatible with non-writeable arrays, so we make a copy
18 |     return torch.tensor(np.copy(audio))[None], sample_rate
19 | 
20 | 
21 | def model(device, capacity='full'):
22 |     """Preloads model from disk"""
23 |     # Bind model and capacity
24 |     crepe.infer.capacity = capacity
25 |     crepe.infer.model = crepe.Crepe(capacity)
26 | 
27 |     # Load weights
28 |     file = os.path.join(os.path.dirname(__file__), 'assets', f'{capacity}.pth')
29 |     crepe.infer.model.load_state_dict(
30 |         torch.load(file, map_location=device))
31 | 
32 |     # Place on device
33 |     crepe.infer.model = crepe.infer.model.to(torch.device(device))
34 | 
35 |     # Eval mode
36 |     crepe.infer.model.eval()
37 | 


--------------------------------------------------------------------------------
/crepe/loudness.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | import librosa
 4 | import numpy as np
 5 | import resampy
 6 | import torch
 7 | 
 8 | import crepe
 9 | 
10 | 
11 | ###############################################################################
12 | # Constants
13 | ###############################################################################
14 | 
15 | 
16 | # Minimum decibel level
17 | MIN_DB = -100.
18 | 
19 | # Reference decibel level
20 | REF_DB = 20.
21 | 
22 | 
23 | ###############################################################################
24 | # A-weighted loudness
25 | ###############################################################################
26 | 
27 | 
28 | def a_weighted(audio, sample_rate, hop_length=None, pad=True):
29 |     """Retrieve the per-frame loudness"""
30 |     # Save device
31 |     device = audio.device
32 | 
33 |     # Default hop length of 10 ms
34 |     hop_length = sample_rate // 100 if hop_length is None else hop_length
35 | 
36 |     # Convert to numpy
37 |     audio = audio.detach().cpu().numpy().squeeze(0)
38 | 
39 |     # Resample
40 |     if sample_rate != crepe.SAMPLE_RATE:
41 |         audio = resampy.resample(audio, sample_rate, crepe.SAMPLE_RATE)
42 |         hop_length = int(hop_length * crepe.SAMPLE_RATE / sample_rate)
43 | 
44 |     # Cache weights
45 |     if not hasattr(a_weighted, 'weights'):
46 |         a_weighted.weights = perceptual_weights()
47 | 
48 |     # Take stft
49 |     stft = librosa.stft(audio,
50 |                         n_fft=crepe.WINDOW_SIZE,
51 |                         hop_length=hop_length,
52 |                         win_length=crepe.WINDOW_SIZE,
53 |                         center=pad,
54 |                         pad_mode='constant')
55 | 
56 |     # Compute magnitude on db scale
57 |     db = librosa.amplitude_to_db(np.abs(stft))
58 | 
59 |     # Apply A-weighting
60 |     weighted = db + a_weighted.weights
61 | 
62 |     # Threshold
63 |     weighted[weighted < MIN_DB] = MIN_DB
64 | 
65 |     # Average over weighted frequencies
66 |     return torch.from_numpy(weighted.mean(axis=0)).float().to(device)[None]
67 | 
68 | 
69 | def perceptual_weights():
70 |     """A-weighted frequency-dependent perceptual loudness weights"""
71 |     frequencies = librosa.fft_frequencies(sr=crepe.SAMPLE_RATE,
72 |                                           n_fft=crepe.WINDOW_SIZE)
73 | 
74 |     # A warning is raised for nearly inaudible frequencies, but it ends up
75 |     # defaulting to -100 db. That default is fine for our purposes.
76 |     with warnings.catch_warnings():
77 |         warnings.simplefilter('ignore', RuntimeWarning)
78 |         return librosa.A_weighting(frequencies)[:, None] - REF_DB
79 | 


--------------------------------------------------------------------------------
/crepe/model.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | 
  3 | import torch
  4 | import torch.nn.functional as F
  5 | 
  6 | import crepe
  7 | 
  8 | 
  9 | ###########################################################################
 10 | # Model definition
 11 | ###########################################################################
 12 | 
 13 | 
 14 | class Crepe(torch.nn.Module):
 15 |     """Crepe model definition"""
 16 | 
 17 |     def __init__(self, model='full'):
 18 |         super().__init__()
 19 | 
 20 |         # Model-specific layer parameters
 21 |         if model == 'full':
 22 |             in_channels = [1, 1024, 128, 128, 128, 256]
 23 |             out_channels = [1024, 128, 128, 128, 256, 512]
 24 |             self.in_features = 2048
 25 |         elif model == 'tiny':
 26 |             in_channels = [1, 128, 16, 16, 16, 32]
 27 |             out_channels = [128, 16, 16, 16, 32, 64]
 28 |             self.in_features = 256
 29 |         else:
 30 |             raise ValueError(f'Model {model} is not supported')
 31 | 
 32 |         # Shared layer parameters
 33 |         kernel_sizes = [(512, 1)] + 5 * [(64, 1)]
 34 |         strides = [(4, 1)] + 5 * [(1, 1)]
 35 | 
 36 |         # Overload with eps and momentum conversion given by MMdnn
 37 |         batch_norm_fn = functools.partial(torch.nn.BatchNorm2d,
 38 |                                           eps=0.0010000000474974513,
 39 |                                           momentum=0.0)
 40 | 
 41 |         # Layer definitions
 42 |         self.conv1 = torch.nn.Conv2d(
 43 |             in_channels=in_channels[0],
 44 |             out_channels=out_channels[0],
 45 |             kernel_size=kernel_sizes[0],
 46 |             stride=strides[0])
 47 |         self.conv1_BN = batch_norm_fn(
 48 |             num_features=out_channels[0])
 49 | 
 50 |         self.conv2 = torch.nn.Conv2d(
 51 |             in_channels=in_channels[1],
 52 |             out_channels=out_channels[1],
 53 |             kernel_size=kernel_sizes[1],
 54 |             stride=strides[1])
 55 |         self.conv2_BN = batch_norm_fn(
 56 |             num_features=out_channels[1])
 57 | 
 58 |         self.conv3 = torch.nn.Conv2d(
 59 |             in_channels=in_channels[2],
 60 |             out_channels=out_channels[2],
 61 |             kernel_size=kernel_sizes[2],
 62 |             stride=strides[2])
 63 |         self.conv3_BN = batch_norm_fn(
 64 |             num_features=out_channels[2])
 65 | 
 66 |         self.conv4 = torch.nn.Conv2d(
 67 |             in_channels=in_channels[3],
 68 |             out_channels=out_channels[3],
 69 |             kernel_size=kernel_sizes[3],
 70 |             stride=strides[3])
 71 |         self.conv4_BN = batch_norm_fn(
 72 |             num_features=out_channels[3])
 73 | 
 74 |         self.conv5 = torch.nn.Conv2d(
 75 |             in_channels=in_channels[4],
 76 |             out_channels=out_channels[4],
 77 |             kernel_size=kernel_sizes[4],
 78 |             stride=strides[4])
 79 |         self.conv5_BN = batch_norm_fn(
 80 |             num_features=out_channels[4])
 81 | 
 82 |         self.conv6 = torch.nn.Conv2d(
 83 |             in_channels=in_channels[5],
 84 |             out_channels=out_channels[5],
 85 |             kernel_size=kernel_sizes[5],
 86 |             stride=strides[5])
 87 |         self.conv6_BN = batch_norm_fn(
 88 |             num_features=out_channels[5])
 89 | 
 90 |         self.classifier = torch.nn.Linear(
 91 |             in_features=self.in_features,
 92 |             out_features=crepe.PITCH_BINS)
 93 | 
 94 |     def forward(self, x, embed=False):
 95 |         # Forward pass through first five layers
 96 |         x = self.embed(x)
 97 | 
 98 |         if embed:
 99 |             return x
100 | 
101 |         # Forward pass through layer six
102 |         x = self.layer(x, self.conv6, self.conv6_BN)
103 | 
104 |         # shape=(batch, self.in_features)
105 |         x = x.permute(0, 2, 1, 3).reshape(-1, self.in_features)
106 | 
107 |         # Compute logits
108 |         return torch.sigmoid(self.classifier(x))
109 | 
110 |     ###########################################################################
111 |     # Forward pass utilities
112 |     ###########################################################################
113 | 
114 |     def embed(self, x):
115 |         """Map input audio to pitch embedding"""
116 |         # shape=(batch, 1, 1024, 1)
117 |         x = x[:, None, :, None]
118 | 
119 |         # Forward pass through first five layers
120 |         x = self.layer(x, self.conv1, self.conv1_BN, (0, 0, 254, 254))
121 |         x = self.layer(x, self.conv2, self.conv2_BN)
122 |         x = self.layer(x, self.conv3, self.conv3_BN)
123 |         x = self.layer(x, self.conv4, self.conv4_BN)
124 |         x = self.layer(x, self.conv5, self.conv5_BN)
125 | 
126 |         return x
127 | 
128 |     def layer(self, x, conv, batch_norm, padding=(0, 0, 31, 32)):
129 |         """Forward pass through one layer"""
130 |         x = F.pad(x, padding)
131 |         x = conv(x)
132 |         x = F.relu(x)
133 |         x = batch_norm(x)
134 |         return F.max_pool2d(x, (2, 1), (2, 1))
135 | 


--------------------------------------------------------------------------------
/crepe/threshold.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | 
  4 | import crepe
  5 | 
  6 | 
  7 | ###############################################################################
  8 | # Pitch thresholding methods
  9 | ###############################################################################
 10 | 
 11 | 
 12 | class At:
 13 |     """Simple thresholding at a specified probability value"""
 14 | 
 15 |     def __init__(self, value):
 16 |         self.value = value
 17 | 
 18 |     def __call__(self, pitch, periodicity):
 19 |         # Make a copy to prevent in-place modification
 20 |         pitch = torch.clone(pitch)
 21 | 
 22 |         # Threshold
 23 |         pitch[periodicity < self.value] = crepe.UNVOICED
 24 |         return pitch
 25 | 
 26 | 
 27 | class Hysteresis:
 28 |     """Hysteresis thresholding"""
 29 | 
 30 |     def __init__(self,
 31 |                  lower_bound=.19,
 32 |                  upper_bound=.31,
 33 |                  width=.2,
 34 |                  stds=1.7,
 35 |                  return_threshold=False):
 36 |         self.lower_bound = lower_bound
 37 |         self.upper_bound = upper_bound
 38 |         self.width = width
 39 |         self.stds = stds
 40 |         self.return_threshold = return_threshold
 41 | 
 42 |     def __call__(self, pitch, periodicity):
 43 |         # Save output device
 44 |         device = pitch.device
 45 | 
 46 |         # Perform hysteresis in log-2 space
 47 |         pitch = torch.log2(pitch).detach().flatten().cpu().numpy()
 48 | 
 49 |         # Flatten periodicity
 50 |         periodicity = periodicity.flatten().cpu().numpy()
 51 | 
 52 |         # Ignore confidently unvoiced pitch
 53 |         pitch[periodicity < self.lower_bound] = crepe.UNVOICED
 54 | 
 55 |         # Whiten pitch
 56 |         mean, std = np.nanmean(pitch), np.nanstd(pitch)
 57 |         pitch = (pitch - mean) / std
 58 | 
 59 |         # Require high confidence to make predictions far from the mean
 60 |         parabola = self.width * pitch ** 2 - self.width * self.stds ** 2
 61 |         threshold = \
 62 |             self.lower_bound + np.clip(parabola, 0, 1 - self.lower_bound)
 63 |         threshold[np.isnan(threshold)] = self.lower_bound
 64 | 
 65 |         # Apply hysteresis to prevent short, unconfident voiced regions
 66 |         i = 0
 67 |         while i < len(periodicity) - 1:
 68 | 
 69 |             # Detect unvoiced to voiced transition
 70 |             if periodicity[i] < threshold[i] and \
 71 |                periodicity[i + 1] > threshold[i + 1]:
 72 | 
 73 |                 # Grow region until next unvoiced or end of array
 74 |                 start, end, keep = i + 1, i + 1, False
 75 |                 while end < len(periodicity) and \
 76 |                       periodicity[end] > threshold[end]:
 77 |                     if periodicity[end] > self.upper_bound:
 78 |                         keep = True
 79 |                     end += 1
 80 | 
 81 |                 # Force unvoiced if we didn't pass the confidence required by
 82 |                 # the hysteresis
 83 |                 if not keep:
 84 |                     threshold[start:end] = 1
 85 | 
 86 |                 i = end
 87 | 
 88 |             else:
 89 |                 i += 1
 90 | 
 91 |         # Remove pitch with low periodicity
 92 |         pitch[periodicity < threshold] = crepe.UNVOICED
 93 | 
 94 |         # Unwhiten
 95 |         pitch = pitch * std + mean
 96 | 
 97 |         # Convert to Hz
 98 |         pitch = torch.tensor(2 ** pitch, device=device)[None, :]
 99 | 
100 |         # Optionally return threshold
101 |         if self.return_threshold:
102 |             return pitch, torch.tensor(threshold, device=device)
103 | 
104 |         return pitch
105 | 
106 | 
107 | ###############################################################################
108 | # Periodicity thresholding methods
109 | ###############################################################################
110 | 
111 | 
112 | class Silence:
113 |     """Set periodicity to zero in silent regions"""
114 | 
115 |     def __init__(self, value=-60):
116 |         self.value = value
117 | 
118 |     def __call__(self,
119 |                  periodicity,
120 |                  audio,
121 |                  sample_rate=crepe.SAMPLE_RATE,
122 |                  hop_length=None,
123 |                  pad=True):
124 |         # Don't modify in-place
125 |         periodicity = torch.clone(periodicity)
126 | 
127 |         # Compute loudness
128 |         loudness = crepe.loudness.a_weighted(
129 |             audio, sample_rate, hop_length, pad)
130 | 
131 |         # Threshold silence
132 |         periodicity[loudness < self.value] = 0.
133 | 
134 |         return periodicity
135 | 


--------------------------------------------------------------------------------
/feature_retrieval/__init__.py:
--------------------------------------------------------------------------------
1 | from .index import *
2 | from .train import *
3 | from .transform import *
4 | from .retrieval import *
5 | 


--------------------------------------------------------------------------------
/feature_retrieval/retrieval.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import logging
 3 | 
 4 | import torch
 5 | 
 6 | from feature_retrieval import FaissRetrievableFeatureIndex
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | class IRetrieval(abc.ABC):
12 |     @abc.abstractmethod
13 |     def retriv_whisper(self, vec: torch.Tensor) -> torch.Tensor:
14 |         raise NotImplementedError
15 | 
16 |     @abc.abstractmethod
17 |     def retriv_hubert(self, vec: torch.Tensor) -> torch.Tensor:
18 |         raise NotImplementedError
19 | 
20 | 
21 | class DummyRetrieval(IRetrieval):
22 |     def retriv_whisper(self, vec: torch.FloatTensor) -> torch.FloatTensor:
23 |         logger.debug("start dummy retriv whisper")
24 |         return vec.clone().to(torch.device("cpu"))
25 | 
26 |     def retriv_hubert(self, vec: torch.FloatTensor) -> torch.FloatTensor:
27 |         logger.debug("start dummy retriv hubert")
28 |         return vec.clone().to(torch.device("cpu"))
29 | 
30 | 
31 | class FaissIndexRetrieval(IRetrieval):
32 |     def __init__(self, hubert_index: FaissRetrievableFeatureIndex, whisper_index: FaissRetrievableFeatureIndex) -> None:
33 |         self._hubert_index = hubert_index
34 |         self._whisper_index = whisper_index
35 | 
36 |     def retriv_whisper(self, vec: torch.Tensor) -> torch.Tensor:
37 |         logger.debug("start retriv whisper")
38 |         np_vec = self._whisper_index.retriv(vec.numpy())
39 |         return torch.from_numpy(np_vec)
40 | 
41 |     def retriv_hubert(self, vec: torch.Tensor) -> torch.Tensor:
42 |         logger.debug("start retriv hubert")
43 |         np_vec = self._hubert_index.retriv(vec.numpy())
44 |         return torch.from_numpy(np_vec)
45 | 


--------------------------------------------------------------------------------
/feature_retrieval/train.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import cast
 3 | 
 4 | import numpy as np
 5 | 
 6 | from feature_retrieval import NumpyArray
 7 | from feature_retrieval.index import FaissIVFFlatTrainableFeatureIndexBuilder, logger
 8 | from feature_retrieval.transform import IFeatureMatrixTransform
 9 | 
10 | 
11 | def train_index(
12 |     features_path: Path,
13 |     index_save_filepath: Path,
14 |     index_builder: FaissIVFFlatTrainableFeatureIndexBuilder,
15 |     feature_transform: IFeatureMatrixTransform,
16 | ) -> None:
17 |     logger.info("start getting feature vectors from %s", features_path.absolute())
18 |     feature_matrix = get_feature_matrix(features_path)
19 |     logger.debug("fetched %s features", feature_matrix.shape[0])
20 | 
21 |     logger.info("apply transform to feature matrix")
22 |     feature_matrix = feature_transform.transform(feature_matrix)
23 |     num_vectors, vector_dim = feature_matrix.shape
24 |     logger.debug("features transformed. Current features %s", num_vectors)
25 | 
26 |     feature_index = index_builder.build(num_vectors=num_vectors, vector_dim=vector_dim)
27 |     logger.info("adding features to index with training")
28 | 
29 |     feature_index.add_with_train(feature_matrix)
30 |     feature_index.save(index_save_filepath)
31 |     logger.info("index saved to %s", index_save_filepath.absolute())
32 | 
33 | 
34 | def get_feature_matrix(features_dir_path: Path) -> NumpyArray:
35 |     matrices = [np.load(str(features_path)) for features_path in features_dir_path.rglob("*.npy")]
36 |     feature_matrix = np.concatenate(matrices, axis=0)
37 |     return cast(NumpyArray, feature_matrix)
38 | 


--------------------------------------------------------------------------------
/feature_retrieval/transform.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import logging
 3 | from typing import cast, Callable
 4 | 
 5 | from sklearn.cluster import MiniBatchKMeans
 6 | 
 7 | from feature_retrieval.index import NumpyArray
 8 | 
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class IFeatureMatrixTransform:
14 |     """Interface for transform encoded voice feature from (n_features,vector_dim) to (m_features,vector_dim)"""
15 | 
16 |     @abc.abstractmethod
17 |     def transform(self, matrix: NumpyArray) -> NumpyArray:
18 |         """transform given feature matrix from (n_features,vector_dim) to (m_features,vector_dim)"""
19 |         raise NotImplementedError
20 | 
21 | 
22 | class DummyFeatureTransform(IFeatureMatrixTransform):
23 |     """do nothing"""
24 | 
25 |     def transform(self, matrix: NumpyArray) -> NumpyArray:
26 |         return matrix
27 | 
28 | 
29 | class MinibatchKmeansFeatureTransform(IFeatureMatrixTransform):
30 |     """replaces number of examples with k-means centroids using minibatch algorythm"""
31 | 
32 |     def __init__(self, n_clusters: int, n_parallel: int) -> None:
33 |         self._n_clusters = n_clusters
34 |         self._n_parallel = n_parallel
35 | 
36 |     @property
37 |     def _batch_size(self) -> int:
38 |         return self._n_parallel * 256
39 | 
40 |     def transform(self, matrix: NumpyArray) -> NumpyArray:
41 |         """transform given feature matrix from (n_features,vector_dim) to (n_clusters,vector_dim)"""
42 |         cluster = MiniBatchKMeans(
43 |             n_clusters=self._n_clusters,
44 |             verbose=True,
45 |             batch_size=self._batch_size,
46 |             compute_labels=False,
47 |             init="k-means++",
48 |         )
49 |         return cast(NumpyArray, cluster.fit(matrix).cluster_centers_)
50 | 
51 | 
52 | class OnConditionFeatureTransform(IFeatureMatrixTransform):
53 |     """call given transform if condition is True else call otherwise transform"""
54 | 
55 |     def __init__(
56 |         self,
57 |         condition: Callable[[NumpyArray], bool],
58 |         on_condition: IFeatureMatrixTransform,
59 |         otherwise: IFeatureMatrixTransform,
60 |     ) -> None:
61 |         self._condition = condition
62 |         self._on_condition = on_condition
63 |         self._otherwise = otherwise
64 | 
65 |     def transform(self, matrix: NumpyArray) -> NumpyArray:
66 |         if self._condition(matrix):
67 |             transform_name = self._on_condition.__class__.__name__
68 |             logger.info(f"pass condition. Transform by rule {transform_name}")
69 |             return self._on_condition.transform(matrix)
70 |         transform_name = self._otherwise.__class__.__name__
71 |         logger.info(f"condition is not passed. Transform by rule {transform_name}")
72 |         return self._otherwise.transform(matrix)
73 | 


--------------------------------------------------------------------------------
/hubert/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Benjamin van Niekerk
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/hubert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/hubert/__init__.py


--------------------------------------------------------------------------------
/hubert/inference.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 3 | import numpy as np
 4 | import argparse
 5 | import torch
 6 | import librosa
 7 | 
 8 | from hubert import hubert_model
 9 | 
10 | 
11 | def load_audio(file: str, sr: int = 16000):
12 |     x, sr = librosa.load(file, sr=sr)
13 |     return x
14 | 
15 | 
16 | def load_model(path, device):
17 |     model = hubert_model.hubert_soft(path)
18 |     model.eval()
19 |     if not (device == "cpu"):
20 |         model.half()
21 |     model.to(device)
22 |     return model
23 | 
24 | 
25 | def pred_vec(model, wavPath, vecPath, device):
26 |     audio = load_audio(wavPath)
27 |     audln = audio.shape[0]
28 |     vec_a = []
29 |     idx_s = 0
30 |     while (idx_s + 20 * 16000 < audln):
31 |         feats = audio[idx_s:idx_s + 20 * 16000]
32 |         feats = torch.from_numpy(feats).to(device)
33 |         feats = feats[None, None, :]
34 |         if not (device == "cpu"):
35 |             feats = feats.half()
36 |         with torch.no_grad():
37 |             vec = model.units(feats).squeeze().data.cpu().float().numpy()
38 |             vec_a.extend(vec)
39 |         idx_s = idx_s + 20 * 16000
40 |     if (idx_s < audln):
41 |         feats = audio[idx_s:audln]
42 |         feats = torch.from_numpy(feats).to(device)
43 |         feats = feats[None, None, :]
44 |         if not (device == "cpu"):
45 |             feats = feats.half()
46 |         with torch.no_grad():
47 |             vec = model.units(feats).squeeze().data.cpu().float().numpy()
48 |             # print(vec.shape)   # [length, dim=256] hop=320
49 |             vec_a.extend(vec)
50 |     np.save(vecPath, vec_a, allow_pickle=False)
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     parser = argparse.ArgumentParser()
55 |     parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
56 |     parser.add_argument("-v", "--vec", help="vec", dest="vec", required=True)
57 |     args = parser.parse_args()
58 |     print(args.wav)
59 |     print(args.vec)
60 | 
61 |     wavPath = args.wav
62 |     vecPath = args.vec
63 | 
64 |     device = "cuda" if torch.cuda.is_available() else "cpu"
65 |     hubert = load_model(os.path.join(
66 |         "hubert_pretrain", "hubert-soft-0d54a1f4.pt"), device)
67 |     pred_vec(hubert, wavPath, vecPath, device)
68 | 


--------------------------------------------------------------------------------
/hubert_pretrain/README.md:
--------------------------------------------------------------------------------
1 | Path for:
2 | 
3 |     hubert-soft-0d54a1f4.pt


--------------------------------------------------------------------------------
/pitch/__init__.py:
--------------------------------------------------------------------------------
1 | from .inference import load_csv_pitch


--------------------------------------------------------------------------------
/pitch/core/LICENCE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Sebastian Rosenzweig, Simon Schwär, Meinard Müller, International Audio Laboratories Erlangen, Germany.
 4 | We thank the German Research Foundation (DFG) for various research grants that
 5 | allow us for conducting fundamental research in music processing.
 6 | The International Audio Laboratories Erlangen are a joint institution of the
 7 | Friedrich-Alexander-Universität Erlangen-Nürnberg (FAU) and Fraunhofer
 8 | Institute for Integrated Circuits IIS.
 9 | 
10 | Permission is hereby granted, free of charge, to any person obtaining a copy of
11 | this software and associated documentation files (the "Software"), to deal in
12 | the Software without restriction, including without limitation the rights to
13 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
14 | the Software, and to permit persons to whom the Software is furnished to do so,
15 | subject to the following conditions:
16 | 
17 | The above copyright notice and this permission notice shall be included in all
18 | copies or substantial portions of the Software.
19 | 
20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
22 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
23 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
24 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
25 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 | 


--------------------------------------------------------------------------------
/pitch/core/README.md:
--------------------------------------------------------------------------------
 1 | This repository contains a Python package called libf0 which provides open-source  implementations for four popular model-based F0-estimation approaches, YIN (Cheveigné & Kawahara, 2002), pYIN (Mauch & Dixon, 2014), an approach inspired by Melodia (Salamon & Gómez, 2012), and SWIPE (Camacho & Harris, 2008).
 2 | 
 3 | If you use the libf0 in your research, please consider the following references.
 4 | 
 5 | ## References
 6 | 
 7 | Sebastian Rosenzweig, Simon Schwär, and Meinard Müller.
 8 | [A Python Library for Fundamental Frequency Estimation.](https://archives.ismir.net/ismir2022/latebreaking/000003.pdf)
 9 | In Late Breaking Demos of the International Society for Music Information Retrieval Conference (ISMIR), Bengaluru, India, 2022.
10 | 
11 | Alain de Cheveigné and Hideki Kawahara.
12 | YIN, a fundamental frequency estimator for speech and music. Journal of the Acoustical Society of America (JASA), 111(4):1917–1930, 2002.
13 | 
14 | Matthias Mauch and Simon Dixon.
15 | pYIN: A fundamental frequency estimator using probabilistic threshold distributions. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pages 659–663, Florence, Italy, 2014.
16 | 
17 | Justin Salamon and Emilia Gómez.
18 | Melody extraction from polyphonic music signals using pitch contour characteristics. IEEE Transactions on Audio, Speech, and Language Processing, 20(6):
19 | 1759–1770, 2012.
20 | 
21 | Arturo Camacho and John G. Harris.
22 | A sawtooth waveform inspired pitch estimator for speech and music. The Journal of the Acoustical Society of America, 124(3):1638–1652, 2008.
23 | 
24 | Meinard Müller. Fundamentals of Music Processing – Using Python and Jupyter Notebooks. Springer Verlag, 2nd edition, 2021. ISBN 978-3-030-69807-2. doi: 10.1007/978-3-030-69808-9.
25 | 
26 | ## Documentation
27 | There is also an API documentation for libf0:
28 | 
29 | https://groupmm.github.io/libf0
30 | 
31 | ## Contributing
32 | 
33 | We are happy for suggestions and contributions. We would be grateful for either directly contacting us via email (meinard.mueller@audiolabs-erlangen.de) or for creating an issue in our Github repository. Please do not submit a pull request without prior consultation with us.
34 | 
35 | ## Licence
36 | 
37 | The code for this toolbox is published under an MIT licence.
38 | 
39 | ## Acknowledgements
40 | 
41 | This work was supported by the German Research Foundation (MU 2686/13-1, SCHE 280/20-1). We thank Edgar Suárez and Vojtěch Pešek for helping with the implementations. Furthermore, we thank Fatemeh Eftekhar and Maryam Pirmoradi for testing the toolbox. The International Audio Laboratories Erlangen are a joint institution of the Friedrich-Alexander-Universität Erlangen-Nürnberg (FAU) and Fraunhofer Institute for Integrated Circuits IIS.
42 | 


--------------------------------------------------------------------------------
/pitch/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/pitch/core/__init__.py


--------------------------------------------------------------------------------
/pitch/core/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | | Description: libf0 utility functions
  3 | | Contributors: Sebastian Rosenzweig, Simon Schwär, Meinard Müller
  4 | | License: The MIT license, https://opensource.org/licenses/MIT
  5 | | This file is part of libf0.
  6 | """
  7 | import numpy as np
  8 | 
  9 | 
 10 | def sonify_trajectory_with_sinusoid(f0, t, audio_len, confidence=None, Fs=22050, smooth_len=11):
 11 |     """
 12 |     Sonification of trajectory with sinusoidal. Adapted from FMP notebook: C8/C8S2_FundFreqTracking.ipynb
 13 | 
 14 |     Parameters
 15 |     ----------
 16 |     f0 : ndarray
 17 |         F0-trajectory
 18 |     t : ndarray
 19 |         Time axis
 20 |     audio_len : int
 21 |         Desired audio length in samples
 22 |     confidence : None or ndarray
 23 |         Confidence values for amplitude control
 24 |     Fs : int
 25 |         Sampling rate
 26 |     smooth_len : int
 27 |         Smoothing filter length to avoid clicks in the sonification
 28 | 
 29 |     Returns
 30 |     -------
 31 |     x_soni : ndarray
 32 |         Sonified F0-trajectory
 33 |     """
 34 |     if confidence is None:
 35 |         confidence = np.ones_like(f0)
 36 | 
 37 |     # initialize
 38 |     x_soni = np.zeros(audio_len)
 39 |     amplitude_mod = np.zeros(audio_len)
 40 | 
 41 |     # Computation of hop size
 42 |     sine_len = int(t[1] * Fs)
 43 | 
 44 |     t = np.arange(0, sine_len) / Fs
 45 |     phase = 0
 46 | 
 47 |     # loop over all F0 values, ensure continuous phase
 48 |     for idx in np.arange(0, len(f0)):
 49 |         cur_f = f0[idx]
 50 |         cur_amp = confidence[idx]
 51 | 
 52 |         if cur_f == 0:
 53 |             phase = 0
 54 |             continue
 55 | 
 56 |         cur_soni = np.sin(2*np.pi*(cur_f*t+phase))
 57 |         diff = np.maximum(0, (idx+1)*sine_len - len(x_soni))
 58 |         if diff > 0:
 59 |             x_soni[idx * sine_len:(idx + 1) * sine_len - diff] = cur_soni[:-diff]
 60 |             amplitude_mod[idx * sine_len:(idx + 1) * sine_len - diff] = cur_amp
 61 |         else:
 62 |             x_soni[idx*sine_len:(idx+1)*sine_len-diff] = cur_soni
 63 |             amplitude_mod[idx*sine_len:(idx+1)*sine_len-diff] = cur_amp
 64 | 
 65 |         phase += cur_f * sine_len / Fs
 66 |         phase -= 2 * np.round(phase/2)
 67 | 
 68 |     # filter amplitudes to avoid transients
 69 |     amplitude_mod = np.convolve(amplitude_mod, np.hanning(smooth_len)/np.sum(np.hanning(smooth_len)), 'same')
 70 |     x_soni = x_soni * amplitude_mod
 71 |     return x_soni
 72 | 
 73 | 
 74 | def hz_to_cents(F, F_ref=55.0):
 75 |     """
 76 |     Converts frequency in Hz to cents.
 77 | 
 78 |     Parameters
 79 |     ----------
 80 |     F : float or ndarray
 81 |         Frequency value in Hz
 82 |     F_ref : float
 83 |         Reference frequency in Hz (Default value = 55.0)
 84 |     Returns
 85 |     -------
 86 |     F_cents : float or ndarray
 87 |         Frequency in cents
 88 |     """
 89 | 
 90 |     # Avoid division by 0
 91 |     F_temp = np.array(F).astype(float)
 92 |     F_temp[F_temp == 0] = np.nan
 93 | 
 94 |     F_cents = 1200 * np.log2(F_temp / F_ref)
 95 | 
 96 |     return F_cents
 97 | 
 98 | 
 99 | def cents_to_hz(F_cents, F_ref=55.0):
100 |     """
101 |     Converts frequency in cents to Hz.
102 | 
103 |     Parameters
104 |     ----------
105 |     F_cents : float or ndarray
106 |         Frequency in cents
107 |     F_ref : float
108 |         Reference frequency in Hz (Default value = 55.0)
109 |     Returns
110 |     -------
111 |     F : float or ndarray
112 |         Frequency in Hz
113 |     """
114 |     F = F_ref * 2 ** (F_cents / 1200)
115 | 
116 |     # Avoid NaN output
117 |     F = np.nan_to_num(F, copy=False, nan=0)
118 | 
119 |     return F
120 | 


--------------------------------------------------------------------------------
/pitch/debug.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | 
 4 | 
 5 | def save_csv_pitch(pitch, path):
 6 |     with open(path, "w", encoding='utf-8') as pitch_file:
 7 |         for i in range(len(pitch)):
 8 |             t = i * 10
 9 |             minute = t // 60000
10 |             seconds = (t - minute * 60000) // 1000
11 |             millisecond = t % 1000
12 |             print(
13 |                 f"{minute}m {seconds}s {millisecond:3d},{int(pitch[i])}", file=pitch_file)
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("-p", "--pit", help="pit", dest="pit", required=True)  # pit for train
19 |     args = parser.parse_args()
20 |     print(args.pit)
21 | 
22 |     pitch = np.load(args.pit)
23 |     save_csv_pitch(pitch, 'pitch_debug.csv')
24 | 


--------------------------------------------------------------------------------
/pitch/inference.py:
--------------------------------------------------------------------------------
  1 | import sys,os
  2 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  3 | import torch
  4 | import librosa
  5 | import argparse
  6 | import numpy as np
  7 | import crepe
  8 | 
  9 | 
 10 | def move_average(a, n, mode="same"):
 11 |     return (np.convolve(a, np.ones((n,))/n, mode=mode))
 12 | 
 13 | 
 14 | def compute_f0_mouth(path, device):
 15 |     # pip install praat-parselmouth
 16 |     import parselmouth
 17 | 
 18 |     x, sr = librosa.load(path, sr=16000)
 19 |     assert sr == 16000
 20 |     lpad = 1024 // 160
 21 |     rpad = lpad
 22 |     f0 = parselmouth.Sound(x, sr).to_pitch_ac(
 23 |         time_step=160 / sr,
 24 |         voicing_threshold=0.5,
 25 |         pitch_floor=30,
 26 |         pitch_ceiling=1000).selected_array['frequency']
 27 |     f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
 28 |     return f0
 29 | 
 30 | 
 31 | def compute_f0_salience(filename, device):
 32 |     from pitch.core.salience import salience
 33 |     audio, sr = librosa.load(filename, sr=16000)
 34 |     assert sr == 16000
 35 |     f0, t, s = salience(
 36 |         audio,
 37 |         Fs=sr,
 38 |         H=320,
 39 |         N=2048,
 40 |         F_min=45.0,
 41 |         F_max=1760.0)
 42 |     f0 = np.repeat(f0, 2, -1)  # 320 -> 160 * 2
 43 |     f0 = move_average(f0, 3)
 44 |     return f0
 45 | 
 46 | 
 47 | def compute_f0_voice(filename, device):
 48 |     audio, sr = librosa.load(filename, sr=16000)
 49 |     assert sr == 16000
 50 |     audio = torch.tensor(np.copy(audio))[None]
 51 |     audio = audio + torch.randn_like(audio) * 0.001
 52 |     # Here we'll use a 10 millisecond hop length
 53 |     hop_length = 160
 54 |     fmin = 50
 55 |     fmax = 1000
 56 |     model = "full"
 57 |     batch_size = 512
 58 |     pitch = crepe.predict(
 59 |         audio,
 60 |         sr,
 61 |         hop_length,
 62 |         fmin,
 63 |         fmax,
 64 |         model,
 65 |         batch_size=batch_size,
 66 |         device=device,
 67 |         return_periodicity=False,
 68 |     )
 69 |     pitch = crepe.filter.mean(pitch, 3)
 70 |     pitch = pitch.squeeze(0)
 71 |     return pitch
 72 | 
 73 | 
 74 | def compute_f0_sing(filename, device):
 75 |     audio, sr = librosa.load(filename, sr=16000)
 76 |     assert sr == 16000
 77 |     audio = torch.tensor(np.copy(audio))[None]
 78 |     audio = audio + torch.randn_like(audio) * 0.001
 79 |     # Here we'll use a 20 millisecond hop length
 80 |     hop_length = 320
 81 |     fmin = 50
 82 |     fmax = 1000
 83 |     model = "full"
 84 |     batch_size = 512
 85 |     pitch = crepe.predict(
 86 |         audio,
 87 |         sr,
 88 |         hop_length,
 89 |         fmin,
 90 |         fmax,
 91 |         model,
 92 |         batch_size=batch_size,
 93 |         device=device,
 94 |         return_periodicity=False,
 95 |     )
 96 |     pitch = np.repeat(pitch, 2, -1)  # 320 -> 160 * 2
 97 |     pitch = crepe.filter.mean(pitch, 5)
 98 |     pitch = pitch.squeeze(0)
 99 |     return pitch
100 | 
101 | 
102 | def save_csv_pitch(pitch, path):
103 |     with open(path, "w", encoding='utf-8') as pitch_file:
104 |         for i in range(len(pitch)):
105 |             t = i * 10
106 |             minute = t // 60000
107 |             seconds = (t - minute * 60000) // 1000
108 |             millisecond = t % 1000
109 |             print(
110 |                 f"{minute}m {seconds}s {millisecond:3d},{int(pitch[i])}", file=pitch_file)
111 | 
112 | 
113 | def load_csv_pitch(path):
114 |     pitch = []
115 |     with open(path, "r", encoding='utf-8') as pitch_file:
116 |         for line in pitch_file.readlines():
117 |             pit = line.strip().split(",")[-1]
118 |             pitch.append(int(pit))
119 |     return pitch
120 | 
121 | 
122 | if __name__ == "__main__":
123 |     parser = argparse.ArgumentParser()
124 |     parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
125 |     parser.add_argument("-p", "--pit", help="pit", dest="pit", required=True)  # csv for excel
126 |     args = parser.parse_args()
127 |     print(args.wav)
128 |     print(args.pit)
129 | 
130 |     device = "cuda" if torch.cuda.is_available() else "cpu"
131 |     pitch = compute_f0_sing(args.wav, device)
132 |     save_csv_pitch(pitch, args.pit)
133 |     # tmp = load_csv_pitch(args.pit)
134 |     # save_csv_pitch(tmp, "tmp.csv")
135 | 


--------------------------------------------------------------------------------
/prepare/preprocess_a.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import librosa
 3 | import argparse
 4 | import numpy as np
 5 | from tqdm import tqdm
 6 | from concurrent.futures import ThreadPoolExecutor, as_completed
 7 | from scipy.io import wavfile
 8 | 
 9 | 
10 | def resample_wave(wav_in, wav_out, sample_rate):
11 |     wav, _ = librosa.load(wav_in, sr=sample_rate)
12 |     wav = wav / np.abs(wav).max() * 0.6
13 |     wav = wav / max(0.01, np.max(np.abs(wav))) * 32767 * 0.6
14 |     wavfile.write(wav_out, sample_rate, wav.astype(np.int16))
15 | 
16 | 
17 | def process_file(file, wavPath, spks, outPath, sr):
18 |     if file.endswith(".wav"):
19 |         file = file[:-4]
20 |         resample_wave(f"{wavPath}/{spks}/{file}.wav", f"{outPath}/{spks}/{file}.wav", sr)
21 | 
22 | 
23 | def process_files_with_thread_pool(wavPath, spks, outPath, sr, thread_num=None):
24 |     files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")]
25 | 
26 |     with ThreadPoolExecutor(max_workers=thread_num) as executor:
27 |         futures = {executor.submit(process_file, file, wavPath, spks, outPath, sr): file for file in files}
28 | 
29 |         for future in tqdm(as_completed(futures), total=len(futures), desc=f'Processing {sr} {spks}'):
30 |             future.result()
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     parser = argparse.ArgumentParser()
35 |     parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
36 |     parser.add_argument("-o", "--out", help="out", dest="out", required=True)
37 |     parser.add_argument("-s", "--sr", help="sample rate", dest="sr", type=int, required=True)
38 |     parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1)
39 | 
40 |     args = parser.parse_args()
41 |     print(args.wav)
42 |     print(args.out)
43 |     print(args.sr)
44 | 
45 |     os.makedirs(args.out, exist_ok=True)
46 |     wavPath = args.wav
47 |     outPath = args.out
48 | 
49 |     assert args.sr == 16000 or args.sr == 32000
50 | 
51 |     for spks in os.listdir(wavPath):
52 |         if os.path.isdir(f"./{wavPath}/{spks}"):
53 |             os.makedirs(f"./{outPath}/{spks}", exist_ok=True)
54 |             if args.thread_count == 0:
55 |                 process_num = os.cpu_count() // 2 + 1
56 |             else:
57 |                 process_num = args.thread_count
58 |             process_files_with_thread_pool(wavPath, spks, outPath, args.sr, process_num)
59 | 


--------------------------------------------------------------------------------
/prepare/preprocess_cdc.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import torch
 4 | import torchaudio
 5 | 
 6 | from tqdm import tqdm
 7 | from scipy.io.wavfile import read
 8 | from scipy.io.wavfile import write
 9 | # torch=1.9.0 ->  pip install torchaudio==0.9.0 -i https://mirrors.aliyun.com/pypi/simple/
10 | # this file is for VCTK
11 | 
12 | 
13 | MAX_WAV_VALUE = 32768.0
14 | 
15 | 
16 | def cut_direct_content(iWave, oWave):
17 |     source, sr = torchaudio.load(iWave)
18 |     stft = torch.stft(source, 1024, 256, 1024, torch.hann_window(1024), return_complex=True)
19 |     stft[:, 0, :] = 0
20 |     stft[:, 1, :] = 0
21 |     istft = torch.istft(stft, 1024, 256, 1024, torch.hann_window(1024))
22 |     audio = istft.squeeze()
23 |     audio = MAX_WAV_VALUE * audio
24 |     audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE-1)
25 |     audio = audio.short()
26 |     audio = audio.data.cpu().detach().numpy()
27 |     write(oWave, sr, audio)
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     parser = argparse.ArgumentParser()
32 |     parser.add_argument("-i", help="input path", dest="inPath", required=True)
33 |     parser.add_argument("-o", help="output path", dest="outPath", required=True)
34 | 
35 |     args = parser.parse_args()
36 |     print(args.inPath)
37 |     print(args.outPath)
38 | 
39 |     os.makedirs(args.outPath, exist_ok=True)
40 |     rootPath = args.inPath
41 |     outPath = args.outPath
42 | 
43 |     for spks in os.listdir(rootPath):
44 |         if (os.path.isdir(f"./{rootPath}/{spks}")):
45 |             os.makedirs(f"./{outPath}/{spks}", exist_ok=True)
46 | 
47 |             files = [f for f in os.listdir(f"./{rootPath}/{spks}") if f.endswith(".wav")]
48 |             for file in tqdm(files, desc=f'Processing cdc {spks}'):
49 |                 iWave = f"./{rootPath}/{spks}/{file}"
50 |                 oWave = f"./{outPath}/{spks}/{file}"
51 |                 cut_direct_content(iWave, oWave)
52 | 


--------------------------------------------------------------------------------
/prepare/preprocess_crepe.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 3 | import numpy as np
 4 | import librosa
 5 | import torch
 6 | import crepe
 7 | import argparse
 8 | from tqdm import tqdm
 9 | 
10 | 
11 | def compute_f0(filename, save, device):
12 |     audio, sr = librosa.load(filename, sr=16000)
13 |     assert sr == 16000
14 |     # Load audio
15 |     audio = torch.tensor(np.copy(audio))[None]
16 |     audio = audio + torch.randn_like(audio) * 0.001
17 |     # Here we'll use a 10 millisecond hop length
18 |     hop_length = 160
19 |     # Provide a sensible frequency range for your domain (upper limit is 2006 Hz)
20 |     # This would be a reasonable range for speech
21 |     fmin = 50
22 |     fmax = 1000
23 |     # Select a model capacity--one of "tiny" or "full"
24 |     model = "full"
25 |     # Pick a batch size that doesn't cause memory errors on your gpu
26 |     batch_size = 512
27 |     # Compute pitch using first gpu
28 |     pitch, periodicity = crepe.predict(
29 |         audio,
30 |         sr,
31 |         hop_length,
32 |         fmin,
33 |         fmax,
34 |         model,
35 |         batch_size=batch_size,
36 |         device=device,
37 |         return_periodicity=True,
38 |     )
39 |     # CREPE was not trained on silent audio. some error on silent need filter.pitPath
40 |     periodicity = crepe.filter.median(periodicity, 7)
41 |     pitch = crepe.filter.mean(pitch, 5)
42 |     pitch[periodicity < 0.5] = 0
43 |     pitch = pitch.squeeze(0)
44 |     np.save(save, pitch, allow_pickle=False)
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     parser = argparse.ArgumentParser()
49 |     parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
50 |     parser.add_argument("-p", "--pit", help="pit", dest="pit", required=True)
51 | 
52 |     args = parser.parse_args()
53 |     print(args.wav)
54 |     print(args.pit)
55 | 
56 |     os.makedirs(args.pit, exist_ok=True)
57 |     wavPath = args.wav
58 |     pitPath = args.pit
59 | 
60 |     device = "cuda" if torch.cuda.is_available() else "cpu"
61 | 
62 |     for spks in os.listdir(wavPath):
63 |         if os.path.isdir(f"./{wavPath}/{spks}"):
64 |             os.makedirs(f"./{pitPath}/{spks}", exist_ok=True)
65 | 
66 |             files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")]
67 |             for file in tqdm(files, desc=f'Processing crepe {spks}'):
68 |                 file = file[:-4]
69 |                 compute_f0(f"{wavPath}/{spks}/{file}.wav", f"{pitPath}/{spks}/{file}.pit", device)
70 | 


--------------------------------------------------------------------------------
/prepare/preprocess_f0.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import librosa
 4 | import pyworld
 5 | import argparse
 6 | from tqdm import tqdm
 7 | from concurrent.futures import ProcessPoolExecutor, as_completed
 8 | 
 9 | 
10 | def compute_f0(path, save):
11 |     x, sr = librosa.load(path, sr=16000)
12 |     assert sr == 16000
13 |     f0, t = pyworld.dio(
14 |         x.astype(np.double),
15 |         fs=sr,
16 |         f0_ceil=900,
17 |         frame_period=1000 * 160 / sr,
18 |     )
19 |     f0 = pyworld.stonemask(x.astype(np.double), f0, t, fs=16000)
20 |     for index, pitch in enumerate(f0):
21 |         f0[index] = round(pitch, 1)
22 |     np.save(save, f0, allow_pickle=False)
23 | 
24 | 
25 | def process_file(file, wavPath, spks, pitPath):
26 |     if file.endswith(".wav"):
27 |         file = file[:-4]
28 |         compute_f0(f"{wavPath}/{spks}/{file}.wav", f"{pitPath}/{spks}/{file}.pit")
29 | 
30 | 
31 | def process_files_with_process_pool(wavPath, spks, pitPath, process_num=None):
32 |     files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")]
33 | 
34 |     with ProcessPoolExecutor(max_workers=process_num) as executor:
35 |         futures = {executor.submit(process_file, file, wavPath, spks, pitPath): file for file in files}
36 | 
37 |         for future in tqdm(as_completed(futures), total=len(futures), desc=f'Processing f0 {spks}'):
38 |             future.result()
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     parser = argparse.ArgumentParser()
43 |     parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
44 |     parser.add_argument("-p", "--pit", help="pit", dest="pit", required=True)
45 |     parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1)
46 |     
47 |     args = parser.parse_args()
48 |     print(args.wav)
49 |     print(args.pit)
50 | 
51 |     os.makedirs(args.pit, exist_ok=True)
52 |     wavPath = args.wav
53 |     pitPath = args.pit
54 | 
55 |     for spks in os.listdir(wavPath):
56 |         if os.path.isdir(f"./{wavPath}/{spks}"):
57 |             os.makedirs(f"./{pitPath}/{spks}", exist_ok=True)
58 |             if args.thread_count == 0:
59 |                 process_num = os.cpu_count() // 2 + 1
60 |             else:
61 |                 process_num = args.thread_count
62 |             process_files_with_process_pool(wavPath, spks, pitPath, process_num)
63 | 


--------------------------------------------------------------------------------
/prepare/preprocess_f0_mouth.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import librosa
 4 | import argparse
 5 | import parselmouth
 6 | # pip install praat-parselmouth
 7 | from tqdm import tqdm
 8 | from concurrent.futures import ProcessPoolExecutor, as_completed
 9 | 
10 | 
11 | def compute_f0(path, save):
12 |     x, sr = librosa.load(path, sr=16000)
13 |     assert sr == 16000
14 |     lpad = 1024 // 160
15 |     rpad = lpad
16 |     f0 = parselmouth.Sound(x, sr).to_pitch_ac(
17 |         time_step=160 / sr,
18 |         voicing_threshold=0.5,
19 |         pitch_floor=30,
20 |         pitch_ceiling=1000).selected_array['frequency']
21 |     f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
22 |     np.save(save, f0, allow_pickle=False)
23 | 
24 | 
25 | def process_file(file, wavPath, spks, pitPath):
26 |     if file.endswith(".wav"):
27 |         file = file[:-4]
28 |         compute_f0(f"{wavPath}/{spks}/{file}.wav", f"{pitPath}/{spks}/{file}.pit")
29 | 
30 | 
31 | def process_files_with_process_pool(wavPath, spks, pitPath, process_num=None):
32 |     files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")]
33 | 
34 |     with ProcessPoolExecutor(max_workers=process_num) as executor:
35 |         futures = {executor.submit(process_file, file, wavPath, spks, pitPath): file for file in files}
36 | 
37 |         for future in tqdm(as_completed(futures), total=len(futures), desc=f'Processing f0 {spks}'):
38 |             future.result()
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     parser = argparse.ArgumentParser()
43 |     parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
44 |     parser.add_argument("-p", "--pit", help="pit", dest="pit", required=True)
45 |     parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1)
46 |     
47 |     args = parser.parse_args()
48 |     print(args.wav)
49 |     print(args.pit)
50 | 
51 |     os.makedirs(args.pit, exist_ok=True)
52 |     wavPath = args.wav
53 |     pitPath = args.pit
54 | 
55 |     for spks in os.listdir(wavPath):
56 |         if os.path.isdir(f"./{wavPath}/{spks}"):
57 |             os.makedirs(f"./{pitPath}/{spks}", exist_ok=True)
58 |             if args.thread_count == 0:
59 |                 process_num = os.cpu_count() // 2 + 1
60 |             else:
61 |                 process_num = args.thread_count
62 |             process_files_with_process_pool(wavPath, spks, pitPath, process_num)
63 | 


--------------------------------------------------------------------------------
/prepare/preprocess_hubert.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 3 | import numpy as np
 4 | import argparse
 5 | import torch
 6 | import librosa
 7 | 
 8 | from tqdm import tqdm
 9 | from hubert import hubert_model
10 | 
11 | 
12 | def load_audio(file: str, sr: int = 16000):
13 |     x, sr = librosa.load(file, sr=sr)
14 |     return x
15 | 
16 | 
17 | def load_model(path, device):
18 |     model = hubert_model.hubert_soft(path)
19 |     model.eval()
20 |     model.half()
21 |     model.to(device)
22 |     return model
23 | 
24 | 
25 | def pred_vec(model, wavPath, vecPath, device):
26 |     feats = load_audio(wavPath)
27 |     feats = torch.from_numpy(feats).to(device)
28 |     feats = feats[None, None, :].half()
29 |     with torch.no_grad():
30 |         vec = model.units(feats).squeeze().data.cpu().float().numpy()
31 |         # print(vec.shape)   # [length, dim=256] hop=320
32 |         np.save(vecPath, vec, allow_pickle=False)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     parser = argparse.ArgumentParser()
37 |     parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
38 |     parser.add_argument("-v", "--vec", help="vec", dest="vec", required=True)
39 |     
40 |     args = parser.parse_args()
41 |     print(args.wav)
42 |     print(args.vec)
43 |     os.makedirs(args.vec, exist_ok=True)
44 | 
45 |     wavPath = args.wav
46 |     vecPath = args.vec
47 | 
48 |     device = "cuda" if torch.cuda.is_available() else "cpu"
49 |     hubert = load_model(os.path.join("hubert_pretrain", "hubert-soft-0d54a1f4.pt"), device)
50 | 
51 |     for spks in os.listdir(wavPath):
52 |         if os.path.isdir(f"./{wavPath}/{spks}"):
53 |             os.makedirs(f"./{vecPath}/{spks}", exist_ok=True)
54 | 
55 |             files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")]
56 |             for file in tqdm(files, desc=f'Processing vec {spks}'):
57 |                 file = file[:-4]
58 |                 pred_vec(hubert, f"{wavPath}/{spks}/{file}.wav", f"{vecPath}/{spks}/{file}.vec", device)
59 | 


--------------------------------------------------------------------------------
/prepare/preprocess_ppg.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 3 | import numpy as np
 4 | import argparse
 5 | import torch
 6 | import random
 7 | from tqdm import tqdm
 8 | from whisper.model import Whisper, ModelDimensions
 9 | from whisper.audio import load_audio, pad_or_trim, log_mel_spectrogram
10 | 
11 | 
12 | def load_model(path) -> Whisper:
13 |     device = "cuda" if torch.cuda.is_available() else "cpu"
14 |     checkpoint = torch.load(path, map_location="cpu")
15 |     dims = ModelDimensions(**checkpoint["dims"])
16 |     print(dims)
17 |     model = Whisper(dims)
18 |     del model.decoder
19 |     cut = len(model.encoder.blocks) // 4
20 |     cut = -1 * cut
21 |     del model.encoder.blocks[cut:]
22 |     model.load_state_dict(checkpoint["model_state_dict"], strict=False)
23 |     model.eval()
24 |     model.half()
25 |     model.to(device)
26 |     return model
27 | 
28 | 
29 | def pred_ppg(whisper: Whisper, wavPath, ppgPath):
30 |     audio = load_audio(wavPath)
31 |     audln = audio.shape[0]
32 |     ppgln = audln // 320
33 |     audio = pad_or_trim(audio)
34 |     mel = log_mel_spectrogram(audio).half().to(whisper.device)
35 |     with torch.no_grad():
36 |         ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
37 |         ppg = ppg[:ppgln,]  # [length, dim=1280]
38 |         # print(ppg.shape)
39 |         np.save(ppgPath, ppg, allow_pickle=False)
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     parser = argparse.ArgumentParser()
44 |     parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
45 |     parser.add_argument("-p", "--ppg", help="ppg", dest="ppg", required=True)
46 |     args = parser.parse_args()
47 |     print(args.wav)
48 |     print(args.ppg)
49 | 
50 |     os.makedirs(args.ppg, exist_ok=True)
51 |     wavPath = args.wav
52 |     ppgPath = args.ppg
53 | 
54 |     whisper = load_model(os.path.join("whisper_pretrain", "large-v2.pt"))
55 |     spkPaths = os.listdir(wavPath)
56 |     random.shuffle(spkPaths)
57 | 
58 |     for spks in spkPaths:
59 |         if os.path.isdir(f"./{wavPath}/{spks}"):
60 |             os.makedirs(f"./{ppgPath}/{spks}", exist_ok=True)
61 | 
62 |             files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")]
63 |             for file in tqdm(files, desc=f'Processing ppg {spks}'):
64 |                 if file.endswith(".wav"):
65 |                     # print(file)
66 |                     file = file[:-4]
67 |                     path_wav = f"{wavPath}/{spks}/{file}.wav"
68 |                     path_ppg = f"{ppgPath}/{spks}/{file}.ppg"
69 |                     if os.path.isfile(f"{path_ppg}.npy"):
70 |                         continue
71 |                     pred_ppg(whisper, path_wav, path_ppg)
72 | 


--------------------------------------------------------------------------------
/prepare/preprocess_random.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | if __name__ == "__main__":
 5 |     all_items = []
 6 |     fo = open("./files/train_all.txt", "r+", encoding='utf-8')
 7 |     while (True):
 8 |         try:
 9 |             item = fo.readline().strip()
10 |         except Exception as e:
11 |             print('nothing of except:', e)
12 |             break
13 |         if (item == None or item == ""):
14 |             break
15 |         all_items.append(item)
16 |     fo.close()
17 | 
18 |     random.shuffle(all_items)
19 | 
20 |     fw = open("./files/train_all.txt", "w", encoding="utf-8")
21 |     for strs in all_items:
22 |         print(strs, file=fw)
23 |     fw.close()
24 | 


--------------------------------------------------------------------------------
/prepare/preprocess_speaker.py:
--------------------------------------------------------------------------------
  1 | import sys,os
  2 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  3 | import torch
  4 | import numpy as np
  5 | import argparse
  6 | 
  7 | from tqdm import tqdm
  8 | from functools import partial
  9 | from argparse import RawTextHelpFormatter
 10 | from multiprocessing.pool import ThreadPool
 11 | 
 12 | from speaker.models.lstm import LSTMSpeakerEncoder
 13 | from speaker.config import SpeakerEncoderConfig
 14 | from speaker.utils.audio import AudioProcessor
 15 | from speaker.infer import read_json
 16 | 
 17 | 
 18 | def get_spk_wavs(dataset_path, output_path):
 19 |     wav_files = []
 20 |     os.makedirs(f"./{output_path}", exist_ok=True)
 21 |     for spks in os.listdir(dataset_path):
 22 |         if os.path.isdir(f"./{dataset_path}/{spks}"):
 23 |             os.makedirs(f"./{output_path}/{spks}", exist_ok=True)
 24 |             for file in os.listdir(f"./{dataset_path}/{spks}"):
 25 |                 if file.endswith(".wav"):
 26 |                     wav_files.append(f"./{dataset_path}/{spks}/{file}")
 27 |         elif spks.endswith(".wav"):
 28 |             wav_files.append(f"./{dataset_path}/{spks}")
 29 |     return wav_files
 30 | 
 31 | 
 32 | def process_wav(wav_file, dataset_path, output_path, args, speaker_encoder_ap, speaker_encoder):
 33 |     waveform = speaker_encoder_ap.load_wav(
 34 |         wav_file, sr=speaker_encoder_ap.sample_rate
 35 |     )
 36 |     spec = speaker_encoder_ap.melspectrogram(waveform)
 37 |     spec = torch.from_numpy(spec.T)
 38 |     if args.use_cuda:
 39 |         spec = spec.cuda()
 40 |     spec = spec.unsqueeze(0)
 41 |     embed = speaker_encoder.compute_embedding(spec).detach().cpu().numpy()
 42 |     embed = embed.squeeze()
 43 |     embed_path = wav_file.replace(dataset_path, output_path)
 44 |     embed_path = embed_path.replace(".wav", ".spk")
 45 |     np.save(embed_path, embed, allow_pickle=False)
 46 | 
 47 | 
 48 | def extract_speaker_embeddings(wav_files, dataset_path, output_path, args, speaker_encoder_ap, speaker_encoder, concurrency):
 49 |     bound_process_wav = partial(process_wav, dataset_path=dataset_path, output_path=output_path, args=args, speaker_encoder_ap=speaker_encoder_ap, speaker_encoder=speaker_encoder)
 50 | 
 51 |     with ThreadPool(concurrency) as pool:
 52 |         list(tqdm(pool.imap(bound_process_wav, wav_files), total=len(wav_files)))
 53 | 
 54 | 
 55 | if __name__ == "__main__":
 56 | 
 57 |     parser = argparse.ArgumentParser(
 58 |         description="""Compute embedding vectors for each wav file in a dataset.""",
 59 |         formatter_class=RawTextHelpFormatter,
 60 |     )
 61 |     parser.add_argument("dataset_path", type=str, help="Path to dataset waves.")
 62 |     parser.add_argument(
 63 |         "output_path", type=str, help="path for output speaker/speaker_wavs.npy."
 64 |     )
 65 |     parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
 66 |     parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1)
 67 |     args = parser.parse_args()
 68 |     dataset_path = args.dataset_path
 69 |     output_path = args.output_path
 70 |     thread_count = args.thread_count
 71 |     # model
 72 |     args.model_path = os.path.join("speaker_pretrain", "best_model.pth.tar")
 73 |     args.config_path = os.path.join("speaker_pretrain", "config.json")
 74 |     # config
 75 |     config_dict = read_json(args.config_path)
 76 | 
 77 |     # model
 78 |     config = SpeakerEncoderConfig(config_dict)
 79 |     config.from_dict(config_dict)
 80 | 
 81 |     speaker_encoder = LSTMSpeakerEncoder(
 82 |         config.model_params["input_dim"],
 83 |         config.model_params["proj_dim"],
 84 |         config.model_params["lstm_dim"],
 85 |         config.model_params["num_lstm_layers"],
 86 |     )
 87 | 
 88 |     speaker_encoder.load_checkpoint(args.model_path, eval=True, use_cuda=args.use_cuda)
 89 | 
 90 |     # preprocess
 91 |     speaker_encoder_ap = AudioProcessor(**config.audio)
 92 |     # normalize the input audio level and trim silences
 93 |     speaker_encoder_ap.do_sound_norm = True
 94 |     speaker_encoder_ap.do_trim_silence = True
 95 | 
 96 |     wav_files = get_spk_wavs(dataset_path, output_path)
 97 | 
 98 |     if thread_count == 0:
 99 |         process_num = os.cpu_count()
100 |     else:
101 |         process_num = thread_count
102 | 
103 |     extract_speaker_embeddings(wav_files, dataset_path, output_path, args, speaker_encoder_ap, speaker_encoder, process_num)


--------------------------------------------------------------------------------
/prepare/preprocess_speaker_ave.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import argparse
 4 | import numpy as np
 5 | from tqdm import tqdm
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument("dataset_speaker", type=str)
11 |     parser.add_argument("dataset_singer", type=str)
12 | 
13 |     data_speaker = parser.parse_args().dataset_speaker
14 |     data_singer = parser.parse_args().dataset_singer
15 | 
16 |     os.makedirs(data_singer, exist_ok=True)
17 | 
18 |     for speaker in os.listdir(data_speaker):
19 |         subfile_num = 0
20 |         speaker_ave = 0
21 | 
22 |         for file in tqdm(os.listdir(os.path.join(data_speaker, speaker)), desc=f"average {speaker}"):
23 |             if not file.endswith(".npy"):
24 |                 continue
25 |             source_embed = np.load(os.path.join(data_speaker, speaker, file))
26 |             source_embed = source_embed.astype(np.float32)
27 |             speaker_ave = speaker_ave + source_embed
28 |             subfile_num = subfile_num + 1
29 |         if subfile_num == 0:
30 |             continue
31 |         speaker_ave = speaker_ave / subfile_num
32 | 
33 |         np.save(os.path.join(data_singer, f"{speaker}.spk.npy"),
34 |                 speaker_ave, allow_pickle=False)
35 | 
36 |         # rewrite timbre code by average, if similarity is larger than cmp_val
37 |         rewrite_timbre_code = False
38 |         if not rewrite_timbre_code:
39 |             continue
40 |         cmp_src = torch.FloatTensor(speaker_ave)
41 |         cmp_num = 0
42 |         cmp_val = 0.85
43 |         for file in tqdm(os.listdir(os.path.join(data_speaker, speaker)), desc=f"rewrite {speaker}"):
44 |             if not file.endswith(".npy"):
45 |                 continue
46 |             cmp_tmp = np.load(os.path.join(data_speaker, speaker, file))
47 |             cmp_tmp = cmp_tmp.astype(np.float32)
48 |             cmp_tmp = torch.FloatTensor(cmp_tmp)
49 |             cmp_cos = torch.cosine_similarity(cmp_src, cmp_tmp, dim=0)
50 |             if (cmp_cos > cmp_val):
51 |                 cmp_num += 1
52 |                 np.save(os.path.join(data_speaker, speaker, file),
53 |                         speaker_ave, allow_pickle=False)
54 |         print(f"rewrite timbre for {speaker} with :", cmp_num)
55 | 


--------------------------------------------------------------------------------
/prepare/preprocess_spec.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 3 | import torch
 4 | import argparse
 5 | import multiprocessing
 6 | from concurrent.futures import ThreadPoolExecutor
 7 | from tqdm import tqdm
 8 | from vits import spectrogram
 9 | from vits import utils
10 | from omegaconf import OmegaConf
11 | 
12 | 
13 | def compute_spec(hps, filename, specname):
14 |     audio, sampling_rate = utils.load_wav_to_torch(filename)
15 |     assert sampling_rate == hps.sampling_rate, f"{sampling_rate} is not {hps.sampling_rate}"
16 |     audio_norm = audio / hps.max_wav_value
17 |     audio_norm = audio_norm.unsqueeze(0)
18 |     n_fft = hps.filter_length
19 |     sampling_rate = hps.sampling_rate
20 |     hop_size = hps.hop_length
21 |     win_size = hps.win_length
22 |     spec = spectrogram.spectrogram_torch(
23 |         audio_norm, n_fft, sampling_rate, hop_size, win_size, center=False)
24 |     spec = torch.squeeze(spec, 0)
25 |     torch.save(spec, specname)
26 | 
27 | 
28 | def process_file(file):
29 |     if file.endswith(".wav"):
30 |         file = file[:-4]
31 |         compute_spec(hps.data, f"{wavPath}/{spks}/{file}.wav", f"{spePath}/{spks}/{file}.pt")
32 | 
33 | 
34 | def process_files_with_thread_pool(wavPath, spks, thread_num):
35 |     files = os.listdir(f"./{wavPath}/{spks}")
36 |     with ThreadPoolExecutor(max_workers=thread_num) as executor:
37 |         list(tqdm(executor.map(process_file, files), total=len(files), desc=f'Processing spec {spks}'))
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
43 |     parser.add_argument("-s", "--spe", help="spe", dest="spe", required=True)
44 |     parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1)
45 | 
46 |     args = parser.parse_args()
47 |     print(args.wav)
48 |     print(args.spe)
49 | 
50 |     os.makedirs(args.spe, exist_ok=True)
51 |     wavPath = args.wav
52 |     spePath = args.spe
53 |     hps = OmegaConf.load("./configs/base.yaml")
54 | 
55 |     for spks in os.listdir(wavPath):
56 |         if os.path.isdir(f"./{wavPath}/{spks}"):
57 |             os.makedirs(f"./{spePath}/{spks}", exist_ok=True)
58 |             if args.thread_count == 0:
59 |                 process_num = os.cpu_count() // 2 + 1
60 |             else:
61 |                 process_num = args.thread_count
62 |             process_files_with_thread_pool(wavPath, spks, process_num)
63 | 


--------------------------------------------------------------------------------
/prepare/preprocess_train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | 
 4 | 
 5 | def print_error(info):
 6 |     print(f"\033[31m File isn't existed: {info}\033[0m")
 7 | 
 8 | 
 9 | IndexBySinger = False
10 | if __name__ == "__main__":
11 |     os.makedirs("./files/", exist_ok=True)
12 | 
13 |     rootPath = "./data_svc/waves-32k/"
14 |     all_items = []
15 |     for spks in os.listdir(f"./{rootPath}"):
16 |         if not os.path.isdir(f"./{rootPath}/{spks}"):
17 |             continue
18 |         print(f"./{rootPath}/{spks}")
19 |         for file in os.listdir(f"./{rootPath}/{spks}"):
20 |             if file.endswith(".wav"):
21 |                 file = file[:-4]
22 | 
23 |                 if (IndexBySinger == False):
24 |                     path_spk = f"./data_svc/speaker/{spks}/{file}.spk.npy"
25 |                 else:
26 |                     path_spk = f"./data_svc/singer/{spks}.spk.npy"
27 | 
28 |                 path_wave = f"./data_svc/waves-32k/{spks}/{file}.wav"
29 |                 path_spec = f"./data_svc/specs/{spks}/{file}.pt"
30 |                 path_pitch = f"./data_svc/pitch/{spks}/{file}.pit.npy"
31 |                 path_hubert = f"./data_svc/hubert/{spks}/{file}.vec.npy"
32 |                 path_whisper = f"./data_svc/whisper/{spks}/{file}.ppg.npy"
33 |                 has_error = 0
34 |                 if not os.path.isfile(path_spk):
35 |                     print_error(path_spk)
36 |                     has_error = 1
37 |                 if not os.path.isfile(path_wave):
38 |                     print_error(path_wave)
39 |                     has_error = 1
40 |                 if not os.path.isfile(path_spec):
41 |                     print_error(path_spec)
42 |                     has_error = 1
43 |                 if not os.path.isfile(path_pitch):
44 |                     print_error(path_pitch)
45 |                     has_error = 1
46 |                 if not os.path.isfile(path_hubert):
47 |                     print_error(path_hubert)
48 |                     has_error = 1
49 |                 if not os.path.isfile(path_whisper):
50 |                     print_error(path_whisper)
51 |                     has_error = 1
52 |                 if has_error == 0:
53 |                     all_items.append(
54 |                         f"{path_wave}|{path_spec}|{path_pitch}|{path_hubert}|{path_whisper}|{path_spk}")
55 | 
56 |     random.shuffle(all_items)
57 |     valids = all_items[:10]
58 |     valids.sort()
59 |     trains = all_items[10:]
60 |     # trains.sort()
61 |     fw = open("./files/valid.txt", "w", encoding="utf-8")
62 |     for strs in valids:
63 |         print(strs, file=fw)
64 |     fw.close()
65 |     fw = open("./files/train.txt", "w", encoding="utf-8")
66 |     for strs in trains:
67 |         print(strs, file=fw)
68 |     fw.close()
69 | 


--------------------------------------------------------------------------------
/prepare/preprocess_trim.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | from tqdm import tqdm
 5 | from pydub import AudioSegment
 6 | from pydub.silence import split_on_silence
 7 | from pydub import effects
 8 | # this file is for VCTK, use after CDC
 9 | 
10 | 
11 | def trim_silence(iWave, oWave):
12 |     try:
13 |         audio = AudioSegment.from_wav(iWave)
14 |         # audio = effects.normalize(audio, 6)# max - 6dB
15 |         audio_chunks = split_on_silence(
16 |             audio,
17 |             min_silence_len=200,
18 |             silence_thresh=-45,
19 |             keep_silence=200,
20 |         )
21 |         for chunk in audio_chunks[1:]:
22 |             audio_chunks[0] += chunk
23 |         audio_chunks[0].export(oWave, format="wav")
24 |     except Exception as e:
25 |         print(str(e))
26 |         print(iWave)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument("-i", help="input path", dest="inPath", required=True)
32 |     parser.add_argument("-o", help="output path", dest="outPath", required=True)
33 | 
34 |     args = parser.parse_args()
35 |     print(args.inPath)
36 |     print(args.outPath)
37 | 
38 |     os.makedirs(args.outPath, exist_ok=True)
39 |     rootPath = args.inPath
40 |     outPath = args.outPath
41 | 
42 |     for spks in os.listdir(rootPath):
43 |         if (os.path.isdir(f"./{rootPath}/{spks}")):
44 |             os.makedirs(f"./{outPath}/{spks}", exist_ok=True)
45 | 
46 |             files = [f for f in os.listdir(f"./{rootPath}/{spks}") if f.endswith(".wav")]
47 |             for file in tqdm(files, desc=f'Processing sil {spks}'):
48 |                 iWave = f"./{rootPath}/{spks}/{file}"
49 |                 oWave = f"./{outPath}/{spks}/{file}"
50 |                 trim_silence(iWave, oWave)
51 | 


--------------------------------------------------------------------------------
/prepare/preprocess_zzz.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 3 | from tqdm import tqdm
 4 | from torch.utils.data import DataLoader
 5 | from omegaconf import OmegaConf
 6 | from vits.data_utils import TextAudioSpeakerSet
 7 | from vits.data_utils import TextAudioSpeakerCollate
 8 | from vits.data_utils import DistributedBucketSampler
 9 | 
10 | 
11 | hps = OmegaConf.load("./configs/base.yaml")
12 | dataset = TextAudioSpeakerSet("files/valid.txt", hps.data)
13 | 
14 | for _ in tqdm(dataset):
15 |     pass
16 | 
17 | 
18 | sampler = DistributedBucketSampler(
19 |     dataset,
20 |     4,
21 |     [150, 300, 450],
22 |     num_replicas=1,
23 |     rank=0,
24 |     shuffle=True)
25 | collate_fn = TextAudioSpeakerCollate()
26 | loader = DataLoader(dataset, num_workers=0, shuffle=False, pin_memory=True,
27 |                     collate_fn=collate_fn, batch_sampler=sampler)
28 | 
29 | 
30 | for _ in tqdm(loader):
31 |     pass
32 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | fsspec
 2 | pyworld
 3 | matplotlib
 4 | soundfile
 5 | scikit-learn
 6 | scipy
 7 | tensorboard
 8 | transformers
 9 | tqdm
10 | librosa
11 | omegaconf
12 | gradio==3.36.1
13 | ruamel.yaml
14 | resampy
15 | numpy==1.24
16 | chardet
17 | faiss-cpu==1.7.4
18 | 


--------------------------------------------------------------------------------
/speaker/README.md:
--------------------------------------------------------------------------------
 1 | ### Speaker Encoder
 2 | 
 3 | This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
 4 | 
 5 | With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
 6 | 
 7 | Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
 8 | 
 9 | ![](umap.png)
10 | 
11 | Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
12 | 
13 | To run the code, you need to follow the same flow as in TTS.
14 | 
15 | - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
16 | - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
17 | - Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth.tar model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
18 | - Watch training on Tensorboard as in TTS
19 | 


--------------------------------------------------------------------------------
/speaker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/speaker/__init__.py


--------------------------------------------------------------------------------
/speaker/config.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import asdict, dataclass, field
 2 | from typing import Dict, List
 3 | 
 4 | from .utils.coqpit import MISSING
 5 | from .utils.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
 6 | 
 7 | 
 8 | @dataclass
 9 | class SpeakerEncoderConfig(BaseTrainingConfig):
10 |     """Defines parameters for Speaker Encoder model."""
11 | 
12 |     model: str = "speaker_encoder"
13 |     audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
14 |     datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
15 |     # model params
16 |     model_params: Dict = field(
17 |         default_factory=lambda: {
18 |             "model_name": "lstm",
19 |             "input_dim": 80,
20 |             "proj_dim": 256,
21 |             "lstm_dim": 768,
22 |             "num_lstm_layers": 3,
23 |             "use_lstm_with_projection": True,
24 |         }
25 |     )
26 | 
27 |     audio_augmentation: Dict = field(default_factory=lambda: {})
28 | 
29 |     storage: Dict = field(
30 |         default_factory=lambda: {
31 |             "sample_from_storage_p": 0.66,  # the probability with which we'll sample from the DataSet in-memory storage
32 |             "storage_size": 15,  # the size of the in-memory storage with respect to a single batch
33 |         }
34 |     )
35 | 
36 |     # training params
37 |     max_train_step: int = 1000000  # end training when number of training steps reaches this value.
38 |     loss: str = "angleproto"
39 |     grad_clip: float = 3.0
40 |     lr: float = 0.0001
41 |     lr_decay: bool = False
42 |     warmup_steps: int = 4000
43 |     wd: float = 1e-6
44 | 
45 |     # logging params
46 |     tb_model_param_stats: bool = False
47 |     steps_plot_stats: int = 10
48 |     checkpoint: bool = True
49 |     save_step: int = 1000
50 |     print_step: int = 20
51 | 
52 |     # data loader
53 |     num_speakers_in_batch: int = MISSING
54 |     num_utters_per_speaker: int = MISSING
55 |     num_loader_workers: int = MISSING
56 |     skip_speakers: bool = False
57 |     voice_len: float = 1.6
58 | 
59 |     def check_values(self):
60 |         super().check_values()
61 |         c = asdict(self)
62 |         assert (
63 |             c["model_params"]["input_dim"] == self.audio.num_mels
64 |         ), " [!] model input dimendion must be equal to melspectrogram dimension."
65 | 


--------------------------------------------------------------------------------
/speaker/infer.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import json
  3 | import fsspec
  4 | import torch
  5 | import numpy as np
  6 | import argparse
  7 | 
  8 | from argparse import RawTextHelpFormatter
  9 | from .models.lstm import LSTMSpeakerEncoder
 10 | from .config import SpeakerEncoderConfig
 11 | from .utils.audio import AudioProcessor
 12 | 
 13 | 
 14 | def read_json(json_path):
 15 |     config_dict = {}
 16 |     try:
 17 |         with fsspec.open(json_path, "r", encoding="utf-8") as f:
 18 |             data = json.load(f)
 19 |     except json.decoder.JSONDecodeError:
 20 |         # backwards compat.
 21 |         data = read_json_with_comments(json_path)
 22 |     config_dict.update(data)
 23 |     return config_dict
 24 | 
 25 | 
 26 | def read_json_with_comments(json_path):
 27 |     """for backward compat."""
 28 |     # fallback to json
 29 |     with fsspec.open(json_path, "r", encoding="utf-8") as f:
 30 |         input_str = f.read()
 31 |     # handle comments
 32 |     input_str = re.sub(r"\\\n", "", input_str)
 33 |     input_str = re.sub(r"//.*\n", "\n", input_str)
 34 |     data = json.loads(input_str)
 35 |     return data
 36 | 
 37 | 
 38 | if __name__ == "__main__":
 39 | 
 40 |     parser = argparse.ArgumentParser(
 41 |         description="""Compute embedding vectors for each wav file in a dataset.""",
 42 |         formatter_class=RawTextHelpFormatter,
 43 |     )
 44 |     parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
 45 |     parser.add_argument(
 46 |         "config_path",
 47 |         type=str,
 48 |         help="Path to model config file.",
 49 |     )
 50 | 
 51 |     parser.add_argument("-s", "--source", help="input wave", dest="source")
 52 |     parser.add_argument(
 53 |         "-t", "--target", help="output 256d speaker embeddimg", dest="target"
 54 |     )
 55 | 
 56 |     parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
 57 |     parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
 58 | 
 59 |     args = parser.parse_args()
 60 |     source_file = args.source
 61 |     target_file = args.target
 62 | 
 63 |     # config
 64 |     config_dict = read_json(args.config_path)
 65 |     # print(config_dict)
 66 | 
 67 |     # model
 68 |     config = SpeakerEncoderConfig(config_dict)
 69 |     config.from_dict(config_dict)
 70 | 
 71 |     speaker_encoder = LSTMSpeakerEncoder(
 72 |         config.model_params["input_dim"],
 73 |         config.model_params["proj_dim"],
 74 |         config.model_params["lstm_dim"],
 75 |         config.model_params["num_lstm_layers"],
 76 |     )
 77 | 
 78 |     speaker_encoder.load_checkpoint(args.model_path, eval=True, use_cuda=args.use_cuda)
 79 | 
 80 |     # preprocess
 81 |     speaker_encoder_ap = AudioProcessor(**config.audio)
 82 |     # normalize the input audio level and trim silences
 83 |     speaker_encoder_ap.do_sound_norm = True
 84 |     speaker_encoder_ap.do_trim_silence = True
 85 | 
 86 |     # compute speaker embeddings
 87 | 
 88 |     # extract the embedding
 89 |     waveform = speaker_encoder_ap.load_wav(
 90 |         source_file, sr=speaker_encoder_ap.sample_rate
 91 |     )
 92 |     spec = speaker_encoder_ap.melspectrogram(waveform)
 93 |     spec = torch.from_numpy(spec.T)
 94 |     if args.use_cuda:
 95 |         spec = spec.cuda()
 96 |     spec = spec.unsqueeze(0)
 97 |     embed = speaker_encoder.compute_embedding(spec).detach().cpu().numpy()
 98 |     embed = embed.squeeze()
 99 |     # print(embed)
100 |     # print(embed.size)
101 |     np.save(target_file, embed, allow_pickle=False)
102 | 
103 | 
104 |     if hasattr(speaker_encoder, 'module'):
105 |         state_dict = speaker_encoder.module.state_dict()
106 |     else:
107 |         state_dict = speaker_encoder.state_dict()
108 |         torch.save({'model': state_dict}, "model_small.pth")
109 | 


--------------------------------------------------------------------------------
/speaker/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/speaker/models/__init__.py


--------------------------------------------------------------------------------
/speaker/models/lstm.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from torch import nn
  4 | 
  5 | from ..utils.io import load_fsspec
  6 | 
  7 | 
  8 | class LSTMWithProjection(nn.Module):
  9 |     def __init__(self, input_size, hidden_size, proj_size):
 10 |         super().__init__()
 11 |         self.input_size = input_size
 12 |         self.hidden_size = hidden_size
 13 |         self.proj_size = proj_size
 14 |         self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
 15 |         self.linear = nn.Linear(hidden_size, proj_size, bias=False)
 16 | 
 17 |     def forward(self, x):
 18 |         self.lstm.flatten_parameters()
 19 |         o, (_, _) = self.lstm(x)
 20 |         return self.linear(o)
 21 | 
 22 | 
 23 | class LSTMWithoutProjection(nn.Module):
 24 |     def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
 25 |         super().__init__()
 26 |         self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
 27 |         self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
 28 |         self.relu = nn.ReLU()
 29 | 
 30 |     def forward(self, x):
 31 |         _, (hidden, _) = self.lstm(x)
 32 |         return self.relu(self.linear(hidden[-1]))
 33 | 
 34 | 
 35 | class LSTMSpeakerEncoder(nn.Module):
 36 |     def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3, use_lstm_with_projection=True):
 37 |         super().__init__()
 38 |         self.use_lstm_with_projection = use_lstm_with_projection
 39 |         layers = []
 40 |         # choise LSTM layer
 41 |         if use_lstm_with_projection:
 42 |             layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
 43 |             for _ in range(num_lstm_layers - 1):
 44 |                 layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
 45 |             self.layers = nn.Sequential(*layers)
 46 |         else:
 47 |             self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
 48 | 
 49 |         self._init_layers()
 50 | 
 51 |     def _init_layers(self):
 52 |         for name, param in self.layers.named_parameters():
 53 |             if "bias" in name:
 54 |                 nn.init.constant_(param, 0.0)
 55 |             elif "weight" in name:
 56 |                 nn.init.xavier_normal_(param)
 57 | 
 58 |     def forward(self, x):
 59 |         # TODO: implement state passing for lstms
 60 |         d = self.layers(x)
 61 |         if self.use_lstm_with_projection:
 62 |             d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
 63 |         else:
 64 |             d = torch.nn.functional.normalize(d, p=2, dim=1)
 65 |         return d
 66 | 
 67 |     @torch.no_grad()
 68 |     def inference(self, x):
 69 |         d = self.layers.forward(x)
 70 |         if self.use_lstm_with_projection:
 71 |             d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
 72 |         else:
 73 |             d = torch.nn.functional.normalize(d, p=2, dim=1)
 74 |         return d
 75 | 
 76 |     def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True):
 77 |         """
 78 |         Generate embeddings for a batch of utterances
 79 |         x: 1xTxD
 80 |         """
 81 |         max_len = x.shape[1]
 82 | 
 83 |         if max_len < num_frames:
 84 |             num_frames = max_len
 85 | 
 86 |         offsets = np.linspace(0, max_len - num_frames, num=num_eval)
 87 | 
 88 |         frames_batch = []
 89 |         for offset in offsets:
 90 |             offset = int(offset)
 91 |             end_offset = int(offset + num_frames)
 92 |             frames = x[:, offset:end_offset]
 93 |             frames_batch.append(frames)
 94 | 
 95 |         frames_batch = torch.cat(frames_batch, dim=0)
 96 |         embeddings = self.inference(frames_batch)
 97 | 
 98 |         if return_mean:
 99 |             embeddings = torch.mean(embeddings, dim=0, keepdim=True)
100 | 
101 |         return embeddings
102 | 
103 |     def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5):
104 |         """
105 |         Generate embeddings for a batch of utterances
106 |         x: BxTxD
107 |         """
108 |         num_overlap = num_frames * overlap
109 |         max_len = x.shape[1]
110 |         embed = None
111 |         num_iters = seq_lens / (num_frames - num_overlap)
112 |         cur_iter = 0
113 |         for offset in range(0, max_len, num_frames - num_overlap):
114 |             cur_iter += 1
115 |             end_offset = min(x.shape[1], offset + num_frames)
116 |             frames = x[:, offset:end_offset]
117 |             if embed is None:
118 |                 embed = self.inference(frames)
119 |             else:
120 |                 embed[cur_iter <= num_iters, :] += self.inference(frames[cur_iter <= num_iters, :, :])
121 |         return embed / num_iters
122 | 
123 |     # pylint: disable=unused-argument, redefined-builtin
124 |     def load_checkpoint(self, checkpoint_path: str, eval: bool = False, use_cuda: bool = False):
125 |         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
126 |         self.load_state_dict(state["model"])
127 |         if use_cuda:
128 |             self.cuda()
129 |         if eval:
130 |             self.eval()
131 |             assert not self.training
132 | 


--------------------------------------------------------------------------------
/speaker/umap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/speaker/umap.png


--------------------------------------------------------------------------------
/speaker/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/speaker/utils/__init__.py


--------------------------------------------------------------------------------
/speaker_pretrain/README.md:
--------------------------------------------------------------------------------
1 | Path for:
2 | 
3 |     best_model.pth.tar
4 | 
5 |     config.json
6 | 


--------------------------------------------------------------------------------
/speaker_pretrain/config.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "model_name": "lstm",
  3 |     "run_name": "mueller91",
  4 |     "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ",
  5 |     "audio":{
  6 |         // Audio processing parameters
  7 |         "num_mels": 80,         // size of the mel spec frame.
  8 |         "fft_size": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
  9 |         "sample_rate": 16000,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
 10 |         "win_length": 1024,     // stft window length in ms.
 11 |         "hop_length": 256,      // stft window hop-lengh in ms.
 12 |         "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
 13 |         "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
 14 |         "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
 15 |         "min_level_db": -100,   // normalization range
 16 |         "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
 17 |         "power": 1.5,           // value to sharpen wav signals after GL algorithm.
 18 |         "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
 19 |         // Normalization parameters
 20 |         "signal_norm": true,    // normalize the spec values in range [0, 1]
 21 |         "symmetric_norm": true, // move normalization to range [-1, 1]
 22 |         "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
 23 |         "clip_norm": true,      // clip normalized values into the range.
 24 |         "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
 25 |         "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
 26 |         "do_trim_silence": true,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
 27 |         "trim_db": 60          // threshold for timming silence. Set this according to your dataset.
 28 |     },
 29 |     "reinit_layers": [],
 30 |     "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
 31 |     "grad_clip": 3.0, // upper limit for gradients for clipping.
 32 |     "epochs": 1000, // total number of epochs to train.
 33 |     "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
 34 |     "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
 35 |     "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
 36 |     "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
 37 |     "steps_plot_stats": 10, // number of steps to plot embeddings.
 38 |     "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
 39 |     "voice_len": 2.0, // size of the voice
 40 |     "num_utters_per_speaker": 10,  //
 41 |     "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
 42 |     "wd": 0.000001, // Weight decay weight.
 43 |     "checkpoint": true, // If true, it saves checkpoints per "save_step"
 44 |     "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
 45 |     "print_step": 20, // Number of steps to log traning on console.
 46 |     "output_path": "../../OutputsMozilla/checkpoints/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
 47 |     "model": {
 48 |         "input_dim": 80,
 49 |         "proj_dim": 256,
 50 |         "lstm_dim": 768,
 51 |         "num_lstm_layers": 3,
 52 |         "use_lstm_with_projection": true
 53 |     },
 54 |     "storage": {
 55 |         "sample_from_storage_p": 0.9,  // the probability with which we'll sample from the DataSet in-memory storage
 56 |         "storage_size": 25,   // the size of the in-memory storage with respect to a single batch
 57 |         "additive_noise": 1e-5   // add very small gaussian noise to the data in order to increase robustness
 58 |     },
 59 |     "datasets": 
 60 |         [
 61 |             {
 62 |                 "name": "vctk_slim",
 63 |                 "path": "../../../audio-datasets/en/VCTK-Corpus/",
 64 |                 "meta_file_train": null,
 65 |                 "meta_file_val": null
 66 |             },
 67 |             {
 68 |                 "name": "libri_tts",
 69 |                 "path": "../../../audio-datasets/en/LibriTTS/train-clean-100",
 70 |                 "meta_file_train": null,
 71 |                 "meta_file_val": null
 72 |             },
 73 |             {
 74 |                 "name": "libri_tts",
 75 |                 "path": "../../../audio-datasets/en/LibriTTS/train-clean-360",
 76 |                 "meta_file_train": null,
 77 |                 "meta_file_val": null
 78 |             },
 79 |             {
 80 |                 "name": "libri_tts",
 81 |                 "path": "../../../audio-datasets/en/LibriTTS/train-other-500",
 82 |                 "meta_file_train": null,
 83 |                 "meta_file_val": null
 84 |             },
 85 |             {
 86 |                 "name": "voxceleb1",
 87 |                 "path": "../../../audio-datasets/en/voxceleb1/",
 88 |                 "meta_file_train": null,
 89 |                 "meta_file_val": null
 90 |             },
 91 |             {
 92 |                 "name": "voxceleb2",
 93 |                 "path": "../../../audio-datasets/en/voxceleb2/",
 94 |                 "meta_file_train": null,
 95 |                 "meta_file_val": null
 96 |             },
 97 |             {
 98 |                 "name": "common_voice",
 99 |                 "path": "../../../audio-datasets/en/MozillaCommonVoice",
100 |                 "meta_file_train": "train.tsv",
101 |                 "meta_file_val": "test.tsv"
102 |             }
103 |         ]
104 | }


--------------------------------------------------------------------------------
/svc_eva.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | 
 4 | # average -> ave -> eva :haha
 5 | 
 6 | eva_conf = {
 7 |     './configs/singers/singer0022.npy': 0,
 8 |     './configs/singers/singer0030.npy': 0,
 9 |     './configs/singers/singer0047.npy': 0.5,
10 |     './configs/singers/singer0051.npy': 0.5,
11 | }
12 | 
13 | if __name__ == "__main__":
14 | 
15 |     eva = np.zeros(256)
16 |     for k, v in eva_conf.items():
17 |         assert os.path.isfile(k), k
18 |         spk = np.load(k)
19 |         eva = eva + spk * v
20 |     np.save("eva.spk.npy", eva, allow_pickle=False)
21 | 


--------------------------------------------------------------------------------
/svc_export.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 3 | import torch
 4 | import argparse
 5 | from omegaconf import OmegaConf
 6 | 
 7 | from vits.models import SynthesizerInfer
 8 | 
 9 | 
10 | def load_model(checkpoint_path, model):
11 |     assert os.path.isfile(checkpoint_path)
12 |     checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
13 |     saved_state_dict = checkpoint_dict["model_g"]
14 |     if hasattr(model, "module"):
15 |         state_dict = model.module.state_dict()
16 |     else:
17 |         state_dict = model.state_dict()
18 |     new_state_dict = {}
19 |     for k, v in state_dict.items():
20 |         try:
21 |             new_state_dict[k] = saved_state_dict[k]
22 |         except:
23 |             new_state_dict[k] = v
24 |     if hasattr(model, "module"):
25 |         model.module.load_state_dict(new_state_dict)
26 |     else:
27 |         model.load_state_dict(new_state_dict)
28 |     return model
29 | 
30 | 
31 | def save_pretrain(checkpoint_path, save_path):
32 |     assert os.path.isfile(checkpoint_path)
33 |     checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
34 |     torch.save({
35 |         'model_g': checkpoint_dict['model_g'],
36 |         'model_d': checkpoint_dict['model_d'],
37 |     }, save_path)
38 | 
39 | 
40 | def save_model(model, checkpoint_path):
41 |     if hasattr(model, 'module'):
42 |         state_dict = model.module.state_dict()
43 |     else:
44 |         state_dict = model.state_dict()
45 |     torch.save({'model_g': state_dict}, checkpoint_path)
46 | 
47 | 
48 | def main(args):
49 |     hp = OmegaConf.load(args.config)
50 |     model = SynthesizerInfer(
51 |         hp.data.filter_length // 2 + 1,
52 |         hp.data.segment_size // hp.data.hop_length,
53 |         hp)
54 | 
55 |     # save_pretrain(args.checkpoint_path, "sovits5.0.pretrain.pth")
56 |     load_model(args.checkpoint_path, model)
57 |     save_model(model, "sovits5.0.pth")
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     parser = argparse.ArgumentParser()
62 |     parser.add_argument('-c', '--config', type=str, required=True,
63 |                         help="yaml file for config. will use hp_str from checkpoint if not given.")
64 |     parser.add_argument('-p', '--checkpoint_path', type=str, required=True,
65 |                         help="path of checkpoint pt file for evaluation")
66 |     args = parser.parse_args()
67 | 
68 |     main(args)
69 | 


--------------------------------------------------------------------------------
/svc_inference_batch.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 3 | import tqdm
 4 | import torch
 5 | import argparse
 6 | 
 7 | from whisper.inference import load_model, pred_ppg
 8 | 
 9 | # How to use
10 | # python svc_inference_batch.py --config configs/base.yaml --model vits_pretrain/sovits5.0.pth --wave test_waves/ --spk configs/singers/singer0047.npy
11 | 
12 | out_path = "./_svc_out"
13 | os.makedirs(out_path, exist_ok=True)
14 | 
15 | if __name__ == '__main__':
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument('--config', type=str, required=True,
18 |                         help="yaml file for config.")
19 |     parser.add_argument('--model', type=str, required=True,
20 |                         help="path of model for evaluation")
21 |     parser.add_argument('--wave', type=str, required=True,
22 |                         help="Path of raw audio.")
23 |     parser.add_argument('--spk', type=str, required=True,
24 |                     help="Path of speaker.")
25 |     parser.add_argument('--shift', type=int, default=0,
26 |                     help="Pitch shift key.")
27 |     args = parser.parse_args()
28 |     wave_path = args.wave
29 |     assert os.path.isdir(wave_path), f"{wave_path} is not folder"
30 |     waves = [file for file in os.listdir(wave_path) if file.endswith(".wav")]
31 |     for file in waves:
32 |         print(file)
33 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
34 |     whisper = load_model(os.path.join("whisper_pretrain", "large-v2.pt"), device=device)
35 |     for file in tqdm.tqdm(waves, desc="whisper"):
36 |         pred_ppg(whisper, f"{wave_path}/{file}", f"{out_path}/{file}.ppg.npy", device=device)
37 |     del whisper
38 | 
39 |     for file in tqdm.tqdm(waves, desc="svc"):
40 |         os.system(
41 |             f"python svc_inference.py --config {args.config} --model {args.model} --wave {wave_path}/{file} --ppg {out_path}/{file}.ppg.npy --spk {args.spk} --shift {args.shift}")
42 |         os.system(f"mv svc_out.wav {out_path}/{file}")
43 |         os.system(f"rm {out_path}/{file}.ppg.npy")
44 | 


--------------------------------------------------------------------------------
/svc_inference_post.py:
--------------------------------------------------------------------------------
 1 | import sys, os
 2 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 3 | import torch
 4 | import librosa
 5 | import argparse
 6 | import numpy as np
 7 | from scipy.io.wavfile import write
 8 | from vad.utils import init_jit_model, get_speech_timestamps
 9 | 
10 | 
11 | def load_audio(file: str, sr: int = 16000):
12 |     x, sr = librosa.load(file, sr=sr)
13 |     return x
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     parser = argparse.ArgumentParser()
18 | 
19 |     parser.add_argument('--ref', type=str, required=True,
20 |                         help="Path of ref audio.")
21 |     parser.add_argument('--svc', type=str, required=True,
22 |                         help="Path of svc audio.")
23 |     parser.add_argument('--out', type=str, required=True,
24 |                         help="Path of out audio.")
25 | 
26 |     args = parser.parse_args()
27 |     print("svc in wave :", args.ref)
28 |     print("svc out wave :", args.svc)
29 |     print("svc post wave :", args.out)
30 | 
31 |     model = init_jit_model(os.path.join('vad/assets', 'silero_vad.jit'))
32 |     model.eval()
33 | 
34 |     ref_wave = load_audio(args.ref, sr=16000)
35 |     tmp_wave = torch.from_numpy(ref_wave).squeeze(0)
36 |     tag_wave = get_speech_timestamps(
37 |         tmp_wave, model, threshold=0.2, sampling_rate=16000)
38 | 
39 |     ref_wave[:] = 0
40 |     for tag in tag_wave:
41 |         ref_wave[tag["start"]:tag["end"]] = 1
42 | 
43 |     ref_wave = np.repeat(ref_wave, 2, -1)
44 |     svc_wave = load_audio(args.svc, sr=32000)
45 | 
46 |     min_len = min(len(ref_wave), len(svc_wave))
47 |     ref_wave = ref_wave[:min_len]
48 |     svc_wave = svc_wave[:min_len]
49 |     svc_wave[ref_wave == 0] = 0
50 | 
51 |     write(args.out, 32000, svc_wave)
52 | 


--------------------------------------------------------------------------------
/svc_inference_shift.py:
--------------------------------------------------------------------------------
  1 | import sys,os
  2 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
  3 | import torch
  4 | import argparse
  5 | import numpy as np
  6 | 
  7 | from omegaconf import OmegaConf
  8 | from scipy.io.wavfile import write
  9 | from pitch import load_csv_pitch
 10 | from vits.models import SynthesizerInfer
 11 | from svc_inference import load_svc_model, svc_infer
 12 | 
 13 | 
 14 | def main(args):
 15 |     if (args.ppg == None):
 16 |         args.ppg = "svc_tmp.ppg.npy"
 17 |         print(
 18 |             f"Auto run : python whisper/inference.py -w {args.wave} -p {args.ppg}")
 19 |         os.system(f"python whisper/inference.py -w {args.wave} -p {args.ppg}")
 20 | 
 21 |     if (args.vec == None):
 22 |         args.vec = "svc_tmp.vec.npy"
 23 |         print(
 24 |             f"Auto run : python hubert/inference.py -w {args.wave} -v {args.vec}")
 25 |         os.system(f"python hubert/inference.py -w {args.wave} -v {args.vec}")
 26 | 
 27 |     if (args.pit == None):
 28 |         args.pit = "svc_tmp.pit.csv"
 29 |         print(
 30 |             f"Auto run : python pitch/inference.py -w {args.wave} -p {args.pit}")
 31 |         os.system(f"python pitch/inference.py -w {args.wave} -p {args.pit}")
 32 | 
 33 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 34 |     hp = OmegaConf.load(args.config)
 35 |     model = SynthesizerInfer(
 36 |         hp.data.filter_length // 2 + 1,
 37 |         hp.data.segment_size // hp.data.hop_length,
 38 |         hp)
 39 |     load_svc_model(args.model, model)
 40 |     model.eval()
 41 |     model.to(device)
 42 | 
 43 |     spk = np.load(args.spk)
 44 |     spk = torch.FloatTensor(spk)
 45 | 
 46 |     ppg = np.load(args.ppg)
 47 |     ppg = np.repeat(ppg, 2, 0)
 48 |     ppg = torch.FloatTensor(ppg)
 49 | 
 50 |     vec = np.load(args.vec)
 51 |     vec = np.repeat(vec, 2, 0)
 52 |     vec = torch.FloatTensor(vec)
 53 | 
 54 |     pit = load_csv_pitch(args.pit)
 55 | 
 56 |     shift_l = args.shift_l
 57 |     shift_r = args.shift_r
 58 | 
 59 |     print(f"pitch shift: [{shift_l}, {shift_r}]")
 60 | 
 61 |     for shift in range(shift_l, shift_r + 1):
 62 |         print(shift)
 63 |         tmp = np.array(pit)
 64 |         tmp = tmp * (2 ** (shift / 12))
 65 |         tmp = torch.FloatTensor(tmp)
 66 | 
 67 |         out_audio = svc_infer(model, spk, tmp, ppg, vec, hp, device)
 68 |         write(os.path.join("./_svc_out", f"svc_out_{shift}.wav"),
 69 |               hp.data.sampling_rate, out_audio)
 70 | 
 71 | 
 72 | if __name__ == '__main__':
 73 |     parser = argparse.ArgumentParser()
 74 |     parser.add_argument('--config', type=str, required=True,
 75 |                         help="yaml file for config.")
 76 |     parser.add_argument('--model', type=str, required=True,
 77 |                         help="path of model for evaluation")
 78 |     parser.add_argument('--wave', type=str, required=True,
 79 |                         help="Path of raw audio.")
 80 |     parser.add_argument('--spk', type=str, required=True,
 81 |                         help="Path of speaker.")
 82 |     parser.add_argument('--ppg', type=str,
 83 |                         help="Path of content vector.")
 84 |     parser.add_argument('--vec', type=str,
 85 |                         help="Path of hubert vector.")
 86 |     parser.add_argument('--pit', type=str,
 87 |                         help="Path of pitch csv file.")
 88 |     parser.add_argument('--shift_l', type=int, default=0,
 89 |                         help="Pitch shift key for [shift_l, shift_r]")
 90 |     parser.add_argument('--shift_r', type=int, default=0,
 91 |                         help="Pitch shift key for [shift_l, shift_r]")
 92 |     args = parser.parse_args()
 93 | 
 94 |     assert args.shift_l >= -12
 95 |     assert args.shift_r >= -12
 96 |     assert args.shift_l <= 12
 97 |     assert args.shift_r <= 12
 98 |     assert args.shift_l <= args.shift_r
 99 | 
100 |     os.makedirs("./_svc_out", exist_ok=True)
101 | 
102 |     main(args)
103 | 


--------------------------------------------------------------------------------
/svc_merge.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import argparse
 4 | import collections
 5 | 
 6 | 
 7 | def load_model(checkpoint_path):
 8 |     assert os.path.isfile(checkpoint_path)
 9 |     checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
10 |     saved_state_dict = checkpoint_dict["model_g"]
11 |     return saved_state_dict
12 | 
13 | 
14 | def save_model(state_dict, checkpoint_path):
15 |     torch.save({'model_g': state_dict}, checkpoint_path)
16 | 
17 | 
18 | def average_model(model_list):
19 |     model_keys = list(model_list[0].keys())
20 |     model_average = collections.OrderedDict()
21 |     for key in model_keys:
22 |         key_sum = 0
23 |         for i in range(len(model_list)):
24 |             key_sum = (key_sum + model_list[i][key])
25 |         model_average[key] = torch.div(key_sum, float(len(model_list)))
26 |     return model_average
27 | #   ss_list = []
28 | #   ss_list.append(s1)
29 | #   ss_list.append(s2)
30 | #   ss_merge = average_model(ss_list)
31 | 
32 | 
33 | def merge_model(model1, model2, rate):
34 |     model_keys = model1.keys()
35 |     model_merge = collections.OrderedDict()
36 |     for key in model_keys:
37 |         key_merge = rate * model1[key] + (1 - rate) * model2[key]
38 |         model_merge[key] = key_merge
39 |     return model_merge
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     parser = argparse.ArgumentParser()
44 |     parser.add_argument('-m1', '--model1', type=str, required=True)
45 |     parser.add_argument('-m2', '--model2', type=str, required=True)
46 |     parser.add_argument('-r1', '--rate', type=float, required=True)
47 |     args = parser.parse_args()
48 | 
49 |     print(args.model1)
50 |     print(args.model2)
51 |     print(args.rate)
52 | 
53 |     assert args.rate > 0 and args.rate < 1, f"{args.rate} should be in range (0, 1)"
54 |     s1 = load_model(args.model1)
55 |     s2 = load_model(args.model2)
56 | 
57 |     merge = merge_model(s1, s2, args.rate)
58 |     save_model(merge, "sovits5.0_merge.pth")
59 | 


--------------------------------------------------------------------------------
/svc_preprocessing.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import argparse
 4 | import subprocess
 5 | 
 6 | assert torch.cuda.is_available(), "\033[31m You need GPU to Train! \033[0m"
 7 | print("CPU Count is :", os.cpu_count())
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument("-t", type=int, default=0, help="thread count")
11 | args = parser.parse_args()
12 | 
13 | 
14 | commands = [
15 |    "python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-16k -s 16000 -t 0",
16 |    "python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000 -t 0",
17 |    "python prepare/preprocess_crepe.py -w data_svc/waves-16k/ -p data_svc/pitch",
18 |    "python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper",
19 |    "python prepare/preprocess_hubert.py -w data_svc/waves-16k/ -v data_svc/hubert",
20 |    "python prepare/preprocess_speaker.py data_svc/waves-16k/ data_svc/speaker -t 0",
21 |    "python prepare/preprocess_speaker_ave.py data_svc/speaker/ data_svc/singer",
22 |    "python prepare/preprocess_spec.py -w data_svc/waves-32k/ -s data_svc/specs -t 0",
23 |    "python prepare/preprocess_train.py",
24 |    "python prepare/preprocess_zzz.py",
25 | ]
26 | 
27 | 
28 | for command in commands:
29 |    print(f"Command: {command}")
30 | 
31 |    process = subprocess.Popen(command, shell=True)
32 |    outcode = process.wait()
33 |    if (outcode):
34 |       break
35 | 


--------------------------------------------------------------------------------
/svc_train_retrieval.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import multiprocessing
  4 | from functools import partial
  5 | from pathlib import Path
  6 | 
  7 | import faiss
  8 | 
  9 | from feature_retrieval import (
 10 |     train_index,
 11 |     FaissIVFFlatTrainableFeatureIndexBuilder,
 12 |     OnConditionFeatureTransform,
 13 |     MinibatchKmeansFeatureTransform,
 14 |     DummyFeatureTransform,
 15 | )
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | def get_speaker_list(base_path: Path):
 21 |     speakers_path = base_path / "waves-16k"
 22 |     if not speakers_path.exists():
 23 |         raise FileNotFoundError(f"path {speakers_path} does not exists")
 24 |     return [speaker_dir.name for speaker_dir in speakers_path.iterdir() if speaker_dir.is_dir()]
 25 | 
 26 | 
 27 | def create_indexes_path(base_path: Path) -> Path:
 28 |     indexes_path = base_path / "indexes"
 29 |     logger.info("create indexes folder %s", indexes_path)
 30 |     indexes_path.mkdir(exist_ok=True)
 31 |     return indexes_path
 32 | 
 33 | 
 34 | def create_index(
 35 |         feature_name: str,
 36 |         prefix: str,
 37 |         speaker: str,
 38 |         base_path: Path,
 39 |         indexes_path: Path,
 40 |         compress_features_after: int,
 41 |         n_clusters: int,
 42 |         n_parallel: int,
 43 |         train_batch_size: int = 8192,
 44 | ) -> None:
 45 |     features_path = base_path / feature_name / speaker
 46 |     if not features_path.exists():
 47 |         raise ValueError(f'features not found by path {features_path}')
 48 |     index_path = indexes_path / speaker
 49 |     index_path.mkdir(exist_ok=True)
 50 |     index_filename = f"{prefix}{feature_name}.index"
 51 |     index_filepath = index_path / index_filename
 52 |     logger.debug('index will be save to %s', index_filepath)
 53 | 
 54 |     builder = FaissIVFFlatTrainableFeatureIndexBuilder(train_batch_size, distance=faiss.METRIC_L2)
 55 |     transform = OnConditionFeatureTransform(
 56 |         condition=lambda matrix: matrix.shape[0] > compress_features_after,
 57 |         on_condition=MinibatchKmeansFeatureTransform(n_clusters, n_parallel),
 58 |         otherwise=DummyFeatureTransform()
 59 |     )
 60 |     train_index(features_path, index_filepath, builder, transform)
 61 | 
 62 | 
 63 | def main() -> None:
 64 |     arg_parser = argparse.ArgumentParser("crate faiss indexes for feature retrieval")
 65 |     arg_parser.add_argument("--debug", action="store_true")
 66 |     arg_parser.add_argument("--prefix", default='', help="add prefix to index filename")
 67 |     arg_parser.add_argument('--speakers', nargs="+",
 68 |                             help="speaker names to create an index. By default all speakers are from data_svc")
 69 |     arg_parser.add_argument("--compress-features-after", type=int, default=200_000,
 70 |                             help="If the number of features is greater than the value compress "
 71 |                                  "feature vectors using MiniBatchKMeans.")
 72 |     arg_parser.add_argument("--n-clusters", type=int, default=10_000,
 73 |                             help="Number of centroids to which features will be compressed")
 74 | 
 75 |     arg_parser.add_argument("--n-parallel", type=int, default=multiprocessing.cpu_count()-1,
 76 |                             help="Nuber of parallel job of MinibatchKmeans. Default is cpus-1")
 77 |     args = arg_parser.parse_args()
 78 | 
 79 |     if args.debug:
 80 |         logging.basicConfig(level=logging.DEBUG)
 81 |     else:
 82 |         logging.basicConfig(level=logging.INFO)
 83 | 
 84 |     base_path = Path(".").absolute() / "data_svc"
 85 |     if args.speakers:
 86 |         speakers = args.speakers
 87 |     else:
 88 |         speakers = get_speaker_list(base_path)
 89 | 
 90 |     logger.info("got %s speakers: %s", len(speakers), speakers)
 91 |     indexes_path = create_indexes_path(base_path)
 92 | 
 93 |     create_index_func = partial(
 94 |         create_index,
 95 |         prefix=args.prefix,
 96 |         base_path=base_path,
 97 |         indexes_path=indexes_path,
 98 |         compress_features_after=args.compress_features_after,
 99 |         n_clusters=args.n_clusters,
100 |         n_parallel=args.n_parallel,
101 |     )
102 | 
103 |     for speaker in speakers:
104 |         logger.info("create hubert index for speaker %s", speaker)
105 |         create_index_func(feature_name="hubert", speaker=speaker)
106 | 
107 |         logger.info("create whisper index for speaker %s", speaker)
108 |         create_index_func(feature_name="whisper", speaker=speaker)
109 | 
110 |     logger.info("done!")
111 | 
112 | 
113 | if __name__ == '__main__':
114 |     main()
115 | 


--------------------------------------------------------------------------------
/svc_trainer.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 3 | import argparse
 4 | import torch
 5 | import torch.multiprocessing as mp
 6 | from omegaconf import OmegaConf
 7 | 
 8 | from vits_extend.train import train
 9 | 
10 | torch.backends.cudnn.benchmark = True
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument('-c', '--config', type=str, required=True,
16 |                         help="yaml file for configuration")
17 |     parser.add_argument('-p', '--checkpoint_path', type=str, default=None,
18 |                         help="path of checkpoint pt file to resume training")
19 |     parser.add_argument('-n', '--name', type=str, required=True,
20 |                         help="name of the model for logging, saving checkpoint")
21 |     args = parser.parse_args()
22 | 
23 |     hp = OmegaConf.load(args.config)
24 |     with open(args.config, 'r') as f:
25 |         hp_str = ''.join(f.readlines())
26 | 
27 |     assert hp.data.hop_length == 320, \
28 |         'hp.data.hop_length must be equal to 320, got %d' % hp.data.hop_length
29 | 
30 |     args.num_gpus = 0
31 |     torch.manual_seed(hp.train.seed)
32 |     if torch.cuda.is_available():
33 |         torch.cuda.manual_seed(hp.train.seed)
34 |         args.num_gpus = torch.cuda.device_count()
35 |         print('Batch size per GPU :', hp.train.batch_size)
36 | 
37 |         if args.num_gpus > 1:
38 |             mp.spawn(train, nprocs=args.num_gpus,
39 |                      args=(args, args.checkpoint_path, hp, hp_str,))
40 |         else:
41 |             train(0, args, args.checkpoint_path, hp, hp_str)
42 |     else:
43 |         print('No GPU find!')
44 | 


--------------------------------------------------------------------------------
/test.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/test.wav


--------------------------------------------------------------------------------
/vad/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020-present Silero Team
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/vad/assets/silero_vad.jit:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/vad/assets/silero_vad.jit


--------------------------------------------------------------------------------
/vits/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Jaehyeon Kim
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/vits/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/vits/__init__.py


--------------------------------------------------------------------------------
/vits/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def feature_loss(fmap_r, fmap_g):
 5 |     loss = 0
 6 |     for dr, dg in zip(fmap_r, fmap_g):
 7 |         for rl, gl in zip(dr, dg):
 8 |             rl = rl.float().detach()
 9 |             gl = gl.float()
10 |             loss += torch.mean(torch.abs(rl - gl))
11 | 
12 |     return loss * 2
13 | 
14 | 
15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
16 |     loss = 0
17 |     r_losses = []
18 |     g_losses = []
19 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
20 |         dr = dr.float()
21 |         dg = dg.float()
22 |         r_loss = torch.mean((1 - dr) ** 2)
23 |         g_loss = torch.mean(dg**2)
24 |         loss += r_loss + g_loss
25 |         r_losses.append(r_loss.item())
26 |         g_losses.append(g_loss.item())
27 | 
28 |     return loss, r_losses, g_losses
29 | 
30 | 
31 | def generator_loss(disc_outputs):
32 |     loss = 0
33 |     gen_losses = []
34 |     for dg in disc_outputs:
35 |         dg = dg.float()
36 |         l = torch.mean((1 - dg) ** 2)
37 |         gen_losses.append(l)
38 |         loss += l
39 | 
40 |     return loss, gen_losses
41 | 
42 | 
43 | def kl_loss(z_p, logs_q, m_p, logs_p, total_logdet, z_mask):
44 |     """
45 |     z_p, logs_q: [b, h, t_t]
46 |     m_p, logs_p: [b, h, t_t]
47 |     total_logdet: [b] - total_logdet summed over each batch
48 |     """
49 |     z_p = z_p.float()
50 |     logs_q = logs_q.float()
51 |     m_p = m_p.float()
52 |     logs_p = logs_p.float()
53 |     z_mask = z_mask.float()
54 | 
55 |     kl = logs_p - logs_q - 0.5
56 |     kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
57 |     kl = torch.sum(kl * z_mask)
58 |     # add total_logdet (Negative LL)
59 |     kl -= torch.sum(total_logdet)
60 |     l = kl / torch.sum(z_mask)
61 |     return l
62 | 
63 | 
64 | def kl_loss_back(z_p, logs_q, m_p, logs_p, z_mask):
65 |     """
66 |     z_p, logs_q: [b, h, t_t]
67 |     m_p, logs_p: [b, h, t_t]
68 |     """
69 |     z_p = z_p.float()
70 |     logs_q = logs_q.float()
71 |     m_p = m_p.float()
72 |     logs_p = logs_p.float()
73 |     z_mask = z_mask.float()
74 | 
75 |     kl = logs_p - logs_q - 0.5
76 |     kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
77 |     kl = torch.sum(kl * z_mask)
78 |     l = kl / torch.sum(z_mask)
79 |     return l
80 | 


--------------------------------------------------------------------------------
/vits/modules_grl.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/ubisoft/ubisoft-laforge-daft-exprt Apache License Version 2.0
 2 | # Unsupervised Domain Adaptation by Backpropagation
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | from torch.autograd import Function
 8 | from torch.nn.utils import weight_norm
 9 | 
10 | 
11 | class GradientReversalFunction(Function):
12 |     @staticmethod
13 |     def forward(ctx, x, lambda_):
14 |         ctx.lambda_ = lambda_
15 |         return x.clone()
16 | 
17 |     @staticmethod
18 |     def backward(ctx, grads):
19 |         lambda_ = ctx.lambda_
20 |         lambda_ = grads.new_tensor(lambda_)
21 |         dx = -lambda_ * grads
22 |         return dx, None
23 | 
24 | 
25 | class GradientReversal(torch.nn.Module):
26 |     ''' Gradient Reversal Layer
27 |             Y. Ganin, V. Lempitsky,
28 |             "Unsupervised Domain Adaptation by Backpropagation",
29 |             in ICML, 2015.
30 |         Forward pass is the identity function
31 |         In the backward pass, upstream gradients are multiplied by -lambda (i.e. gradient are reversed)
32 |     '''
33 | 
34 |     def __init__(self, lambda_reversal=1):
35 |         super(GradientReversal, self).__init__()
36 |         self.lambda_ = lambda_reversal
37 | 
38 |     def forward(self, x):
39 |         return GradientReversalFunction.apply(x, self.lambda_)
40 | 
41 | 
42 | class SpeakerClassifier(nn.Module):
43 | 
44 |     def __init__(self, embed_dim, spk_dim):
45 |         super(SpeakerClassifier, self).__init__()
46 |         self.classifier = nn.Sequential(
47 |             GradientReversal(lambda_reversal=1),
48 |             weight_norm(nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)),
49 |             nn.ReLU(),
50 |             weight_norm(nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)),
51 |             nn.ReLU(),
52 |             weight_norm(nn.Conv1d(embed_dim, spk_dim, kernel_size=5, padding=2))
53 |         )
54 | 
55 |     def forward(self, x):
56 |         ''' Forward function of Speaker Classifier:
57 |             x = (B, embed_dim, len)
58 |         '''
59 |         # pass through classifier
60 |         outputs = self.classifier(x)  # (B, nb_speakers)
61 |         outputs = torch.mean(outputs, dim=-1)
62 |         return outputs
63 | 


--------------------------------------------------------------------------------
/vits/spectrogram.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data
  3 | 
  4 | from librosa.filters import mel as librosa_mel_fn
  5 | 
  6 | MAX_WAV_VALUE = 32768.0
  7 | 
  8 | 
  9 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 10 |     """
 11 |     PARAMS
 12 |     ------
 13 |     C: compression factor
 14 |     """
 15 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 16 | 
 17 | 
 18 | def dynamic_range_decompression_torch(x, C=1):
 19 |     """
 20 |     PARAMS
 21 |     ------
 22 |     C: compression factor used to compress
 23 |     """
 24 |     return torch.exp(x) / C
 25 | 
 26 | 
 27 | def spectral_normalize_torch(magnitudes):
 28 |     output = dynamic_range_compression_torch(magnitudes)
 29 |     return output
 30 | 
 31 | 
 32 | def spectral_de_normalize_torch(magnitudes):
 33 |     output = dynamic_range_decompression_torch(magnitudes)
 34 |     return output
 35 | 
 36 | 
 37 | mel_basis = {}
 38 | hann_window = {}
 39 | 
 40 | 
 41 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
 42 |     if torch.min(y) < -1.0:
 43 |         print("min value is ", torch.min(y))
 44 |     if torch.max(y) > 1.0:
 45 |         print("max value is ", torch.max(y))
 46 | 
 47 |     global hann_window
 48 |     dtype_device = str(y.dtype) + "_" + str(y.device)
 49 |     wnsize_dtype_device = str(win_size) + "_" + dtype_device
 50 |     if wnsize_dtype_device not in hann_window:
 51 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
 52 |             dtype=y.dtype, device=y.device
 53 |         )
 54 | 
 55 |     y = torch.nn.functional.pad(
 56 |         y.unsqueeze(1),
 57 |         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
 58 |         mode="reflect",
 59 |     )
 60 |     y = y.squeeze(1)
 61 | 
 62 |     spec = torch.stft(
 63 |         y,
 64 |         n_fft,
 65 |         hop_length=hop_size,
 66 |         win_length=win_size,
 67 |         window=hann_window[wnsize_dtype_device],
 68 |         center=center,
 69 |         pad_mode="reflect",
 70 |         normalized=False,
 71 |         onesided=True,
 72 |         return_complex=False,
 73 |     )
 74 | 
 75 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
 76 |     return spec
 77 | 
 78 | 
 79 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
 80 |     global mel_basis
 81 |     dtype_device = str(spec.dtype) + "_" + str(spec.device)
 82 |     fmax_dtype_device = str(fmax) + "_" + dtype_device
 83 |     if fmax_dtype_device not in mel_basis:
 84 |         mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
 85 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
 86 |             dtype=spec.dtype, device=spec.device
 87 |         )
 88 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
 89 |     spec = spectral_normalize_torch(spec)
 90 |     return spec
 91 | 
 92 | 
 93 | def mel_spectrogram_torch(
 94 |     y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
 95 | ):
 96 |     if torch.min(y) < -1.0:
 97 |         print("min value is ", torch.min(y))
 98 |     if torch.max(y) > 1.0:
 99 |         print("max value is ", torch.max(y))
100 | 
101 |     global mel_basis, hann_window
102 |     dtype_device = str(y.dtype) + "_" + str(y.device)
103 |     fmax_dtype_device = str(fmax) + "_" + dtype_device
104 |     wnsize_dtype_device = str(win_size) + "_" + dtype_device
105 |     if fmax_dtype_device not in mel_basis:
106 |         mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
107 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
108 |             dtype=y.dtype, device=y.device
109 |         )
110 |     if wnsize_dtype_device not in hann_window:
111 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
112 |             dtype=y.dtype, device=y.device
113 |         )
114 | 
115 |     y = torch.nn.functional.pad(
116 |         y.unsqueeze(1),
117 |         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
118 |         mode="reflect",
119 |     )
120 |     y = y.squeeze(1)
121 | 
122 |     spec = torch.stft(
123 |         y,
124 |         n_fft,
125 |         hop_length=hop_size,
126 |         win_length=win_size,
127 |         window=hann_window[wnsize_dtype_device],
128 |         center=center,
129 |         pad_mode="reflect",
130 |         normalized=False,
131 |         onesided=True,
132 |         return_complex=False,
133 |     )
134 | 
135 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
136 | 
137 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
138 |     spec = spectral_normalize_torch(spec)
139 | 
140 |     return spec
141 | 


--------------------------------------------------------------------------------
/vits/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from scipy.io.wavfile import read
 4 | 
 5 | MATPLOTLIB_FLAG = False
 6 | 
 7 | 
 8 | def load_wav_to_torch(full_path):
 9 |     sampling_rate, data = read(full_path)
10 |     return torch.FloatTensor(data.astype(np.float32)), sampling_rate
11 | 
12 | 
13 | f0_bin = 256
14 | f0_max = 1100.0
15 | f0_min = 50.0
16 | f0_mel_min = 1127 * np.log(1 + f0_min / 700)
17 | f0_mel_max = 1127 * np.log(1 + f0_max / 700)
18 | 
19 | 
20 | def f0_to_coarse(f0):
21 |     is_torch = isinstance(f0, torch.Tensor)
22 |     f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * \
23 |         np.log(1 + f0 / 700)
24 |     f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * \
25 |         (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
26 | 
27 |     f0_mel[f0_mel <= 1] = 1
28 |     f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
29 |     f0_coarse = (
30 |         f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int)
31 |     assert f0_coarse.max() <= 255 and f0_coarse.min(
32 |     ) >= 1, (f0_coarse.max(), f0_coarse.min())
33 |     return f0_coarse
34 | 


--------------------------------------------------------------------------------
/vits_decoder/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 NVIDIA CORPORATION.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software. 
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/vits_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | from .alias.act import SnakeAlias


--------------------------------------------------------------------------------
/vits_decoder/alias/LICENSE-snake.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Edward Dixon
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/vits_decoder/alias/__init__.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2 | #   LICENSE is in incl_licenses directory.
3 | 
4 | from .filter import *
5 | from .resample import *
6 | from .act import *


--------------------------------------------------------------------------------
/vits_decoder/alias/act.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
  2 | #   LICENSE is in incl_licenses directory.
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | from torch import sin, pow
  9 | from torch.nn import Parameter
 10 | from .resample import UpSample1d, DownSample1d
 11 | 
 12 | 
 13 | class Activation1d(nn.Module):
 14 |     def __init__(self,
 15 |                  activation,
 16 |                  up_ratio: int = 2,
 17 |                  down_ratio: int = 2,
 18 |                  up_kernel_size: int = 12,
 19 |                  down_kernel_size: int = 12):
 20 |         super().__init__()
 21 |         self.up_ratio = up_ratio
 22 |         self.down_ratio = down_ratio
 23 |         self.act = activation
 24 |         self.upsample = UpSample1d(up_ratio, up_kernel_size)
 25 |         self.downsample = DownSample1d(down_ratio, down_kernel_size)
 26 | 
 27 |     # x: [B,C,T]
 28 |     def forward(self, x):
 29 |         x = self.upsample(x)
 30 |         x = self.act(x)
 31 |         x = self.downsample(x)
 32 | 
 33 |         return x
 34 | 
 35 | 
 36 | class SnakeBeta(nn.Module):
 37 |     '''
 38 |     A modified Snake function which uses separate parameters for the magnitude of the periodic components
 39 |     Shape:
 40 |         - Input: (B, C, T)
 41 |         - Output: (B, C, T), same shape as the input
 42 |     Parameters:
 43 |         - alpha - trainable parameter that controls frequency
 44 |         - beta - trainable parameter that controls magnitude
 45 |     References:
 46 |         - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
 47 |         https://arxiv.org/abs/2006.08195
 48 |     Examples:
 49 |         >>> a1 = snakebeta(256)
 50 |         >>> x = torch.randn(256)
 51 |         >>> x = a1(x)
 52 |     '''
 53 | 
 54 |     def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
 55 |         '''
 56 |         Initialization.
 57 |         INPUT:
 58 |             - in_features: shape of the input
 59 |             - alpha - trainable parameter that controls frequency
 60 |             - beta - trainable parameter that controls magnitude
 61 |             alpha is initialized to 1 by default, higher values = higher-frequency.
 62 |             beta is initialized to 1 by default, higher values = higher-magnitude.
 63 |             alpha will be trained along with the rest of your model.
 64 |         '''
 65 |         super(SnakeBeta, self).__init__()
 66 |         self.in_features = in_features
 67 |         # initialize alpha
 68 |         self.alpha_logscale = alpha_logscale
 69 |         if self.alpha_logscale:  # log scale alphas initialized to zeros
 70 |             self.alpha = Parameter(torch.zeros(in_features) * alpha)
 71 |             self.beta = Parameter(torch.zeros(in_features) * alpha)
 72 |         else:  # linear scale alphas initialized to ones
 73 |             self.alpha = Parameter(torch.ones(in_features) * alpha)
 74 |             self.beta = Parameter(torch.ones(in_features) * alpha)
 75 |         self.alpha.requires_grad = alpha_trainable
 76 |         self.beta.requires_grad = alpha_trainable
 77 |         self.no_div_by_zero = 0.000000001
 78 | 
 79 |     def forward(self, x):
 80 |         '''
 81 |         Forward pass of the function.
 82 |         Applies the function to the input elementwise.
 83 |         SnakeBeta = x + 1/b * sin^2 (xa)
 84 |         '''
 85 |         alpha = self.alpha.unsqueeze(
 86 |             0).unsqueeze(-1)  # line up with x to [B, C, T]
 87 |         beta = self.beta.unsqueeze(0).unsqueeze(-1)
 88 |         if self.alpha_logscale:
 89 |             alpha = torch.exp(alpha)
 90 |             beta = torch.exp(beta)
 91 |         x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
 92 |         return x
 93 | 
 94 | 
 95 | class Mish(nn.Module):
 96 |     """
 97 |     Mish activation function is proposed in "Mish: A Self 
 98 |     Regularized Non-Monotonic Neural Activation Function" 
 99 |     paper, https://arxiv.org/abs/1908.08681.
100 |     """
101 | 
102 |     def __init__(self):
103 |         super().__init__()
104 | 
105 |     def forward(self, x):
106 |         return x * torch.tanh(F.softplus(x))
107 | 
108 | 
109 | class SnakeAlias(nn.Module):
110 |     def __init__(self,
111 |                  channels,
112 |                  up_ratio: int = 2,
113 |                  down_ratio: int = 2,
114 |                  up_kernel_size: int = 12,
115 |                  down_kernel_size: int = 12):
116 |         super().__init__()
117 |         self.up_ratio = up_ratio
118 |         self.down_ratio = down_ratio
119 |         self.act = SnakeBeta(channels, alpha_logscale=True)
120 |         self.upsample = UpSample1d(up_ratio, up_kernel_size)
121 |         self.downsample = DownSample1d(down_ratio, down_kernel_size)
122 | 
123 |     # x: [B,C,T]
124 |     def forward(self, x):
125 |         x = self.upsample(x)
126 |         x = self.act(x)
127 |         x = self.downsample(x)
128 | 
129 |         return x


--------------------------------------------------------------------------------
/vits_decoder/alias/filter.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | import math
 8 | 
 9 | if 'sinc' in dir(torch):
10 |     sinc = torch.sinc
11 | else:
12 |     # This code is adopted from adefossez's julius.core.sinc under the MIT License
13 |     # https://adefossez.github.io/julius/julius/core.html
14 |     #   LICENSE is in incl_licenses directory.
15 |     def sinc(x: torch.Tensor):
16 |         """
17 |         Implementation of sinc, i.e. sin(pi * x) / (pi * x)
18 |         __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
19 |         """
20 |         return torch.where(x == 0,
21 |                            torch.tensor(1., device=x.device, dtype=x.dtype),
22 |                            torch.sin(math.pi * x) / math.pi / x)
23 | 
24 | 
25 | # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
26 | # https://adefossez.github.io/julius/julius/lowpass.html
27 | #   LICENSE is in incl_licenses directory.
28 | def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
29 |     even = (kernel_size % 2 == 0)
30 |     half_size = kernel_size // 2
31 | 
32 |     #For kaiser window
33 |     delta_f = 4 * half_width
34 |     A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
35 |     if A > 50.:
36 |         beta = 0.1102 * (A - 8.7)
37 |     elif A >= 21.:
38 |         beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.)
39 |     else:
40 |         beta = 0.
41 |     window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
42 | 
43 |     # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
44 |     if even:
45 |         time = (torch.arange(-half_size, half_size) + 0.5)
46 |     else:
47 |         time = torch.arange(kernel_size) - half_size
48 |     if cutoff == 0:
49 |         filter_ = torch.zeros_like(time)
50 |     else:
51 |         filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
52 |         # Normalize filter to have sum = 1, otherwise we will have a small leakage
53 |         # of the constant component in the input signal.
54 |         filter_ /= filter_.sum()
55 |         filter = filter_.view(1, 1, kernel_size)
56 | 
57 |     return filter
58 | 
59 | 
60 | class LowPassFilter1d(nn.Module):
61 |     def __init__(self,
62 |                  cutoff=0.5,
63 |                  half_width=0.6,
64 |                  stride: int = 1,
65 |                  padding: bool = True,
66 |                  padding_mode: str = 'replicate',
67 |                  kernel_size: int = 12):
68 |         # kernel_size should be even number for stylegan3 setup,
69 |         # in this implementation, odd number is also possible.
70 |         super().__init__()
71 |         if cutoff < -0.:
72 |             raise ValueError("Minimum cutoff must be larger than zero.")
73 |         if cutoff > 0.5:
74 |             raise ValueError("A cutoff above 0.5 does not make sense.")
75 |         self.kernel_size = kernel_size
76 |         self.even = (kernel_size % 2 == 0)
77 |         self.pad_left = kernel_size // 2 - int(self.even)
78 |         self.pad_right = kernel_size // 2
79 |         self.stride = stride
80 |         self.padding = padding
81 |         self.padding_mode = padding_mode
82 |         filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
83 |         self.register_buffer("filter", filter)
84 | 
85 |     #input [B, C, T]
86 |     def forward(self, x):
87 |         _, C, _ = x.shape
88 | 
89 |         if self.padding:
90 |             x = F.pad(x, (self.pad_left, self.pad_right),
91 |                       mode=self.padding_mode)
92 |         out = F.conv1d(x, self.filter.expand(C, -1, -1),
93 |                        stride=self.stride, groups=C)
94 | 
95 |         return out


--------------------------------------------------------------------------------
/vits_decoder/alias/resample.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import torch.nn as nn
 5 | from torch.nn import functional as F
 6 | from .filter import LowPassFilter1d
 7 | from .filter import kaiser_sinc_filter1d
 8 | 
 9 | 
10 | class UpSample1d(nn.Module):
11 |     def __init__(self, ratio=2, kernel_size=None):
12 |         super().__init__()
13 |         self.ratio = ratio
14 |         self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
15 |         self.stride = ratio
16 |         self.pad = self.kernel_size // ratio - 1
17 |         self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
18 |         self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
19 |         filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio,
20 |                                       half_width=0.6 / ratio,
21 |                                       kernel_size=self.kernel_size)
22 |         self.register_buffer("filter", filter)
23 | 
24 |     # x: [B, C, T]
25 |     def forward(self, x):
26 |         _, C, _ = x.shape
27 | 
28 |         x = F.pad(x, (self.pad, self.pad), mode='replicate')
29 |         x = self.ratio * F.conv_transpose1d(
30 |             x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
31 |         x = x[..., self.pad_left:-self.pad_right]
32 | 
33 |         return x
34 | 
35 | 
36 | class DownSample1d(nn.Module):
37 |     def __init__(self, ratio=2, kernel_size=None):
38 |         super().__init__()
39 |         self.ratio = ratio
40 |         self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
41 |         self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio,
42 |                                        half_width=0.6 / ratio,
43 |                                        stride=ratio,
44 |                                        kernel_size=self.kernel_size)
45 | 
46 |     def forward(self, x):
47 |         xx = self.lowpass(x)
48 | 
49 |         return xx


--------------------------------------------------------------------------------
/vits_decoder/bigv.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from torch.nn import Conv1d
 5 | from torch.nn.utils import weight_norm, remove_weight_norm
 6 | from .alias.act import SnakeAlias
 7 | 
 8 | 
 9 | def init_weights(m, mean=0.0, std=0.01):
10 |     classname = m.__class__.__name__
11 |     if classname.find("Conv") != -1:
12 |         m.weight.data.normal_(mean, std)
13 | 
14 | 
15 | def get_padding(kernel_size, dilation=1):
16 |     return int((kernel_size*dilation - dilation)/2)
17 | 
18 | 
19 | class AMPBlock(torch.nn.Module):
20 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
21 |         super(AMPBlock, self).__init__()
22 |         self.convs1 = nn.ModuleList([
23 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
24 |                                padding=get_padding(kernel_size, dilation[0]))),
25 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
26 |                                padding=get_padding(kernel_size, dilation[1]))),
27 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
28 |                                padding=get_padding(kernel_size, dilation[2])))
29 |         ])
30 |         self.convs1.apply(init_weights)
31 | 
32 |         self.convs2 = nn.ModuleList([
33 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
34 |                                padding=get_padding(kernel_size, 1))),
35 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
36 |                                padding=get_padding(kernel_size, 1))),
37 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
38 |                                padding=get_padding(kernel_size, 1)))
39 |         ])
40 |         self.convs2.apply(init_weights)
41 | 
42 |         # total number of conv layers
43 |         self.num_layers = len(self.convs1) + len(self.convs2)
44 | 
45 |         # periodic nonlinearity with snakebeta function and anti-aliasing
46 |         self.activations = nn.ModuleList([
47 |             SnakeAlias(channels) for _ in range(self.num_layers)
48 |         ])
49 | 
50 |     def forward(self, x):
51 |         acts1, acts2 = self.activations[::2], self.activations[1::2]
52 |         for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
53 |             xt = a1(x)
54 |             xt = c1(xt)
55 |             xt = a2(xt)
56 |             xt = c2(xt)
57 |             x = xt + x
58 |         return x
59 | 
60 |     def remove_weight_norm(self):
61 |         for l in self.convs1:
62 |             remove_weight_norm(l)
63 |         for l in self.convs2:
64 |             remove_weight_norm(l)


--------------------------------------------------------------------------------
/vits_decoder/discriminator.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from omegaconf import OmegaConf
 5 | from .msd import ScaleDiscriminator
 6 | from .mpd import MultiPeriodDiscriminator
 7 | from .mrd import MultiResolutionDiscriminator
 8 | 
 9 | 
10 | class Discriminator(nn.Module):
11 |     def __init__(self, hp):
12 |         super(Discriminator, self).__init__()
13 |         self.MRD = MultiResolutionDiscriminator(hp)
14 |         self.MPD = MultiPeriodDiscriminator(hp)
15 |         self.MSD = ScaleDiscriminator()
16 | 
17 |     def forward(self, x):
18 |         r = self.MRD(x)
19 |         p = self.MPD(x)
20 |         s = self.MSD(x)
21 |         return r + p + s
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     hp = OmegaConf.load('../config/base.yaml')
26 |     model = Discriminator(hp)
27 | 
28 |     x = torch.randn(3, 1, 16384)
29 |     print(x.shape)
30 | 
31 |     output = model(x)
32 |     for features, score in output:
33 |         for feat in features:
34 |             print(feat.shape)
35 |         print(score.shape)
36 | 
37 |     pytorch_total_params = sum(p.numel()
38 |                                for p in model.parameters() if p.requires_grad)
39 |     print(pytorch_total_params)
40 | 


--------------------------------------------------------------------------------
/vits_decoder/med.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchaudio
 3 | import typing as T
 4 | 
 5 | 
 6 | class MelspecDiscriminator(torch.nn.Module):
 7 |     """mel spectrogram (frequency domain) discriminator"""
 8 | 
 9 |     def __init__(self) -> None:
10 |         super().__init__()
11 |         self.SAMPLE_RATE = 48000
12 |         # mel filterbank transform
13 |         self._melspec = torchaudio.transforms.MelSpectrogram(
14 |             sample_rate=self.SAMPLE_RATE,
15 |             n_fft=2048,
16 |             win_length=int(0.025 * self.SAMPLE_RATE),
17 |             hop_length=int(0.010 * self.SAMPLE_RATE),
18 |             n_mels=128,
19 |             power=1,
20 |         )
21 | 
22 |         # time-frequency 2D convolutions
23 |         kernel_sizes = [(7, 7), (4, 4), (4, 4), (4, 4)]
24 |         strides = [(1, 2), (1, 2), (1, 2), (1, 2)]
25 |         self._convs = torch.nn.ModuleList(
26 |             [
27 |                 torch.nn.Sequential(
28 |                     torch.nn.Conv2d(
29 |                         in_channels=1 if i == 0 else 32,
30 |                         out_channels=64,
31 |                         kernel_size=k,
32 |                         stride=s,
33 |                         padding=(1, 2),
34 |                         bias=False,
35 |                     ),
36 |                     torch.nn.BatchNorm2d(num_features=64),
37 |                     torch.nn.GLU(dim=1),
38 |                 )
39 |                 for i, (k, s) in enumerate(zip(kernel_sizes, strides))
40 |             ]
41 |         )
42 | 
43 |         # output adversarial projection
44 |         self._postnet = torch.nn.Conv2d(
45 |             in_channels=32,
46 |             out_channels=1,
47 |             kernel_size=(15, 3),
48 |             stride=(1, 2),
49 |         )
50 | 
51 |     def forward(self, x: torch.Tensor) -> T.Tuple[torch.Tensor, T.List[torch.Tensor]]:
52 |         # apply the log-scale mel spectrogram transform
53 |         x = torch.log(self._melspec(x) + 1e-5)
54 | 
55 |         # compute hidden layers and feature maps
56 |         f = []
57 |         for c in self._convs:
58 |             x = c(x)
59 |             f.append(x)
60 | 
61 |         # apply the output projection and global average pooling
62 |         x = self._postnet(x)
63 |         x = x.mean(dim=[-2, -1])
64 | 
65 |         return [(f, x)]
66 | 


--------------------------------------------------------------------------------
/vits_decoder/mpd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch.nn.utils import weight_norm, spectral_norm
 5 | 
 6 | class DiscriminatorP(nn.Module):
 7 |     def __init__(self, hp, period):
 8 |         super(DiscriminatorP, self).__init__()
 9 | 
10 |         self.LRELU_SLOPE = hp.mpd.lReLU_slope
11 |         self.period = period
12 | 
13 |         kernel_size = hp.mpd.kernel_size
14 |         stride = hp.mpd.stride
15 |         norm_f = weight_norm if hp.mpd.use_spectral_norm == False else spectral_norm
16 | 
17 |         self.convs = nn.ModuleList([
18 |             norm_f(nn.Conv2d(1, 64, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
19 |             norm_f(nn.Conv2d(64, 128, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
20 |             norm_f(nn.Conv2d(128, 256, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
21 |             norm_f(nn.Conv2d(256, 512, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
22 |             norm_f(nn.Conv2d(512, 1024, (kernel_size, 1), 1, padding=(kernel_size // 2, 0))),
23 |         ])
24 |         self.conv_post = norm_f(nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
25 | 
26 |     def forward(self, x):
27 |         fmap = []
28 | 
29 |         # 1d to 2d
30 |         b, c, t = x.shape
31 |         if t % self.period != 0: # pad first
32 |             n_pad = self.period - (t % self.period)
33 |             x = F.pad(x, (0, n_pad), "reflect")
34 |             t = t + n_pad
35 |         x = x.view(b, c, t // self.period, self.period)
36 | 
37 |         for l in self.convs:
38 |             x = l(x)
39 |             x = F.leaky_relu(x, self.LRELU_SLOPE)
40 |             fmap.append(x)
41 |         x = self.conv_post(x)
42 |         fmap.append(x)
43 |         x = torch.flatten(x, 1, -1)
44 | 
45 |         return fmap, x
46 | 
47 | 
48 | class MultiPeriodDiscriminator(nn.Module):
49 |     def __init__(self, hp):
50 |         super(MultiPeriodDiscriminator, self).__init__()
51 | 
52 |         self.discriminators = nn.ModuleList(
53 |             [DiscriminatorP(hp, period) for period in hp.mpd.periods]
54 |         )
55 | 
56 |     def forward(self, x):
57 |         ret = list()
58 |         for disc in self.discriminators:
59 |             ret.append(disc(x))
60 | 
61 |         return ret  # [(feat, score), (feat, score), (feat, score), (feat, score), (feat, score)]
62 | 


--------------------------------------------------------------------------------
/vits_decoder/mrd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch.nn.utils import weight_norm, spectral_norm
 5 | 
 6 | class DiscriminatorR(torch.nn.Module):
 7 |     def __init__(self, hp, resolution):
 8 |         super(DiscriminatorR, self).__init__()
 9 | 
10 |         self.resolution = resolution
11 |         self.LRELU_SLOPE = hp.mpd.lReLU_slope
12 | 
13 |         norm_f = weight_norm if hp.mrd.use_spectral_norm == False else spectral_norm
14 | 
15 |         self.convs = nn.ModuleList([
16 |             norm_f(nn.Conv2d(1, 32, (3, 9), padding=(1, 4))),
17 |             norm_f(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))),
18 |             norm_f(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))),
19 |             norm_f(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))),
20 |             norm_f(nn.Conv2d(32, 32, (3, 3), padding=(1, 1))),
21 |         ])
22 |         self.conv_post = norm_f(nn.Conv2d(32, 1, (3, 3), padding=(1, 1)))
23 | 
24 |     def forward(self, x):
25 |         fmap = []
26 | 
27 |         x = self.spectrogram(x)
28 |         x = x.unsqueeze(1)
29 |         for l in self.convs:
30 |             x = l(x)
31 |             x = F.leaky_relu(x, self.LRELU_SLOPE)
32 |             fmap.append(x)
33 |         x = self.conv_post(x)
34 |         fmap.append(x)
35 |         x = torch.flatten(x, 1, -1)
36 | 
37 |         return fmap, x
38 | 
39 |     def spectrogram(self, x):
40 |         n_fft, hop_length, win_length = self.resolution
41 |         x = F.pad(x, (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), mode='reflect')
42 |         x = x.squeeze(1)
43 |         x = torch.stft(x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False, return_complex=False) #[B, F, TT, 2]
44 |         mag = torch.norm(x, p=2, dim =-1) #[B, F, TT]
45 | 
46 |         return mag
47 | 
48 | 
49 | class MultiResolutionDiscriminator(torch.nn.Module):
50 |     def __init__(self, hp):
51 |         super(MultiResolutionDiscriminator, self).__init__()
52 |         self.resolutions = eval(hp.mrd.resolutions)
53 |         self.discriminators = nn.ModuleList(
54 |             [DiscriminatorR(hp, resolution) for resolution in self.resolutions]
55 |         )
56 | 
57 |     def forward(self, x):
58 |         ret = list()
59 |         for disc in self.discriminators:
60 |             ret.append(disc(x))
61 | 
62 |         return ret  # [(feat, score), (feat, score), (feat, score)]
63 | 


--------------------------------------------------------------------------------
/vits_decoder/msd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch.nn.utils import weight_norm
 5 | 
 6 | 
 7 | class ScaleDiscriminator(torch.nn.Module):
 8 |     def __init__(self):
 9 |         super(ScaleDiscriminator, self).__init__()
10 |         self.convs = nn.ModuleList([
11 |             weight_norm(nn.Conv1d(1, 16, 15, 1, padding=7)),
12 |             weight_norm(nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)),
13 |             weight_norm(nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)),
14 |             weight_norm(nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
15 |             weight_norm(nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
16 |             weight_norm(nn.Conv1d(1024, 1024, 5, 1, padding=2)),
17 |         ])
18 |         self.conv_post = weight_norm(nn.Conv1d(1024, 1, 3, 1, padding=1))
19 | 
20 |     def forward(self, x):
21 |         fmap = []
22 |         for l in self.convs:
23 |             x = l(x)
24 |             x = F.leaky_relu(x, 0.1)
25 |             fmap.append(x)
26 |         x = self.conv_post(x)
27 |         fmap.append(x)
28 |         x = torch.flatten(x, 1, -1)
29 |         return [(fmap, x)]
30 | 


--------------------------------------------------------------------------------
/vits_extend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/vits_extend/__init__.py


--------------------------------------------------------------------------------
/vits_extend/dataloader.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import DataLoader
 2 | from vits.data_utils import DistributedBucketSampler
 3 | from vits.data_utils import TextAudioSpeakerCollate
 4 | from vits.data_utils import TextAudioSpeakerSet
 5 | 
 6 | 
 7 | def create_dataloader_train(hps, n_gpus, rank):
 8 |     collate_fn = TextAudioSpeakerCollate()
 9 |     train_dataset = TextAudioSpeakerSet(hps.data.training_files, hps.data)
10 |     train_sampler = DistributedBucketSampler(
11 |         train_dataset,
12 |         hps.train.batch_size,
13 |         [150, 300, 450],
14 |         num_replicas=n_gpus,
15 |         rank=rank,
16 |         shuffle=True)
17 |     train_loader = DataLoader(
18 |         train_dataset,
19 |         num_workers=4,
20 |         shuffle=False,
21 |         pin_memory=True,
22 |         collate_fn=collate_fn,
23 |         batch_sampler=train_sampler)
24 |     return train_loader
25 | 
26 | 
27 | def create_dataloader_eval(hps):
28 |     collate_fn = TextAudioSpeakerCollate()
29 |     eval_dataset = TextAudioSpeakerSet(hps.data.validation_files, hps.data)
30 |     eval_loader = DataLoader(
31 |         eval_dataset,
32 |         num_workers=2,
33 |         shuffle=False,
34 |         batch_size=hps.train.batch_size,
35 |         pin_memory=True,
36 |         drop_last=False,
37 |         collate_fn=collate_fn)
38 |     return eval_loader
39 | 


--------------------------------------------------------------------------------
/vits_extend/plotting.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | mpl_logger = logging.getLogger('matplotlib')  # must before import matplotlib
 3 | mpl_logger.setLevel(logging.WARNING)
 4 | import matplotlib
 5 | matplotlib.use("Agg")
 6 | 
 7 | import numpy as np
 8 | import matplotlib.pylab as plt
 9 | 
10 | 
11 | def save_figure_to_numpy(fig):
12 |     # save it to a numpy array.
13 |     data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
14 |     data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
15 |     data = np.transpose(data, (2, 0, 1))
16 |     return data
17 | 
18 | 
19 | def plot_waveform_to_numpy(waveform):
20 |     fig, ax = plt.subplots(figsize=(12, 4))
21 |     ax.plot()
22 |     ax.plot(range(len(waveform)), waveform,
23 |             linewidth=0.1, alpha=0.7, color='blue')
24 | 
25 |     plt.xlabel("Samples")
26 |     plt.ylabel("Amplitude")
27 |     plt.ylim(-1, 1)
28 |     plt.tight_layout()
29 | 
30 |     fig.canvas.draw()
31 |     data = save_figure_to_numpy(fig)
32 |     plt.close()
33 | 
34 |     return data
35 | 
36 | 
37 | def plot_spectrogram_to_numpy(spectrogram):
38 |     fig, ax = plt.subplots(figsize=(12, 4))
39 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower",
40 |                    interpolation='none')
41 |     plt.colorbar(im, ax=ax)
42 |     plt.xlabel("Frames")
43 |     plt.ylabel("Channels")
44 |     plt.tight_layout()
45 | 
46 |     fig.canvas.draw()
47 |     data = save_figure_to_numpy(fig)
48 |     plt.close()
49 |     return data
50 | 


--------------------------------------------------------------------------------
/vits_extend/stft.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Jungil Kong
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import math
 24 | import os
 25 | import random
 26 | import torch
 27 | import torch.utils.data
 28 | import numpy as np
 29 | from librosa.util import normalize
 30 | from scipy.io.wavfile import read
 31 | from librosa.filters import mel as librosa_mel_fn
 32 | 
 33 | 
 34 | class TacotronSTFT(torch.nn.Module):
 35 |     def __init__(self, filter_length=512, hop_length=160, win_length=512,
 36 |                  n_mel_channels=80, sampling_rate=16000, mel_fmin=0.0,
 37 |                  mel_fmax=None, center=False, device='cpu'):
 38 |         super(TacotronSTFT, self).__init__()
 39 |         self.n_mel_channels = n_mel_channels
 40 |         self.sampling_rate = sampling_rate
 41 |         self.n_fft = filter_length
 42 |         self.hop_size = hop_length
 43 |         self.win_size = win_length
 44 |         self.fmin = mel_fmin
 45 |         self.fmax = mel_fmax
 46 |         self.center = center
 47 | 
 48 |         mel = librosa_mel_fn(
 49 |             sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax)
 50 | 
 51 |         mel_basis = torch.from_numpy(mel).float().to(device)
 52 |         hann_window = torch.hann_window(win_length).to(device)
 53 | 
 54 |         self.register_buffer('mel_basis', mel_basis)
 55 |         self.register_buffer('hann_window', hann_window)
 56 | 
 57 |     def linear_spectrogram(self, y):
 58 |         assert (torch.min(y.data) >= -1)
 59 |         assert (torch.max(y.data) <= 1)
 60 | 
 61 |         y = torch.nn.functional.pad(y.unsqueeze(1),
 62 |                                     (int((self.n_fft - self.hop_size) / 2), int((self.n_fft - self.hop_size) / 2)),
 63 |                                     mode='reflect')
 64 |         y = y.squeeze(1)
 65 |         spec = torch.stft(y, self.n_fft, hop_length=self.hop_size, win_length=self.win_size, window=self.hann_window,
 66 |                           center=self.center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
 67 |         spec = torch.norm(spec, p=2, dim=-1)
 68 | 
 69 |         return spec
 70 | 
 71 |     def mel_spectrogram(self, y):
 72 |         """Computes mel-spectrograms from a batch of waves
 73 |         PARAMS
 74 |         ------
 75 |         y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
 76 | 
 77 |         RETURNS
 78 |         -------
 79 |         mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
 80 |         """
 81 |         assert(torch.min(y.data) >= -1)
 82 |         assert(torch.max(y.data) <= 1)
 83 | 
 84 |         y = torch.nn.functional.pad(y.unsqueeze(1),
 85 |                                     (int((self.n_fft - self.hop_size) / 2), int((self.n_fft - self.hop_size) / 2)),
 86 |                                     mode='reflect')
 87 |         y = y.squeeze(1)
 88 | 
 89 |         spec = torch.stft(y, self.n_fft, hop_length=self.hop_size, win_length=self.win_size, window=self.hann_window,
 90 |                           center=self.center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
 91 | 
 92 |         spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
 93 | 
 94 |         spec = torch.matmul(self.mel_basis, spec)
 95 |         spec = self.spectral_normalize_torch(spec)
 96 | 
 97 |         return spec
 98 | 
 99 |     def spectral_normalize_torch(self, magnitudes):
100 |         output = self.dynamic_range_compression_torch(magnitudes)
101 |         return output
102 | 
103 |     def dynamic_range_compression_torch(self, x, C=1, clip_val=1e-5):
104 |         return torch.log(torch.clamp(x, min=clip_val) * C)
105 | 


--------------------------------------------------------------------------------
/vits_extend/stft_loss.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2019 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """STFT-based Loss modules."""
  7 | 
  8 | import torch
  9 | import torch.nn.functional as F
 10 | 
 11 | 
 12 | def stft(x, fft_size, hop_size, win_length, window):
 13 |     """Perform STFT and convert to magnitude spectrogram.
 14 |     Args:
 15 |         x (Tensor): Input signal tensor (B, T).
 16 |         fft_size (int): FFT size.
 17 |         hop_size (int): Hop size.
 18 |         win_length (int): Window length.
 19 |         window (str): Window function type.
 20 |     Returns:
 21 |         Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
 22 |     """
 23 |     x_stft = torch.stft(x, fft_size, hop_size, win_length, window, return_complex=False)
 24 |     real = x_stft[..., 0]
 25 |     imag = x_stft[..., 1]
 26 | 
 27 |     # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
 28 |     return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1)
 29 | 
 30 | 
 31 | class SpectralConvergengeLoss(torch.nn.Module):
 32 |     """Spectral convergence loss module."""
 33 | 
 34 |     def __init__(self):
 35 |         """Initilize spectral convergence loss module."""
 36 |         super(SpectralConvergengeLoss, self).__init__()
 37 | 
 38 |     def forward(self, x_mag, y_mag):
 39 |         """Calculate forward propagation.
 40 |         Args:
 41 |             x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
 42 |             y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
 43 |         Returns:
 44 |             Tensor: Spectral convergence loss value.
 45 |         """
 46 |         return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro")
 47 | 
 48 | 
 49 | class LogSTFTMagnitudeLoss(torch.nn.Module):
 50 |     """Log STFT magnitude loss module."""
 51 | 
 52 |     def __init__(self):
 53 |         """Initilize los STFT magnitude loss module."""
 54 |         super(LogSTFTMagnitudeLoss, self).__init__()
 55 | 
 56 |     def forward(self, x_mag, y_mag):
 57 |         """Calculate forward propagation.
 58 |         Args:
 59 |             x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
 60 |             y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
 61 |         Returns:
 62 |             Tensor: Log STFT magnitude loss value.
 63 |         """
 64 |         return F.l1_loss(torch.log(y_mag), torch.log(x_mag))
 65 | 
 66 | 
 67 | class STFTLoss(torch.nn.Module):
 68 |     """STFT loss module."""
 69 | 
 70 |     def __init__(self, device, fft_size=1024, shift_size=120, win_length=600, window="hann_window"):
 71 |         """Initialize STFT loss module."""
 72 |         super(STFTLoss, self).__init__()
 73 |         self.fft_size = fft_size
 74 |         self.shift_size = shift_size
 75 |         self.win_length = win_length
 76 |         self.window = getattr(torch, window)(win_length).to(device)
 77 |         self.spectral_convergenge_loss = SpectralConvergengeLoss()
 78 |         self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
 79 | 
 80 |     def forward(self, x, y):
 81 |         """Calculate forward propagation.
 82 |         Args:
 83 |             x (Tensor): Predicted signal (B, T).
 84 |             y (Tensor): Groundtruth signal (B, T).
 85 |         Returns:
 86 |             Tensor: Spectral convergence loss value.
 87 |             Tensor: Log STFT magnitude loss value.
 88 |         """
 89 |         x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
 90 |         y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
 91 |         sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
 92 |         mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
 93 | 
 94 |         return sc_loss, mag_loss
 95 | 
 96 | 
 97 | class MultiResolutionSTFTLoss(torch.nn.Module):
 98 |     """Multi resolution STFT loss module."""
 99 | 
100 |     def __init__(self,
101 |                  device,
102 |                  resolutions,
103 |                  window="hann_window"):
104 |         """Initialize Multi resolution STFT loss module.
105 |         Args:
106 |             resolutions (list): List of (FFT size, hop size, window length).
107 |             window (str): Window function type.
108 |         """
109 |         super(MultiResolutionSTFTLoss, self).__init__()
110 |         self.stft_losses = torch.nn.ModuleList()
111 |         for fs, ss, wl in resolutions:
112 |             self.stft_losses += [STFTLoss(device, fs, ss, wl, window)]
113 | 
114 |     def forward(self, x, y):
115 |         """Calculate forward propagation.
116 |         Args:
117 |             x (Tensor): Predicted signal (B, T).
118 |             y (Tensor): Groundtruth signal (B, T).
119 |         Returns:
120 |             Tensor: Multi resolution spectral convergence loss value.
121 |             Tensor: Multi resolution log STFT magnitude loss value.
122 |         """
123 |         sc_loss = 0.0
124 |         mag_loss = 0.0
125 |         for f in self.stft_losses:
126 |             sc_l, mag_l = f(x, y)
127 |             sc_loss += sc_l
128 |             mag_loss += mag_l
129 | 
130 |         sc_loss /= len(self.stft_losses)
131 |         mag_loss /= len(self.stft_losses)
132 | 
133 |         return sc_loss, mag_loss
134 | 


--------------------------------------------------------------------------------
/vits_extend/validation.py:
--------------------------------------------------------------------------------
 1 | import tqdm
 2 | import torch
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | def validate(hp, args, generator, discriminator, valloader, stft, writer, step, device):
 7 |     generator.eval()
 8 |     discriminator.eval()
 9 |     torch.backends.cudnn.benchmark = False
10 | 
11 |     loader = tqdm.tqdm(valloader, desc='Validation loop')
12 |     mel_loss = 0.0
13 |     for idx, (ppg, ppg_l, vec, pit, spk, spec, spec_l, audio, audio_l) in enumerate(loader):
14 |         ppg = ppg.to(device)
15 |         vec = vec.to(device)
16 |         pit = pit.to(device)
17 |         spk = spk.to(device)
18 |         ppg_l = ppg_l.to(device)
19 |         audio = audio.to(device)
20 | 
21 |         if hasattr(generator, 'module'):
22 |             fake_audio = generator.module.infer(ppg, vec, pit, spk, ppg_l)[
23 |                 :, :, :audio.size(2)]
24 |         else:
25 |             fake_audio = generator.infer(ppg, vec, pit, spk, ppg_l)[
26 |                 :, :, :audio.size(2)]
27 | 
28 |         mel_fake = stft.mel_spectrogram(fake_audio.squeeze(1))
29 |         mel_real = stft.mel_spectrogram(audio.squeeze(1))
30 | 
31 |         mel_loss += F.l1_loss(mel_fake, mel_real).item()
32 | 
33 |         if idx < hp.log.num_audio:
34 |             spec_fake = stft.linear_spectrogram(fake_audio.squeeze(1))
35 |             spec_real = stft.linear_spectrogram(audio.squeeze(1))
36 | 
37 |             audio = audio[0][0].cpu().detach().numpy()
38 |             fake_audio = fake_audio[0][0].cpu().detach().numpy()
39 |             spec_fake = spec_fake[0].cpu().detach().numpy()
40 |             spec_real = spec_real[0].cpu().detach().numpy()
41 |             writer.log_fig_audio(
42 |                 audio, fake_audio, spec_fake, spec_real, idx, step)
43 | 
44 |     mel_loss = mel_loss / len(valloader.dataset)
45 | 
46 |     writer.log_validation(mel_loss, generator, discriminator, step)
47 | 
48 |     torch.backends.cudnn.benchmark = True
49 | 


--------------------------------------------------------------------------------
/vits_extend/writer.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.tensorboard import SummaryWriter
 2 | import numpy as np
 3 | import librosa
 4 | 
 5 | from .plotting import plot_waveform_to_numpy, plot_spectrogram_to_numpy
 6 | 
 7 | class MyWriter(SummaryWriter):
 8 |     def __init__(self, hp, logdir):
 9 |         super(MyWriter, self).__init__(logdir)
10 |         self.sample_rate = hp.data.sampling_rate
11 | 
12 |     def log_training(self, g_loss, d_loss, mel_loss, stft_loss, k_loss, r_loss, score_loss, step):
13 |         self.add_scalar('train/g_loss', g_loss, step)
14 |         self.add_scalar('train/d_loss', d_loss, step)
15 |         
16 |         self.add_scalar('train/score_loss', score_loss, step)
17 |         self.add_scalar('train/stft_loss', stft_loss, step)
18 |         self.add_scalar('train/mel_loss', mel_loss, step)
19 |         self.add_scalar('train/kl_f_loss', k_loss, step)
20 |         self.add_scalar('train/kl_r_loss', r_loss, step)
21 | 
22 |     def log_validation(self, mel_loss, generator, discriminator, step):
23 |         self.add_scalar('validation/mel_loss', mel_loss, step)
24 | 
25 |     def log_fig_audio(self, real, fake, spec_fake, spec_real, idx, step):
26 |         if idx == 0:
27 |             spec_fake = librosa.amplitude_to_db(spec_fake, ref=np.max,top_db=80.)
28 |             spec_real = librosa.amplitude_to_db(spec_real, ref=np.max,top_db=80.)
29 |             self.add_image(f'spec_fake/{step}', plot_spectrogram_to_numpy(spec_fake), step)
30 |             self.add_image(f'wave_fake/{step}', plot_waveform_to_numpy(fake), step)
31 |             self.add_image(f'spec_real/{step}', plot_spectrogram_to_numpy(spec_real), step)
32 |             self.add_image(f'wave_real/{step}', plot_waveform_to_numpy(real), step)
33 | 
34 |             self.add_audio(f'fake/{step}', fake, step, self.sample_rate)
35 |             self.add_audio(f'real/{step}', real, step, self.sample_rate)
36 | 
37 |     def log_histogram(self, model, step):
38 |         for tag, value in model.named_parameters():
39 |             self.add_histogram(tag.replace('.', '/'), value.cpu().detach().numpy(), step)
40 | 


--------------------------------------------------------------------------------
/vits_pretrain/README.md:
--------------------------------------------------------------------------------
1 | Path for:
2 | 
3 |     sovits5.0_bigvgan_mix_v2.pth
4 | 


--------------------------------------------------------------------------------
/whisper/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 OpenAI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/whisper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlayVoice/whisper-vits-svc/b95a84954d74f651982c0ecea5f7eb67f9c02d46/whisper/__init__.py


--------------------------------------------------------------------------------
/whisper/audio.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from functools import lru_cache
  3 | from typing import Union
  4 | 
  5 | import librosa
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn.functional as F
  9 | 
 10 | from .utils import exact_div
 11 | 
 12 | from librosa.filters import mel as librosa_mel_fn
 13 | 
 14 | # hard-coded audio hyperparameters
 15 | SAMPLE_RATE = 16000
 16 | N_FFT = 400
 17 | N_MELS = 80
 18 | HOP_LENGTH = 160
 19 | CHUNK_LENGTH = 30
 20 | N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000: number of samples in a chunk
 21 | N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH)  # 3000: number of frames in a mel spectrogram input
 22 | 
 23 | 
 24 | def load_audio(file: str, sr: int = SAMPLE_RATE):
 25 |     x, sr = librosa.load(file, sr=sr)
 26 |     return x
 27 | 
 28 | 
 29 | def pad_or_trim(array, length_max: int = N_SAMPLES, length_min: int = N_SAMPLES // 2, *, axis: int = -1):
 30 |     """
 31 |     Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
 32 |     """
 33 |     if torch.is_tensor(array):
 34 |         if array.shape[axis] > length_max:
 35 |             array = array.index_select(dim=axis, index=torch.arange(length_max, device=array.device))
 36 | 
 37 |         if array.shape[axis] < length_min:
 38 |             pad_widths = [(0, 0)] * array.ndim
 39 |             pad_widths[axis] = (0, length_min - array.shape[axis])
 40 |             array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
 41 |     else:
 42 |         if array.shape[axis] > length_max:
 43 |             array = array.take(indices=range(length_max), axis=axis)
 44 | 
 45 |         if array.shape[axis] < length_min:
 46 |             pad_widths = [(0, 0)] * array.ndim
 47 |             pad_widths[axis] = (0, length_min - array.shape[axis])
 48 |             array = np.pad(array, pad_widths)
 49 | 
 50 |     return array
 51 | 
 52 | 
 53 | @lru_cache(maxsize=None)
 54 | def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor:
 55 |     """
 56 |     load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
 57 |     Allows decoupling librosa dependency; saved using:
 58 | 
 59 |         np.savez_compressed(
 60 |             "mel_filters.npz",
 61 |             mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
 62 |         )
 63 |     """
 64 |     assert n_mels == 80, f"Unsupported n_mels: {n_mels}"
 65 |     return torch.from_numpy(librosa_mel_fn(sr=SAMPLE_RATE,n_fft=N_FFT,n_mels=n_mels)).to(device)
 66 | 
 67 | 
 68 | def log_mel_spectrogram(audio: Union[str, np.ndarray, torch.Tensor], n_mels: int = N_MELS):
 69 |     """
 70 |     Compute the log-Mel spectrogram of
 71 | 
 72 |     Parameters
 73 |     ----------
 74 |     audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
 75 |         The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
 76 | 
 77 |     n_mels: int
 78 |         The number of Mel-frequency filters, only 80 is supported
 79 | 
 80 |     Returns
 81 |     -------
 82 |     torch.Tensor, shape = (80, n_frames)
 83 |         A Tensor that contains the Mel spectrogram
 84 |     """
 85 |     if not torch.is_tensor(audio):
 86 |         if isinstance(audio, str):
 87 |             audio = load_audio(audio)
 88 |         audio = torch.from_numpy(audio)
 89 | 
 90 |     window = torch.hann_window(N_FFT).to(audio.device)
 91 |     stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
 92 |     magnitudes = stft[..., :-1].abs() ** 2
 93 | 
 94 |     filters = mel_filters(audio.device, n_mels)
 95 |     mel_spec = filters @ magnitudes
 96 | 
 97 |     log_spec = torch.clamp(mel_spec, min=1e-10).log10()
 98 |     log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
 99 |     log_spec = (log_spec + 4.0) / 4.0
100 |     return log_spec
101 | 


--------------------------------------------------------------------------------
/whisper/inference.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 3 | import numpy as np
 4 | import argparse
 5 | import torch
 6 | 
 7 | from whisper.model import Whisper, ModelDimensions
 8 | from whisper.audio import load_audio, pad_or_trim, log_mel_spectrogram
 9 | 
10 | 
11 | def load_model(path, device) -> Whisper:
12 |     checkpoint = torch.load(path, map_location="cpu")
13 |     dims = ModelDimensions(**checkpoint["dims"])
14 |     # print(dims)
15 |     model = Whisper(dims)
16 |     del model.decoder
17 |     cut = len(model.encoder.blocks) // 4
18 |     cut = -1 * cut
19 |     del model.encoder.blocks[cut:]
20 |     model.load_state_dict(checkpoint["model_state_dict"], strict=False)
21 |     model.eval()
22 |     if not (device == "cpu"):
23 |         model.half()
24 |     model.to(device)
25 |     # torch.save({
26 |     #     'dims': checkpoint["dims"],
27 |     #     'model_state_dict': model.state_dict(),
28 |     # }, "large-v2.pt")
29 |     return model
30 | 
31 | 
32 | def pred_ppg(whisper: Whisper, wavPath, ppgPath, device):
33 |     audio = load_audio(wavPath)
34 |     audln = audio.shape[0]
35 |     ppg_a = []
36 |     idx_s = 0
37 |     while (idx_s + 15 * 16000 < audln):
38 |         short = audio[idx_s:idx_s + 15 * 16000]
39 |         idx_s = idx_s + 15 * 16000
40 |         ppgln = 15 * 16000 // 320
41 |         # short = pad_or_trim(short)
42 |         mel = log_mel_spectrogram(short).to(device)
43 |         if not (device == "cpu"):
44 |             mel = mel.half()
45 |         with torch.no_grad():
46 |             mel = mel + torch.randn_like(mel) * 0.1
47 |             ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
48 |             ppg = ppg[:ppgln,]  # [length, dim=1024]
49 |             ppg_a.extend(ppg)
50 |     if (idx_s < audln):
51 |         short = audio[idx_s:audln]
52 |         ppgln = (audln - idx_s) // 320
53 |         # short = pad_or_trim(short)
54 |         mel = log_mel_spectrogram(short).to(device)
55 |         if not (device == "cpu"):
56 |             mel = mel.half()
57 |         with torch.no_grad():
58 |             mel = mel + torch.randn_like(mel) * 0.1
59 |             ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
60 |             ppg = ppg[:ppgln,]  # [length, dim=1024]
61 |             ppg_a.extend(ppg)
62 |     np.save(ppgPath, ppg_a, allow_pickle=False)
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     parser = argparse.ArgumentParser()
67 |     parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
68 |     parser.add_argument("-p", "--ppg", help="ppg", dest="ppg", required=True)
69 |     args = parser.parse_args()
70 |     print(args.wav)
71 |     print(args.ppg)
72 | 
73 |     wavPath = args.wav
74 |     ppgPath = args.ppg
75 | 
76 |     device = "cuda" if torch.cuda.is_available() else "cpu"
77 |     whisper = load_model(os.path.join("whisper_pretrain", "large-v2.pt"), device)
78 |     pred_ppg(whisper, wavPath, ppgPath, device)
79 | 


--------------------------------------------------------------------------------
/whisper/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import sys
  4 | import zlib
  5 | from typing import Callable, TextIO
  6 | 
  7 | system_encoding = sys.getdefaultencoding()
  8 | 
  9 | if system_encoding != "utf-8":
 10 |     def make_safe(string):
 11 |         # replaces any character not representable using the system default encoding with an '?',
 12 |         # avoiding UnicodeEncodeError (https://github.com/openai/whisper/discussions/729).
 13 |         return string.encode(system_encoding, errors="replace").decode(system_encoding)
 14 | else:
 15 |     def make_safe(string):
 16 |         # utf-8 can encode any Unicode code point, so no need to do the round-trip encoding
 17 |         return string
 18 | 
 19 | 
 20 | def exact_div(x, y):
 21 |     assert x % y == 0
 22 |     return x // y
 23 | 
 24 | 
 25 | def str2bool(string):
 26 |     str2val = {"True": True, "False": False}
 27 |     if string in str2val:
 28 |         return str2val[string]
 29 |     else:
 30 |         raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
 31 | 
 32 | 
 33 | def optional_int(string):
 34 |     return None if string == "None" else int(string)
 35 | 
 36 | 
 37 | def optional_float(string):
 38 |     return None if string == "None" else float(string)
 39 | 
 40 | 
 41 | def compression_ratio(text) -> float:
 42 |     text_bytes = text.encode("utf-8")
 43 |     return len(text_bytes) / len(zlib.compress(text_bytes))
 44 | 
 45 | 
 46 | def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'):
 47 |     assert seconds >= 0, "non-negative timestamp expected"
 48 |     milliseconds = round(seconds * 1000.0)
 49 | 
 50 |     hours = milliseconds // 3_600_000
 51 |     milliseconds -= hours * 3_600_000
 52 | 
 53 |     minutes = milliseconds // 60_000
 54 |     milliseconds -= minutes * 60_000
 55 | 
 56 |     seconds = milliseconds // 1_000
 57 |     milliseconds -= seconds * 1_000
 58 | 
 59 |     hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
 60 |     return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
 61 | 
 62 | 
 63 | class ResultWriter:
 64 |     extension: str
 65 | 
 66 |     def __init__(self, output_dir: str):
 67 |         self.output_dir = output_dir
 68 | 
 69 |     def __call__(self, result: dict, audio_path: str):
 70 |         audio_basename = os.path.basename(audio_path)
 71 |         output_path = os.path.join(self.output_dir, audio_basename + "." + self.extension)
 72 | 
 73 |         with open(output_path, "w", encoding="utf-8") as f:
 74 |             self.write_result(result, file=f)
 75 | 
 76 |     def write_result(self, result: dict, file: TextIO):
 77 |         raise NotImplementedError
 78 | 
 79 | 
 80 | class WriteTXT(ResultWriter):
 81 |     extension: str = "txt"
 82 | 
 83 |     def write_result(self, result: dict, file: TextIO):
 84 |         for segment in result["segments"]:
 85 |             print(segment['text'].strip(), file=file, flush=True)
 86 | 
 87 | 
 88 | class WriteVTT(ResultWriter):
 89 |     extension: str = "vtt"
 90 | 
 91 |     def write_result(self, result: dict, file: TextIO):
 92 |         print("WEBVTT\n", file=file)
 93 |         for segment in result["segments"]:
 94 |             print(
 95 |                 f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
 96 |                 f"{segment['text'].strip().replace('-->', '->')}\n",
 97 |                 file=file,
 98 |                 flush=True,
 99 |             )
100 | 
101 | 
102 | class WriteSRT(ResultWriter):
103 |     extension: str = "srt"
104 | 
105 |     def write_result(self, result: dict, file: TextIO):
106 |         for i, segment in enumerate(result["segments"], start=1):
107 |             # write srt lines
108 |             print(
109 |                 f"{i}\n"
110 |                 f"{format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> "
111 |                 f"{format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n"
112 |                 f"{segment['text'].strip().replace('-->', '->')}\n",
113 |                 file=file,
114 |                 flush=True,
115 |             )
116 | 
117 | 
118 | class WriteTSV(ResultWriter):
119 |     """
120 |     Write a transcript to a file in TSV (tab-separated values) format containing lines like:
121 |     <start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
122 | 
123 |     Using integer milliseconds as start and end times means there's no chance of interference from
124 |     an environment setting a language encoding that causes the decimal in a floating point number
125 |     to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
126 |     """
127 |     extension: str = "tsv"
128 | 
129 |     def write_result(self, result: dict, file: TextIO):
130 |         print("start", "end", "text", sep="\t", file=file)
131 |         for segment in result["segments"]:
132 |             print(round(1000 * segment['start']), file=file, end="\t")
133 |             print(round(1000 * segment['end']), file=file, end="\t")
134 |             print(segment['text'].strip().replace("\t", " "), file=file, flush=True)
135 | 
136 | 
137 | class WriteJSON(ResultWriter):
138 |     extension: str = "json"
139 | 
140 |     def write_result(self, result: dict, file: TextIO):
141 |         json.dump(result, file)
142 | 
143 | 
144 | def get_writer(output_format: str, output_dir: str) -> Callable[[dict, TextIO], None]:
145 |     writers = {
146 |         "txt": WriteTXT,
147 |         "vtt": WriteVTT,
148 |         "srt": WriteSRT,
149 |         "tsv": WriteTSV,
150 |         "json": WriteJSON,
151 |     }
152 | 
153 |     if output_format == "all":
154 |         all_writers = [writer(output_dir) for writer in writers.values()]
155 | 
156 |         def write_all(result: dict, file: TextIO):
157 |             for writer in all_writers:
158 |                 writer(result, file)
159 | 
160 |         return write_all
161 | 
162 |     return writers[output_format](output_dir)
163 | 
164 | 


--------------------------------------------------------------------------------
/whisper_pretrain/README.md:
--------------------------------------------------------------------------------
1 | Path for:
2 | 
3 |     large-v2.pt
4 | 


--------------------------------------------------------------------------------