├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── configs ├── common.yaml ├── config-online-language-emb.yaml ├── config-online-spin-language-emb.yaml ├── config-online-spin.yaml ├── config-online.yaml └── config.yaml ├── data_utils.py ├── features.py ├── losses.py ├── mel_processing.py ├── models ├── __init__.py ├── clova │ ├── SpeakerNet.py │ └── models │ │ ├── RawNet3.py │ │ ├── RawNetBasicBlock.py │ │ ├── ResNetBlocks.py │ │ ├── ResNetSE34L.py │ │ ├── ResNetSE34V2.py │ │ ├── VGGVox.py │ │ ├── byol.py │ │ ├── ssl_singer_identity │ │ ├── .gitignore │ │ ├── LICENSE │ │ ├── README.md │ │ ├── convert_checkpoint.py │ │ ├── environment.yml │ │ ├── eval.py │ │ ├── metadata │ │ │ ├── img │ │ │ │ ├── byol.png │ │ │ │ ├── full_diagram.png │ │ │ │ ├── isolated.png │ │ │ │ ├── pipeline.png │ │ │ │ └── techniques_.png │ │ │ ├── m4singer_renamed_split_4s │ │ │ │ └── speaker_pairs.txt │ │ │ └── vocalset_renamed_split_4s │ │ │ │ └── speaker_pairs.txt │ │ ├── preprocess │ │ │ ├── create_speaker_pairs.py │ │ │ └── preprocess_dataset.py │ │ ├── singer_identity │ │ │ ├── __init__.py │ │ │ ├── callbacks │ │ │ │ ├── evaluation.py │ │ │ │ └── ma_updates.py │ │ │ ├── losses.py │ │ │ ├── model.py │ │ │ ├── models │ │ │ │ ├── byol.py │ │ │ │ └── network_components.py │ │ │ ├── train_configs │ │ │ │ ├── README.md │ │ │ │ ├── byol.yaml │ │ │ │ ├── common.yaml │ │ │ │ ├── contrastive-vc.yaml │ │ │ │ ├── contrastive.yaml │ │ │ │ ├── contrastive_test.yaml │ │ │ │ ├── uniformity-alignment.yaml │ │ │ │ └── vicreg.yaml │ │ │ ├── trainer.py │ │ │ ├── trainer_byol.py │ │ │ └── utils │ │ │ │ ├── core.py │ │ │ │ ├── data_utils.py │ │ │ │ └── fetch_pretrained.py │ │ └── train.py │ │ └── weights │ │ └── RawNet3 │ │ ├── .gitattributes │ │ └── README.md ├── commons.py ├── content_extractors.py ├── f0_predictor │ ├── CrepeF0Predictor.py │ ├── DioF0Predictor.py │ ├── F0Predictor.py │ ├── FCPEF0Predictor.py │ ├── HarvestF0Predictor.py │ ├── PMF0Predictor.py │ ├── RMVPEF0Predictor.py │ ├── __init__.py │ ├── crepe.py │ ├── fcpe │ │ ├── __init__.py │ │ ├── model.py │ │ ├── nvSTFT.py │ │ └── pcmer.py │ └── rmvpe │ │ ├── __init__.py │ │ ├── constants.py │ │ ├── deepunet.py │ │ ├── inference.py │ │ ├── model.py │ │ ├── seq.py │ │ ├── spec.py │ │ └── utils.py ├── hifigan │ ├── __init__.py │ ├── config.json │ ├── generator_v1.txt │ └── models.py ├── models.py ├── modules.py ├── so_vits_svc.py ├── speaker_encoder │ ├── __init__.py │ ├── audio.py │ ├── compute_embed.py │ ├── config.py │ ├── hparams.py │ ├── inference.py │ ├── model.py │ ├── params_data.py │ ├── params_model.py │ ├── preprocess.py │ ├── train.py │ ├── visualizations.py │ └── voice_encoder.py ├── speaker_encoders.py ├── spin │ ├── __init__.py │ ├── spin.yaml │ └── src │ │ ├── data │ │ ├── __init__.py │ │ ├── audio.py │ │ ├── dataset.py │ │ ├── librispeech.py │ │ └── sampler.py │ │ ├── model │ │ ├── __init__.py │ │ ├── base.py │ │ └── spin.py │ │ ├── nn │ │ ├── __init__.py │ │ ├── dnn.py │ │ ├── hubert.py │ │ ├── swav_vq_dis.py │ │ └── wavlm.py │ │ ├── task │ │ ├── __init__.py │ │ └── train_spin.py │ │ └── util │ │ ├── __init__.py │ │ ├── log.py │ │ ├── model_utils.py │ │ ├── padding.py │ │ ├── pnmi.py │ │ └── scheduler.py ├── ssl_singer_identity │ ├── LICENSE │ ├── README.md │ ├── environment.yml │ ├── eval.py │ ├── metadata │ │ ├── img │ │ │ ├── byol.png │ │ │ └── pipeline.png │ │ ├── m4singer_renamed_split_4s │ │ │ └── speaker_pairs.txt │ │ └── vocalset_renamed_split_4s │ │ │ └── speaker_pairs.txt │ ├── preprocess │ │ ├── create_speaker_pairs.py │ │ └── preprocess_dataset.py │ ├── singer_identity │ │ ├── __init__.py │ │ ├── callbacks │ │ │ ├── evaluation.py │ │ │ └── ma_updates.py │ │ ├── losses.py │ │ ├── model.py │ │ ├── models │ │ │ ├── byol.py │ │ │ └── network_components.py │ │ ├── train_configs │ │ │ ├── README.md │ │ │ ├── byol.yaml │ │ │ ├── common.yaml │ │ │ ├── contrastive-vc.yaml │ │ │ ├── contrastive.yaml │ │ │ ├── contrastive_test.yaml │ │ │ ├── uniformity-alignment.yaml │ │ │ └── vicreg.yaml │ │ ├── trainer.py │ │ ├── trainer_byol.py │ │ └── utils │ │ │ ├── core.py │ │ │ ├── data_utils.py │ │ │ └── fetch_pretrained.py │ └── train.py └── wavlm │ ├── WavLM-Large.pt.txt │ ├── WavLM.py │ ├── __init__.py │ └── modules.py ├── requirements.txt ├── resources └── freesvc.png ├── scripts ├── convert.py ├── convert.txt ├── convert_dir_vad.py ├── downsample.py ├── inference.py ├── inference_parallel.py ├── inference_sample.py ├── prepare_nus_dataset.sh ├── prepare_nus_dataset_vad.sh ├── prepare_pop_dataset.sh ├── prepare_vctk_dataset.sh ├── preprocess_content.py ├── preprocess_flist.py ├── preprocess_pitch.py ├── preprocess_spk.py ├── preprocess_sr.py ├── run_inference.sh ├── run_inference_parallel.sh └── segment_vad.py ├── train.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Checkpoints 2 | **/ckpt/ 3 | 4 | # Dataset 5 | dataset 6 | 7 | # Model weights 8 | WavLM-Large.pt 9 | generator_v1 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | *$py.class 15 | 16 | # C extensions 17 | *.so 18 | 19 | # Distribution / packaging 20 | .Python 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | wheels/ 33 | share/python-wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .nox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | *.py,cover 60 | .hypothesis/ 61 | .pytest_cache/ 62 | cover/ 63 | 64 | # Translations 65 | *.mo 66 | *.pot 67 | 68 | # Django stuff: 69 | *.log 70 | local_settings.py 71 | db.sqlite3 72 | db.sqlite3-journal 73 | 74 | # Flask stuff: 75 | instance/ 76 | .webassets-cache 77 | 78 | # Scrapy stuff: 79 | .scrapy 80 | 81 | # Sphinx documentation 82 | docs/_build/ 83 | 84 | # PyBuilder 85 | .pybuilder/ 86 | target/ 87 | 88 | # Jupyter Notebook 89 | .ipynb_checkpoints 90 | 91 | # IPython 92 | profile_default/ 93 | ipython_config.py 94 | 95 | # pyenv 96 | # For a library or package, you might want to ignore these files since the code is 97 | # intended to run in multiple environments; otherwise, check them in: 98 | # .python-version 99 | 100 | # pipenv 101 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 102 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 103 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 104 | # install all needed dependencies. 105 | #Pipfile.lock 106 | 107 | # poetry 108 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 109 | # This is especially recommended for binary packages to ensure reproducibility, and is more 110 | # commonly ignored for libraries. 111 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 112 | #poetry.lock 113 | 114 | # pdm 115 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 116 | #pdm.lock 117 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 118 | # in version control. 119 | # https://pdm.fming.dev/#use-with-ide 120 | .pdm.toml 121 | 122 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 123 | __pypackages__/ 124 | 125 | # Celery stuff 126 | celerybeat-schedule 127 | celerybeat.pid 128 | 129 | # SageMath parsed files 130 | *.sage.py 131 | 132 | # Environments 133 | .env 134 | .venv 135 | env/ 136 | venv/ 137 | ENV/ 138 | env.bak/ 139 | venv.bak/ 140 | 141 | # Spyder project settings 142 | .spyderproject 143 | .spyproject 144 | 145 | # Rope project settings 146 | .ropeproject 147 | 148 | # mkdocs documentation 149 | /site 150 | 151 | # mypy 152 | .mypy_cache/ 153 | .dmypy.json 154 | dmypy.json 155 | 156 | # Pyre type checker 157 | .pyre/ 158 | 159 | # pytype static type analyzer 160 | .pytype/ 161 | 162 | # Cython debug symbols 163 | cython_debug/ 164 | 165 | # PyCharm 166 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 167 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 168 | # and can be added to the global gitignore or merged into this file. For a more nuclear 169 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 170 | #.idea/ 171 | 172 | /data/ 173 | logs/ 174 | spin.ckpt -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # FROM pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime 2 | 3 | FROM nvcr.io/nvidia/pytorch:23.12-py3 4 | RUN apt update && \ 5 | apt -y install git libsndfile1-dev ffmpeg 6 | 7 | # RUN python3 -m pip install --upgrade pip 8 | 9 | # RUN python3 -m pip install torchaudio==0.13.1 -f https://download.pytorch.org/whl/cu116 10 | 11 | COPY requirements.txt . 12 | RUN python3 -m pip install -r requirements.txt 13 | 14 | # Install fairseq (not necessary now) 15 | # RUN git clone https://github.com/facebookresearch/fairseq.git && \ 16 | # cd fairseq && \ 17 | # git checkout 05255f9 && \ 18 | # python3 setup.py build_ext --inplace && \ 19 | # python3 -m pip install -e . && \ 20 | # python3 setup.py build develop 21 | 22 | # RUN python3 -m pip install numpy --upgrade && python3 -m pip install numba 23 | 24 | # Setup working directory 25 | ARG WORKSPACE=/workspace 26 | RUN mkdir -p /${WORKSPACE} 27 | WORKDIR ${WORKSPACE} 28 | COPY . ${WORKSPACE}/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Jingyi Li 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /configs/common.yaml: -------------------------------------------------------------------------------- 1 | path: ./logs/${hydra.job.config_name}/${now:%Y-%m-%d}/${now:%H-%M-%S} 2 | 3 | log_level: INFO 4 | seed: 1 5 | tb_log_dir: tensorboard 6 | tqdm: true 7 | 8 | hydra: 9 | run: 10 | dir: ${path} 11 | job_logging: 12 | formatters: 13 | colorlog: 14 | format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s:%(lineno)s:%(funcName)s()%(reset)s][%(log_color)s%(levelname)s%(reset)s] 15 | - %(message)s' 16 | handlers: 17 | file: 18 | filename: ${hydra.run.dir}/${hydra.job.name}_${now:%Y-%m-%d}_${now:%H-%M-%S}.log 19 | 20 | defaults: 21 | - override hydra/job_logging: colorlog 22 | - override hydra/hydra_logging: colorlog -------------------------------------------------------------------------------- /configs/config-online-language-emb.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - common 3 | - config 4 | 5 | data: 6 | use_lang_emb: true 7 | num_langs: 11 8 | lang_dim: 192 # same size as hidden_channels to facilitate the addition of the conditioning 9 | lang2id: 10 | chinese: 0 11 | dutch: 1 12 | english: 2 13 | french: 3 14 | german: 4 15 | italian: 5 16 | japanese: 6 17 | other: 7 18 | polish: 8 19 | portuguese: 9 20 | spanish: 10 21 | use_spk_emb: false 22 | spk_embeddings_dir: null # compute on forward (model) 23 | spk_encoder_type: null # compute on forward (model) | ECAPA2SpeakerEncoder16k 24 | content_encoder_type: null # compute on forward (model) | hubert 25 | content_encoder_ckpt: null # compute on forward (model) | lengyue233/content-vec-best 26 | 27 | model: 28 | use_spk_emb: true 29 | spk_encoder_type: ECAPA2SpeakerEncoder16k 30 | spk_encoder_ckpt: null # Not used for ECAPA2SpeakerEncoder16k 31 | content_encoder_type: hubert # or wavlm 32 | content_encoder_ckpt: lengyue233/content-vec-best # or models/wavlm/WavLM-Large.pt 33 | -------------------------------------------------------------------------------- /configs/config-online-spin-language-emb.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - common 3 | - config 4 | 5 | data: 6 | use_lang_emb: true 7 | num_langs: 11 8 | lang_dim: 192 # same size as hidden_channels to facilitate the addition 9 | lang2id: 10 | chinese: 0 11 | dutch: 1 12 | english: 2 13 | french: 3 14 | german: 4 15 | italian: 5 16 | japanese: 6 17 | other: 7 18 | polish: 8 19 | portuguese: 9 20 | spanish: 10 21 | use_spk_emb: false 22 | spk_embeddings_dir: null # compute on forward (model) 23 | spk_encoder_type: null # compute on forward (model) | ECAPA2SpeakerEncoder16k 24 | content_encoder_type: null # compute on forward (model) | hubert 25 | content_encoder_ckpt: null # compute on forward (model) | lengyue233/content-vec-best 26 | 27 | model: 28 | use_spk_emb: true 29 | spk_encoder_type: ECAPA2SpeakerEncoder16k 30 | spk_encoder_ckpt: null # Not used for ECAPA2SpeakerEncoder16k 31 | content_encoder_type: spin # hubert | wavlm | spin 32 | content_encoder_config: models/spin/spin.yaml # path to the config file for the content encoder 33 | content_encoder_ckpt: models/spin/spin.ckpt # or models/wavlm/WavLM-Large.pt 34 | -------------------------------------------------------------------------------- /configs/config-online-spin.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - common 3 | - config 4 | 5 | data: 6 | use_spk_emb: false 7 | spk_embeddings_dir: null # compute on forward (model) 8 | spk_encoder_type: null # compute on forward (model) | ECAPA2SpeakerEncoder16k 9 | content_encoder_type: null # compute on forward (model) | hubert 10 | content_encoder_ckpt: null # compute on forward (model) | lengyue233/content-vec-best 11 | 12 | model: 13 | use_spk_emb: true 14 | spk_encoder_type: ECAPA2SpeakerEncoder16k 15 | spk_encoder_ckpt: null # Not used for ECAPA2SpeakerEncoder16k 16 | content_encoder_type: spin # hubert | wavlm | spin 17 | content_encoder_config: models/spin/spin.yaml # path to the config file for the content encoder 18 | content_encoder_ckpt: models/spin/spin.ckpt # or models/wavlm/WavLM-Large.pt 19 | -------------------------------------------------------------------------------- /configs/config-online.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - common 3 | - config 4 | 5 | data: 6 | use_spk_emb: false 7 | spk_embeddings_dir: null # compute on forward (model) 8 | spk_encoder_type: null # compute on forward (model) | ECAPA2SpeakerEncoder16k 9 | content_encoder_type: null # compute on forward (model) | hubert 10 | content_encoder_ckpt: null # compute on forward (model) | lengyue233/content-vec-best 11 | 12 | model: 13 | use_spk_emb: true 14 | spk_encoder_type: ECAPA2SpeakerEncoder16k 15 | spk_encoder_ckpt: null # Not used for ECAPA2SpeakerEncoder16k 16 | content_encoder_type: hubert # or wavlm 17 | content_encoder_ckpt: lengyue233/content-vec-best # or models/wavlm/WavLM-Large.pt 18 | -------------------------------------------------------------------------------- /configs/config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - common 3 | 4 | train: 5 | batch_size: 128 6 | betas: [0.8, 0.99] 7 | c_kl: 1.0 8 | c_mel: 45 9 | distributed: false # BUG: multi-gpu is not working 10 | use_multiprocessing: false # BUG: multi-gpu is not working 11 | epochs: 20 12 | eps: 1e-9 13 | fp16_run: false 14 | init_lr_ratio: 1 15 | raise_error: false 16 | learning_rate: 2e-4 17 | log_interval: 10 18 | log_level: ${log_level} 19 | lr_decay: 0.98 20 | max_speclen: 128 21 | port: 8005 22 | resume_training: false # set to false to finetune from a model 23 | seed: 1234 24 | segment_size: 8960 25 | use_sr: false 26 | valid_epoch_interval: 1 27 | valid_steps_interval: 1000 28 | save_epoch_interval: 10 29 | save_steps_interval: 1000 30 | warmup_epochs: 0 31 | # weighted_batch_speaker_sampling : false 32 | # weighted_batch_lang_sampling : false 33 | weighted_batch_speaker_sampling : 0.5 34 | weighted_batch_lang_sampling : 0.5 35 | 36 | data: 37 | dataset_dir: /raid/lucasgris/free-svc/data 38 | filter_length: 1280 39 | hop_length: 320 40 | max_wav_value: 32768.0 41 | mel_fmax: null 42 | mel_fmin: 0.0 43 | n_mel_channels: 80 44 | num_workers: 64 45 | # For pitch extraction, set the pitch_predictor (will compute in dataloader) or pitch_features_dir (will load from disk) 46 | pitch_predictor: rmvpe # pm | crepe | harvest | dio | rmvpe | fcpe 47 | pitch_features_dir: ${data.dataset_dir}/pitch_features/ 48 | sampling_rate: 24000 49 | spectrogram_dir: null #${data.dataset_dir}/spectrograms # it is recommended NOT to use if you have small disk space 50 | # For speaker embedding extraction, set the use_spk_emb to True and spk_embeddings_dir (will load from disk) or configure the model to compute it on forward 51 | use_spk_emb: true 52 | spk_embeddings_dir: ${data.dataset_dir}/spk_embeddings 53 | # SR augmentation is deprecated, set use_sr to False 54 | sr_min_max: [68, 92] 55 | # For content feature extraction, set the content_feature_dir (will load from disk) or configure the model to compute it on forward 56 | content_feature_dir: null 57 | training_files: data/train.csv 58 | validation_files: data/valid.csv 59 | win_length: 1280 60 | 61 | model: 62 | save_dir: null 63 | filter_channels: 768 64 | finetune_from_model: 65 | discriminator: /raid/lucasgris/free-svc/D-freevc-24.pth 66 | generator: /raid/lucasgris/free-svc/freevc-24.pth 67 | hidden_channels: 192 68 | inter_channels: 192 69 | kernel_size: 3 70 | n_heads: 2 71 | n_layers_q: 3 72 | n_layers: 6 73 | p_dropout: 0.1 74 | resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] 75 | resblock_kernel_sizes: [3,7,11] 76 | resblock: 1 77 | c_dim: 768 78 | upsample_initial_channel: 512 79 | upsample_kernel_sizes: [16,16,4,4] 80 | upsample_rates: [10,8,2,2] 81 | use_spectral_norm: false 82 | freeze_external_spk: true 83 | device: cuda 84 | # For online speaker embedding extraction, set the use_spk_emb to True and spk_encoder_type 85 | use_spk_emb: false 86 | gin_channels: null # gin_channels = spk_encoder.embedding_dim 87 | spk_encoder_type: null # ECAPA2SpeakerEncoder16k | 88 | # For content feature extraction, set the content_encoder_type and content_encoder_ckpt 89 | content_encoder_type: null # load from disk (data) - hubert | wavlm 90 | content_encoder_ckpt: null # load from disk (data) - [path] | models/wavlm/WavLM-Large.pt | lengyue233/content-vec-best 91 | post_content_encoder_type: vits-encoder-with-uv-emb # or freevc-bottleneck 92 | coarse_f0: true 93 | cond_f0_on_flow: false 94 | -------------------------------------------------------------------------------- /losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | 5 | def feature_loss(fmap_r, fmap_g): 6 | loss = 0 7 | for dr, dg in zip(fmap_r, fmap_g): 8 | for rl, gl in zip(dr, dg): 9 | rl = rl.float().detach() 10 | gl = gl.float() 11 | loss += torch.mean(torch.abs(rl - gl)) 12 | 13 | return loss * 2 14 | 15 | 16 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 17 | loss = 0 18 | r_losses = [] 19 | g_losses = [] 20 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 21 | dr = dr.float() 22 | dg = dg.float() 23 | r_loss = torch.mean((1-dr)**2) 24 | g_loss = torch.mean(dg**2) 25 | loss += (r_loss + g_loss) 26 | r_losses.append(r_loss.item()) 27 | g_losses.append(g_loss.item()) 28 | 29 | return loss, r_losses, g_losses 30 | 31 | 32 | def generator_loss(disc_outputs): 33 | loss = 0 34 | gen_losses = [] 35 | for dg in disc_outputs: 36 | dg = dg.float() 37 | l = torch.mean((1-dg)**2) 38 | gen_losses.append(l) 39 | loss += l 40 | 41 | return loss, gen_losses 42 | 43 | 44 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 45 | """ 46 | z_p, logs_q: [b, h, t_t] 47 | m_p, logs_p: [b, h, t_t] 48 | """ 49 | z_p = z_p.float() 50 | logs_q = logs_q.float() 51 | m_p = m_p.float() 52 | logs_p = logs_p.float() 53 | z_mask = z_mask.float() 54 | # print(logs_p) 55 | kl = logs_p - logs_q - 0.5 56 | kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p) 57 | kl = torch.sum(kl * z_mask) 58 | l = kl / torch.sum(z_mask) 59 | return l 60 | -------------------------------------------------------------------------------- /mel_processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data 3 | from librosa.filters import mel as librosa_mel_fn 4 | 5 | import logging 6 | logger = logging.getLogger(__name__) 7 | 8 | MAX_WAV_VALUE = 32768.0 9 | 10 | 11 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 12 | """ 13 | PARAMS 14 | ------ 15 | C: compression factor 16 | """ 17 | return torch.log(torch.clamp(x, min=clip_val) * C) 18 | 19 | 20 | def dynamic_range_decompression_torch(x, C=1): 21 | """ 22 | PARAMS 23 | ------ 24 | C: compression factor used to compress 25 | """ 26 | return torch.exp(x) / C 27 | 28 | 29 | def spectral_normalize_torch(magnitudes): 30 | output = dynamic_range_compression_torch(magnitudes) 31 | return output 32 | 33 | 34 | def spectral_de_normalize_torch(magnitudes): 35 | output = dynamic_range_decompression_torch(magnitudes) 36 | return output 37 | 38 | 39 | class MelProcessing: 40 | 41 | def __init__(self, mel_basis={}, hann_window={}): 42 | self.mel_basis = mel_basis 43 | self.hann_window = hann_window 44 | 45 | # TODO: sample rate is not used 46 | def spectrogram_torch(self, y, n_fft, sampling_rate, hop_size, win_size, center=False): 47 | if torch.min(y) < -1.: 48 | print('min value is ', torch.min(y)) 49 | if torch.max(y) > 1.: 50 | print('max value is ', torch.max(y)) 51 | 52 | dtype_device = str(y.dtype) + '_' + str(y.device) 53 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 54 | if wnsize_dtype_device not in self.hann_window: 55 | self.hann_window[wnsize_dtype_device] = torch.hann_window( 56 | win_size).to(dtype=y.dtype, device=y.device) 57 | 58 | y = torch.nn.functional.pad(y.unsqueeze( 59 | 1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 60 | y = y.squeeze(1) 61 | 62 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=self.hann_window[wnsize_dtype_device], 63 | center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) 64 | 65 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 66 | return spec 67 | 68 | def spec_to_mel_torch(self, spec, n_fft, num_mels, sampling_rate, fmin, fmax): 69 | dtype_device = str(spec.dtype) + '_' + str(spec.device) 70 | fmax_dtype_device = str(fmax) + '_' + dtype_device 71 | if fmax_dtype_device not in self.mel_basis: 72 | mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, 73 | n_mels=num_mels, fmin=fmin, fmax=fmax) 74 | self.mel_basis[fmax_dtype_device] = torch.from_numpy( 75 | mel).to(dtype=spec.dtype, device=spec.device) 76 | spec = torch.matmul(self.mel_basis[fmax_dtype_device], spec) 77 | spec = spectral_normalize_torch(spec) 78 | return spec 79 | 80 | def mel_spectrogram_torch(self, y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): 81 | if torch.min(y) < -1.: 82 | logger.debug('min value is ', torch.min(y)) 83 | if torch.max(y) > 1.: 84 | logger.debug('max value is ', torch.max(y)) 85 | 86 | dtype_device = str(y.dtype) + '_' + str(y.device) 87 | fmax_dtype_device = str(fmax) + '_' + dtype_device 88 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 89 | if fmax_dtype_device not in self.mel_basis: 90 | mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, 91 | n_mels=num_mels, fmin=fmin, fmax=fmax) 92 | self.mel_basis[fmax_dtype_device] = torch.from_numpy( 93 | mel).to(dtype=y.dtype, device=y.device) 94 | if wnsize_dtype_device not in self.hann_window: 95 | self.hann_window[wnsize_dtype_device] = torch.hann_window( 96 | win_size).to(dtype=y.dtype, device=y.device) 97 | 98 | y = torch.nn.functional.pad(y.unsqueeze( 99 | 1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 100 | y = y.squeeze(1) 101 | 102 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=self.hann_window[wnsize_dtype_device], 103 | center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) 104 | 105 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 106 | 107 | spec = torch.matmul(self.mel_basis[fmax_dtype_device], spec) 108 | spec = spectral_normalize_torch(spec) 109 | 110 | return spec 111 | 112 | mel_processing = MelProcessing() -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import SynthesizerTrn 2 | from .models import MultiPeriodDiscriminator -------------------------------------------------------------------------------- /models/clova/SpeakerNet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import importlib 4 | 5 | 6 | class WrappedModel(nn.Module): 7 | 8 | ## The purpose of this wrapper is to make the model structure consistent between single and multi-GPU 9 | 10 | def __init__(self, model): 11 | super(WrappedModel, self).__init__() 12 | self.module = model 13 | 14 | def forward(self, x, label=None): 15 | return self.module(x, label) 16 | 17 | 18 | class SpeakerNet(nn.Module): 19 | def __init__(self, model, **kwargs): 20 | super(SpeakerNet, self).__init__() 21 | 22 | if type(model) == str: 23 | SpeakerNetModel = importlib.import_module(".models." + model).__getattribute__("MainModel") 24 | else: 25 | SpeakerNetModel = model 26 | self.model = SpeakerNetModel(**kwargs) 27 | 28 | def forward(self, data, label=None): 29 | 30 | data = data.reshape(-1, data.size()[-1]).cuda() 31 | outp = self.model.forward(data) 32 | return outp 33 | 34 | def loadParameters(self, path): 35 | print("Loading pretrained model from %s" % (path)) 36 | pretrained_dict = torch.load(path) 37 | model_dict = self.model.state_dict() 38 | pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} 39 | model_dict.update(pretrained_dict) 40 | self.model.load_state_dict(model_dict) 41 | print("Pretrained model is loaded.") 42 | -------------------------------------------------------------------------------- /models/clova/models/RawNet3.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import torch 4 | import torch.nn as nn 5 | from asteroid_filterbanks import Encoder, ParamSincFB 6 | 7 | from .RawNetBasicBlock import Bottle2neck, PreEmphasis 8 | 9 | 10 | class RawNet3(nn.Module): 11 | def __init__(self, block, model_scale, context, summed, C=1024, **kwargs): 12 | super().__init__() 13 | 14 | nOut = kwargs["nOut"] 15 | 16 | self.context = context 17 | self.encoder_type = kwargs["encoder_type"] 18 | self.log_sinc = kwargs["log_sinc"] 19 | self.norm_sinc = kwargs["norm_sinc"] 20 | self.out_bn = kwargs["out_bn"] 21 | self.summed = summed 22 | 23 | self.preprocess = nn.Sequential( 24 | PreEmphasis(), nn.InstanceNorm1d(1, eps=1e-4, affine=True) 25 | ) 26 | self.conv1 = Encoder( 27 | ParamSincFB( 28 | C // 4, 29 | 251, 30 | stride=kwargs["sinc_stride"], 31 | ) 32 | ) 33 | self.relu = nn.ReLU() 34 | self.bn1 = nn.BatchNorm1d(C // 4) 35 | 36 | self.layer1 = block( 37 | C // 4, C, kernel_size=3, dilation=2, scale=model_scale, pool=5 38 | ) 39 | self.layer2 = block( 40 | C, C, kernel_size=3, dilation=3, scale=model_scale, pool=3 41 | ) 42 | self.layer3 = block(C, C, kernel_size=3, dilation=4, scale=model_scale) 43 | self.layer4 = nn.Conv1d(3 * C, 1536, kernel_size=1) 44 | 45 | if self.context: 46 | attn_input = 1536 * 3 47 | else: 48 | attn_input = 1536 49 | print("self.encoder_type", self.encoder_type) 50 | if self.encoder_type == "ECA": 51 | attn_output = 1536 52 | elif self.encoder_type == "ASP": 53 | attn_output = 1 54 | else: 55 | raise ValueError("Undefined encoder") 56 | 57 | self.attention = nn.Sequential( 58 | nn.Conv1d(attn_input, 128, kernel_size=1), 59 | nn.ReLU(), 60 | nn.BatchNorm1d(128), 61 | nn.Conv1d(128, attn_output, kernel_size=1), 62 | nn.Softmax(dim=2), 63 | ) 64 | 65 | self.bn5 = nn.BatchNorm1d(3072) 66 | 67 | self.fc6 = nn.Linear(3072, nOut) 68 | self.bn6 = nn.BatchNorm1d(nOut) 69 | 70 | self.mp3 = nn.MaxPool1d(3) 71 | 72 | def forward(self, x): 73 | """ 74 | :param x: input mini-batch (bs, samp) 75 | """ 76 | 77 | with torch.cuda.amp.autocast(enabled=False): 78 | x = self.preprocess(x) 79 | x = torch.abs(self.conv1(x)) 80 | if self.log_sinc: 81 | x = torch.log(x + 1e-6) 82 | if self.norm_sinc == "mean": 83 | x = x - torch.mean(x, dim=-1, keepdim=True) 84 | elif self.norm_sinc == "mean_std": 85 | m = torch.mean(x, dim=-1, keepdim=True) 86 | s = torch.std(x, dim=-1, keepdim=True) 87 | s[s < 0.001] = 0.001 88 | x = (x - m) / s 89 | 90 | if self.summed: 91 | x1 = self.layer1(x) 92 | x2 = self.layer2(x1) 93 | x3 = self.layer3(self.mp3(x1) + x2) 94 | else: 95 | x1 = self.layer1(x) 96 | x2 = self.layer2(x1) 97 | x3 = self.layer3(x2) 98 | 99 | x = self.layer4(torch.cat((self.mp3(x1), x2, x3), dim=1)) 100 | x = self.relu(x) 101 | 102 | t = x.size()[-1] 103 | 104 | if self.context: 105 | global_x = torch.cat( 106 | ( 107 | x, 108 | torch.mean(x, dim=2, keepdim=True).repeat(1, 1, t), 109 | torch.sqrt( 110 | torch.var(x, dim=2, keepdim=True).clamp( 111 | min=1e-4, max=1e4 112 | ) 113 | ).repeat(1, 1, t), 114 | ), 115 | dim=1, 116 | ) 117 | else: 118 | global_x = x 119 | 120 | w = self.attention(global_x) 121 | 122 | mu = torch.sum(x * w, dim=2) 123 | sg = torch.sqrt( 124 | (torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4, max=1e4) 125 | ) 126 | 127 | x = torch.cat((mu, sg), 1) 128 | 129 | x = self.bn5(x) 130 | 131 | x = self.fc6(x) 132 | 133 | if self.out_bn: 134 | x = self.bn6(x) 135 | 136 | return x 137 | 138 | 139 | def MainModel(**kwargs): 140 | 141 | model = RawNet3( 142 | Bottle2neck, model_scale=8, context=True, summed=True, out_bn=False, log_sinc=True, norm_sinc="mean", grad_mult=1, **kwargs 143 | ) 144 | return model 145 | -------------------------------------------------------------------------------- /models/clova/models/RawNetBasicBlock.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | class PreEmphasis(torch.nn.Module): 9 | def __init__(self, coef: float = 0.97) -> None: 10 | super().__init__() 11 | self.coef = coef 12 | # make kernel 13 | # In pytorch, the convolution operation uses cross-correlation. So, filter is flipped. 14 | self.register_buffer( 15 | "flipped_filter", 16 | torch.FloatTensor([-self.coef, 1.0]).unsqueeze(0).unsqueeze(0), 17 | ) 18 | 19 | def forward(self, input: torch.tensor) -> torch.tensor: 20 | assert ( 21 | len(input.size()) == 2 22 | ), "The number of dimensions of input tensor must be 2!" 23 | # reflect padding to match lengths of in/out 24 | input = input.unsqueeze(1) 25 | input = F.pad(input, (1, 0), "reflect") 26 | return F.conv1d(input, self.flipped_filter) 27 | 28 | 29 | class AFMS(nn.Module): 30 | """ 31 | Alpha-Feature map scaling, added to the output of each residual block[1,2]. 32 | 33 | Reference: 34 | [1] RawNet2 : https://www.isca-speech.org/archive/Interspeech_2020/pdfs/1011.pdf 35 | [2] AMFS : https://www.koreascience.or.kr/article/JAKO202029757857763.page 36 | """ 37 | 38 | def __init__(self, nb_dim: int) -> None: 39 | super().__init__() 40 | self.alpha = nn.Parameter(torch.ones((nb_dim, 1))) 41 | self.fc = nn.Linear(nb_dim, nb_dim) 42 | self.sig = nn.Sigmoid() 43 | 44 | def forward(self, x): 45 | y = F.adaptive_avg_pool1d(x, 1).view(x.size(0), -1) 46 | y = self.sig(self.fc(y)).view(x.size(0), x.size(1), -1) 47 | 48 | x = x + self.alpha 49 | x = x * y 50 | return x 51 | 52 | 53 | class Bottle2neck(nn.Module): 54 | def __init__( 55 | self, 56 | inplanes, 57 | planes, 58 | kernel_size=None, 59 | dilation=None, 60 | scale=4, 61 | pool=False, 62 | ): 63 | 64 | super().__init__() 65 | 66 | width = int(math.floor(planes / scale)) 67 | 68 | self.conv1 = nn.Conv1d(inplanes, width * scale, kernel_size=1) 69 | self.bn1 = nn.BatchNorm1d(width * scale) 70 | 71 | self.nums = scale - 1 72 | 73 | convs = [] 74 | bns = [] 75 | 76 | num_pad = math.floor(kernel_size / 2) * dilation 77 | 78 | for i in range(self.nums): 79 | convs.append( 80 | nn.Conv1d( 81 | width, 82 | width, 83 | kernel_size=kernel_size, 84 | dilation=dilation, 85 | padding=num_pad, 86 | ) 87 | ) 88 | bns.append(nn.BatchNorm1d(width)) 89 | 90 | self.convs = nn.ModuleList(convs) 91 | self.bns = nn.ModuleList(bns) 92 | 93 | self.conv3 = nn.Conv1d(width * scale, planes, kernel_size=1) 94 | self.bn3 = nn.BatchNorm1d(planes) 95 | 96 | self.relu = nn.ReLU() 97 | 98 | self.width = width 99 | 100 | self.mp = nn.MaxPool1d(pool) if pool else False 101 | self.afms = AFMS(planes) 102 | 103 | if inplanes != planes: # if change in number of filters 104 | self.residual = nn.Sequential( 105 | nn.Conv1d(inplanes, planes, kernel_size=1, stride=1, bias=False) 106 | ) 107 | else: 108 | self.residual = nn.Identity() 109 | 110 | def forward(self, x): 111 | residual = self.residual(x) 112 | 113 | out = self.conv1(x) 114 | out = self.relu(out) 115 | out = self.bn1(out) 116 | 117 | spx = torch.split(out, self.width, 1) 118 | for i in range(self.nums): 119 | if i == 0: 120 | sp = spx[i] 121 | else: 122 | sp = sp + spx[i] 123 | sp = self.convs[i](sp) 124 | sp = self.relu(sp) 125 | sp = self.bns[i](sp) 126 | if i == 0: 127 | out = sp 128 | else: 129 | out = torch.cat((out, sp), 1) 130 | 131 | out = torch.cat((out, spx[self.nums]), 1) 132 | 133 | out = self.conv3(out) 134 | out = self.relu(out) 135 | out = self.bn3(out) 136 | 137 | out += residual 138 | if self.mp: 139 | out = self.mp(out) 140 | out = self.afms(out) 141 | 142 | return out 143 | -------------------------------------------------------------------------------- /models/clova/models/ResNetBlocks.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | class SEBasicBlock(nn.Module): 8 | expansion = 1 9 | 10 | def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8): 11 | super(SEBasicBlock, self).__init__() 12 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 13 | self.bn1 = nn.BatchNorm2d(planes) 14 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False) 15 | self.bn2 = nn.BatchNorm2d(planes) 16 | self.relu = nn.ReLU(inplace=True) 17 | self.se = SELayer(planes, reduction) 18 | self.downsample = downsample 19 | self.stride = stride 20 | 21 | def forward(self, x): 22 | residual = x 23 | 24 | out = self.conv1(x) 25 | out = self.relu(out) 26 | out = self.bn1(out) 27 | 28 | out = self.conv2(out) 29 | out = self.bn2(out) 30 | out = self.se(out) 31 | 32 | if self.downsample is not None: 33 | residual = self.downsample(x) 34 | 35 | out += residual 36 | out = self.relu(out) 37 | return out 38 | 39 | 40 | class SEBottleneck(nn.Module): 41 | expansion = 4 42 | 43 | def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8): 44 | super(SEBottleneck, self).__init__() 45 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 46 | self.bn1 = nn.BatchNorm2d(planes) 47 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, 48 | padding=1, bias=False) 49 | self.bn2 = nn.BatchNorm2d(planes) 50 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 51 | self.bn3 = nn.BatchNorm2d(planes * 4) 52 | self.relu = nn.ReLU(inplace=True) 53 | self.se = SELayer(planes * 4, reduction) 54 | self.downsample = downsample 55 | self.stride = stride 56 | 57 | def forward(self, x): 58 | residual = x 59 | 60 | out = self.conv1(x) 61 | out = self.bn1(out) 62 | out = self.relu(out) 63 | 64 | out = self.conv2(out) 65 | out = self.bn2(out) 66 | out = self.relu(out) 67 | 68 | out = self.conv3(out) 69 | out = self.bn3(out) 70 | out = self.se(out) 71 | 72 | if self.downsample is not None: 73 | residual = self.downsample(x) 74 | 75 | out += residual 76 | out = self.relu(out) 77 | 78 | return out 79 | 80 | 81 | class SELayer(nn.Module): 82 | def __init__(self, channel, reduction=8): 83 | super(SELayer, self).__init__() 84 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 85 | self.fc = nn.Sequential( 86 | nn.Linear(channel, channel // reduction), 87 | nn.ReLU(inplace=True), 88 | nn.Linear(channel // reduction, channel), 89 | nn.Sigmoid() 90 | ) 91 | 92 | def forward(self, x): 93 | b, c, _, _ = x.size() 94 | y = self.avg_pool(x).view(b, c) 95 | y = self.fc(y).view(b, c, 1, 1) 96 | return x * y -------------------------------------------------------------------------------- /models/clova/models/ResNetSE34L.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | import torchaudio 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.nn import Parameter 9 | from models.ResNetBlocks import * 10 | 11 | class ResNetSE(nn.Module): 12 | def __init__(self, block, layers, num_filters, nOut, encoder_type='SAP', n_mels=40, log_input=True, **kwargs): 13 | super(ResNetSE, self).__init__() 14 | 15 | print('Embedding size is %d, encoder %s.'%(nOut, encoder_type)) 16 | 17 | self.inplanes = num_filters[0] 18 | self.encoder_type = encoder_type 19 | self.n_mels = n_mels 20 | self.log_input = log_input 21 | 22 | self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=7, stride=(2, 1), padding=3, 23 | bias=False) 24 | self.bn1 = nn.BatchNorm2d(num_filters[0]) 25 | self.relu = nn.ReLU(inplace=True) 26 | 27 | self.layer1 = self._make_layer(block, num_filters[0], layers[0]) 28 | self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2)) 29 | self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2)) 30 | self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(1, 1)) 31 | 32 | self.instancenorm = nn.InstanceNorm1d(n_mels) 33 | self.torchfb = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, window_fn=torch.hamming_window, n_mels=n_mels) 34 | 35 | if self.encoder_type == "SAP": 36 | self.sap_linear = nn.Linear(num_filters[3] * block.expansion, num_filters[3] * block.expansion) 37 | self.attention = self.new_parameter(num_filters[3] * block.expansion, 1) 38 | out_dim = num_filters[3] * block.expansion 39 | elif self.encoder_type == "ASP": 40 | self.sap_linear = nn.Linear(num_filters[3] * block.expansion, num_filters[3] * block.expansion) 41 | self.attention = self.new_parameter(num_filters[3] * block.expansion, 1) 42 | out_dim = num_filters[3] * block.expansion * 2 43 | else: 44 | raise ValueError('Undefined encoder') 45 | 46 | self.fc = nn.Linear(out_dim, nOut) 47 | 48 | for m in self.modules(): 49 | if isinstance(m, nn.Conv2d): 50 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 51 | elif isinstance(m, nn.BatchNorm2d): 52 | nn.init.constant_(m.weight, 1) 53 | nn.init.constant_(m.bias, 0) 54 | 55 | def _make_layer(self, block, planes, blocks, stride=1): 56 | downsample = None 57 | if stride != 1 or self.inplanes != planes * block.expansion: 58 | downsample = nn.Sequential( 59 | nn.Conv2d(self.inplanes, planes * block.expansion, 60 | kernel_size=1, stride=stride, bias=False), 61 | nn.BatchNorm2d(planes * block.expansion), 62 | ) 63 | 64 | layers = [] 65 | layers.append(block(self.inplanes, planes, stride, downsample)) 66 | self.inplanes = planes * block.expansion 67 | for i in range(1, blocks): 68 | layers.append(block(self.inplanes, planes)) 69 | 70 | return nn.Sequential(*layers) 71 | 72 | def new_parameter(self, *size): 73 | out = nn.Parameter(torch.FloatTensor(*size)) 74 | nn.init.xavier_normal_(out) 75 | return out 76 | 77 | def forward(self, x): 78 | 79 | with torch.no_grad(): 80 | with torch.cuda.amp.autocast(enabled=False): 81 | x = self.torchfb(x)+1e-6 82 | if self.log_input: x = x.log() 83 | x = self.instancenorm(x).unsqueeze(1).detach() 84 | 85 | x = self.conv1(x) 86 | x = self.bn1(x) 87 | x = self.relu(x) 88 | 89 | x = self.layer1(x) 90 | x = self.layer2(x) 91 | x = self.layer3(x) 92 | x = self.layer4(x) 93 | 94 | x = torch.mean(x, dim=2, keepdim=True) 95 | 96 | if self.encoder_type == "SAP": 97 | x = x.permute(0,3,1,2).squeeze(-1) 98 | h = torch.tanh(self.sap_linear(x)) 99 | w = torch.matmul(h, self.attention).squeeze(dim=2) 100 | w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1) 101 | x = torch.sum(x * w, dim=1) 102 | elif self.encoder_type == "ASP": 103 | x = x.permute(0,3,1,2).squeeze(-1) 104 | h = torch.tanh(self.sap_linear(x)) 105 | w = torch.matmul(h, self.attention).squeeze(dim=2) 106 | w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1) 107 | mu = torch.sum(x * w, dim=1) 108 | rh = torch.sqrt( ( torch.sum((x**2) * w, dim=1) - mu**2 ).clamp(min=1e-5) ) 109 | x = torch.cat((mu,rh),1) 110 | 111 | x = x.view(x.size()[0], -1) 112 | x = self.fc(x) 113 | 114 | return x 115 | 116 | 117 | def MainModel(nOut=256, **kwargs): 118 | # Number of filters 119 | num_filters = [16, 32, 64, 128] 120 | model = ResNetSE(SEBasicBlock, [3, 4, 6, 3], num_filters, nOut, **kwargs) 121 | return model 122 | -------------------------------------------------------------------------------- /models/clova/models/ResNetSE34V2.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | import torchaudio 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.nn import Parameter 9 | from models.ResNetBlocks import * 10 | from utils import PreEmphasis 11 | 12 | class ResNetSE(nn.Module): 13 | def __init__(self, block, layers, num_filters, nOut, encoder_type='SAP', n_mels=40, log_input=True, **kwargs): 14 | super(ResNetSE, self).__init__() 15 | 16 | print('Embedding size is %d, encoder %s.'%(nOut, encoder_type)) 17 | 18 | self.inplanes = num_filters[0] 19 | self.encoder_type = encoder_type 20 | self.n_mels = n_mels 21 | self.log_input = log_input 22 | 23 | self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=3, stride=1, padding=1) 24 | self.relu = nn.ReLU(inplace=True) 25 | self.bn1 = nn.BatchNorm2d(num_filters[0]) 26 | 27 | 28 | self.layer1 = self._make_layer(block, num_filters[0], layers[0]) 29 | self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2)) 30 | self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2)) 31 | self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(2, 2)) 32 | 33 | self.instancenorm = nn.InstanceNorm1d(n_mels) 34 | self.torchfb = torch.nn.Sequential( 35 | PreEmphasis(), 36 | torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, window_fn=torch.hamming_window, n_mels=n_mels) 37 | ) 38 | 39 | outmap_size = int(self.n_mels/8) 40 | 41 | self.attention = nn.Sequential( 42 | nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1), 43 | nn.ReLU(), 44 | nn.BatchNorm1d(128), 45 | nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1), 46 | nn.Softmax(dim=2), 47 | ) 48 | 49 | if self.encoder_type == "SAP": 50 | out_dim = num_filters[3] * outmap_size 51 | elif self.encoder_type == "ASP": 52 | out_dim = num_filters[3] * outmap_size * 2 53 | else: 54 | raise ValueError('Undefined encoder') 55 | 56 | self.fc = nn.Linear(out_dim, nOut) 57 | 58 | for m in self.modules(): 59 | if isinstance(m, nn.Conv2d): 60 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 61 | elif isinstance(m, nn.BatchNorm2d): 62 | nn.init.constant_(m.weight, 1) 63 | nn.init.constant_(m.bias, 0) 64 | 65 | def _make_layer(self, block, planes, blocks, stride=1): 66 | downsample = None 67 | if stride != 1 or self.inplanes != planes * block.expansion: 68 | downsample = nn.Sequential( 69 | nn.Conv2d(self.inplanes, planes * block.expansion, 70 | kernel_size=1, stride=stride, bias=False), 71 | nn.BatchNorm2d(planes * block.expansion), 72 | ) 73 | 74 | layers = [] 75 | layers.append(block(self.inplanes, planes, stride, downsample)) 76 | self.inplanes = planes * block.expansion 77 | for i in range(1, blocks): 78 | layers.append(block(self.inplanes, planes)) 79 | 80 | return nn.Sequential(*layers) 81 | 82 | def new_parameter(self, *size): 83 | out = nn.Parameter(torch.FloatTensor(*size)) 84 | nn.init.xavier_normal_(out) 85 | return out 86 | 87 | def forward(self, x): 88 | 89 | with torch.no_grad(): 90 | with torch.cuda.amp.autocast(enabled=False): 91 | x = self.torchfb(x)+1e-6 92 | if self.log_input: x = x.log() 93 | x = self.instancenorm(x).unsqueeze(1) 94 | 95 | x = self.conv1(x) 96 | x = self.relu(x) 97 | x = self.bn1(x) 98 | 99 | x = self.layer1(x) 100 | x = self.layer2(x) 101 | x = self.layer3(x) 102 | x = self.layer4(x) 103 | 104 | x = x.reshape(x.size()[0],-1,x.size()[-1]) 105 | 106 | w = self.attention(x) 107 | 108 | if self.encoder_type == "SAP": 109 | x = torch.sum(x * w, dim=2) 110 | elif self.encoder_type == "ASP": 111 | mu = torch.sum(x * w, dim=2) 112 | sg = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-5) ) 113 | x = torch.cat((mu,sg),1) 114 | 115 | x = x.view(x.size()[0], -1) 116 | x = self.fc(x) 117 | 118 | return x 119 | 120 | 121 | def MainModel(nOut=256, **kwargs): 122 | # Number of filters 123 | num_filters = [32, 64, 128, 256] 124 | model = ResNetSE(SEBasicBlock, [3, 4, 6, 3], num_filters, nOut, **kwargs) 125 | return model 126 | 127 | -------------------------------------------------------------------------------- /models/clova/models/VGGVox.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | import torchaudio 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.nn import Parameter 9 | 10 | class MainModel(nn.Module): 11 | def __init__(self, nOut = 1024, encoder_type='SAP', log_input=True, **kwargs): 12 | super(MainModel, self).__init__(); 13 | 14 | print('Embedding size is %d, encoder %s.'%(nOut, encoder_type)) 15 | 16 | self.encoder_type = encoder_type 17 | self.log_input = log_input 18 | 19 | self.netcnn = nn.Sequential( 20 | nn.Conv2d(1, 96, kernel_size=(5,7), stride=(1,2), padding=(2,2)), 21 | nn.BatchNorm2d(96), 22 | nn.ReLU(inplace=True), 23 | nn.MaxPool2d(kernel_size=(1,3), stride=(1,2)), 24 | 25 | nn.Conv2d(96, 256, kernel_size=(5,5), stride=(2,2), padding=(1,1)), 26 | nn.BatchNorm2d(256), 27 | nn.ReLU(inplace=True), 28 | nn.MaxPool2d(kernel_size=(3,3), stride=(2,2)), 29 | 30 | nn.Conv2d(256, 384, kernel_size=(3,3), padding=(1,1)), 31 | nn.BatchNorm2d(384), 32 | nn.ReLU(inplace=True), 33 | 34 | nn.Conv2d(384, 256, kernel_size=(3,3), padding=(1,1)), 35 | nn.BatchNorm2d(256), 36 | nn.ReLU(inplace=True), 37 | 38 | nn.Conv2d(256, 256, kernel_size=(3,3), padding=(1,1)), 39 | nn.BatchNorm2d(256), 40 | nn.ReLU(inplace=True), 41 | nn.MaxPool2d(kernel_size=(3,3), stride=(2,2)), 42 | 43 | nn.Conv2d(256, 512, kernel_size=(4,1), padding=(0,0)), 44 | nn.BatchNorm2d(512), 45 | nn.ReLU(inplace=True), 46 | 47 | ); 48 | 49 | if self.encoder_type == "MAX": 50 | self.encoder = nn.AdaptiveMaxPool2d((1,1)) 51 | out_dim = 512 52 | elif self.encoder_type == "TAP": 53 | self.encoder = nn.AdaptiveAvgPool2d((1,1)) 54 | out_dim = 512 55 | elif self.encoder_type == "SAP": 56 | self.sap_linear = nn.Linear(512, 512) 57 | self.attention = self.new_parameter(512, 1) 58 | out_dim = 512 59 | else: 60 | raise ValueError('Undefined encoder') 61 | 62 | self.fc = nn.Linear(out_dim, nOut) 63 | 64 | self.instancenorm = nn.InstanceNorm1d(40) 65 | self.torchfb = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, f_min=0.0, f_max=8000, pad=0, n_mels=40) 66 | 67 | def new_parameter(self, *size): 68 | out = nn.Parameter(torch.FloatTensor(*size)) 69 | nn.init.xavier_normal_(out) 70 | return out 71 | 72 | def forward(self, x): 73 | 74 | with torch.no_grad(): 75 | with torch.cuda.amp.autocast(enabled=False): 76 | x = self.torchfb(x)+1e-6 77 | if self.log_input: x = x.log() 78 | x = self.instancenorm(x).unsqueeze(1) 79 | 80 | x = self.netcnn(x); 81 | 82 | if self.encoder_type == "MAX" or self.encoder_type == "TAP": 83 | x = self.encoder(x) 84 | x = x.view((x.size()[0], -1)) 85 | 86 | elif self.encoder_type == "SAP": 87 | x = x.permute(0, 2, 1, 3) 88 | x = x.squeeze(dim=1).permute(0, 2, 1) # batch * L * D 89 | h = torch.tanh(self.sap_linear(x)) 90 | w = torch.matmul(h, self.attention).squeeze(dim=2) 91 | w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1) 92 | x = torch.sum(x * w, dim=1) 93 | 94 | x = self.fc(x); 95 | 96 | return x; 97 | 98 | -------------------------------------------------------------------------------- /models/clova/models/byol.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | from .ssl_singer_identity.singer_identity import load_model 4 | 5 | 6 | def MainModel(**kwargs): 7 | 8 | model = load_model("byol", torchscript=True) 9 | model.train() 10 | return model 11 | -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Bernardo Torres 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/metadata/img/byol.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freds0/free-svc/def7f35a0d1935b00e6ebaf19a4a149617f3354b/models/clova/models/ssl_singer_identity/metadata/img/byol.png -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/metadata/img/full_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freds0/free-svc/def7f35a0d1935b00e6ebaf19a4a149617f3354b/models/clova/models/ssl_singer_identity/metadata/img/full_diagram.png -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/metadata/img/isolated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freds0/free-svc/def7f35a0d1935b00e6ebaf19a4a149617f3354b/models/clova/models/ssl_singer_identity/metadata/img/isolated.png -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/metadata/img/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freds0/free-svc/def7f35a0d1935b00e6ebaf19a4a149617f3354b/models/clova/models/ssl_singer_identity/metadata/img/pipeline.png -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/metadata/img/techniques_.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freds0/free-svc/def7f35a0d1935b00e6ebaf19a4a149617f3354b/models/clova/models/ssl_singer_identity/metadata/img/techniques_.png -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/singer_identity/__init__.py: -------------------------------------------------------------------------------- 1 | # from . import losses 2 | 3 | from .model import load_model 4 | # from . import model 5 | # from . import trainer 6 | # from . import utils 7 | # from .data import siamese_encoders 8 | -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/singer_identity/callbacks/ma_updates.py: -------------------------------------------------------------------------------- 1 | from math import cos, pi 2 | from typing import Optional, Sequence 3 | 4 | import torch 5 | from pytorch_lightning import Callback, LightningModule, Trainer 6 | 7 | 8 | class MAWeightUpdate(Callback): 9 | """Weight update rule from BYOL. 10 | Your model should have: 11 | - ``self.online_network`` 12 | - ``self.target_network`` 13 | Updates the target_network params using an exponential moving average update rule weighted by tau. 14 | BYOL claims this keeps the online_network from collapsing. 15 | .. note:: Automatically increases tau from ``initial_tau`` to 1.0 with every training step 16 | Example:: 17 | # model must have 2 attributes 18 | model = Model() 19 | model.online_network = ... 20 | model.target_network = ... 21 | trainer = Trainer(callbacks=[MAWeightUpdate()]) 22 | """ 23 | 24 | def __init__(self, initial_tau: float = 0.996, max_epochs=100, should_update: bool = True): 25 | """ 26 | Args: 27 | initial_tau: starting tau. Auto-updates with every training step 28 | """ 29 | super().__init__() 30 | self.initial_tau = initial_tau 31 | self.max_epochs = max_epochs 32 | self.should_update = should_update 33 | 34 | self.current_tau = initial_tau 35 | 36 | def on_train_batch_end( 37 | self, 38 | trainer: Trainer, 39 | pl_module: LightningModule, 40 | outputs: Sequence, 41 | batch: Sequence, 42 | batch_idx: int, 43 | unused: Optional[int] = 0 44 | ) -> None: 45 | # get networks 46 | student_network = pl_module.student_network 47 | teacher_network = pl_module.teacher_network 48 | 49 | # update weights 50 | self.update_weights(student_network, teacher_network) 51 | 52 | # log tau 53 | pl_module.log("hparams/MA rate", self.current_tau, prog_bar=False, logger=True) 54 | 55 | # update tau after 56 | if self.should_update: 57 | self.current_tau = self.update_tau(pl_module, trainer) 58 | 59 | def update_tau(self, pl_module: LightningModule, trainer: Trainer) -> float: 60 | max_steps = len(trainer.train_dataloader) * self.max_epochs 61 | tau = 1 - (1 - self.initial_tau) * (cos(pi * pl_module.global_step / max_steps) + 1) / 2 62 | return tau 63 | 64 | def update_weights( 65 | self, 66 | student_network: torch.nn.Module, 67 | teacher_network: torch.nn.Module 68 | ) -> None: 69 | # apply MA weight update 70 | for (name, student_p), (_, teacher_p) in zip( 71 | student_network.named_parameters(), 72 | teacher_network.named_parameters(), 73 | ): 74 | teacher_p.data = self.current_tau * teacher_p.data + (1 - self.current_tau) * student_p.data 75 | -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/singer_identity/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from singer_identity.utils.core import similarity, roll 4 | 5 | 6 | def std_batch(x, var=1, eps=1e-8): 7 | std = torch.sqrt(x.var(dim=0) + eps) 8 | return torch.mean(F.relu(var - std)) 9 | 10 | 11 | def variance_hinge_reg(x, y, var=1): 12 | # From https://github.com/facebookresearch/vicreg 13 | std_x = std_batch(x, var=var) 14 | std_y = std_batch(y, var=var) 15 | std_loss = std_x / 2 + std_y / 2 16 | return std_loss 17 | 18 | 19 | def covariance(x): 20 | # In official implementation they do mean over batch (to verify) 21 | # mean = x.mean(1, keepdims=True) 22 | mean = x.mean(dim=0) 23 | x = x - mean 24 | cov = torch.matmul(x.transpose(0, 1), x) / (x.shape[0] - 1) 25 | # cov = (x.T @ x) / (x.shape[0] - 1) 26 | return cov 27 | 28 | 29 | def covariance_reg(x, y): 30 | eye = torch.eye(x.shape[1]).to(x.device) 31 | cov_x = covariance(x) 32 | cov_y = covariance(y) 33 | assert cov_x.shape[0] == cov_x.shape[1] 34 | assert cov_y.shape[0] == cov_y.shape[1] 35 | cov_reg = (cov_x * (1 - eye)).pow(2).sum() / x.shape[1] + (cov_y * (1 - eye)).pow( 36 | 2 37 | ).sum() / x.shape[1] 38 | return cov_reg 39 | 40 | 41 | def invariance_loss(x, y): 42 | return F.mse_loss(x, y) 43 | 44 | 45 | def vicreg_loss(x, y, gamma=1, fact_inv_loss=1, fact_var=1, fact_cov=1): 46 | # Adapted from https://github.com/facebookresearch/vicreg 47 | repr_loss = invariance_loss(x, y) 48 | std_loss = variance_hinge_reg(x, y, var=gamma) 49 | cov_loss = covariance_reg(x, y) 50 | loss = fact_inv_loss * repr_loss + fact_var * std_loss + fact_cov * cov_loss 51 | return loss 52 | 53 | 54 | def compute_norms(*args): 55 | norms = [] 56 | for arg in args: 57 | norms.append(torch.sqrt((arg**2).sum(1))) 58 | return norms 59 | 60 | 61 | def align_loss(x, y, alpha=2): 62 | # From https://github.com/SsnL/align_uniform 63 | return (x - y).norm(p=2, dim=1).pow(alpha).mean() 64 | 65 | 66 | def uniform_loss(x, t=2): 67 | # From https://github.com/SsnL/align_uniform 68 | return torch.pdist(x, p=2).pow(2).mul(-t).exp().mean().log() 69 | 70 | 71 | def contrastive_loss(z1, z2, temp=0.2, nr_negative=1, decouple=False): 72 | cost_pos = similarity(z1, z2, temp) # Positive samples 73 | cost_neg = [] 74 | 75 | n_rolls = min(z1.shape[0] - 1, nr_negative) # Number of negative samples 76 | curr_neg_z = z2 77 | 78 | for i in range(n_rolls): 79 | curr_neg_z = roll(curr_neg_z) # Shifts batch 80 | cost_neg.append(similarity(z1, curr_neg_z, temp)) # Negative sim. 81 | 82 | if not decouple: 83 | cost_neg.append(cost_pos) # Adds positive similarity in denominator 84 | 85 | cost_neg = torch.stack(cost_neg).transpose(1, 0) 86 | cost = (-cost_pos + torch.logsumexp(cost_neg, 1)).mean() 87 | # TODO: implement similarities with less operations, but this works 88 | ratio = torch.mean(cost_neg) / ( 89 | torch.mean(cost_pos) + torch.tensor(1e-6).type_as(z1) 90 | ) 91 | return cost, ratio.item() 92 | -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/singer_identity/models/network_components.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from typing import Union, Callable, List, Optional 4 | from torchvision.models import efficientnet_b0, efficientnet_b4 5 | import torchvision.transforms as vt 6 | 7 | 8 | def get_vision_backbone( 9 | vismod="efficientnet_b0", num_classes=1000, pretrained=False, **kwargs 10 | ): 11 | if vismod == "efficientnet_b0": 12 | return efficientnet_b0(pretrained=pretrained, num_classes=num_classes, **kwargs) 13 | elif vismod == "efficientnet_b4": 14 | return efficientnet_b4(pretrained=pretrained, num_classes=num_classes, **kwargs) 15 | 16 | else: 17 | raise NotImplementedError 18 | 19 | 20 | class Grey2Rgb(nn.Module): 21 | def __init__(self): 22 | super().__init__() 23 | self.normalize = vt.Normalize( 24 | mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] 25 | ) 26 | 27 | def forward(self, data): 28 | batch_size, freq_bins, times = data.shape 29 | data /= data.max() 30 | data = data.unsqueeze(1).expand(batch_size, 3, freq_bins, times) 31 | data = self.normalize(data) 32 | return data 33 | 34 | 35 | class LogScale(nn.Module): 36 | def forward(self, data): 37 | # eps = 1e-8 38 | eps = torch.tensor(1e-8, device=data.device) 39 | return torch.log(data + eps) 40 | 41 | 42 | class Aggregator(nn.Module): 43 | """Aggregates (in time) a list of features""" 44 | 45 | def __init__(self): 46 | super().__init__() 47 | self.aggregation = nn.Sequential(nn.AdaptiveAvgPool1d(1), nn.Flatten(1)) 48 | 49 | def forward(self, features): 50 | """ 51 | Returns: 52 | outputs_feature: torch.Tensor of shape(B x C x t) 53 | """ 54 | if isinstance(features, list): 55 | output_feature = [self.aggregation(feature) for feature in features] 56 | else: 57 | output_feature = self.aggregation(features) 58 | return output_feature 59 | -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/singer_identity/train_configs/README.md: -------------------------------------------------------------------------------- 1 | # Configuration File for Training 2 | 3 | You can use a configuration file to train a model using the `train.py` script. Here we provide a description of how to setup the config file. The common options are described in the [common config](common.yaml) file. 4 | 5 | 6 | ```python 7 | python train.py --config path/to/common.yaml --config path/to/model_config.yaml 8 | ``` 9 | The model specific options are described below. In the example above, `model_config.yaml` will overwrite the options in `common.yaml` when options are repeated. For more details check the [Lightning CLI](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli.html#lightning-cli) docs. 10 | 11 | ## 1. Model specific options 12 | In order to use contrastive, VICReg and Uniformity-Alignment, simply change the loss arguments in the config file. Below is the example for the contrastive loss: 13 | 14 | ```yaml 15 | use_contrastive_loss: true # use contrastive loss 16 | temp: 0.2 # temperature for contrastive loss 17 | nr_negative: 250 # number of negative samples for contrastive loss 18 | decouple: true # use decouple contrastive loss or regular NT-Xent loss 19 | use_covariance_reg: false # use covariance regularization 20 | use_variance_reg: false # use variance regularization 21 | use_vicreg_loss: false # use vicreg loss 22 | use_align_loss: false # use alignment loss 23 | use_uniform_loss: false # use uniformity loss 24 | ``` 25 | The individual weights for the losses can be specified as well. BYOL training has its dedicated trainer class and needs to be specified as shown in `byol.yaml`. 26 | 27 | We provide the following configs for the models used in the paper: 28 | 29 | - `byol.yaml` 30 | - `contrastive.yaml` 31 | - `contrastive_vc.yaml` 32 | - `uniformity-alignment.yaml` 33 | - `vicreg.yaml` 34 | 35 | 36 | ## 2. Data Options 37 | In the config file used to launch training (`common.yaml` is this example), specify the datasets to use as follows: 38 | 39 | ```yaml 40 | data: 41 | class_path: singer_id.data.siamese_encoders.SiameseEncodersDataModule # default the dataloader class 42 | init_args: 43 | dataset_dirs: 44 | - '/Path/to/dataset1/dataset1_name' 45 | - '/Path/to/dataset2/dataset2_name' 46 | batch_size: # batch size for training 47 | batch_size_val: # batch size for validation 48 | nr_samples: # number of samples to use for training (default: 176000, ie 4 seconds of audio in 44.1kHz) 49 | normalize: # normalize the audio when loading 50 | num_workers: # number of workers for the dataloader 51 | batch_sampling_mode: # "sample_clips" or "sample groups". Use "sample_clips" for self-supervised COLA loading 52 | eval_frac: # fraction of the dataset to use for validation 53 | group_name_is_folder: 54 | group_by_artist: 55 | multi_epoch: # number of epochs to repeat the dataset to simulate a larger dataset 56 | ``` 57 | 58 | ## 3. Augmentation Options 59 | 60 | The following augmentations are available. We use [Audiomentations](https://github.com/iver56/audiomentations) and [Parselmouth](https://github.com/YannickJadoul/Parselmouth) to perform the augmentations. All fields specify the probability of applying the augmentation, except for `pitch_shift_parselmouth`, `pitch_range_parselmouth`. 61 | 62 | ```yaml 63 | augmentations: 64 | "enable": true 65 | "gaussian_noise": 0.5 # min_amplitude=0.001, max_amplitude=0.05 66 | "pitch_shift_naive": 0 # naive pitch shift (using librosa), not used in the paper 67 | "time_stretch": 0 # time stretch, not used in the paper 68 | "gain": 0.5 # min_gain_in_db=-6, max_gain_in_db=0 69 | "shift": 0 # not used in the paper 70 | "parametric_eq": 0 # not used in the paper 71 | "tanh_distortion": 0 # not used in the paper 72 | "time_mask": 0.5 # max_band_part=1/8 73 | "formant_shift_parselmouth": 0 # not used in the paper 74 | "pitch_shift_parselmouth": [1, 1.3] # Pitch shift value on parselmouth 75 | "pitch_range_parselmouth": 1.5 # Pitch range value on parselmouth 76 | "pitch_shift_parselmouth_prob": 0.5 77 | ``` 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/singer_identity/train_configs/byol.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | class_path: singer_identity.trainer_byol.BYOL 3 | init_args: 4 | # ------------------ Feature extractor ------------------ 5 | feature_extractor: 6 | spec_layer: 'melspectogram' 7 | n_fft: 2048 8 | hop_length: 512 9 | # ------------------ Encoder ------------------ 10 | backbone: 11 | backbone: "efficientnet_b0" 12 | pretrained: true 13 | embedding_dim: 1000 14 | # ------------------ Projection ------------------ 15 | projection: 16 | input_dim: 1000 17 | output_dim: 128 18 | l2_normalize: true 19 | # ------------------ Predictor ------------------ 20 | predictor: 21 | dims: 22 | - 128 23 | - 1024 24 | - 128 25 | use_batchnorm: true 26 | normalize_projections: true 27 | weight_callback: 28 | class_path: singer_identity.callbacks.ma_updates.MAWeightUpdate 29 | init_args: 30 | initial_tau: 0.99 31 | max_epochs: 1000 32 | # ------------------ Optimizer ------------------ 33 | optimizer: 34 | class_path: singer_identity.models.byol.Adam 35 | init_args: 36 | lr: 3e-5 37 | weight_decay: 1.5e-6 38 | scheduler: 39 | class_path: singer_identity.models.byol.LinearWarmupCosineAnnealing 40 | init_args: 41 | warmup_epochs: 10 42 | max_epochs: 1000 43 | 44 | trainer: 45 | # ------------------ Logger ------------------ 46 | logger: 47 | class_path: pytorch_lightning.loggers.TensorBoardLogger 48 | init_args: 49 | save_dir: "logs" 50 | name: "byol" -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/singer_identity/train_configs/contrastive-vc.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | class_path: singer_identity.trainer.SSLTrainer 3 | init_args: 4 | # ------------------ Feature extractor ------------------ 5 | feature_extractor: 6 | spec_layer: 'melspectogram' 7 | n_fft: 2048 8 | hop_length: 512 9 | # ------------------ Encoder ------------------ 10 | backbone: 11 | backbone: "efficientnet_b0" 12 | pretrained: true 13 | embedding_dim: 1000 14 | # ------------------ Projection ------------------ 15 | projection: 16 | input_dim: 1000 17 | output_dim: 128 18 | l2_normalize: true 19 | # ------------------ Training hyperparameters ------------------ 20 | use_contrastive_loss: true 21 | temp: 0.2 22 | nr_negative: 250 23 | decouple: true 24 | use_covariance_reg: true 25 | fact_cov: 100 26 | use_variance_reg: true 27 | fact_var: 25 28 | use_invariance_loss: false 29 | use_vicreg_loss: false 30 | use_align_loss: false 31 | use_uniform_loss: false 32 | # ------------------ Optimizer ------------------ 33 | optimizer1_init: 34 | class_path: torch.optim.Adam 35 | init_args: 36 | lr: 0.0001 37 | weight_decay: 1e-5 38 | trainer: 39 | # ------------------ Logger ------------------ 40 | logger: 41 | class_path: pytorch_lightning.loggers.TensorBoardLogger 42 | init_args: 43 | save_dir: "logs" 44 | name: "contrastive-vc" -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/singer_identity/train_configs/contrastive.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | class_path: singer_identity.trainer.SSLTrainer 3 | init_args: 4 | # ------------------ Feature extractor ------------------ 5 | feature_extractor: 6 | spec_layer: 'melspectogram' 7 | n_fft: 2048 8 | hop_length: 512 9 | # ------------------ Encoder ------------------ 10 | backbone: 11 | backbone: "efficientnet_b0" 12 | pretrained: true 13 | embedding_dim: 1000 14 | # ------------------ Projection ------------------ 15 | projection: 16 | input_dim: 1000 17 | output_dim: 128 18 | l2_normalize: true 19 | # ------------------ Training hyperparameters ------------------ 20 | use_contrastive_loss: true 21 | temp: 0.2 22 | nr_negative: 250 23 | decouple: true 24 | use_covariance_reg: false 25 | use_variance_reg: false 26 | use_vicreg_loss: false 27 | use_align_loss: false 28 | use_uniform_loss: false 29 | # ------------------ Optimizer ------------------ 30 | optimizer1_init: 31 | class_path: torch.optim.Adam 32 | init_args: 33 | lr: 0.0001 34 | weight_decay: 1e-5 35 | trainer: 36 | # ------------------ Logger ------------------ 37 | logger: 38 | class_path: pytorch_lightning.loggers.TensorBoardLogger 39 | init_args: 40 | save_dir: "logs" 41 | name: "contrastive" -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/singer_identity/train_configs/contrastive_test.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | class_path: singer_identity.trainer.SSLTrainer 3 | init_args: 4 | # ------------------ Feature extractor ------------------ 5 | feature_extractor: 6 | spec_layer: 'melspectogram' 7 | n_fft: 2048 8 | hop_length: 512 9 | 10 | # ------------------ Encoder ------------------ 11 | backbone: 12 | backbone: "efficientnet_b0" 13 | pretrained: true 14 | embedding_dim: 1000 15 | 16 | # ------------------ Projection ------------------ 17 | projection: 18 | input_dim: 1000 19 | output_dim: 128 20 | l2_normalize: true 21 | 22 | # ------------------ Training hyperparameters ------------------ 23 | use_contrastive_loss: true 24 | temp: 0.2 25 | nr_negative: 250 26 | decouple: true 27 | use_covariance_reg: false 28 | use_variance_reg: false 29 | use_vicreg_loss: false 30 | use_align_loss: false 31 | use_uniform_loss: false 32 | 33 | # ------------------ Optimizer ------------------ 34 | optimizer1_init: 35 | class_path: torch.optim.Adam 36 | init_args: 37 | lr: 0.0001 38 | weight_decay: 1e-5 39 | 40 | trainer: 41 | # ------------------ Logger ------------------ 42 | logger: 43 | class_path: pytorch_lightning.loggers.TensorBoardLogger 44 | init_args: 45 | save_dir: "logs" 46 | name: "contrastive" -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/singer_identity/train_configs/uniformity-alignment.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | class_path: singer_identity.trainer.SSLTrainer 3 | init_args: 4 | # ------------------ Feature extractor ------------------ 5 | feature_extractor: 6 | spec_layer: 'melspectogram' 7 | n_fft: 2048 8 | hop_length: 512 9 | # ------------------ Encoder ------------------ 10 | backbone: 11 | backbone: "efficientnet_b0" 12 | pretrained: true 13 | embedding_dim: 1000 14 | # ------------------ Projection ------------------ 15 | projection: 16 | input_dim: 1000 17 | output_dim: 128 18 | l2_normalize: true 19 | # ------------------ Training hyperparameters ------------------ 20 | use_contrastive_loss: false 21 | use_covariance_reg: false 22 | use_variance_reg: false 23 | use_invariance_loss: false 24 | use_align_loss: true 25 | fact_align_loss: 1 26 | use_uniform_loss: true 27 | fact_unif_loss: 1 28 | # ------------------ Optimizer ------------------ 29 | optimizer1_init: 30 | class_path: torch.optim.Adam 31 | init_args: 32 | lr: 0.0001 33 | weight_decay: 1e-5 34 | trainer: 35 | # ------------------ Logger ------------------ 36 | logger: 37 | class_path: pytorch_lightning.loggers.TensorBoardLogger 38 | init_args: 39 | save_dir: "logs" 40 | name: "uniformity-alignment" -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/singer_identity/train_configs/vicreg.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | class_path: singer_identity.trainer.SSLTrainer 3 | init_args: 4 | # ------------------ Feature extractor ------------------ 5 | feature_extractor: 6 | spec_layer: 'melspectogram' 7 | n_fft: 2048 8 | hop_length: 512 9 | # ------------------ Encoder ------------------ 10 | backbone: 11 | backbone: "efficientnet_b0" 12 | pretrained: true 13 | embedding_dim: 1000 14 | # ------------------ Projection ------------------ 15 | projection: 16 | input_dim: 1000 17 | output_dim: 128 18 | l2_normalize: true 19 | # ------------------ Training hyperparameters ------------------ 20 | use_contrastive_loss: false 21 | # temp: 0.2 22 | # nr_negative: 250 23 | # decouple: true 24 | use_invariance_loss: true 25 | fact_inv_loss: 25 26 | use_covariance_reg: true 27 | fact_cov: 100 28 | use_variance_reg: true 29 | fact_var: 25 30 | gamma: 1 31 | use_align_loss: false 32 | use_uniform_loss: false 33 | # ------------------ Optimizer ------------------ 34 | optimizer1_init: 35 | class_path: torch.optim.Adam 36 | init_args: 37 | lr: 0.0001 38 | weight_decay: 1e-5 39 | trainer: 40 | # ------------------ Logger ------------------ 41 | logger: 42 | class_path: pytorch_lightning.loggers.TensorBoardLogger 43 | init_args: 44 | save_dir: "logs" 45 | name: "vicreg" -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/singer_identity/trainer_byol.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Any 2 | 3 | import torch 4 | import torch.nn as nn 5 | from singer_identity.models.byol import TeacherStudentModel, Optimizer, Scheduler 6 | from singer_identity.model import IdentityEncoder, Projection, SiameseArm, MLP 7 | 8 | import copy 9 | 10 | class BYOL(TeacherStudentModel): 11 | def __init__( 12 | self, 13 | # module: nn.Module, 14 | weight_callback, 15 | optimizer: Optimizer, 16 | backbone: dict = {}, 17 | projection: dict = {}, 18 | predictor: dict = {}, 19 | feature_extractor: dict = {}, 20 | loss_fn: nn.Module = torch.nn.MSELoss(), 21 | scheduler: Optional[Scheduler] = None, 22 | normalize_projections: bool = False, 23 | normalize_representations: bool = False, 24 | ): 25 | encoder = IdentityEncoder(feature_extractor=feature_extractor, encoder=backbone) 26 | projection_layer = Projection(**projection) 27 | predictor_layer = MLP(**copy.deepcopy(predictor)) 28 | module = SiameseArm( 29 | encoder=encoder, 30 | projector=projection_layer, 31 | predictor=predictor_layer, 32 | normalize_projections=normalize_projections, 33 | normalize_representations=normalize_representations, 34 | ) 35 | 36 | super(BYOL, self).__init__( 37 | module, loss_fn, weight_callback, optimizer, scheduler=scheduler 38 | ) 39 | self.save_hyperparameters(ignore=["module", "loss_fn"]) 40 | 41 | def shared_step(self, batch, step_name: str): 42 | x1 = batch["clip1"] 43 | x2 = batch["clip2"] 44 | 45 | batch_size = x1.shape[0] 46 | 47 | ys, zs, qs = self.student_network(x1) 48 | with torch.no_grad(): 49 | yt, zt, qt = self.teacher_network(x2) 50 | loss_12 = self.loss_fn(qs, zt) 51 | 52 | ys, zs, qs = self.student_network(x2) 53 | with torch.no_grad(): 54 | yt, zt, qt = self.teacher_network(x1) 55 | loss_21 = self.loss_fn(qs, zt) 56 | 57 | loss = (loss_12 + loss_21) / 2 58 | 59 | self.log( 60 | f"loss/{step_name}", 61 | loss, 62 | prog_bar=True, 63 | batch_size=batch_size, 64 | ) 65 | 66 | self.record_variables(y1=ys, z1=zs, y2=yt, z2=zt) 67 | 68 | return loss 69 | 70 | def training_step(self, batch, batch_idx): 71 | return self.shared_step(batch, "train") 72 | 73 | def validation_step(self, batch, batch_idx): 74 | return self.shared_step(batch, "val") 75 | -------------------------------------------------------------------------------- /models/clova/models/ssl_singer_identity/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pytorch_lightning as pl 3 | from pytorch_lightning.cli import LightningCLI 4 | 5 | 6 | class CLI(LightningCLI): 7 | def add_arguments_to_parser(self, parser): 8 | parser.add_argument("--ckpt_path", default=None) 9 | 10 | 11 | if __name__ == "__main__": 12 | cli = CLI( 13 | model_class=pl.LightningModule, 14 | datamodule_class=pl.LightningDataModule, 15 | subclass_mode_model=True, 16 | subclass_mode_data=True, 17 | save_config_kwargs={"overwrite": True}, 18 | run=False, 19 | ) 20 | 21 | ckpt_path = cli.config["ckpt_path"] 22 | 23 | if ckpt_path is not None: 24 | step = torch.load(ckpt_path, map_location="cpu")["global_step"] 25 | cli.trainer.fit_loop.epoch_loop._batches_that_stepped = step 26 | 27 | cli.trainer.fit(cli.model, cli.datamodule, ckpt_path=ckpt_path) 28 | -------------------------------------------------------------------------------- /models/clova/models/weights/RawNet3/.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bin.* filter=lfs diff=lfs merge=lfs -text 5 | *.bz2 filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.model filter=lfs diff=lfs merge=lfs -text 12 | *.msgpack filter=lfs diff=lfs merge=lfs -text 13 | *.onnx filter=lfs diff=lfs merge=lfs -text 14 | *.ot filter=lfs diff=lfs merge=lfs -text 15 | *.parquet filter=lfs diff=lfs merge=lfs -text 16 | *.pb filter=lfs diff=lfs merge=lfs -text 17 | *.pt filter=lfs diff=lfs merge=lfs -text 18 | *.pth filter=lfs diff=lfs merge=lfs -text 19 | *.rar filter=lfs diff=lfs merge=lfs -text 20 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 21 | *.tar.* filter=lfs diff=lfs merge=lfs -text 22 | *.tflite filter=lfs diff=lfs merge=lfs -text 23 | *.tgz filter=lfs diff=lfs merge=lfs -text 24 | *.xz filter=lfs diff=lfs merge=lfs -text 25 | *.zip filter=lfs diff=lfs merge=lfs -text 26 | *.zstandard filter=lfs diff=lfs merge=lfs -text 27 | *tfevents* filter=lfs diff=lfs merge=lfs -text 28 | -------------------------------------------------------------------------------- /models/clova/models/weights/RawNet3/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | license: mit 3 | --- 4 | 5 | 6 | 7 | 8 | 9 | --- 10 | thumbnail: Refer to https://github.com/jungjee/RawNet for full documentation 11 | 12 | tags: 13 | - Speaker recognition 14 | - Speaker verification 15 | - RawNet 16 | - RawNet3 17 | 18 | license: "mit" 19 | 20 | datasets: 21 | - VoxCeleb1 22 | - VoxCeleb2 23 | 24 | metrics: 25 | - EER 0.89% on Vox1-O 26 | - minDCF 0.0659 on Vox1-O 27 | --- 28 | -------------------------------------------------------------------------------- /models/f0_predictor/CrepeF0Predictor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from models.f0_predictor.crepe import CrepePitchExtractor 4 | from models.f0_predictor.F0Predictor import F0Predictor 5 | 6 | 7 | class CrepeF0Predictor(F0Predictor): 8 | def __init__(self,hop_length=512,f0_min=50,f0_max=1100,device=None,sampling_rate=44100,threshold=0.05,model="full"): 9 | self.F0Creper = CrepePitchExtractor(hop_length=hop_length,f0_min=f0_min,f0_max=f0_max,device=device,threshold=threshold,model=model) 10 | self.hop_length = hop_length 11 | self.f0_min = f0_min 12 | self.f0_max = f0_max 13 | self.device = device 14 | self.threshold = threshold 15 | self.sampling_rate = sampling_rate 16 | self.name = "crepe" 17 | 18 | def compute_f0(self,wav,p_len=None): 19 | x = torch.FloatTensor(wav).to(self.device) 20 | if p_len is None: 21 | p_len = x.shape[0]//self.hop_length 22 | else: 23 | assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error" 24 | f0,uv = self.F0Creper(x[None,:].float(),self.sampling_rate,pad_to=p_len) 25 | return f0 26 | 27 | def compute_f0_uv(self,wav,p_len=None): 28 | x = torch.FloatTensor(wav).to(self.device) 29 | if p_len is None: 30 | p_len = x.shape[0]//self.hop_length 31 | else: 32 | assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error" 33 | f0,uv = self.F0Creper(x[None,:].float(),self.sampling_rate,pad_to=p_len) 34 | return f0,uv -------------------------------------------------------------------------------- /models/f0_predictor/DioF0Predictor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pyworld 3 | 4 | from models.f0_predictor.F0Predictor import F0Predictor 5 | 6 | 7 | class DioF0Predictor(F0Predictor): 8 | def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100): 9 | self.hop_length = hop_length 10 | self.f0_min = f0_min 11 | self.f0_max = f0_max 12 | self.sampling_rate = sampling_rate 13 | self.name = "dio" 14 | 15 | def interpolate_f0(self,f0): 16 | ''' 17 | 对F0进行插值处理 18 | ''' 19 | vuv_vector = np.zeros_like(f0, dtype=np.float32) 20 | vuv_vector[f0 > 0.0] = 1.0 21 | vuv_vector[f0 <= 0.0] = 0.0 22 | 23 | nzindex = np.nonzero(f0)[0] 24 | data = f0[nzindex] 25 | nzindex = nzindex.astype(np.float32) 26 | time_org = self.hop_length / self.sampling_rate * nzindex 27 | time_frame = np.arange(f0.shape[0]) * self.hop_length / self.sampling_rate 28 | 29 | if data.shape[0] <= 0: 30 | return np.zeros(f0.shape[0], dtype=np.float32),vuv_vector 31 | 32 | if data.shape[0] == 1: 33 | return np.ones(f0.shape[0], dtype=np.float32) * f0[0],vuv_vector 34 | 35 | f0 = np.interp(time_frame, time_org, data, left=data[0], right=data[-1]) 36 | 37 | return f0,vuv_vector 38 | 39 | def resize_f0(self,x, target_len): 40 | source = np.array(x) 41 | source[source<0.001] = np.nan 42 | target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source) 43 | res = np.nan_to_num(target) 44 | return res 45 | 46 | def compute_f0(self,wav,p_len=None): 47 | if p_len is None: 48 | p_len = wav.shape[0]//self.hop_length 49 | f0, t = pyworld.dio( 50 | wav.astype(np.double), 51 | fs=self.sampling_rate, 52 | f0_floor=self.f0_min, 53 | f0_ceil=self.f0_max, 54 | frame_period=1000 * self.hop_length / self.sampling_rate, 55 | ) 56 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) 57 | for index, pitch in enumerate(f0): 58 | f0[index] = round(pitch, 1) 59 | return self.interpolate_f0(self.resize_f0(f0, p_len))[0] 60 | 61 | def compute_f0_uv(self,wav,p_len=None): 62 | if p_len is None: 63 | p_len = wav.shape[0]//self.hop_length 64 | f0, t = pyworld.dio( 65 | wav.astype(np.double), 66 | fs=self.sampling_rate, 67 | f0_floor=self.f0_min, 68 | f0_ceil=self.f0_max, 69 | frame_period=1000 * self.hop_length / self.sampling_rate, 70 | ) 71 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) 72 | for index, pitch in enumerate(f0): 73 | f0[index] = round(pitch, 1) 74 | return self.interpolate_f0(self.resize_f0(f0, p_len)) 75 | -------------------------------------------------------------------------------- /models/f0_predictor/F0Predictor.py: -------------------------------------------------------------------------------- 1 | class F0Predictor(object): 2 | def compute_f0(self,wav,p_len): 3 | ''' 4 | input: wav:[signal_length] 5 | p_len:int 6 | output: f0:[signal_length//hop_length] 7 | ''' 8 | pass 9 | 10 | def compute_f0_uv(self,wav,p_len): 11 | ''' 12 | input: wav:[signal_length] 13 | p_len:int 14 | output: f0:[signal_length//hop_length],uv:[signal_length//hop_length] 15 | ''' 16 | pass -------------------------------------------------------------------------------- /models/f0_predictor/FCPEF0Predictor.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn.functional as F 6 | 7 | from models.f0_predictor.F0Predictor import F0Predictor 8 | 9 | from .fcpe.model import FCPEInfer 10 | 11 | 12 | class FCPEF0Predictor(F0Predictor): 13 | def __init__(self, hop_length=512, f0_min=50, f0_max=1100, dtype=torch.float32, device=None, sampling_rate=44100, 14 | threshold=0.05): 15 | self.fcpe = FCPEInfer(model_path="pretrain/fcpe.pt", device=device, dtype=dtype) 16 | self.hop_length = hop_length 17 | self.f0_min = f0_min 18 | self.f0_max = f0_max 19 | if device is None: 20 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu' 21 | else: 22 | self.device = device 23 | self.threshold = threshold 24 | self.sampling_rate = sampling_rate 25 | self.dtype = dtype 26 | self.name = "fcpe" 27 | 28 | def repeat_expand( 29 | self, content: Union[torch.Tensor, np.ndarray], target_len: int, mode: str = "nearest" 30 | ): 31 | ndim = content.ndim 32 | 33 | if content.ndim == 1: 34 | content = content[None, None] 35 | elif content.ndim == 2: 36 | content = content[None] 37 | 38 | assert content.ndim == 3 39 | 40 | is_np = isinstance(content, np.ndarray) 41 | if is_np: 42 | content = torch.from_numpy(content) 43 | 44 | results = torch.nn.functional.interpolate(content, size=target_len, mode=mode) 45 | 46 | if is_np: 47 | results = results.numpy() 48 | 49 | if ndim == 1: 50 | return results[0, 0] 51 | elif ndim == 2: 52 | return results[0] 53 | 54 | def post_process(self, x, sampling_rate, f0, pad_to): 55 | if isinstance(f0, np.ndarray): 56 | f0 = torch.from_numpy(f0).float().to(x.device) 57 | 58 | if pad_to is None: 59 | return f0 60 | 61 | f0 = self.repeat_expand(f0, pad_to) 62 | 63 | vuv_vector = torch.zeros_like(f0) 64 | vuv_vector[f0 > 0.0] = 1.0 65 | vuv_vector[f0 <= 0.0] = 0.0 66 | 67 | # 去掉0频率, 并线性插值 68 | nzindex = torch.nonzero(f0).squeeze() 69 | f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy() 70 | time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy() 71 | time_frame = np.arange(pad_to) * self.hop_length / sampling_rate 72 | 73 | vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0] 74 | 75 | if f0.shape[0] <= 0: 76 | return torch.zeros(pad_to, dtype=torch.float, device=x.device).cpu().numpy(), vuv_vector.cpu().numpy() 77 | if f0.shape[0] == 1: 78 | return (torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[ 79 | 0]).cpu().numpy(), vuv_vector.cpu().numpy() 80 | 81 | # 大概可以用 torch 重写? 82 | f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1]) 83 | # vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector,pad_to/len(vuv_vector),order = 0)) 84 | 85 | return f0, vuv_vector.cpu().numpy() 86 | 87 | def compute_f0(self, wav, p_len=None): 88 | x = torch.FloatTensor(wav).to(self.dtype).to(self.device) 89 | if p_len is None: 90 | p_len = x.shape[0] // self.hop_length 91 | else: 92 | assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" 93 | f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0,:,0] 94 | if torch.all(f0 == 0): 95 | rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len) 96 | return rtn, rtn 97 | return self.post_process(x, self.sampling_rate, f0, p_len)[0] 98 | 99 | def compute_f0_uv(self, wav, p_len=None): 100 | x = torch.FloatTensor(wav).to(self.dtype).to(self.device) 101 | if p_len is None: 102 | p_len = x.shape[0] // self.hop_length 103 | else: 104 | assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" 105 | f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0,:,0] 106 | if torch.all(f0 == 0): 107 | rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len) 108 | return rtn, rtn 109 | return self.post_process(x, self.sampling_rate, f0, p_len) -------------------------------------------------------------------------------- /models/f0_predictor/HarvestF0Predictor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pyworld 3 | 4 | from models.f0_predictor.F0Predictor import F0Predictor 5 | 6 | 7 | class HarvestF0Predictor(F0Predictor): 8 | def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100): 9 | self.hop_length = hop_length 10 | self.f0_min = f0_min 11 | self.f0_max = f0_max 12 | self.sampling_rate = sampling_rate 13 | self.name = "harvest" 14 | 15 | def interpolate_f0(self,f0): 16 | ''' 17 | 对F0进行插值处理 18 | ''' 19 | vuv_vector = np.zeros_like(f0, dtype=np.float32) 20 | vuv_vector[f0 > 0.0] = 1.0 21 | vuv_vector[f0 <= 0.0] = 0.0 22 | 23 | nzindex = np.nonzero(f0)[0] 24 | data = f0[nzindex] 25 | nzindex = nzindex.astype(np.float32) 26 | time_org = self.hop_length / self.sampling_rate * nzindex 27 | time_frame = np.arange(f0.shape[0]) * self.hop_length / self.sampling_rate 28 | 29 | if data.shape[0] <= 0: 30 | return np.zeros(f0.shape[0], dtype=np.float32),vuv_vector 31 | 32 | if data.shape[0] == 1: 33 | return np.ones(f0.shape[0], dtype=np.float32) * f0[0],vuv_vector 34 | 35 | f0 = np.interp(time_frame, time_org, data, left=data[0], right=data[-1]) 36 | 37 | return f0,vuv_vector 38 | def resize_f0(self,x, target_len): 39 | source = np.array(x) 40 | source[source<0.001] = np.nan 41 | target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source) 42 | res = np.nan_to_num(target) 43 | return res 44 | 45 | def compute_f0(self,wav,p_len=None): 46 | if p_len is None: 47 | p_len = wav.shape[0]//self.hop_length 48 | f0, t = pyworld.harvest( 49 | wav.astype(np.double), 50 | fs=self.hop_length, 51 | f0_ceil=self.f0_max, 52 | f0_floor=self.f0_min, 53 | frame_period=1000 * self.hop_length / self.sampling_rate, 54 | ) 55 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs) 56 | return self.interpolate_f0(self.resize_f0(f0, p_len))[0] 57 | 58 | def compute_f0_uv(self,wav,p_len=None): 59 | if p_len is None: 60 | p_len = wav.shape[0]//self.hop_length 61 | f0, t = pyworld.harvest( 62 | wav.astype(np.double), 63 | fs=self.sampling_rate, 64 | f0_floor=self.f0_min, 65 | f0_ceil=self.f0_max, 66 | frame_period=1000 * self.hop_length / self.sampling_rate, 67 | ) 68 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) 69 | return self.interpolate_f0(self.resize_f0(f0, p_len)) 70 | -------------------------------------------------------------------------------- /models/f0_predictor/PMF0Predictor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import parselmouth 3 | 4 | from models.f0_predictor.F0Predictor import F0Predictor 5 | 6 | 7 | class PMF0Predictor(F0Predictor): 8 | def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100): 9 | self.hop_length = hop_length 10 | self.f0_min = f0_min 11 | self.f0_max = f0_max 12 | self.sampling_rate = sampling_rate 13 | self.name = "pm" 14 | 15 | def interpolate_f0(self,f0): 16 | ''' 17 | 对F0进行插值处理 18 | ''' 19 | vuv_vector = np.zeros_like(f0, dtype=np.float32) 20 | vuv_vector[f0 > 0.0] = 1.0 21 | vuv_vector[f0 <= 0.0] = 0.0 22 | 23 | nzindex = np.nonzero(f0)[0] 24 | data = f0[nzindex] 25 | nzindex = nzindex.astype(np.float32) 26 | time_org = self.hop_length / self.sampling_rate * nzindex 27 | time_frame = np.arange(f0.shape[0]) * self.hop_length / self.sampling_rate 28 | 29 | if data.shape[0] <= 0: 30 | return np.zeros(f0.shape[0], dtype=np.float32),vuv_vector 31 | 32 | if data.shape[0] == 1: 33 | return np.ones(f0.shape[0], dtype=np.float32) * f0[0],vuv_vector 34 | 35 | f0 = np.interp(time_frame, time_org, data, left=data[0], right=data[-1]) 36 | 37 | return f0,vuv_vector 38 | 39 | 40 | def compute_f0(self,wav,p_len=None): 41 | x = wav 42 | if p_len is None: 43 | p_len = x.shape[0]//self.hop_length 44 | else: 45 | assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error" 46 | time_step = self.hop_length / self.sampling_rate * 1000 47 | f0 = parselmouth.Sound(x, self.sampling_rate).to_pitch_ac( 48 | time_step=time_step / 1000, voicing_threshold=0.6, 49 | pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array['frequency'] 50 | 51 | pad_size=(p_len - len(f0) + 1) // 2 52 | if(pad_size>0 or p_len - len(f0) - pad_size>0): 53 | f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') 54 | f0,uv = self.interpolate_f0(f0) 55 | return f0 56 | 57 | def compute_f0_uv(self,wav,p_len=None): 58 | x = wav 59 | if p_len is None: 60 | p_len = x.shape[0]//self.hop_length 61 | else: 62 | assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error" 63 | time_step = self.hop_length / self.sampling_rate * 1000 64 | f0 = parselmouth.Sound(x, self.sampling_rate).to_pitch_ac( 65 | time_step=time_step / 1000, voicing_threshold=0.6, 66 | pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array['frequency'] 67 | 68 | pad_size=(p_len - len(f0) + 1) // 2 69 | if(pad_size>0 or p_len - len(f0) - pad_size>0): 70 | f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') 71 | f0,uv = self.interpolate_f0(f0) 72 | return f0,uv 73 | -------------------------------------------------------------------------------- /models/f0_predictor/RMVPEF0Predictor.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | import os 3 | import numpy as np 4 | import torch 5 | import torch.nn.functional as F 6 | 7 | from models.f0_predictor.F0Predictor import F0Predictor 8 | 9 | from .rmvpe import RMVPE 10 | 11 | 12 | class RMVPEF0Predictor(F0Predictor): 13 | def __init__(self,hop_length=512,f0_min=50,f0_max=1100, dtype=torch.float32, device=None,sampling_rate=44100,threshold=0.05): 14 | ckpt_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ckpt/rmvpe.pt") 15 | self.rmvpe = RMVPE(model_path=ckpt_filepath, dtype=dtype, device=device) 16 | self.hop_length = hop_length 17 | self.f0_min = f0_min 18 | self.f0_max = f0_max 19 | if device is None: 20 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu' 21 | else: 22 | self.device = device 23 | self.threshold = threshold 24 | self.sampling_rate = sampling_rate 25 | self.dtype = dtype 26 | self.name = "rmvpe" 27 | 28 | def repeat_expand( 29 | self, content: Union[torch.Tensor, np.ndarray], target_len: int, mode: str = "nearest" 30 | ): 31 | ndim = content.ndim 32 | 33 | if content.ndim == 1: 34 | content = content[None, None] 35 | elif content.ndim == 2: 36 | content = content[None] 37 | 38 | assert content.ndim == 3 39 | 40 | is_np = isinstance(content, np.ndarray) 41 | if is_np: 42 | content = torch.from_numpy(content) 43 | 44 | results = torch.nn.functional.interpolate(content, size=target_len, mode=mode) 45 | 46 | if is_np: 47 | results = results.numpy() 48 | 49 | if ndim == 1: 50 | return results[0, 0] 51 | elif ndim == 2: 52 | return results[0] 53 | 54 | def post_process(self, x, sampling_rate, f0, pad_to): 55 | if isinstance(f0, np.ndarray): 56 | f0 = torch.from_numpy(f0).float().to(x.device) 57 | 58 | if pad_to is None: 59 | return f0 60 | 61 | f0 = self.repeat_expand(f0, pad_to) 62 | 63 | vuv_vector = torch.zeros_like(f0) 64 | vuv_vector[f0 > 0.0] = 1.0 65 | vuv_vector[f0 <= 0.0] = 0.0 66 | 67 | # 去掉0频率, 并线性插值 68 | nzindex = torch.nonzero(f0).squeeze() 69 | f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy() 70 | time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy() 71 | time_frame = np.arange(pad_to) * self.hop_length / sampling_rate 72 | 73 | vuv_vector = F.interpolate(vuv_vector[None,None,:],size=pad_to)[0][0] 74 | 75 | if f0.shape[0] <= 0: 76 | return torch.zeros(pad_to, dtype=torch.float, device=x.device).cpu().numpy(),vuv_vector.cpu().numpy() 77 | if f0.shape[0] == 1: 78 | return (torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[0]).cpu().numpy() ,vuv_vector.cpu().numpy() 79 | 80 | # 大概可以用 torch 重写? 81 | f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1]) 82 | #vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector,pad_to/len(vuv_vector),order = 0)) 83 | 84 | return f0,vuv_vector.cpu().numpy() 85 | 86 | def compute_f0(self,wav,p_len=None): 87 | x = torch.FloatTensor(wav).to(self.dtype).to(self.device) 88 | if p_len is None: 89 | p_len = x.shape[0]//self.hop_length 90 | else: 91 | assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error" 92 | f0 = self.rmvpe.infer_from_audio(x,self.sampling_rate,self.threshold) 93 | if torch.all(f0 == 0): 94 | rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len) 95 | return rtn,rtn 96 | return self.post_process(x,self.sampling_rate,f0,p_len)[0] 97 | 98 | def compute_f0_uv(self,wav,p_len=None): 99 | x = torch.FloatTensor(wav).to(self.dtype).to(self.device) 100 | if p_len is None: 101 | p_len = x.shape[0]//self.hop_length 102 | else: 103 | assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error" 104 | f0 = self.rmvpe.infer_from_audio(x,self.sampling_rate,self.threshold) 105 | if torch.all(f0 == 0): 106 | rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len) 107 | return rtn,rtn 108 | return self.post_process(x,self.sampling_rate,f0,p_len) 109 | -------------------------------------------------------------------------------- /models/f0_predictor/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs): 3 | if f0_predictor == "pm": 4 | from models.f0_predictor.PMF0Predictor import PMF0Predictor 5 | f0_predictor_object = PMF0Predictor(hop_length=hop_length, sampling_rate=sampling_rate) 6 | 7 | elif f0_predictor == "crepe": 8 | from models.f0_predictor.CrepeF0Predictor import CrepeF0Predictor 9 | f0_predictor_object = CrepeF0Predictor(hop_length=hop_length, sampling_rate=sampling_rate, device=kargs["device"],threshold=kargs["threshold"]) 10 | 11 | elif f0_predictor == "harvest": 12 | from models.f0_predictor.HarvestF0Predictor import HarvestF0Predictor 13 | f0_predictor_object = HarvestF0Predictor(hop_length=hop_length, sampling_rate=sampling_rate) 14 | 15 | elif f0_predictor == "dio": 16 | from models.f0_predictor.DioF0Predictor import DioF0Predictor 17 | f0_predictor_object = DioF0Predictor(hop_length=hop_length, sampling_rate=sampling_rate) 18 | 19 | elif f0_predictor == "rmvpe": 20 | from models.f0_predictor.RMVPEF0Predictor import RMVPEF0Predictor 21 | f0_predictor_object = RMVPEF0Predictor(hop_length=hop_length, sampling_rate=sampling_rate, dtype=torch.float32 ,device=kargs["device"],threshold=kargs["threshold"]) 22 | 23 | elif f0_predictor == "fcpe": 24 | from models.f0_predictor.FCPEF0Predictor import FCPEF0Predictor 25 | f0_predictor_object = FCPEF0Predictor(hop_length=hop_length, sampling_rate=sampling_rate, dtype=torch.float32 ,device=kargs["device"],threshold=kargs["threshold"]) 26 | 27 | else: 28 | raise Exception("Unknown f0 predictor") 29 | return f0_predictor_object 30 | -------------------------------------------------------------------------------- /models/f0_predictor/fcpe/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import FCPEInfer # noqa: F401 2 | from .nvSTFT import STFT # noqa: F401 3 | from .pcmer import PCmer # noqa: F401 4 | -------------------------------------------------------------------------------- /models/f0_predictor/rmvpe/__init__.py: -------------------------------------------------------------------------------- 1 | from .constants import * # noqa: F403 2 | from .inference import RMVPE # noqa: F401 3 | from .model import E2E, E2E0 # noqa: F401 4 | from .spec import MelSpectrogram # noqa: F401 5 | from .utils import ( # noqa: F401 6 | cycle, 7 | summary, 8 | to_local_average_cents, 9 | to_viterbi_cents, 10 | ) 11 | -------------------------------------------------------------------------------- /models/f0_predictor/rmvpe/constants.py: -------------------------------------------------------------------------------- 1 | SAMPLE_RATE = 16000 2 | 3 | N_CLASS = 360 4 | 5 | N_MELS = 128 6 | MEL_FMIN = 30 7 | MEL_FMAX = SAMPLE_RATE // 2 8 | WINDOW_LENGTH = 1024 9 | CONST = 1997.3794084376191 10 | -------------------------------------------------------------------------------- /models/f0_predictor/rmvpe/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torchaudio.transforms import Resample 4 | 5 | from .constants import * # noqa: F403 6 | from .model import E2E0 7 | from .spec import MelSpectrogram 8 | from .utils import to_local_average_cents, to_viterbi_cents 9 | 10 | 11 | class RMVPE: 12 | def __init__(self, model_path, device=None, dtype = torch.float32, hop_length=160): 13 | self.resample_kernel = {} 14 | if device is None: 15 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu' 16 | else: 17 | self.device = device 18 | model = E2E0(4, 1, (2, 2)) 19 | ckpt = torch.load(model_path, map_location=torch.device(self.device)) 20 | model.load_state_dict(ckpt['model']) 21 | model = model.to(dtype).to(self.device) 22 | model.eval() 23 | self.model = model 24 | self.dtype = dtype 25 | self.mel_extractor = MelSpectrogram(N_MELS, SAMPLE_RATE, WINDOW_LENGTH, hop_length, None, MEL_FMIN, MEL_FMAX) # noqa: F405 26 | self.resample_kernel = {} 27 | 28 | def mel2hidden(self, mel): 29 | with torch.no_grad(): 30 | n_frames = mel.shape[-1] 31 | mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='constant') 32 | hidden = self.model(mel) 33 | return hidden[:, :n_frames] 34 | 35 | def decode(self, hidden, thred=0.03, use_viterbi=False): 36 | if use_viterbi: 37 | cents_pred = to_viterbi_cents(hidden, thred=thred) 38 | else: 39 | cents_pred = to_local_average_cents(hidden, thred=thred) 40 | f0 = torch.Tensor([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred]).to(self.device) 41 | return f0 42 | 43 | def infer_from_audio(self, audio, sample_rate=16000, thred=0.05, use_viterbi=False): 44 | audio = audio.unsqueeze(0).to(self.dtype).to(self.device) 45 | if sample_rate == 16000: 46 | audio_res = audio 47 | else: 48 | key_str = str(sample_rate) 49 | if key_str not in self.resample_kernel: 50 | self.resample_kernel[key_str] = Resample(sample_rate, 16000, lowpass_filter_width=128) 51 | self.resample_kernel[key_str] = self.resample_kernel[key_str].to(self.dtype).to(self.device) 52 | audio_res = self.resample_kernel[key_str](audio) 53 | mel_extractor = self.mel_extractor.to(self.device) 54 | mel = mel_extractor(audio_res, center=True).to(self.dtype) 55 | hidden = self.mel2hidden(mel) 56 | f0 = self.decode(hidden.squeeze(0), thred=thred, use_viterbi=use_viterbi) 57 | return f0 58 | -------------------------------------------------------------------------------- /models/f0_predictor/rmvpe/model.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from .constants import * # noqa: F403 4 | from .deepunet import DeepUnet, DeepUnet0 5 | from .seq import BiGRU 6 | from .spec import MelSpectrogram 7 | 8 | 9 | class E2E(nn.Module): 10 | def __init__(self, hop_length, n_blocks, n_gru, kernel_size, en_de_layers=5, inter_layers=4, in_channels=1, 11 | en_out_channels=16): 12 | super(E2E, self).__init__() 13 | self.mel = MelSpectrogram(N_MELS, SAMPLE_RATE, WINDOW_LENGTH, hop_length, None, MEL_FMIN, MEL_FMAX) # noqa: F405 14 | self.unet = DeepUnet(kernel_size, n_blocks, en_de_layers, inter_layers, in_channels, en_out_channels) 15 | self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) 16 | if n_gru: 17 | self.fc = nn.Sequential( 18 | BiGRU(3 * N_MELS, 256, n_gru), # noqa: F405 19 | nn.Linear(512, N_CLASS), # noqa: F405 20 | nn.Dropout(0.25), 21 | nn.Sigmoid() 22 | ) 23 | else: 24 | self.fc = nn.Sequential( 25 | nn.Linear(3 * N_MELS, N_CLASS), # noqa: F405 26 | nn.Dropout(0.25), 27 | nn.Sigmoid() 28 | ) 29 | 30 | def forward(self, x): 31 | mel = self.mel(x.reshape(-1, x.shape[-1])).transpose(-1, -2).unsqueeze(1) 32 | x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) 33 | # x = self.fc(x) 34 | hidden_vec = 0 35 | if len(self.fc) == 4: 36 | for i in range(len(self.fc)): 37 | x = self.fc[i](x) 38 | if i == 0: 39 | hidden_vec = x 40 | return hidden_vec, x 41 | 42 | 43 | class E2E0(nn.Module): 44 | def __init__(self, n_blocks, n_gru, kernel_size, en_de_layers=5, inter_layers=4, in_channels=1, 45 | en_out_channels=16): 46 | super(E2E0, self).__init__() 47 | self.unet = DeepUnet0(kernel_size, n_blocks, en_de_layers, inter_layers, in_channels, en_out_channels) 48 | self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) 49 | if n_gru: 50 | self.fc = nn.Sequential( 51 | BiGRU(3 * N_MELS, 256, n_gru), # noqa: F405 52 | nn.Linear(512, N_CLASS), # noqa: F405 53 | nn.Dropout(0.25), 54 | nn.Sigmoid() 55 | ) 56 | else: 57 | self.fc = nn.Sequential( 58 | nn.Linear(3 * N_MELS, N_CLASS), # noqa: F405 59 | nn.Dropout(0.25), 60 | nn.Sigmoid() 61 | ) 62 | 63 | def forward(self, mel): 64 | mel = mel.transpose(-1, -2).unsqueeze(1) 65 | x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) 66 | x = self.fc(x) 67 | return x 68 | -------------------------------------------------------------------------------- /models/f0_predictor/rmvpe/seq.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class BiGRU(nn.Module): 5 | def __init__(self, input_features, hidden_features, num_layers): 6 | super(BiGRU, self).__init__() 7 | self.gru = nn.GRU(input_features, hidden_features, num_layers=num_layers, batch_first=True, bidirectional=True) 8 | 9 | def forward(self, x): 10 | return self.gru(x)[0] 11 | 12 | 13 | class BiLSTM(nn.Module): 14 | def __init__(self, input_features, hidden_features, num_layers): 15 | super(BiLSTM, self).__init__() 16 | self.lstm = nn.LSTM(input_features, hidden_features, num_layers=num_layers, batch_first=True, bidirectional=True) 17 | 18 | def forward(self, x): 19 | return self.lstm(x)[0] 20 | 21 | -------------------------------------------------------------------------------- /models/f0_predictor/rmvpe/spec.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | from librosa.filters import mel 5 | 6 | 7 | class MelSpectrogram(torch.nn.Module): 8 | def __init__( 9 | self, 10 | n_mel_channels, 11 | sampling_rate, 12 | win_length, 13 | hop_length, 14 | n_fft=None, 15 | mel_fmin=0, 16 | mel_fmax=None, 17 | clamp = 1e-5 18 | ): 19 | super().__init__() 20 | n_fft = win_length if n_fft is None else n_fft 21 | self.hann_window = {} 22 | mel_basis = mel( 23 | sr=sampling_rate, 24 | n_fft=n_fft, 25 | n_mels=n_mel_channels, 26 | fmin=mel_fmin, 27 | fmax=mel_fmax, 28 | htk=True) 29 | mel_basis = torch.from_numpy(mel_basis).float() 30 | self.register_buffer("mel_basis", mel_basis) 31 | self.n_fft = win_length if n_fft is None else n_fft 32 | self.hop_length = hop_length 33 | self.win_length = win_length 34 | self.sampling_rate = sampling_rate 35 | self.n_mel_channels = n_mel_channels 36 | self.clamp = clamp 37 | 38 | def forward(self, audio, keyshift=0, speed=1, center=True): 39 | factor = 2 ** (keyshift / 12) 40 | n_fft_new = int(np.round(self.n_fft * factor)) 41 | win_length_new = int(np.round(self.win_length * factor)) 42 | hop_length_new = int(np.round(self.hop_length * speed)) 43 | 44 | keyshift_key = str(keyshift)+'_'+str(audio.device) 45 | if keyshift_key not in self.hann_window: 46 | self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(audio.device) 47 | 48 | fft = torch.stft( 49 | audio, 50 | n_fft=n_fft_new, 51 | hop_length=hop_length_new, 52 | win_length=win_length_new, 53 | window=self.hann_window[keyshift_key], 54 | center=center, 55 | return_complex=True) 56 | magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) 57 | 58 | if keyshift != 0: 59 | size = self.n_fft // 2 + 1 60 | resize = magnitude.size(1) 61 | if resize < size: 62 | magnitude = F.pad(magnitude, (0, 0, 0, size-resize)) 63 | magnitude = magnitude[:, :size, :] * self.win_length / win_length_new 64 | 65 | mel_output = torch.matmul(self.mel_basis, magnitude) 66 | log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) 67 | return log_mel_spec -------------------------------------------------------------------------------- /models/f0_predictor/rmvpe/utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from functools import reduce 3 | 4 | import librosa 5 | import numpy as np 6 | import torch 7 | from torch.nn.modules.module import _addindent 8 | 9 | from .constants import * # noqa: F403 10 | 11 | 12 | def cycle(iterable): 13 | while True: 14 | for item in iterable: 15 | yield item 16 | 17 | 18 | def summary(model, file=sys.stdout): 19 | def repr(model): 20 | # We treat the extra repr like the sub-module, one item per line 21 | extra_lines = [] 22 | extra_repr = model.extra_repr() 23 | # empty string will be split into list [''] 24 | if extra_repr: 25 | extra_lines = extra_repr.split('\n') 26 | child_lines = [] 27 | total_params = 0 28 | for key, module in model._modules.items(): 29 | mod_str, num_params = repr(module) 30 | mod_str = _addindent(mod_str, 2) 31 | child_lines.append('(' + key + '): ' + mod_str) 32 | total_params += num_params 33 | lines = extra_lines + child_lines 34 | 35 | for name, p in model._parameters.items(): 36 | if hasattr(p, 'shape'): 37 | total_params += reduce(lambda x, y: x * y, p.shape) 38 | 39 | main_str = model._get_name() + '(' 40 | if lines: 41 | # simple one-liner info, which most builtin Modules will use 42 | if len(extra_lines) == 1 and not child_lines: 43 | main_str += extra_lines[0] 44 | else: 45 | main_str += '\n ' + '\n '.join(lines) + '\n' 46 | 47 | main_str += ')' 48 | if file is sys.stdout: 49 | main_str += ', \033[92m{:,}\033[0m params'.format(total_params) 50 | else: 51 | main_str += ', {:,} params'.format(total_params) 52 | return main_str, total_params 53 | 54 | string, count = repr(model) 55 | if file is not None: 56 | if isinstance(file, str): 57 | file = open(file, 'w') 58 | print(string, file=file) 59 | file.flush() 60 | 61 | return count 62 | 63 | 64 | def to_local_average_cents(salience, center=None, thred=0.05): 65 | """ 66 | find the weighted average cents near the argmax bin 67 | """ 68 | 69 | if not hasattr(to_local_average_cents, 'cents_mapping'): 70 | # the bin number-to-cents mapping 71 | to_local_average_cents.cents_mapping = ( 72 | 20 * torch.arange(N_CLASS) + CONST).to(salience.device) # noqa: F405 73 | 74 | if salience.ndim == 1: 75 | if center is None: 76 | center = int(torch.argmax(salience)) 77 | start = max(0, center - 4) 78 | end = min(len(salience), center + 5) 79 | salience = salience[start:end] 80 | product_sum = torch.sum( 81 | salience * to_local_average_cents.cents_mapping[start:end]) 82 | weight_sum = torch.sum(salience) 83 | return product_sum / weight_sum if torch.max(salience) > thred else 0 84 | if salience.ndim == 2: 85 | return torch.Tensor([to_local_average_cents(salience[i, :], None, thred) for i in 86 | range(salience.shape[0])]).to(salience.device) 87 | 88 | raise Exception("label should be either 1d or 2d ndarray") 89 | 90 | def to_viterbi_cents(salience, thred=0.05): 91 | # Create viterbi transition matrix 92 | if not hasattr(to_viterbi_cents, 'transition'): 93 | xx, yy = torch.meshgrid(range(N_CLASS), range(N_CLASS)) # noqa: F405 94 | transition = torch.maximum(30 - abs(xx - yy), 0) 95 | transition = transition / transition.sum(axis=1, keepdims=True) 96 | to_viterbi_cents.transition = transition 97 | 98 | # Convert to probability 99 | prob = salience.T 100 | prob = prob / prob.sum(axis=0) 101 | 102 | # Perform viterbi decoding 103 | path = librosa.sequence.viterbi(prob.detach().cpu().numpy(), to_viterbi_cents.transition).astype(np.int64) 104 | 105 | return torch.Tensor([to_local_average_cents(salience[i, :], path[i], thred) for i in 106 | range(len(path))]).to(salience.device) 107 | -------------------------------------------------------------------------------- /models/hifigan/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import Generator 2 | 3 | 4 | class AttrDict(dict): 5 | def __init__(self, *args, **kwargs): 6 | super(AttrDict, self).__init__(*args, **kwargs) 7 | self.__dict__ = self -------------------------------------------------------------------------------- /models/hifigan/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 16, 5 | "learning_rate": 0.0002, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.999, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [8,8,2,2], 12 | "upsample_kernel_sizes": [16,16,4,4], 13 | "upsample_initial_channel": 512, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | "resblock_initial_channel": 256, 17 | 18 | "segment_size": 8192, 19 | "num_mels": 80, 20 | "num_freq": 1025, 21 | "n_fft": 1024, 22 | "hop_size": 256, 23 | "win_size": 1024, 24 | 25 | "sampling_rate": 22050, 26 | 27 | "fmin": 0, 28 | "fmax": 8000, 29 | "fmax_loss": null, 30 | 31 | "num_workers": 4, 32 | 33 | "dist_config": { 34 | "dist_backend": "nccl", 35 | "dist_url": "tcp://localhost:54321", 36 | "world_size": 1 37 | } 38 | } -------------------------------------------------------------------------------- /models/hifigan/generator_v1.txt: -------------------------------------------------------------------------------- 1 | https://github.com/jik876/hifi-gan -------------------------------------------------------------------------------- /models/speaker_encoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freds0/free-svc/def7f35a0d1935b00e6ebaf19a4a149617f3354b/models/speaker_encoder/__init__.py -------------------------------------------------------------------------------- /models/speaker_encoder/audio.py: -------------------------------------------------------------------------------- 1 | from scipy.ndimage.morphology import binary_dilation 2 | from models.speaker_encoder.params_data import * 3 | from pathlib import Path 4 | from typing import Optional, Union 5 | import numpy as np 6 | import webrtcvad 7 | import librosa 8 | import struct 9 | 10 | int16_max = (2 ** 15) - 1 11 | 12 | 13 | def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], 14 | source_sr: Optional[int] = None): 15 | """ 16 | Applies the preprocessing operations used in training the Speaker Encoder to a waveform 17 | either on disk or in memory. The waveform will be resampled to match the data hyperparameters. 18 | 19 | :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 20 | just .wav), either the waveform as a numpy array of floats. 21 | :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 22 | preprocessing. After preprocessing, the waveform's sampling rate will match the data 23 | hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 24 | this argument will be ignored. 25 | """ 26 | # Load the wav from disk if needed 27 | if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): 28 | wav, source_sr = librosa.load(fpath_or_wav, sr=None) 29 | else: 30 | wav = fpath_or_wav 31 | 32 | # Resample the wav if needed 33 | if source_sr is not None and source_sr != sampling_rate: 34 | wav = librosa.resample(wav, source_sr, sampling_rate) 35 | 36 | # Apply the preprocessing: normalize volume and shorten long silences 37 | wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True) 38 | wav = trim_long_silences(wav) 39 | 40 | return wav 41 | 42 | 43 | def wav_to_mel_spectrogram(wav): 44 | """ 45 | Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform. 46 | Note: this not a log-mel spectrogram. 47 | """ 48 | frames = librosa.feature.melspectrogram( 49 | y=wav, 50 | sr=sampling_rate, 51 | n_fft=int(sampling_rate * mel_window_length / 1000), 52 | hop_length=int(sampling_rate * mel_window_step / 1000), 53 | n_mels=mel_n_channels 54 | ) 55 | return frames.astype(np.float32).T 56 | 57 | 58 | def trim_long_silences(wav): 59 | """ 60 | Ensures that segments without voice in the waveform remain no longer than a 61 | threshold determined by the VAD parameters in params.py. 62 | 63 | :param wav: the raw waveform as a numpy array of floats 64 | :return: the same waveform with silences trimmed away (length <= original wav length) 65 | """ 66 | # Compute the voice detection window size 67 | samples_per_window = (vad_window_length * sampling_rate) // 1000 68 | 69 | # Trim the end of the audio to have a multiple of the window size 70 | wav = wav[:len(wav) - (len(wav) % samples_per_window)] 71 | 72 | # Convert the float waveform to 16-bit mono PCM 73 | pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16)) 74 | 75 | # Perform voice activation detection 76 | voice_flags = [] 77 | vad = webrtcvad.Vad(mode=3) 78 | for window_start in range(0, len(wav), samples_per_window): 79 | window_end = window_start + samples_per_window 80 | voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], 81 | sample_rate=sampling_rate)) 82 | voice_flags = np.array(voice_flags) 83 | 84 | # Smooth the voice detection with a moving average 85 | def moving_average(array, width): 86 | array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2))) 87 | ret = np.cumsum(array_padded, dtype=float) 88 | ret[width:] = ret[width:] - ret[:-width] 89 | return ret[width - 1:] / width 90 | 91 | audio_mask = moving_average(voice_flags, vad_moving_average_width) 92 | audio_mask = np.round(audio_mask).astype(np.bool) 93 | 94 | # Dilate the voiced regions 95 | audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) 96 | audio_mask = np.repeat(audio_mask, samples_per_window) 97 | 98 | return wav[audio_mask == True] 99 | 100 | 101 | def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False): 102 | if increase_only and decrease_only: 103 | raise ValueError("Both increase only and decrease only are set") 104 | dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2)) 105 | if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only): 106 | return wav 107 | return wav * (10 ** (dBFS_change / 20)) 108 | -------------------------------------------------------------------------------- /models/speaker_encoder/compute_embed.py: -------------------------------------------------------------------------------- 1 | from speaker_encoder import inference as encoder 2 | from multiprocessing.pool import Pool 3 | from functools import partial 4 | from pathlib import Path 5 | # from utils import logmmse 6 | # from tqdm import tqdm 7 | # import numpy as np 8 | # import librosa 9 | 10 | 11 | def embed_utterance(fpaths, encoder_model_fpath): 12 | if not encoder.is_loaded(): 13 | encoder.load_model(encoder_model_fpath) 14 | 15 | # Compute the speaker embedding of the utterance 16 | wav_fpath, embed_fpath = fpaths 17 | wav = np.load(wav_fpath) 18 | wav = encoder.preprocess_wav(wav) 19 | embed = encoder.embed_utterance(wav) 20 | np.save(embed_fpath, embed, allow_pickle=False) 21 | 22 | 23 | def create_embeddings(outdir_root: Path, wav_dir: Path, encoder_model_fpath: Path, n_processes: int): 24 | 25 | wav_dir = outdir_root.joinpath("audio") 26 | metadata_fpath = synthesizer_root.joinpath("train.txt") 27 | assert wav_dir.exists() and metadata_fpath.exists() 28 | embed_dir = synthesizer_root.joinpath("embeds") 29 | embed_dir.mkdir(exist_ok=True) 30 | 31 | # Gather the input wave filepath and the target output embed filepath 32 | with metadata_fpath.open("r") as metadata_file: 33 | metadata = [line.split("|") for line in metadata_file] 34 | fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata] 35 | 36 | # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here. 37 | # Embed the utterances in separate threads 38 | func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath) 39 | job = Pool(n_processes).imap(func, fpaths) 40 | list(tqdm(job, "Embedding", len(fpaths), unit="utterances")) -------------------------------------------------------------------------------- /models/speaker_encoder/config.py: -------------------------------------------------------------------------------- 1 | librispeech_datasets = { 2 | "train": { 3 | "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"], 4 | "other": ["LibriSpeech/train-other-500"] 5 | }, 6 | "test": { 7 | "clean": ["LibriSpeech/test-clean"], 8 | "other": ["LibriSpeech/test-other"] 9 | }, 10 | "dev": { 11 | "clean": ["LibriSpeech/dev-clean"], 12 | "other": ["LibriSpeech/dev-other"] 13 | }, 14 | } 15 | libritts_datasets = { 16 | "train": { 17 | "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"], 18 | "other": ["LibriTTS/train-other-500"] 19 | }, 20 | "test": { 21 | "clean": ["LibriTTS/test-clean"], 22 | "other": ["LibriTTS/test-other"] 23 | }, 24 | "dev": { 25 | "clean": ["LibriTTS/dev-clean"], 26 | "other": ["LibriTTS/dev-other"] 27 | }, 28 | } 29 | voxceleb_datasets = { 30 | "voxceleb1" : { 31 | "train": ["VoxCeleb1/wav"], 32 | "test": ["VoxCeleb1/test_wav"] 33 | }, 34 | "voxceleb2" : { 35 | "train": ["VoxCeleb2/dev/aac"], 36 | "test": ["VoxCeleb2/test_wav"] 37 | } 38 | } 39 | 40 | other_datasets = [ 41 | "LJSpeech-1.1", 42 | "VCTK-Corpus/wav48", 43 | ] 44 | 45 | anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"] 46 | -------------------------------------------------------------------------------- /models/speaker_encoder/hparams.py: -------------------------------------------------------------------------------- 1 | ## Mel-filterbank 2 | mel_window_length = 25 # In milliseconds 3 | mel_window_step = 10 # In milliseconds 4 | mel_n_channels = 40 5 | 6 | 7 | ## Audio 8 | sampling_rate = 16000 9 | # Number of spectrogram frames in a partial utterance 10 | partials_n_frames = 160 # 1600 ms 11 | 12 | 13 | ## Voice Activation Detection 14 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds. 15 | # This sets the granularity of the VAD. Should not need to be changed. 16 | vad_window_length = 30 # In milliseconds 17 | # Number of frames to average together when performing the moving average smoothing. 18 | # The larger this value, the larger the VAD variations must be to not get smoothed out. 19 | vad_moving_average_width = 8 20 | # Maximum number of consecutive silent frames a segment can have. 21 | vad_max_silence_length = 6 22 | 23 | 24 | ## Audio volume normalization 25 | audio_norm_target_dBFS = -30 26 | 27 | 28 | ## Model parameters 29 | model_hidden_size = 256 30 | model_embedding_size = 256 31 | model_num_layers = 3 -------------------------------------------------------------------------------- /models/speaker_encoder/params_data.py: -------------------------------------------------------------------------------- 1 | 2 | ## Mel-filterbank 3 | mel_window_length = 25 # In milliseconds 4 | mel_window_step = 10 # In milliseconds 5 | mel_n_channels = 40 6 | 7 | 8 | ## Audio 9 | sampling_rate = 16000 10 | # Number of spectrogram frames in a partial utterance 11 | partials_n_frames = 160 # 1600 ms 12 | # Number of spectrogram frames at inference 13 | inference_n_frames = 80 # 800 ms 14 | 15 | 16 | ## Voice Activation Detection 17 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds. 18 | # This sets the granularity of the VAD. Should not need to be changed. 19 | vad_window_length = 30 # In milliseconds 20 | # Number of frames to average together when performing the moving average smoothing. 21 | # The larger this value, the larger the VAD variations must be to not get smoothed out. 22 | vad_moving_average_width = 8 23 | # Maximum number of consecutive silent frames a segment can have. 24 | vad_max_silence_length = 6 25 | 26 | 27 | ## Audio volume normalization 28 | audio_norm_target_dBFS = -30 29 | 30 | -------------------------------------------------------------------------------- /models/speaker_encoder/params_model.py: -------------------------------------------------------------------------------- 1 | 2 | ## Model parameters 3 | model_hidden_size = 256 4 | model_embedding_size = 256 5 | model_num_layers = 3 6 | 7 | 8 | ## Training parameters 9 | learning_rate_init = 1e-4 10 | speakers_per_batch = 64 11 | utterances_per_speaker = 10 12 | -------------------------------------------------------------------------------- /models/spin/__init__.py: -------------------------------------------------------------------------------- 1 | from .src.model import SpinModel # noqa 2 | from .src.data.dataset import collate_fn as spin_collate_fn # noqa -------------------------------------------------------------------------------- /models/spin/spin.yaml: -------------------------------------------------------------------------------- 1 | # Interspeech 2023 version 2 | 3 | # Training data 4 | data: 5 | json_dir: /data/sls/r/u/hengjui/home/scratch/spin_test/data 6 | splits: 7 | - train-clean-100 8 | sample_rate: 16000 9 | min_audio_len: 40000 # minimum audio samples per utterance 10 | random_crop_len: 272000 # maximum audio samples per utterance 11 | spk2info: /root/RVC_Spin/spin_train/spk_to_f0.csv 12 | out_of_len_audios: /root/RVC_Spin/spin_train/out_of_len_audios.txt 13 | 14 | # Validation data (not used for checkpointing, just for monitoring training progress) 15 | val_data: 16 | json_dir: /data/sls/r/u/hengjui/home/scratch/spin_test/data 17 | phn_dir: /root/RVC_Spin/spin_train/phone_alignment_info 18 | data_dir: /libri_tts/LibriTTS/dev-clean 19 | out_of_len_audios: /root/RVC_Spin/spin_train/out_of_len_audios_val.txt 20 | splits: 21 | - libri-dev-clean 22 | - libri-dev-other 23 | sample_rate: 16000 24 | 25 | # SpinModel config 26 | model: 27 | encoder: 28 | type: HuBERT # `HuBERT` / `WavLM` 29 | use_layer: 12 # the layer which its representations are used for clustering 30 | normalize: False 31 | feat_select: x 32 | randomize_all: False 33 | randomize_layers: [] 34 | freeze_all: False 35 | freeze_layers: ["pos", 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] # `pos`: positional encoding, `0`: CNN extractor 36 | pred_head: 37 | type: DNN 38 | hid_dims: [256] 39 | dropout: 0 40 | activation: ReLU 41 | loss: 42 | type: SwavVQDisentangle 43 | num_vars: 2048 # cluster size 44 | epsilon: 0.02 45 | sinkhorn_iters: 3 46 | temp: 0.1 47 | l2_norm: True 48 | prob_ratio: 1.0 49 | 50 | # Optimization 51 | optim: 52 | optimizer: 53 | name: Adam 54 | args: 55 | lr: 1.e-4 56 | weight_decay: 1.e-6 57 | scheduler: 58 | name: linear_warmup_decay # `linear_warmup_decay` / `linear_warmup_cosine_scheduler` / `noam_scheduler` 59 | args: 60 | warmup: 2500 61 | max_step: 63052 62 | final_lr: 1.e-6 63 | 64 | hparam: 65 | #batch_len: 4096000 # audio samples per GPU (256 secs ~ batch_size = 12.8k) 4096000 66 | batch_size: 32 67 | val_batch_size: 8 68 | 69 | # pytorch_lightning.Trainer 70 | # ref: https://lightning.ai/docs/pytorch/latest/common/trainer.html 71 | trainer: 72 | max_steps: 63052 73 | gradient_clip_val: 10 74 | accumulate_grad_batches: 1 75 | precision: 16 76 | logger: wandb # use `False` to disable logging 77 | log_every_n_steps: 100 78 | default_root_dir: exp/tmp 79 | accelerator: gpu 80 | 81 | strategy: ddp_find_unused_parameters_true # UNCOMMENT this line to enable DDP training 82 | 83 | num_sanity_val_steps: 0 84 | val_check_interval: 1000 85 | 86 | # pytorch_lightning.callbacks.ModelCheckpoint 87 | # ref: https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.ModelCheckpoint.html 88 | checkpoint: 89 | filename: "{epoch}-{step}" 90 | every_n_train_steps: 2000 91 | save_last: true 92 | 93 | # pytorch_lightning.loggers.WandbLogger 94 | # ref: https://lightning.ai/docs/pytorch/latest/extensions/generated/lightning.pytorch.loggers.WandbLogger.html 95 | logger: 96 | project: spin_is2023 -------------------------------------------------------------------------------- /models/spin/src/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataset import ( 2 | AudioPretrainDataset, 3 | AudioPretrainPnmiValDataset, 4 | collate_fn, 5 | val_collate_fn, 6 | ) 7 | from .sampler import MaxLengthBatchSampler, MaxLengthDistributedSampler 8 | -------------------------------------------------------------------------------- /models/spin/src/data/librispeech.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from typing import List, Tuple 4 | 5 | import torchaudio 6 | from tqdm import tqdm 7 | 8 | 9 | def find_all_librispeech(root: str, sort_by_len: bool = False) -> List[Tuple[str, int]]: 10 | files = list(Path(root).rglob("*.flac")) 11 | files = [str(f) for f in files] 12 | file_lens = [torchaudio.info(f).num_frames for f in tqdm(files)] 13 | assert len(files) == len(file_lens), (len(files), len(file_lens)) 14 | data = sorted( 15 | zip(files, file_lens), key=lambda x: x[1 if sort_by_len else 0], reverse=True 16 | ) 17 | return data 18 | 19 | 20 | def save_data_info(data: List[Tuple[str, int]], path: str) -> None: 21 | with open(path, "w") as fp: 22 | json.dump(data, fp, indent=2) 23 | -------------------------------------------------------------------------------- /models/spin/src/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .spin import SpinModel 2 | -------------------------------------------------------------------------------- /models/spin/src/model/base.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | import pytorch_lightning as pl 4 | import yaml 5 | 6 | 7 | class BaseModel(pl.LightningModule): 8 | def __init__(self, config) -> None: 9 | super().__init__() 10 | 11 | if isinstance(config, str) and config.split(".")[-1] in {"yaml", "yml"}: 12 | config = yaml.load(open(config, "r"), Loader=yaml.FullLoader) 13 | 14 | self.config = config 15 | self.save_hyperparameters(config) 16 | 17 | @abc.abstractmethod 18 | def forward(self, batch): 19 | raise NotImplementedError 20 | 21 | @abc.abstractmethod 22 | def training_step(self, batch, batch_idx): 23 | raise NotImplementedError 24 | 25 | @abc.abstractmethod 26 | def configure_optimizers(self): 27 | raise NotImplementedError 28 | -------------------------------------------------------------------------------- /models/spin/src/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from .dnn import DNN 2 | from .hubert import HuBERT 3 | from .swav_vq_dis import SwavVQDisentangle 4 | from .wavlm import WavLM 5 | -------------------------------------------------------------------------------- /models/spin/src/nn/dnn.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import torch 4 | from torch import nn 5 | 6 | 7 | class DNN(nn.Module): 8 | def __init__( 9 | self, 10 | in_dim: int, 11 | hid_dims: List[int], 12 | dropout: float = 0.0, 13 | activation: str = "ReLU", 14 | activate_last: bool = False, 15 | ) -> None: 16 | super().__init__() 17 | 18 | self.in_dim = in_dim 19 | self.out_dim = hid_dims[-1] 20 | self.activate_last = activate_last 21 | 22 | assert len(hid_dims) > 0, len(hid_dims) 23 | hid_dims = [in_dim] + hid_dims 24 | 25 | self.layers = nn.ModuleList( 26 | [nn.Linear(hid_dims[i], hid_dims[i + 1]) for i in range(len(hid_dims) - 1)] 27 | ) 28 | self.num_layer = len(self.layers) 29 | self.dropout = nn.Dropout(dropout) 30 | n_acts = self.num_layer - (0 if self.activate_last else 1) 31 | self.acts = nn.ModuleList([getattr(nn, activation)() for _ in range(n_acts)]) 32 | 33 | def forward(self, x: torch.Tensor, x_len: torch.LongTensor = None) -> torch.Tensor: 34 | for i in range(self.num_layer): 35 | x = self.layers[i](x) 36 | if i < self.num_layer - 1 or self.activate_last: 37 | x = self.dropout(x) 38 | x = self.acts[i](x) 39 | return x 40 | -------------------------------------------------------------------------------- /models/spin/src/task/__init__.py: -------------------------------------------------------------------------------- 1 | from .train_spin import SpinPretrainTask 2 | -------------------------------------------------------------------------------- /models/spin/src/task/train_spin.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | import yaml 5 | from pytorch_lightning import Trainer, seed_everything 6 | from pytorch_lightning.callbacks import ( 7 | LearningRateMonitor, 8 | ModelCheckpoint, 9 | TQDMProgressBar, 10 | ) 11 | from torch.utils.data import DataLoader 12 | 13 | from src.data import AudioPretrainPnmiValDataset, val_collate_fn 14 | from src.model import SpinModel 15 | from src.util import set_logging, set_pl_logger 16 | 17 | 18 | class SpinPretrainTask: 19 | def __init__(self): 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument("task", help="Task name") 22 | parser.add_argument("--config", "-c", help="Config .yaml file") 23 | parser.add_argument("--save-path", "-s", help="Path to save exp") 24 | parser.add_argument("--resume", "-r", default="", help="Resume training") 25 | parser.add_argument("--gpus", "-g", type=int, default=1, help="Number of GPUs") 26 | parser.add_argument( 27 | "--njobs", "-j", type=int, default=8, help="Number of workers" 28 | ) 29 | parser.add_argument("--seed", type=int, default=7122, help="Random seed") 30 | parser.add_argument("--log-level", default="info", help="Logging level") 31 | args = parser.parse_args() 32 | 33 | if not torch.cuda.is_available(): 34 | args.device = "cpu" 35 | args.gpus = 0 36 | else: 37 | args.device = "cuda" if args.gpus > 0 else "cpu" 38 | 39 | self.args = args 40 | set_logging(args.log_level) 41 | 42 | def run(self, model_cls=SpinModel): 43 | assert isinstance(self.args, argparse.Namespace) 44 | 45 | config = yaml.load(open(self.args.config, "r"), Loader=yaml.FullLoader) 46 | self.config = config 47 | 48 | use_ddp = ( 49 | config["trainer"].get("strategy", "").startswith("ddp") 50 | and self.args.gpus > 1 51 | ) 52 | 53 | if self.args.save_path != "": 54 | config["trainer"]["default_root_dir"] = self.args.save_path 55 | 56 | model_checkpoint = ModelCheckpoint( 57 | dirpath=config["trainer"]["default_root_dir"], **config["checkpoint"] 58 | ) 59 | 60 | config["trainer"]["logger"] = set_pl_logger( 61 | config["trainer"]["logger"], 62 | config["logger"]["project"], 63 | config["trainer"]["default_root_dir"].split("/")[-1], 64 | ) 65 | 66 | trainer = Trainer( 67 | callbacks=[ 68 | TQDMProgressBar(), 69 | model_checkpoint, 70 | LearningRateMonitor("step"), 71 | ], 72 | enable_progress_bar=True, 73 | devices=self.args.gpus, 74 | check_val_every_n_epoch=None, 75 | use_distributed_sampler=False, 76 | sync_batchnorm=use_ddp, 77 | **config["trainer"], 78 | ) 79 | 80 | seed_everything(self.args.seed) 81 | 82 | if config.get("val_data", None) is not None: 83 | val_dataset = AudioPretrainPnmiValDataset(**config["val_data"]) 84 | val_loader = DataLoader( 85 | val_dataset, 86 | batch_size=config["hparam"]["val_batch_size"], 87 | num_workers=self.args.njobs, 88 | pin_memory=True, 89 | collate_fn=val_collate_fn, 90 | shuffle=False, 91 | drop_last=False, 92 | ) 93 | else: 94 | val_dataset = None 95 | val_loader = None 96 | 97 | if self.args.resume != "": 98 | model = model_cls.load_from_checkpoint(self.args.resume) 99 | else: 100 | self.args.resume = None 101 | model = model_cls(config, 2) 102 | 103 | model.set_random_seed(self.args.seed) 104 | model.set_njobs(self.args.njobs) 105 | model.set_use_ddp(use_ddp) 106 | 107 | trainer.fit(model, val_dataloaders=val_loader, ckpt_path=self.args.resume) 108 | -------------------------------------------------------------------------------- /models/spin/src/util/__init__.py: -------------------------------------------------------------------------------- 1 | from .log import set_logging, set_pl_logger 2 | from .model_utils import ( 3 | count_parameters, 4 | freeze_module, 5 | init_module, 6 | init_module_bert, 7 | init_module_cnn, 8 | init_module_pos_conv, 9 | unfreeze_module, 10 | ) 11 | from .padding import ( 12 | add_front_padding_mask, 13 | len_to_padding, 14 | padding_to_len, 15 | update_padding_mask, 16 | ) 17 | from .pnmi import compute_show_pnmi, compute_snmi 18 | from .scheduler import get_scheduler 19 | -------------------------------------------------------------------------------- /models/spin/src/util/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Union 3 | 4 | from pytorch_lightning.loggers import WandbLogger 5 | 6 | 7 | def set_logging(log_level: str = "info") -> None: 8 | level = getattr(logging, str(log_level).upper()) 9 | logging.basicConfig( 10 | level=level, 11 | format="%(asctime)s %(filename)s.%(funcName)s %(message)s", 12 | datefmt="%m-%d %H:%M", 13 | ) 14 | 15 | 16 | def set_pl_logger( 17 | logger_type: Union[bool, str], 18 | project: str = "speech_disentangle", 19 | name: str = "example", 20 | ): 21 | if isinstance(logger_type, bool): 22 | return logger_type 23 | elif logger_type == "wandb": 24 | logger = WandbLogger(project=project, name=name) 25 | return logger 26 | else: 27 | raise NotImplementedError(f"Unknown logger type = {logger_type}") 28 | -------------------------------------------------------------------------------- /models/spin/src/util/model_utils.py: -------------------------------------------------------------------------------- 1 | from s3prl.upstream.wav2vec2.wav2vec2_model import MultiheadAttention 2 | from torch import nn 3 | 4 | 5 | def freeze_module(m: nn.Module) -> None: 6 | for p in m.parameters(): 7 | p.requires_grad = False 8 | 9 | 10 | def unfreeze_module(m: nn.Module) -> None: 11 | for p in m.parameters(): 12 | p.requires_grad = True 13 | 14 | 15 | def init_module(m: nn.Module): 16 | for p in m.parameters(): 17 | nn.init.normal_(p, mean=0, std=0.02) 18 | 19 | 20 | def init_module_bert(m: nn.Module): 21 | def normal_(data): 22 | # with FSDP, module params will be on CUDA, so we cast them back to CPU 23 | # so that the RNG is consistent with and without FSDP 24 | data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device)) 25 | 26 | if isinstance(m, nn.Linear): 27 | normal_(m.weight.data) 28 | if m.bias is not None: 29 | m.bias.data.zero_() 30 | if isinstance(m, nn.Embedding): 31 | normal_(m.weight.data) 32 | if m.padding_idx is not None: 33 | m.weight.data[m.padding_idx].zero_() 34 | if isinstance(m, MultiheadAttention): 35 | normal_(m.q_proj.weight.data) 36 | normal_(m.k_proj.weight.data) 37 | normal_(m.v_proj.weight.data) 38 | 39 | 40 | def init_module_cnn(m: nn.Module): 41 | if isinstance(m, nn.Conv1d): 42 | nn.init.kaiming_normal_(m.weight) 43 | if isinstance(m, nn.LayerNorm): 44 | m.reset_parameters() 45 | 46 | 47 | def init_module_pos_conv(m: nn.Module): 48 | if isinstance(m, nn.Conv1d): 49 | m.reset_parameters() 50 | if isinstance(m, nn.LayerNorm): 51 | m.reset_parameters() 52 | 53 | 54 | def count_parameters(model: nn.Module) -> int: 55 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 56 | -------------------------------------------------------------------------------- /models/spin/src/util/padding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | @torch.no_grad() 5 | def len_to_padding(x_len: torch.LongTensor, max_len: int = 0) -> torch.BoolTensor: 6 | if max_len == 0: 7 | max_len = max(x_len) 8 | idxs = torch.arange(max_len, dtype=torch.long).to(x_len.device) 9 | padding_mask = idxs.unsqueeze(0) >= x_len.unsqueeze(1) 10 | return padding_mask 11 | 12 | 13 | @torch.no_grad() 14 | def padding_to_len(padding_mask: torch.BoolTensor) -> torch.LongTensor: 15 | x_len = (~padding_mask).long().sum(-1) 16 | return x_len 17 | 18 | 19 | @torch.no_grad() 20 | def update_padding_mask( 21 | padding_mask: torch.BoolTensor, new_len: int 22 | ) -> torch.BoolTensor: 23 | extra = padding_mask.shape[1] % new_len 24 | if extra > 0: 25 | padding_mask = padding_mask[:, :-extra] 26 | padding_mask = padding_mask.view(padding_mask.shape[0], new_len, -1) 27 | padding_mask = padding_mask.all(-1) 28 | return padding_mask 29 | 30 | 31 | @torch.no_grad() 32 | def add_front_padding_mask( 33 | padding_mask: torch.BoolTensor, pad_front_lens: torch.LongTensor 34 | ) -> None: 35 | for i in range(len(padding_mask)): 36 | if pad_front_lens[i] > 0: 37 | padding_mask[i, : pad_front_lens[i]] = True 38 | -------------------------------------------------------------------------------- /models/spin/src/util/pnmi.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | from collections import Counter 7 | 8 | import numpy as np 9 | from tabulate import tabulate 10 | 11 | 12 | def comp_purity(p_xy, axis): 13 | max_p = p_xy.max(axis=axis) 14 | marg_p = p_xy.sum(axis=axis) 15 | indv_pur = max_p / marg_p 16 | aggr_pur = max_p.sum() 17 | return indv_pur, aggr_pur 18 | 19 | 20 | def comp_entropy(p): 21 | return (-p * np.log(p + 1e-8)).sum() 22 | 23 | 24 | def comp_norm_mutual_info(p_xy): 25 | p_x = p_xy.sum(axis=1, keepdims=True) 26 | p_y = p_xy.sum(axis=0, keepdims=True) 27 | pmi = np.log(p_xy / np.matmul(p_x, p_y) + 1e-8) 28 | mi = (p_xy * pmi).sum() 29 | h_x = comp_entropy(p_x) 30 | h_y = comp_entropy(p_y) 31 | return mi, mi / h_x, mi / h_y, h_x, h_y 32 | 33 | 34 | def pad(labs, n): 35 | if n == 0: 36 | return np.array(labs) 37 | return np.concatenate([[labs[0]] * n, labs, [labs[-1]] * n]) 38 | 39 | 40 | def comp_avg_seg_dur(labs_list): 41 | n_frms = 0 42 | n_segs = 0 43 | for labs in labs_list: 44 | labs = np.array(labs) 45 | edges = np.zeros(len(labs)).astype(bool) 46 | edges[0] = True 47 | edges[1:] = labs[1:] != labs[:-1] 48 | n_frms += len(edges) 49 | n_segs += edges.astype(int).sum() 50 | return n_frms / n_segs 51 | 52 | 53 | def comp_joint_prob(uid2refs, uid2hyps): 54 | cnts = Counter() 55 | skipped = [] 56 | abs_frmdiff = 0 57 | for uid in uid2refs: 58 | if uid not in uid2hyps: 59 | skipped.append(uid) 60 | continue 61 | refs = uid2refs[uid] 62 | hyps = uid2hyps[uid] 63 | abs_frmdiff += abs(len(refs) - len(hyps)) 64 | min_len = min(len(refs), len(hyps)) 65 | refs = refs[:min_len] 66 | hyps = hyps[:min_len] 67 | cnts.update(zip(refs, hyps)) 68 | tot = sum(cnts.values()) 69 | 70 | ref_set = sorted({ref for ref, _ in cnts.keys()}) 71 | hyp_set = sorted({hyp for _, hyp in cnts.keys()}) 72 | ref2pid = dict(zip(ref_set, range(len(ref_set)))) 73 | hyp2lid = dict(zip(hyp_set, range(len(hyp_set)))) 74 | 75 | p_xy = np.zeros((len(ref2pid), len(hyp2lid)), dtype=float) 76 | for (ref, hyp), cnt in cnts.items(): 77 | p_xy[ref2pid[ref], hyp2lid[hyp]] = cnt 78 | freq_xy = p_xy 79 | full_freq_xy = np.zeros((len(ref2pid), 4096), dtype=float) 80 | for (ref, hyp), cnt in cnts.items(): 81 | full_freq_xy[ref2pid[ref], int(hyp)] = cnt 82 | p_xy = p_xy / p_xy.sum() 83 | return ( 84 | freq_xy, 85 | full_freq_xy, 86 | p_xy, 87 | ref2pid, 88 | hyp2lid, 89 | tot, 90 | abs_frmdiff, 91 | skipped, 92 | ref_set, 93 | hyp_set, 94 | ) 95 | 96 | 97 | def comp_phone2code(p_xy): 98 | p_x = p_xy.sum(axis=1, keepdims=True) # ref (phone) 99 | p_y = p_xy.sum(axis=0, keepdims=True) # hyp (code) 100 | 101 | p_x_y = p_xy / p_y # P(x | y) = P(phone | code) 102 | 103 | y_order = np.argsort(p_x_y.argmax(0)) 104 | p_x_y_sorted_y = np.take_along_axis(p_x_y, y_order.reshape((1, -1)), axis=1) 105 | 106 | x_order = np.argsort(p_x[:, 0]) 107 | x_order = np.flip(x_order) 108 | p_x_y_sorted_x = np.take_along_axis(p_x_y, x_order.reshape((-1, 1)), axis=0) 109 | y_order = np.argsort(p_x_y_sorted_x.argmax(0)) 110 | p_x_y_sorted_xy = np.take_along_axis( 111 | p_x_y_sorted_x, y_order.reshape((1, -1)), axis=1 112 | ) 113 | 114 | return p_x_y, p_x_y_sorted_xy, p_x_y_sorted_y, x_order 115 | 116 | 117 | def compute_show_pnmi(uid2refs, uid2hyps, upsample=1, show_results: bool = False): 118 | for k, v in uid2hyps.items(): 119 | uid2hyps[k] = pad(v, 0).repeat(upsample) 120 | 121 | ( 122 | freq_xy, 123 | full_freq_xy, 124 | p_xy, 125 | ref2pid, 126 | hyp2lid, 127 | tot, 128 | frmdiff, 129 | skipped, 130 | ref_set, 131 | hyp_set, 132 | ) = comp_joint_prob(uid2refs, uid2hyps) 133 | ref_pur_by_hyp, ref_pur = comp_purity(p_xy, axis=0) 134 | hyp_pur_by_ref, hyp_pur = comp_purity(p_xy, axis=1) 135 | (mi, mi_norm_by_ref, mi_norm_by_hyp, h_ref, h_hyp) = comp_norm_mutual_info(p_xy) 136 | 137 | if show_results: 138 | print( 139 | tabulate( 140 | [[hyp_pur, ref_pur, mi_norm_by_ref]], 141 | ["Cls Pur", "Phn Pur", "PNMI"], 142 | floatfmt=".3f", 143 | tablefmt="fancy_grid", 144 | ) 145 | ) 146 | 147 | return { 148 | "cls_pur": hyp_pur, 149 | "phn_pur": ref_pur, 150 | "pnmi": mi_norm_by_ref, 151 | } 152 | 153 | 154 | def compute_snmi(p_xy): 155 | _, ref_pur = comp_purity(p_xy, axis=0) 156 | _, hyp_pur = comp_purity(p_xy, axis=1) 157 | (_, mi_norm_by_ref, _, _, _) = comp_norm_mutual_info(p_xy) 158 | 159 | return { 160 | "cls_pur": hyp_pur, 161 | "spk_pur": ref_pur, 162 | "snmi": mi_norm_by_ref, 163 | } 164 | -------------------------------------------------------------------------------- /models/spin/src/util/scheduler.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from torch.optim import Optimizer 4 | from torch.optim.lr_scheduler import LambdaLR, _LRScheduler 5 | 6 | 7 | def get_lr(optimizer: Optimizer) -> float: 8 | for param_group in optimizer.param_groups: 9 | return param_group["lr"] 10 | 11 | 12 | def noam_scheduler( 13 | optimizer: Optimizer, warmup: int = 4000, last_epoch: int = -1 14 | ) -> _LRScheduler: 15 | def func(step: int): 16 | if step < warmup: 17 | return (step + 1) / warmup 18 | else: 19 | return (warmup / (step + 1)) ** 0.5 20 | 21 | return LambdaLR(optimizer, func, last_epoch) 22 | 23 | 24 | def linear_warmup_decay_scheduler( 25 | optimizer: Optimizer, 26 | warmup: int = 4000, 27 | max_step: int = 1000000, 28 | init_lr: float = 1e-6, 29 | final_lr: float = 1e-6, 30 | ) -> _LRScheduler: 31 | func_list = [] 32 | 33 | for param_group in optimizer.param_groups: 34 | base_lr = param_group["lr"] 35 | rate_i = init_lr / base_lr 36 | rate_f = final_lr / base_lr 37 | 38 | def func(step: int) -> float: 39 | if step <= warmup: 40 | return rate_i + (1.0 - rate_i) * step / warmup 41 | else: 42 | return 1.0 - (1.0 - rate_f) * (step - warmup) / (max_step - warmup - 1) 43 | 44 | func_list.append(func) 45 | 46 | return LambdaLR(optimizer, func_list) 47 | 48 | 49 | def linear_warmup_cosine_scheduler( 50 | optimizer: Optimizer, 51 | warmup: int = 4000, 52 | max_step: int = 1000000, 53 | final_lr: float = 1e-6, 54 | ) -> _LRScheduler: 55 | func_list = [] 56 | 57 | for param_group in optimizer.param_groups: 58 | base_lr = param_group["lr"] 59 | rate = final_lr / base_lr 60 | 61 | def func(step: int) -> float: 62 | if step < warmup: 63 | return (step + 1) / warmup 64 | else: 65 | q = 0.5 * ( 66 | 1 + math.cos(math.pi * (step + 1 - warmup) / (max_step - warmup)) 67 | ) 68 | return (1.0 - rate) * q + rate 69 | 70 | func_list.append(func) 71 | 72 | return LambdaLR(optimizer, func_list) 73 | 74 | 75 | def get_scheduler(name: str, optimizer: Optimizer, **kwargs) -> _LRScheduler: 76 | if name == "noam": 77 | return noam_scheduler(optimizer, **kwargs) 78 | elif name == "linear_warmup_decay": 79 | return linear_warmup_decay_scheduler(optimizer, **kwargs) 80 | elif name == "linear_warmup_cosine": 81 | return linear_warmup_cosine_scheduler(optimizer, **kwargs) 82 | else: 83 | raise NotImplementedError(f"Unknown lr scheduler {name}") 84 | -------------------------------------------------------------------------------- /models/ssl_singer_identity/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Bernardo Torres 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /models/ssl_singer_identity/metadata/img/byol.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freds0/free-svc/def7f35a0d1935b00e6ebaf19a4a149617f3354b/models/ssl_singer_identity/metadata/img/byol.png -------------------------------------------------------------------------------- /models/ssl_singer_identity/metadata/img/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freds0/free-svc/def7f35a0d1935b00e6ebaf19a4a149617f3354b/models/ssl_singer_identity/metadata/img/pipeline.png -------------------------------------------------------------------------------- /models/ssl_singer_identity/singer_identity/__init__.py: -------------------------------------------------------------------------------- 1 | # from . import losses 2 | 3 | from .model import load_model 4 | # from . import model 5 | # from . import trainer 6 | # from . import utils 7 | # from .data import siamese_encoders 8 | -------------------------------------------------------------------------------- /models/ssl_singer_identity/singer_identity/callbacks/ma_updates.py: -------------------------------------------------------------------------------- 1 | from math import cos, pi 2 | from typing import Optional, Sequence 3 | 4 | import torch 5 | from pytorch_lightning import Callback, LightningModule, Trainer 6 | 7 | 8 | class MAWeightUpdate(Callback): 9 | """Weight update rule from BYOL. 10 | Your model should have: 11 | - ``self.online_network`` 12 | - ``self.target_network`` 13 | Updates the target_network params using an exponential moving average update rule weighted by tau. 14 | BYOL claims this keeps the online_network from collapsing. 15 | .. note:: Automatically increases tau from ``initial_tau`` to 1.0 with every training step 16 | Example:: 17 | # model must have 2 attributes 18 | model = Model() 19 | model.online_network = ... 20 | model.target_network = ... 21 | trainer = Trainer(callbacks=[MAWeightUpdate()]) 22 | """ 23 | 24 | def __init__(self, initial_tau: float = 0.996, max_epochs=100, should_update: bool = True): 25 | """ 26 | Args: 27 | initial_tau: starting tau. Auto-updates with every training step 28 | """ 29 | super().__init__() 30 | self.initial_tau = initial_tau 31 | self.max_epochs = max_epochs 32 | self.should_update = should_update 33 | 34 | self.current_tau = initial_tau 35 | 36 | def on_train_batch_end( 37 | self, 38 | trainer: Trainer, 39 | pl_module: LightningModule, 40 | outputs: Sequence, 41 | batch: Sequence, 42 | batch_idx: int, 43 | unused: Optional[int] = 0 44 | ) -> None: 45 | # get networks 46 | student_network = pl_module.student_network 47 | teacher_network = pl_module.teacher_network 48 | 49 | # update weights 50 | self.update_weights(student_network, teacher_network) 51 | 52 | # log tau 53 | pl_module.log("hparams/MA rate", self.current_tau, prog_bar=False, logger=True) 54 | 55 | # update tau after 56 | if self.should_update: 57 | self.current_tau = self.update_tau(pl_module, trainer) 58 | 59 | def update_tau(self, pl_module: LightningModule, trainer: Trainer) -> float: 60 | max_steps = len(trainer.train_dataloader) * self.max_epochs 61 | tau = 1 - (1 - self.initial_tau) * (cos(pi * pl_module.global_step / max_steps) + 1) / 2 62 | return tau 63 | 64 | def update_weights( 65 | self, 66 | student_network: torch.nn.Module, 67 | teacher_network: torch.nn.Module 68 | ) -> None: 69 | # apply MA weight update 70 | for (name, student_p), (_, teacher_p) in zip( 71 | student_network.named_parameters(), 72 | teacher_network.named_parameters(), 73 | ): 74 | teacher_p.data = self.current_tau * teacher_p.data + (1 - self.current_tau) * student_p.data 75 | -------------------------------------------------------------------------------- /models/ssl_singer_identity/singer_identity/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from singer_identity.utils.core import similarity, roll 4 | 5 | 6 | def std_batch(x, var=1, eps=1e-8): 7 | std = torch.sqrt(x.var(dim=0) + eps) 8 | return torch.mean(F.relu(var - std)) 9 | 10 | 11 | def variance_hinge_reg(x, y, var=1): 12 | # From https://github.com/facebookresearch/vicreg 13 | std_x = std_batch(x, var=var) 14 | std_y = std_batch(y, var=var) 15 | std_loss = std_x / 2 + std_y / 2 16 | return std_loss 17 | 18 | 19 | def covariance(x): 20 | # In official implementation they do mean over batch (to verify) 21 | # mean = x.mean(1, keepdims=True) 22 | mean = x.mean(dim=0) 23 | x = x - mean 24 | cov = torch.matmul(x.transpose(0, 1), x) / (x.shape[0] - 1) 25 | # cov = (x.T @ x) / (x.shape[0] - 1) 26 | return cov 27 | 28 | 29 | def covariance_reg(x, y): 30 | eye = torch.eye(x.shape[1]).to(x.device) 31 | cov_x = covariance(x) 32 | cov_y = covariance(y) 33 | assert cov_x.shape[0] == cov_x.shape[1] 34 | assert cov_y.shape[0] == cov_y.shape[1] 35 | cov_reg = (cov_x * (1 - eye)).pow(2).sum() / x.shape[1] + (cov_y * (1 - eye)).pow( 36 | 2 37 | ).sum() / x.shape[1] 38 | return cov_reg 39 | 40 | 41 | def invariance_loss(x, y): 42 | return F.mse_loss(x, y) 43 | 44 | 45 | def vicreg_loss(x, y, gamma=1, fact_inv_loss=1, fact_var=1, fact_cov=1): 46 | # Adapted from https://github.com/facebookresearch/vicreg 47 | repr_loss = invariance_loss(x, y) 48 | std_loss = variance_hinge_reg(x, y, var=gamma) 49 | cov_loss = covariance_reg(x, y) 50 | loss = fact_inv_loss * repr_loss + fact_var * std_loss + fact_cov * cov_loss 51 | return loss 52 | 53 | 54 | def compute_norms(*args): 55 | norms = [] 56 | for arg in args: 57 | norms.append(torch.sqrt((arg**2).sum(1))) 58 | return norms 59 | 60 | 61 | def align_loss(x, y, alpha=2): 62 | # From https://github.com/SsnL/align_uniform 63 | return (x - y).norm(p=2, dim=1).pow(alpha).mean() 64 | 65 | 66 | def uniform_loss(x, t=2): 67 | # From https://github.com/SsnL/align_uniform 68 | return torch.pdist(x, p=2).pow(2).mul(-t).exp().mean().log() 69 | 70 | 71 | def contrastive_loss(z1, z2, temp=0.2, nr_negative=1, decouple=False): 72 | cost_pos = similarity(z1, z2, temp) # Positive samples 73 | cost_neg = [] 74 | 75 | n_rolls = min(z1.shape[0] - 1, nr_negative) # Number of negative samples 76 | curr_neg_z = z2 77 | 78 | for i in range(n_rolls): 79 | curr_neg_z = roll(curr_neg_z) # Shifts batch 80 | cost_neg.append(similarity(z1, curr_neg_z, temp)) # Negative sim. 81 | 82 | if not decouple: 83 | cost_neg.append(cost_pos) # Adds positive similarity in denominator 84 | 85 | cost_neg = torch.stack(cost_neg).transpose(1, 0) 86 | cost = (-cost_pos + torch.logsumexp(cost_neg, 1)).mean() 87 | # TODO: implement similarities with less operations, but this works 88 | ratio = torch.mean(cost_neg) / ( 89 | torch.mean(cost_pos) + torch.tensor(1e-6).type_as(z1) 90 | ) 91 | return cost, ratio.item() 92 | -------------------------------------------------------------------------------- /models/ssl_singer_identity/singer_identity/models/network_components.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from typing import Union, Callable, List, Optional 4 | from torchvision.models import efficientnet_b0, efficientnet_b4 5 | import torchvision.transforms as vt 6 | 7 | 8 | def get_vision_backbone( 9 | vismod="efficientnet_b0", num_classes=1000, pretrained=False, **kwargs 10 | ): 11 | if vismod == "efficientnet_b0": 12 | return efficientnet_b0(pretrained=pretrained, num_classes=num_classes, **kwargs) 13 | elif vismod == "efficientnet_b4": 14 | return efficientnet_b4(pretrained=pretrained, num_classes=num_classes, **kwargs) 15 | 16 | else: 17 | raise NotImplementedError 18 | 19 | 20 | class Grey2Rgb(nn.Module): 21 | def __init__(self): 22 | super().__init__() 23 | self.normalize = vt.Normalize( 24 | mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] 25 | ) 26 | 27 | def forward(self, data): 28 | batch_size, freq_bins, times = data.shape 29 | data /= data.max() 30 | data = data.unsqueeze(1).expand(batch_size, 3, freq_bins, times) 31 | data = self.normalize(data) 32 | return data 33 | 34 | 35 | class LogScale(nn.Module): 36 | def forward(self, data): 37 | # eps = 1e-8 38 | eps = torch.tensor(1e-8, device=data.device) 39 | return torch.log(data + eps) 40 | 41 | 42 | class Aggregator(nn.Module): 43 | """Aggregates (in time) a list of features""" 44 | 45 | def __init__(self): 46 | super().__init__() 47 | self.aggregation = nn.Sequential(nn.AdaptiveAvgPool1d(1), nn.Flatten(1)) 48 | 49 | def forward(self, features): 50 | """ 51 | Returns: 52 | outputs_feature: torch.Tensor of shape(B x C x t) 53 | """ 54 | if isinstance(features, list): 55 | output_feature = [self.aggregation(feature) for feature in features] 56 | else: 57 | output_feature = self.aggregation(features) 58 | return output_feature 59 | -------------------------------------------------------------------------------- /models/ssl_singer_identity/singer_identity/train_configs/README.md: -------------------------------------------------------------------------------- 1 | # Configuration File for Training 2 | 3 | You can use a configuration file to train a model using the `train.py` script. Here we provide a description of how to setup the config file. The common options are described in the [common config](common.yaml) file. 4 | 5 | 6 | ```python 7 | python train.py --config path/to/common.yaml --config path/to/model_config.yaml 8 | ``` 9 | The model specific options are described below. In the example above, `model_config.yaml` will overwrite the options in `common.yaml` when options are repeated. For more details check the [Lightning CLI](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli.html#lightning-cli) docs. 10 | 11 | ## 1. Model specific options 12 | In order to use contrastive, VICReg and Uniformity-Alignment, simply change the loss arguments in the config file. Below is the example for the contrastive loss: 13 | 14 | ```yaml 15 | use_contrastive_loss: true # use contrastive loss 16 | temp: 0.2 # temperature for contrastive loss 17 | nr_negative: 250 # number of negative samples for contrastive loss 18 | decouple: true # use decouple contrastive loss or regular NT-Xent loss 19 | use_covariance_reg: false # use covariance regularization 20 | use_variance_reg: false # use variance regularization 21 | use_vicreg_loss: false # use vicreg loss 22 | use_align_loss: false # use alignment loss 23 | use_uniform_loss: false # use uniformity loss 24 | ``` 25 | The individual weights for the losses can be specified as well. BYOL training has its dedicated trainer class and needs to be specified as shown in `byol.yaml`. 26 | 27 | We provide the following configs for the models used in the paper: 28 | 29 | - `byol.yaml` 30 | - `contrastive.yaml` 31 | - `contrastive_vc.yaml` 32 | - `uniformity-alignment.yaml` 33 | - `vicreg.yaml` 34 | 35 | 36 | ## 2. Data Options 37 | In the config file used to launch training (`common.yaml` is this example), specify the datasets to use as follows: 38 | 39 | ```yaml 40 | data: 41 | class_path: singer_id.data.siamese_encoders.SiameseEncodersDataModule # default the dataloader class 42 | init_args: 43 | dataset_dirs: 44 | - '/Path/to/dataset1/dataset1_name' 45 | - '/Path/to/dataset2/dataset2_name' 46 | batch_size: # batch size for training 47 | batch_size_val: # batch size for validation 48 | nr_samples: # number of samples to use for training (default: 176000, ie 4 seconds of audio in 44.1kHz) 49 | normalize: # normalize the audio when loading 50 | num_workers: # number of workers for the dataloader 51 | batch_sampling_mode: # "sample_clips" or "sample groups". Use "sample_clips" for self-supervised COLA loading 52 | eval_frac: # fraction of the dataset to use for validation 53 | group_name_is_folder: 54 | group_by_artist: 55 | multi_epoch: # number of epochs to repeat the dataset to simulate a larger dataset 56 | ``` 57 | 58 | ## 3. Augmentation Options 59 | 60 | The following augmentations are available. We use [Audiomentations](https://github.com/iver56/audiomentations) and [Parselmouth](https://github.com/YannickJadoul/Parselmouth) to perform the augmentations. All fields specify the probability of applying the augmentation, except for `pitch_shift_parselmouth`, `pitch_range_parselmouth`. 61 | 62 | ```yaml 63 | augmentations: 64 | "enable": true 65 | "gaussian_noise": 0.5 # min_amplitude=0.001, max_amplitude=0.05 66 | "pitch_shift_naive": 0 # naive pitch shift (using librosa), not used in the paper 67 | "time_stretch": 0 # time stretch, not used in the paper 68 | "gain": 0.5 # min_gain_in_db=-6, max_gain_in_db=0 69 | "shift": 0 # not used in the paper 70 | "parametric_eq": 0 # not used in the paper 71 | "tanh_distortion": 0 # not used in the paper 72 | "time_mask": 0.5 # max_band_part=1/8 73 | "formant_shift_parselmouth": 0 # not used in the paper 74 | "pitch_shift_parselmouth": [1, 1.3] # Pitch shift value on parselmouth 75 | "pitch_range_parselmouth": 1.5 # Pitch range value on parselmouth 76 | "pitch_shift_parselmouth_prob": 0.5 77 | ``` 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /models/ssl_singer_identity/singer_identity/train_configs/byol.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | class_path: singer_identity.trainer_byol.BYOL 3 | init_args: 4 | # ------------------ Feature extractor ------------------ 5 | feature_extractor: 6 | spec_layer: 'melspectogram' 7 | n_fft: 2048 8 | hop_length: 512 9 | # ------------------ Encoder ------------------ 10 | backbone: 11 | backbone: "efficientnet_b0" 12 | pretrained: true 13 | embedding_dim: 1000 14 | # ------------------ Projection ------------------ 15 | projection: 16 | input_dim: 1000 17 | output_dim: 128 18 | l2_normalize: true 19 | # ------------------ Predictor ------------------ 20 | predictor: 21 | dims: 22 | - 128 23 | - 1024 24 | - 128 25 | use_batchnorm: true 26 | normalize_projections: true 27 | weight_callback: 28 | class_path: singer_identity.callbacks.ma_updates.MAWeightUpdate 29 | init_args: 30 | initial_tau: 0.99 31 | max_epochs: 1000 32 | # ------------------ Optimizer ------------------ 33 | optimizer: 34 | class_path: singer_identity.models.byol.Adam 35 | init_args: 36 | lr: 3e-5 37 | weight_decay: 1.5e-6 38 | scheduler: 39 | class_path: singer_identity.models.byol.LinearWarmupCosineAnnealing 40 | init_args: 41 | warmup_epochs: 10 42 | max_epochs: 1000 43 | 44 | trainer: 45 | # ------------------ Logger ------------------ 46 | logger: 47 | class_path: pytorch_lightning.loggers.TensorBoardLogger 48 | init_args: 49 | save_dir: "logs" 50 | name: "byol" -------------------------------------------------------------------------------- /models/ssl_singer_identity/singer_identity/train_configs/common.yaml: -------------------------------------------------------------------------------- 1 | seed_everything: 123 2 | 3 | # ------------------ Datasets ------------------ 4 | data: 5 | class_path: singer_identity.data.siamese_encoders.SiameseEncodersDataModule 6 | init_args: 7 | dataset_dirs: 8 | # - 'PLACE PATH TO DATASET HERE' 9 | # - 'PLACE PATH TO OTHER DATASET HERE IF NEEDED' 10 | 11 | # ------------------ Data loading hyperparameters ------------------ 12 | batch_size: 140 13 | batch_size_val: 140 14 | nr_samples: 176000 # 4s in 44.1kHz 15 | normalize: true 16 | num_workers: 4 17 | batch_sampling_mode: "sample_clips" 18 | eval_frac: 0.2 # Fraction of the dataset to use for validation 19 | verbose: true 20 | group_name_is_folder: true 21 | group_by_artist: true 22 | multi_epoch: 1 23 | # ------------------ Augmentations ------------------ 24 | augmentations: 25 | "enable": true 26 | "gaussian_noise": 0.5 27 | "pitch_shift_naive": 0 28 | "time_stretch": 0 29 | "gain": 0.5 30 | "shift": 0 31 | "parametric_eq": 0 32 | "tanh_distortion": 0 33 | "time_mask": 0.5 34 | "formant_shift_parselmouth": 0 35 | "pitch_shift_parselmouth": [1, 1.3] 36 | "pitch_range_parselmouth": 1.5 37 | "pitch_shift_parselmouth_prob": 0.5 38 | 39 | # ------------------ Model ------------------ 40 | model: 41 | class_path: singer_identity.trainer.SSLTrainer # Default trainer class, does not need to change 42 | init_args: 43 | # ------------------ Optimizer ------------------ 44 | optimizer1_init: 45 | class_path: torch.optim.Adam 46 | init_args: 47 | lr: 0.0001 48 | weight_decay: 1e-5 49 | 50 | # ------------------ Feature extractor ------------------ 51 | feature_extractor: 52 | spec_layer: 'melspectogram' 53 | n_fft: 2048 54 | hop_length: 512 55 | 56 | # ------------------ Encoder ------------------ 57 | backbone: 58 | backbone: "efficientnet_b0" 59 | pretrained: true 60 | embedding_dim: 1000 # This is the embedding dimension of the backbone 61 | 62 | # ------------------ Projection ------------------ 63 | projection: 64 | input_dim: 1000 65 | output_dim: 128 # Projection dimension 66 | l2_normalize: true # Whether to normalize the projection vectors 67 | 68 | 69 | 70 | 71 | # ------------------ Training ------------------ 72 | trainer: 73 | max_epochs: 100000 # Maximum number of epochs to train for 74 | max_steps: 1000000000 # Maximum number of steps to train for 75 | accelerator: "gpu" 76 | num_nodes: 1 77 | # ------------------ Logger ------------------ 78 | logger: 79 | class_path: pytorch_lightning.loggers.TensorBoardLogger # Replace with logger of choice 80 | init_args: 81 | save_dir: "logs" 82 | name: "log_name_here" 83 | 84 | # ------------------ Vizualization callbacks ------------------ 85 | callbacks: 86 | 87 | # ------------------ Evaluation callbacks ------------------ 88 | # Evaluation callbacks are used to evaluate the model on the validation set 89 | # and are logged during training. 90 | - class_path: singer_identity.callbacks.evaluation.OrderEvaluation # Rank evaluation 91 | init_args: 92 | log_n_epochs: 5 93 | on_train: true 94 | - class_path: singer_identity.callbacks.evaluation.EEREvaluation # EER 95 | init_args: 96 | log_n_epochs: 5 97 | on_train: false 98 | - class_path: singer_identity.callbacks.evaluation.HypersphereEvaluation # Alignment/uniformity 99 | init_args: 100 | log_n_epochs: 5 101 | on_train: true 102 | 103 | # ------------------ Checkpoint callbacks ------------------ 104 | # Checkpoint callbacks are used to save the model during training. 105 | # Uncomment the ones you want to use. 106 | - class_path: pytorch_lightning.callbacks.ModelCheckpoint 107 | init_args: 108 | monitor: "loss/val" 109 | mode: "min" 110 | filename: "best-val-loss-{epoch}-{step}" 111 | save_top_k: 1 112 | 113 | - class_path: pytorch_lightning.callbacks.ModelCheckpoint 114 | init_args: 115 | every_n_epochs: 50 116 | save_top_k: -1 117 | filename: "ckpt-{epoch}-{step}" 118 | 119 | # - class_path: pytorch_lightning.callbacks.ModelCheckpoint 120 | # init_args: 121 | # monitor: "EER evaluation proj/val" 122 | # mode: "min" 123 | # filename: "best-eer-val-{epoch}-{step}" 124 | # save_top_k: 1 125 | # - class_path: pytorch_lightning.callbacks.ModelCheckpoint 126 | # init_args: 127 | # monitor: "Order evaluation mean proj/val" 128 | # mode: "min" 129 | # filename: "best-rank-val-{epoch}-{step}" 130 | # save_top_k: 1 131 | # - class_path: pytorch_lightning.callbacks.ModelCheckpoint 132 | # init_args: 133 | # monitor: "Alignment evaluation proj/val" 134 | # mode: "min" 135 | # filename: "best-alignment-val-{epoch}-{step}" 136 | # save_top_k: 1 137 | # - class_path: pytorch_lightning.callbacks.ModelCheckpoint 138 | # init_args: 139 | # monitor: "Uniformity evaluation proj/val" 140 | # mode: "min" 141 | # filename: "best-uniformity-val-{epoch}-{step}" 142 | # save_top_k: 1 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /models/ssl_singer_identity/singer_identity/train_configs/contrastive-vc.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | class_path: singer_identity.trainer.SSLTrainer 3 | init_args: 4 | # ------------------ Feature extractor ------------------ 5 | feature_extractor: 6 | spec_layer: 'melspectogram' 7 | n_fft: 2048 8 | hop_length: 512 9 | # ------------------ Encoder ------------------ 10 | backbone: 11 | backbone: "efficientnet_b0" 12 | pretrained: true 13 | embedding_dim: 1000 14 | # ------------------ Projection ------------------ 15 | projection: 16 | input_dim: 1000 17 | output_dim: 128 18 | l2_normalize: true 19 | # ------------------ Training hyperparameters ------------------ 20 | use_contrastive_loss: true 21 | temp: 0.2 22 | nr_negative: 250 23 | decouple: true 24 | use_covariance_reg: true 25 | fact_cov: 100 26 | use_variance_reg: true 27 | fact_var: 25 28 | use_invariance_loss: false 29 | use_vicreg_loss: false 30 | use_align_loss: false 31 | use_uniform_loss: false 32 | # ------------------ Optimizer ------------------ 33 | optimizer1_init: 34 | class_path: torch.optim.Adam 35 | init_args: 36 | lr: 0.0001 37 | weight_decay: 1e-5 38 | trainer: 39 | # ------------------ Logger ------------------ 40 | logger: 41 | class_path: pytorch_lightning.loggers.TensorBoardLogger 42 | init_args: 43 | save_dir: "logs" 44 | name: "contrastive-vc" -------------------------------------------------------------------------------- /models/ssl_singer_identity/singer_identity/train_configs/contrastive.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | class_path: singer_identity.trainer.SSLTrainer 3 | init_args: 4 | # ------------------ Feature extractor ------------------ 5 | feature_extractor: 6 | spec_layer: 'melspectogram' 7 | n_fft: 2048 8 | hop_length: 512 9 | # ------------------ Encoder ------------------ 10 | backbone: 11 | backbone: "efficientnet_b0" 12 | pretrained: true 13 | embedding_dim: 1000 14 | # ------------------ Projection ------------------ 15 | projection: 16 | input_dim: 1000 17 | output_dim: 128 18 | l2_normalize: true 19 | # ------------------ Training hyperparameters ------------------ 20 | use_contrastive_loss: true 21 | temp: 0.2 22 | nr_negative: 250 23 | decouple: true 24 | use_covariance_reg: false 25 | use_variance_reg: false 26 | use_vicreg_loss: false 27 | use_align_loss: false 28 | use_uniform_loss: false 29 | # ------------------ Optimizer ------------------ 30 | optimizer1_init: 31 | class_path: torch.optim.Adam 32 | init_args: 33 | lr: 0.0001 34 | weight_decay: 1e-5 35 | trainer: 36 | # ------------------ Logger ------------------ 37 | logger: 38 | class_path: pytorch_lightning.loggers.TensorBoardLogger 39 | init_args: 40 | save_dir: "logs" 41 | name: "contrastive" -------------------------------------------------------------------------------- /models/ssl_singer_identity/singer_identity/train_configs/contrastive_test.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | class_path: singer_identity.trainer.SSLTrainer 3 | init_args: 4 | # ------------------ Feature extractor ------------------ 5 | feature_extractor: 6 | spec_layer: 'melspectogram' 7 | n_fft: 2048 8 | hop_length: 512 9 | 10 | # ------------------ Encoder ------------------ 11 | backbone: 12 | backbone: "efficientnet_b0" 13 | pretrained: true 14 | embedding_dim: 1000 15 | 16 | # ------------------ Projection ------------------ 17 | projection: 18 | input_dim: 1000 19 | output_dim: 128 20 | l2_normalize: true 21 | 22 | # ------------------ Training hyperparameters ------------------ 23 | use_contrastive_loss: true 24 | temp: 0.2 25 | nr_negative: 250 26 | decouple: true 27 | use_covariance_reg: false 28 | use_variance_reg: false 29 | use_vicreg_loss: false 30 | use_align_loss: false 31 | use_uniform_loss: false 32 | 33 | # ------------------ Optimizer ------------------ 34 | optimizer1_init: 35 | class_path: torch.optim.Adam 36 | init_args: 37 | lr: 0.0001 38 | weight_decay: 1e-5 39 | 40 | trainer: 41 | # ------------------ Logger ------------------ 42 | logger: 43 | class_path: pytorch_lightning.loggers.TensorBoardLogger 44 | init_args: 45 | save_dir: "logs" 46 | name: "contrastive" -------------------------------------------------------------------------------- /models/ssl_singer_identity/singer_identity/train_configs/uniformity-alignment.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | class_path: singer_identity.trainer.SSLTrainer 3 | init_args: 4 | # ------------------ Feature extractor ------------------ 5 | feature_extractor: 6 | spec_layer: 'melspectogram' 7 | n_fft: 2048 8 | hop_length: 512 9 | # ------------------ Encoder ------------------ 10 | backbone: 11 | backbone: "efficientnet_b0" 12 | pretrained: true 13 | embedding_dim: 1000 14 | # ------------------ Projection ------------------ 15 | projection: 16 | input_dim: 1000 17 | output_dim: 128 18 | l2_normalize: true 19 | # ------------------ Training hyperparameters ------------------ 20 | use_contrastive_loss: false 21 | use_covariance_reg: false 22 | use_variance_reg: false 23 | use_invariance_loss: false 24 | use_align_loss: true 25 | fact_align_loss: 1 26 | use_uniform_loss: true 27 | fact_unif_loss: 1 28 | # ------------------ Optimizer ------------------ 29 | optimizer1_init: 30 | class_path: torch.optim.Adam 31 | init_args: 32 | lr: 0.0001 33 | weight_decay: 1e-5 34 | trainer: 35 | # ------------------ Logger ------------------ 36 | logger: 37 | class_path: pytorch_lightning.loggers.TensorBoardLogger 38 | init_args: 39 | save_dir: "logs" 40 | name: "uniformity-alignment" -------------------------------------------------------------------------------- /models/ssl_singer_identity/singer_identity/train_configs/vicreg.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | class_path: singer_identity.trainer.SSLTrainer 3 | init_args: 4 | # ------------------ Feature extractor ------------------ 5 | feature_extractor: 6 | spec_layer: 'melspectogram' 7 | n_fft: 2048 8 | hop_length: 512 9 | # ------------------ Encoder ------------------ 10 | backbone: 11 | backbone: "efficientnet_b0" 12 | pretrained: true 13 | embedding_dim: 1000 14 | # ------------------ Projection ------------------ 15 | projection: 16 | input_dim: 1000 17 | output_dim: 128 18 | l2_normalize: true 19 | # ------------------ Training hyperparameters ------------------ 20 | use_contrastive_loss: false 21 | # temp: 0.2 22 | # nr_negative: 250 23 | # decouple: true 24 | use_invariance_loss: true 25 | fact_inv_loss: 25 26 | use_covariance_reg: true 27 | fact_cov: 100 28 | use_variance_reg: true 29 | fact_var: 25 30 | gamma: 1 31 | use_align_loss: false 32 | use_uniform_loss: false 33 | # ------------------ Optimizer ------------------ 34 | optimizer1_init: 35 | class_path: torch.optim.Adam 36 | init_args: 37 | lr: 0.0001 38 | weight_decay: 1e-5 39 | trainer: 40 | # ------------------ Logger ------------------ 41 | logger: 42 | class_path: pytorch_lightning.loggers.TensorBoardLogger 43 | init_args: 44 | save_dir: "logs" 45 | name: "vicreg" -------------------------------------------------------------------------------- /models/ssl_singer_identity/singer_identity/trainer_byol.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Any 2 | 3 | import torch 4 | import torch.nn as nn 5 | from singer_identity.models.byol import TeacherStudentModel, Optimizer, Scheduler 6 | from singer_identity.model import IdentityEncoder, Projection, SiameseArm, MLP 7 | 8 | 9 | class BYOL(TeacherStudentModel): 10 | def __init__( 11 | self, 12 | # module: nn.Module, 13 | backbone: dict, 14 | projection: dict, 15 | predictor: dict, 16 | weight_callback, 17 | optimizer: Optimizer, 18 | feature_extractor: dict = {}, 19 | loss_fn: nn.Module = torch.nn.MSELoss(), 20 | scheduler: Optional[Scheduler] = None, 21 | normalize_projections: bool = False, 22 | normalize_representations: bool = False, 23 | ): 24 | encoder = IdentityEncoder(feature_extractor=feature_extractor, encoder=backbone) 25 | projection = Projection(**projection) 26 | predictor = MLP(**predictor) 27 | module = SiameseArm( 28 | encoder=encoder, 29 | projector=projection, 30 | predictor=predictor, 31 | normalize_projections=normalize_projections, 32 | normalize_representations=normalize_representations, 33 | ) 34 | 35 | super(BYOL, self).__init__( 36 | module, loss_fn, weight_callback, optimizer, scheduler=scheduler 37 | ) 38 | self.save_hyperparameters(ignore=["module", "loss_fn"]) 39 | 40 | def shared_step(self, batch, step_name: str): 41 | x1 = batch["clip1"] 42 | x2 = batch["clip2"] 43 | 44 | batch_size = x1.shape[0] 45 | 46 | ys, zs, qs = self.student_network(x1) 47 | with torch.no_grad(): 48 | yt, zt, qt = self.teacher_network(x2) 49 | loss_12 = self.loss_fn(qs, zt) 50 | 51 | ys, zs, qs = self.student_network(x2) 52 | with torch.no_grad(): 53 | yt, zt, qt = self.teacher_network(x1) 54 | loss_21 = self.loss_fn(qs, zt) 55 | 56 | loss = (loss_12 + loss_21) / 2 57 | 58 | self.log( 59 | f"loss/{step_name}", 60 | loss, 61 | prog_bar=True, 62 | batch_size=batch_size, 63 | ) 64 | 65 | self.record_variables(y1=ys, z1=zs, y2=yt, z2=zt) 66 | 67 | return loss 68 | 69 | def training_step(self, batch, batch_idx): 70 | return self.shared_step(batch, "train") 71 | 72 | def validation_step(self, batch, batch_idx): 73 | return self.shared_step(batch, "val") 74 | -------------------------------------------------------------------------------- /models/ssl_singer_identity/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pytorch_lightning as pl 3 | from pytorch_lightning.cli import LightningCLI 4 | 5 | 6 | class CLI(LightningCLI): 7 | def add_arguments_to_parser(self, parser): 8 | parser.add_argument("--ckpt_path", default=None) 9 | 10 | 11 | if __name__ == "__main__": 12 | cli = CLI( 13 | model_class=pl.LightningModule, 14 | datamodule_class=pl.LightningDataModule, 15 | subclass_mode_model=True, 16 | subclass_mode_data=True, 17 | save_config_kwargs={"overwrite": True}, 18 | run=False, 19 | ) 20 | 21 | ckpt_path = cli.config["ckpt_path"] 22 | 23 | if ckpt_path is not None: 24 | step = torch.load(ckpt_path, map_location="cpu")["global_step"] 25 | cli.trainer.fit_loop.epoch_loop._batches_that_stepped = step 26 | 27 | cli.trainer.fit(cli.model, cli.datamodule, ckpt_path=ckpt_path) 28 | -------------------------------------------------------------------------------- /models/wavlm/WavLM-Large.pt.txt: -------------------------------------------------------------------------------- 1 | https://github.com/microsoft/unilm/tree/master/wavlm -------------------------------------------------------------------------------- /models/wavlm/__init__.py: -------------------------------------------------------------------------------- 1 | from models.wavlm.WavLM import WavLM, WavLMConfig -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | glob2==0.7 2 | tqdm==4.62.3 3 | librosa==0.10.1 4 | scipy>=1.10.0 5 | tensorboard==2.7.0 6 | webrtcvad==2.0.10 7 | colorlog==6.7.0 8 | hydra-core==1.3.2 9 | hydra_colorlog==1.2.0 10 | omegaconf==2.3.0 11 | pyreaper 12 | protobuf==3.20.0 13 | matplotlib==3.7.1 14 | transformers==4.28.1 15 | # pyreaper @ git+https://github.com/r9y9/pyreaper.git@v0.0.9#egg=pyreaper 16 | huggingface_hub 17 | nnAudio 18 | numpy==1.24 19 | gdown 20 | torchaudio==2.1.2 21 | torch==2.1.2 22 | pytorch_lightning 23 | s3prl 24 | stopit 25 | praat-parselmouth 26 | webrtcvad 27 | asteroid_filterbanks 28 | -------------------------------------------------------------------------------- /resources/freesvc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freds0/free-svc/def7f35a0d1935b00e6ebaf19a4a149617f3354b/resources/freesvc.png -------------------------------------------------------------------------------- /scripts/convert.py: -------------------------------------------------------------------------------- 1 | from models.wavlm import WavLM, WavLMConfig 2 | from models.speaker_encoder.voice_encoder import SpeakerEncoder 3 | from models import SynthesizerTrn 4 | from mel_processing import mel_processing 5 | import utils 6 | import argparse 7 | import glob 8 | import logging 9 | import os 10 | import time 11 | 12 | import librosa 13 | import torch 14 | from scipy.io import wavfile 15 | from scipy.io.wavfile import write 16 | from tqdm import tqdm 17 | 18 | import numpy as np 19 | import pyreaper 20 | import torch 21 | 22 | import sys 23 | sys.path.append('..') 24 | 25 | logging.getLogger('numba').setLevel(logging.WARNING) 26 | 27 | 28 | if __name__ == "__main__": 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument( 31 | "--hpfile", type=str, default="configs/freevc.yaml", help="path to yaml config file") 32 | parser.add_argument( 33 | "--ptfile", type=str, default="checkpoints/freevc.pth", help="path to pth file") 34 | parser.add_argument("--txt-path", type=str, 35 | default="convert.txt", help="path to txt file") 36 | parser.add_argument("--out-dir", type=str, 37 | default="output/freevc", help="path to output dir") 38 | parser.add_argument("--use-timestamp", default=False, action="store_true") 39 | args = parser.parse_args() 40 | 41 | os.makedirs(args.outdir, exist_ok=True) 42 | hps = utils.get_hparams_from_file(args.hpfile) 43 | 44 | print("Loading model...") 45 | net_g = SynthesizerTrn( 46 | hps.data.filter_length // 2 + 1, 47 | hps.train.segment_size // hps.data.hop_length, 48 | **hps.model).cuda() 49 | _ = net_g.eval() 50 | print("Loading checkpoint...") 51 | _ = utils.load_checkpoint(args.ptfile, net_g, None, True) 52 | 53 | print("Loading WavLM for content...") 54 | cmodel = utils.get_cmodel(0) 55 | 56 | if hps.model.use_spk: 57 | print("Loading speaker encoder...") 58 | smodel = SpeakerEncoder( 59 | 'speaker_encoder/ckpt/pretrained_bak_5805000.pt') 60 | 61 | print("Processing text...") 62 | titles, srcs, tgts = [], [], [] 63 | with open(args.txtpath, "r") as f: 64 | for rawline in f.readlines(): 65 | title, src, tgt = rawline.strip().split("|") 66 | titles.append(title) 67 | srcs.append(src) 68 | tgts.append(tgt) 69 | 70 | print("Synthesizing...") 71 | with torch.no_grad(): 72 | for line in tqdm(zip(titles, srcs, tgts)): 73 | title, src, tgt = line 74 | # tgt 75 | wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate) 76 | wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20) 77 | if hps.model.use_spk: 78 | g_tgt = smodel.embed_utterance(wav_tgt) 79 | g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).cuda() 80 | else: 81 | wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).cuda() 82 | mel_tgt = mel_processing.mel_spectrogram_torch( 83 | wav_tgt, 84 | hps.data.filter_length, 85 | hps.data.n_mel_channels, 86 | hps.data.sampling_rate, 87 | hps.data.hop_length, 88 | hps.data.win_length, 89 | hps.data.mel_fmin, 90 | hps.data.mel_fmax 91 | ) 92 | # src 93 | wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate) 94 | wav_src = torch.from_numpy(wav_src).unsqueeze(0).cuda() 95 | # get pitch 96 | sampling_rate, audio = wavfile.read(src) 97 | _, _, _, pitch, _ = pyreaper.reaper(audio, sampling_rate) 98 | pitch = np.clip(pitch, 0, 800) * args.pitch_factor 99 | # interpolat to ensures that pitch and z have the same len 100 | z_len = round(audio.shape[-1] / hps.data.hop_length) 101 | pitch = torch.nn.functional.interpolate(torch.tensor(pitch).unsqueeze(0).unsqueeze( 102 | 0), size=z_len, mode="nearest").squeeze().unsqueeze(0).unsqueeze(0).cuda() 103 | 104 | # TODO: explore other interpolation modes 105 | c = torch.nn.functional.interpolate( 106 | c, size=z_len, mode="nearest").cuda() 107 | 108 | if hps.model.use_spk: 109 | audio = net_g.infer(c, g=g_tgt) 110 | else: 111 | audio = net_g.infer(c, mel=mel_tgt) 112 | audio = audio[0][0].data.cpu().float().numpy() 113 | if args.use_timestamp: 114 | timestamp = time.strftime("%m-%d_%H-%M", time.localtime()) 115 | write(os.path.join(args.outdir, "{}.wav".format( 116 | timestamp+"_"+title)), hps.data.sampling_rate, audio) 117 | else: 118 | write(os.path.join(args.outdir, 119 | f"{title}.wav"), hps.data.sampling_rate, audio) 120 | -------------------------------------------------------------------------------- /scripts/convert.txt: -------------------------------------------------------------------------------- 1 | title1|DUMMY/p225/p225_001.wav|DUMMY/p226/p226_002.wav 2 | title2|DUMMY/p226/p226_002.wav|DUMMY/p225/p225_001.wav 3 | -------------------------------------------------------------------------------- /scripts/downsample.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import librosa 4 | import numpy as np 5 | from multiprocessing import Pool, cpu_count 6 | from scipy.io import wavfile 7 | from tqdm import tqdm 8 | from glob import glob 9 | 10 | 11 | def process(audio_path): 12 | if os.path.exists(audio_path): 13 | audio, _ = librosa.load(audio_path, sr=args.sample_rate) 14 | audio, _ = librosa.effects.trim(audio, top_db=20) 15 | peak = np.abs(audio).max() 16 | if peak > 1.0: 17 | audio = 0.98 * audio / peak 18 | save_path = audio_path.replace(args.in_dir, args.out_dir) 19 | save_path = save_path.replace('.'+args.in_audio_format, '.'+args.out_audio_format) 20 | os.makedirs(os.path.dirname(save_path), exist_ok=True) 21 | wavfile.write( 22 | save_path, 23 | args.sample_rate, 24 | (audio * np.iinfo(np.int16).max).astype(np.int16) 25 | ) 26 | 27 | 28 | if __name__ == "__main__": 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("-sr", "--sample-rate", type=int, default=16000, help="sampling rate") 31 | parser.add_argument("-if", "--in-audio-format", type=str, default="wav", help="audio format of input audios") 32 | parser.add_argument("-i", "--in-dir", type=str, default="./data", help="path to source dir") 33 | parser.add_argument("-o", "--out-dir", type=str, default="./", help="path to target dir") 34 | parser.add_argument("-of", "--out-audio-format", type=str, default="wav", help="audio format of output audios") 35 | parser.add_argument("-w", "--num-workers", type=int, default=1, help="number of workers") 36 | args = parser.parse_args() 37 | 38 | filepaths = glob(f'{args.in_dir}/**/*.{args.in_audio_format}', recursive=True) 39 | if args.num_workers == 1: 40 | for filename in tqdm(filepaths): 41 | process(filename) 42 | else: 43 | pool = Pool(processes=args.num_workers) 44 | for _ in tqdm(pool.imap_unordered(process, filepaths)): 45 | pass 46 | -------------------------------------------------------------------------------- /scripts/prepare_nus_dataset.sh: -------------------------------------------------------------------------------- 1 | # Description: This script downloads the Nus dataset and prepares it for training. 2 | 3 | DATASET_DIR_NAME="dataset_nus" 4 | mkdir -p $DATASET_DIR_NAME 5 | 6 | # Check if the dataset_nus is already processed 7 | if [ -f "$DATASET_DIR_NAME/DONE" ]; then 8 | echo "$DATASET_DIR_NAME already processed" 9 | exit 0 10 | fi 11 | 12 | set -e 13 | set -x 14 | 15 | # Function to download the dataset_nus 16 | function downloadNus() { 17 | # Check if download is needed 18 | if [ -f "Nus.zip" ]; then 19 | echo "Dataset already downloaded" 20 | else 21 | echo "Downloading dataset" 22 | gdown 1lGHfVN4jWh-oWKpxnQIwZux1qpk5L9C5 23 | fi 24 | mv Nus.zip $DATASET_DIR_NAME 25 | cd $DATASET_DIR_NAME/ 26 | set +e 27 | unzip Nus.zip && rm Nus.zip && mkdir raw && mv * raw/ 28 | set -e 29 | cd .. 30 | } 31 | 32 | # Function to downsample audios 33 | function downsample() { 34 | python3 scripts/downsample.py \ 35 | --in-audio-format wav \ 36 | --in-dir $DATASET_DIR_NAME/raw \ 37 | --out-dir $DATASET_DIR_NAME/16k \ 38 | --sample-rate 16000 \ 39 | --num-workers 8 40 | } 41 | 42 | # Function to create train and test splits 43 | function create_splits() { 44 | python3 scripts/preprocess_flist.py \ 45 | --source-dir $DATASET_DIR_NAME/16k \ 46 | --train-list $DATASET_DIR_NAME/train.csv \ 47 | --val-list $DATASET_DIR_NAME/val.csv \ 48 | --test-list $DATASET_DIR_NAME/test.csv \ 49 | --seed 1 50 | } 51 | 52 | # Function to extract features 53 | function extract_features() { 54 | python3 scripts/preprocess_spk.py \ 55 | --in-dir $DATASET_DIR_NAME/16k \ 56 | --out-dir $DATASET_DIR_NAME/spk_embeddings \ 57 | --num-workers 8 58 | 59 | python3 scripts/preprocess_content.py \ 60 | --in-dir $DATASET_DIR_NAME/16k \ 61 | --out-dir $DATASET_DIR_NAME/ssl_features 62 | 63 | python3 scripts/preprocess_sr.py \ 64 | --in-dir $DATASET_DIR_NAME/16k \ 65 | --wav-dir $DATASET_DIR_NAME/sr \ 66 | --ssl-dir $DATASET_DIR_NAME/ssl_features \ 67 | --num-workers 1 68 | 69 | python3 scripts/preprocess_pitch.py \ 70 | --in-dir $DATASET_DIR_NAME/16k \ 71 | --out-dir $DATASET_DIR_NAME/pitch_features \ 72 | --num-workers 1 73 | 74 | } 75 | 76 | echo "STEP 1" 77 | downloadNus 78 | echo "STEP 2" 79 | downsample 80 | echo "STEP 3" 81 | create_splits 82 | echo "STEP 4" 83 | extract_features 84 | echo "DONE" 85 | rm -rf $DATASET_DIR_NAME/raw 86 | echo "" > $DATASET_DIR_NAME/DONE 87 | 88 | set +x 89 | echo "To easily train the model, rename the $DATASET_DIR_NAME folder to 'dataset'" 90 | echo "NOTE: the audios were not cut in small chunks. You might want to do that before training (see segment_vad.py)." 91 | -------------------------------------------------------------------------------- /scripts/prepare_nus_dataset_vad.sh: -------------------------------------------------------------------------------- 1 | # Description: This script downloads the Nus dataset and prepares it for training. 2 | 3 | DATASET_DIR_NAME="dataset_nus" 4 | mkdir -p $DATASET_DIR_NAME 5 | 6 | # Check if the is already processed 7 | if [ -f "$DATASET_DIR_NAME/DONE" ]; then 8 | echo "$DATASET_DIR_NAME already processed" 9 | exit 0 10 | fi 11 | 12 | set -e 13 | set -x 14 | 15 | # Function to download the 16 | function downloadNus() { 17 | # Check if download is needed 18 | if [ -f "Nus.zip" ]; then 19 | echo "Dataset already downloaded" 20 | else 21 | echo "Downloading dataset" 22 | gdown 1lGHfVN4jWh-oWKpxnQIwZux1qpk5L9C5 23 | fi 24 | mv Nus.zip $DATASET_DIR_NAME 25 | cd $DATASET_DIR_NAME/ 26 | set +e 27 | unzip Nus.zip && rm Nus.zip && mkdir raw && mv * raw/ 28 | set -e 29 | cd .. 30 | } 31 | 32 | # Function to downsample audios 33 | function downsample() { 34 | python3 scripts/downsample.py \ 35 | --in-audio-format wav \ 36 | --in-dir $DATASET_DIR_NAME/raw \ 37 | --out-dir $DATASET_DIR_NAME/16k \ 38 | --sample-rate 16000 \ 39 | --num-workers 8 40 | } 41 | 42 | # Function to create train and test splits 43 | function create_splits() { 44 | python3 scripts/preprocess_flist.py \ 45 | --source-dir $DATASET_DIR_NAME/16k-segmented \ 46 | --train-list $DATASET_DIR_NAME/train.csv \ 47 | --val-list $DATASET_DIR_NAME/val.csv \ 48 | --test-list $DATASET_DIR_NAME/test.csv \ 49 | --seed 1 50 | } 51 | 52 | # Function to segment audios 53 | function segment() { 54 | python3 scripts/segment_vad.py \ 55 | --dir $DATASET_DIR_NAME/16k \ 56 | --out-dir ./$DATASET_DIR_NAME/16k-segmented 57 | } 58 | 59 | # Function to extract features 60 | function extract_features() { 61 | python3 scripts/preprocess_spk.py \ 62 | --in-dir $DATASET_DIR_NAME/16k-segmented \ 63 | --out-dir $DATASET_DIR_NAME/spk_embeddings \ 64 | --num-workers 8 65 | 66 | python3 scripts/preprocess_content.py \ 67 | --in-dir $DATASET_DIR_NAME/16k-segmented \ 68 | --out-dir $DATASET_DIR_NAME/ssl_features 69 | 70 | python3 scripts/preprocess_sr.py \ 71 | --in-dir $DATASET_DIR_NAME/16k-segmented \ 72 | --wav-dir $DATASET_DIR_NAME/sr \ 73 | --ssl-dir $DATASET_DIR_NAME/ssl_features \ 74 | --num-workers 1 75 | 76 | python3 scripts/preprocess_pitch.py \ 77 | --in-dir $DATASET_DIR_NAME/16k-segmented \ 78 | --out-dir $DATASET_DIR_NAME/pitch_features \ 79 | --num-workers 1 80 | 81 | } 82 | 83 | echo "STEP 1" 84 | downloadNus 85 | echo "STEP 2" 86 | downsample 87 | echo "STEP 3" 88 | segment 89 | echo "STEP 4" 90 | create_splits 91 | echo "STEP 5" 92 | extract_features 93 | echo "DONE" 94 | rm -rf $DATASET_DIR_NAME/raw 95 | echo "" > $DATASET_DIR_NAME/DONE 96 | 97 | set +x 98 | echo "To easily train the model, rename the $DATASET_DIR_NAME folder to 'Data'" -------------------------------------------------------------------------------- /scripts/prepare_pop_dataset.sh: -------------------------------------------------------------------------------- 1 | # Description: This script downloads the PopBuTFy dataset and prepares it for training. 2 | 3 | echo "This dataset has issues with some audio files." 4 | 5 | DATASET_DIR_NAME="dataset_pop" 6 | mkdir -p $DATASET_DIR_NAME 7 | 8 | # Check if the dataset_pop is already processed 9 | if [ -f "$DATASET_DIR_NAME/DONE" ]; then 10 | echo "$DATASET_DIR_NAME already processed" 11 | exit 0 12 | fi 13 | 14 | set -e 15 | set -x 16 | 17 | # Function to download the dataset_pop 18 | function downloadPopBuTFy() { 19 | # Check if download is needed 20 | if [ -f "PopBuTFy.zip" ]; then 21 | echo "Dataset already downloaded" 22 | else 23 | echo "Downloading dataset" 24 | gdown 1WQOTrQDVgBeULUWMtBCAhWmiy2fe3hhh 25 | fi 26 | mv PopBuTFy.zip $DATASET_DIR_NAME/ 27 | cd $DATASET_DIR_NAME/ 28 | unzip PopBuTFy.zip && rm PopBuTFy.zip 29 | cd .. 30 | } 31 | 32 | # Function to create spk dirs 33 | function create_spk_dirs() { 34 | cd $DATASET_DIR_NAME/data/ 35 | set +e 36 | for i in {10..18}; do 37 | mkdir Female${i} 38 | mv "Female${i}#"* Female${i}/ 39 | done 40 | 41 | for i in {1..9}; do 42 | mkdir Female${i} 43 | mv "Female${i}#"* Female${i}/ 44 | done 45 | 46 | for i in {1..6}; do 47 | mkdir Male${i} 48 | mv "Male${i}#"* Male${i}/ 49 | done 50 | set -e 51 | cd ../.. 52 | } 53 | 54 | # Function to downsample audios 55 | function downsample() { 56 | python3 scripts/downsample.py \ 57 | --in-audio-format mp3 \ 58 | --in-dir $DATASET_DIR_NAME/data \ 59 | --out-dir $DATASET_DIR_NAME/16k \ 60 | --sample-rate 16000 \ 61 | --num-workers 8 62 | } 63 | 64 | # Function to create train and test splits 65 | function create_splits() { 66 | python3 scripts/preprocess_flist.py \ 67 | --source-dir $DATASET_DIR_NAME/16k \ 68 | --train-list $DATASET_DIR_NAME/train.csv \ 69 | --val-list $DATASET_DIR_NAME/val.csv \ 70 | --test-list $DATASET_DIR_NAME/test.csv \ 71 | --seed 1 72 | } 73 | 74 | # Function to extract features 75 | function extract_features() { 76 | python3 scripts/preprocess_spk.py \ 77 | --in-dir $DATASET_DIR_NAME/16k \ 78 | --out-dir $DATASET_DIR_NAME/spk_embeddings \ 79 | --num-workers 8 80 | 81 | python3 scripts/preprocess_content.py \ 82 | --in-dir $DATASET_DIR_NAME/16k \ 83 | --out-dir $DATASET_DIR_NAME/ssl_features 84 | 85 | python3 scripts/preprocess_sr.py \ 86 | --in-dir $DATASET_DIR_NAME/16k \ 87 | --wav-dir $DATASET_DIR_NAME/sr \ 88 | --ssl-dir $DATASET_DIR_NAME/ssl_features \ 89 | --num-workers 4 90 | 91 | python3 scripts/preprocess_pitch.py \ 92 | --in-dir $DATASET_DIR_NAME/16k \ 93 | --out-dir $DATASET_DIR_NAME/pitch_features \ 94 | --num-workers 1 95 | 96 | } 97 | 98 | echo "STEP 1" 99 | downloadPopBuTFy 100 | echo "STEP 2" 101 | create_spk_dirs 102 | echo "STEP 3" 103 | downsample 104 | echo "STEP 4" 105 | create_splits 106 | echo "STEP 5" 107 | extract_features 108 | echo "DONE" 109 | rm -rf dataset_pop/data 110 | echo "" > $DATASET_DIR_NAME/DONE 111 | 112 | set +x 113 | echo "To easily train the model, rename the $DATASET_DIR_NAME folder to 'dataset'" -------------------------------------------------------------------------------- /scripts/prepare_vctk_dataset.sh: -------------------------------------------------------------------------------- 1 | # Description: This script downloads the Nus dataset and prepares it for training. 2 | 3 | DATASET_DIR_NAME="dataset_vctk" 4 | mkdir -p $DATASET_DIR_NAME 5 | 6 | # Check if the dataset_nus is already processed 7 | if [ -f "$DATASET_DIR_NAME/DONE" ]; then 8 | echo "$DATASET_DIR_NAME already processed" 9 | exit 0 10 | fi 11 | 12 | set -e 13 | set -x 14 | 15 | # Function to download the dataset_nus 16 | function downloadNus() { 17 | # Check if download is needed 18 | if [ -f "Nus.zip" ]; then 19 | echo "Dataset already downloaded" 20 | else 21 | echo "Downloading dataset" 22 | gdown 1lGHfVN4jWh-oWKpxnQIwZux1qpk5L9C5 23 | fi 24 | mv Nus.zip $DATASET_DIR_NAME 25 | cd $DATASET_DIR_NAME/ 26 | set +e 27 | unzip Nus.zip && rm Nus.zip && mkdir raw && mv * raw/ 28 | set -e 29 | cd .. 30 | } 31 | 32 | # Function to downsample audios 33 | function downsample() { 34 | python3 scripts/downsample.py \ 35 | --in-audio-format wav \ 36 | --in-dir $DATASET_DIR_NAME/raw \ 37 | --out-dir $DATASET_DIR_NAME/16k \ 38 | --sample-rate 16000 \ 39 | --num-workers 8 40 | } 41 | 42 | # Function to create train and test splits 43 | function create_splits() { 44 | python3 scripts/preprocess_flist.py \ 45 | --source-dir $DATASET_DIR_NAME/16k \ 46 | --train-list $DATASET_DIR_NAME/train.csv \ 47 | --val-list $DATASET_DIR_NAME/val.csv \ 48 | --test-list $DATASET_DIR_NAME/test.csv \ 49 | --seed 1 50 | } 51 | 52 | # Function to extract features 53 | function extract_features() { 54 | #python3 scripts/preprocess_spk.py \ 55 | # --in-dir $DATASET_DIR_NAME/16k \ 56 | # --out-dir $DATASET_DIR_NAME/spk_embeddings \ 57 | # --num-workers 8 58 | 59 | #python3 scripts/preprocess_content.py \ 60 | # --in-dir $DATASET_DIR_NAME/16k \ 61 | # --out-dir $DATASET_DIR_NAME/ssl_features 62 | 63 | # python3 scripts/preprocess_sr.py \ 64 | # --in-dir $DATASET_DIR_NAME/16k \ 65 | # --wav-dir $DATASET_DIR_NAME/sr \ 66 | # --ssl-dir $DATASET_DIR_NAME/ssl_features \ 67 | # --num-workers 1 68 | 69 | python3 scripts/preprocess_pitch.py \ 70 | --in-dir $DATASET_DIR_NAME/16k \ 71 | --out-dir $DATASET_DIR_NAME/pitch_features \ 72 | --num-workers 1 73 | 74 | } 75 | 76 | echo "STEP 1" 77 | #downloadNus 78 | echo "STEP 2" 79 | #downsample 80 | echo "STEP 3" 81 | #create_splits 82 | echo "STEP 4" 83 | extract_features 84 | echo "DONE" 85 | rm -rf $DATASET_DIR_NAME/raw 86 | echo "" > $DATASET_DIR_NAME/DONE 87 | 88 | set +x 89 | echo "To easily train the model, rename the $DATASET_DIR_NAME folder to 'dataset'" 90 | echo "NOTE: the audios were not cut in small chunks. You might want to do that before training (see segment_vad.py)." 91 | -------------------------------------------------------------------------------- /scripts/preprocess_content.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from glob import glob 4 | 5 | import librosa 6 | import torch 7 | from tqdm import tqdm 8 | 9 | import sys 10 | sys.path.append(os.path.dirname('..')) 11 | 12 | import utils 13 | from models.wavlm import WavLM, WavLMConfig 14 | 15 | 16 | def extract_and_save_content_features(audio_path, out_dir, sampling_rate=16000): 17 | os.makedirs(os.path.dirname(audio_path), exist_ok=True) 18 | utt_id = os.path.basename(audio_path).rstrip(".wav") 19 | save_filepath = os.path.join(out_dir, f"{utt_id}.pt") 20 | if os.path.isfile(save_filepath): 21 | print("Igored because it is already computed: ", save_filepath) 22 | else: 23 | wav, _ = librosa.load(audio_path, sr=sampling_rate) 24 | wav = torch.from_numpy(wav).unsqueeze(0).cuda() 25 | c = utils.get_content(cmodel, wav) 26 | torch.save(c.cpu(), save_filepath) 27 | 28 | if __name__ == "__main__": 29 | torch.multiprocessing.set_start_method('spawn') 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument("--sr", type=int, default=16000, help="sampling rate") 32 | parser.add_argument("--in-dir", type=str, default="data", help="path to input dir") 33 | parser.add_argument("--out-dir", type=str, default="data/content_features", help="path to output dir") 34 | parser.add_argument("--checkpoint", type=str, default="./models/wavlm/WavLM-Large.pt", help="path to checkpoint") 35 | args = parser.parse_args() 36 | 37 | os.makedirs(args.out_dir, exist_ok=True) 38 | 39 | print("Loading WavLM for content...") 40 | checkpoint = torch.load(args.checkpoint) 41 | cfg = WavLMConfig(checkpoint['cfg']) 42 | cmodel = WavLM(cfg).cuda() 43 | cmodel.load_state_dict(checkpoint['model']) 44 | cmodel.eval() 45 | print("Loaded WavLM.") 46 | 47 | sub_folder_list = os.listdir(args.in_dir) 48 | sub_folder_list.sort() 49 | for spk in sub_folder_list: 50 | print("Preprocessing speaker {} ...".format(spk)) 51 | in_dir = os.path.join(args.in_dir, spk) 52 | if not os.path.isdir(in_dir): 53 | continue 54 | 55 | filepaths = glob(f'{in_dir}/**/*.wav', recursive=True) 56 | 57 | for filepath in tqdm(filepaths): 58 | spk_out_dir = os.path.join(args.out_dir, spk) 59 | os.makedirs(spk_out_dir, exist_ok=True) 60 | extract_and_save_content_features(filepath, spk_out_dir, sampling_rate=args.sr) 61 | -------------------------------------------------------------------------------- /scripts/preprocess_flist.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import random 4 | from tqdm import tqdm 5 | from random import shuffle 6 | 7 | 8 | if __name__ == "__main__": 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("--source-dir", type=str, default="./dataset/", help="path to source dir") 11 | parser.add_argument("--seed", type=int, default=None, help="random seed") 12 | parser.add_argument("--all-list", type=str, default="./dataset/all.csv", help="path to all list") 13 | parser.add_argument("--train-list", default="", help="path to train list") 14 | parser.add_argument("--val-list", default="", help="path to val list") 15 | parser.add_argument("--test-list", default="", help="path to test list") 16 | args = parser.parse_args() 17 | 18 | if args.seed is not None: 19 | random.seed(args.seed) 20 | 21 | train = [] 22 | val = [] 23 | test = [] 24 | idx = 0 25 | 26 | data = [] 27 | for language in os.listdir(args.source_dir): 28 | for speaker in tqdm(os.listdir(os.path.join(args.source_dir, language))): 29 | for root, dirs, files in os.walk(os.path.join(args.source_dir, language, speaker)): 30 | for file in files: 31 | if file.endswith(".wav"): 32 | data.append((os.path.join(root, file), language, speaker)) 33 | 34 | shuffle(data) 35 | 36 | print("Writing", args.all_list) 37 | with open(args.all_list, "w") as f: 38 | for wavpath, language, speaker in tqdm(data): 39 | print(wavpath, language, speaker, sep="|", file=f) 40 | 41 | val += data[:int(len(data) * 0.01)] 42 | test += data[int(len(data) * 0.01):int(len(data) * 0.02)] 43 | train += data[int(len(data) * 0.02):] 44 | 45 | shuffle(train) 46 | shuffle(val) 47 | shuffle(test) 48 | 49 | if args.train_list != "": 50 | print("Writing", args.train_list) 51 | with open(args.train_list, "w") as f: 52 | for wavpath, language, speaker in tqdm(train): 53 | print(wavpath, language, speaker, sep="|", file=f) 54 | 55 | if args.val_list != "": 56 | print("Writing", args.val_list) 57 | with open(args.val_list, "w") as f: 58 | for wavpath, language, speaker in tqdm(val): 59 | print(wavpath, language, speaker, sep="|", file=f) 60 | 61 | if args.test_list != "": 62 | print("Writing", args.test_list) 63 | with open(args.test_list, "w") as f: 64 | for wavpath, language, speaker in tqdm(test): 65 | print(wavpath, language, speaker, sep="|", file=f) 66 | -------------------------------------------------------------------------------- /scripts/preprocess_pitch.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import torch 5 | import random 6 | from glob import glob 7 | from tqdm import tqdm 8 | from scipy.io import wavfile 9 | import concurrent.futures 10 | 11 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 12 | from models.f0_predictor import get_f0_predictor 13 | 14 | def extract_pitch(pitch_predictor, input_path, output_path, skip_existing=False): 15 | if skip_existing and os.path.exists(output_path): 16 | return 17 | pitch = pitch_predictor.compute_f0(wavfile.read(input_path)[1]) 18 | os.makedirs(os.path.dirname(output_path), exist_ok=True) 19 | if type(pitch) is tuple: 20 | print(f"Pitch feature computation might have failed for {input_path}") 21 | pitch = pitch[0] 22 | torch.save(torch.tensor(pitch), output_path) 23 | 24 | if __name__ == "__main__": 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument("--in-dir", type=str, default="data/train", help="path to input dir") 27 | parser.add_argument("--pitch-predictor", type=str, default="rmvpe") 28 | parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu") 29 | parser.add_argument("--sampling-rate", type=int, default=24000) 30 | parser.add_argument("--hop-length", type=int, default=320) 31 | parser.add_argument('--num-workers', type=int, default=1) 32 | parser.add_argument("--skip-existing", action="store_true", help="skip existing pitch files") 33 | parser.add_argument("--out-dir", type=str, default="data/pitch_features/train", help="path to output dir") 34 | args = parser.parse_args() 35 | 36 | if args.device == "cuda" and args.num_workers > 1: 37 | print("Warning: Multiprocessing with CUDA is not supported. Setting num_workers to 1.") 38 | args.num_workers = 1 39 | 40 | pitch_predictor = get_f0_predictor( 41 | args.pitch_predictor, 42 | sampling_rate=args.sampling_rate, 43 | hop_length=args.hop_length, 44 | device=args.device, 45 | threshold=0.05 46 | ) 47 | 48 | file_paths = glob(f'{args.in_dir}/**/*.wav', recursive=True) 49 | random.shuffle(file_paths) 50 | 51 | if args.num_workers > 1: 52 | with concurrent.futures.ProcessPoolExecutor(args.num_workers) as \ 53 | executor: 54 | futures = [executor.submit(pitch_predictor, file_path, file_path.replace(args.in_dir, args.out_dir).replace(".wav", "_pitch.pt"), skip_existing=args.skip_existing) for file_path in file_paths] 55 | for f in tqdm(concurrent.futures.as_completed(futures)): 56 | if f.exception() is not None: 57 | print(f.exception()) 58 | else: 59 | for file_path in tqdm(file_paths): 60 | output_path = file_path.replace(args.in_dir, args.out_dir).replace(".wav", "_pitch.pt") 61 | extract_pitch(pitch_predictor, file_path, output_path, args.skip_existing) -------------------------------------------------------------------------------- /scripts/preprocess_spk.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | from concurrent.futures import ProcessPoolExecutor 5 | from functools import partial 6 | from multiprocessing import cpu_count 7 | from pathlib import Path 8 | 9 | import numpy as np 10 | from tqdm import tqdm 11 | 12 | import sys 13 | sys.path.append(os.path.dirname('..')) 14 | 15 | from models.speaker_encoder.voice_encoder import SpeakerEncoder 16 | from models.speaker_encoder.audio import preprocess_wav 17 | 18 | 19 | def build_from_path(in_dir, out_dir, weights_fpath, num_workers=1): 20 | executor = ProcessPoolExecutor(max_workers=num_workers) 21 | futures = [] 22 | wavfile_paths = glob.glob(in_dir + '/**/*.wav', recursive=True) 23 | wavfile_paths = sorted(wavfile_paths) 24 | print("Number of wav files: ", len(wavfile_paths)) 25 | if num_workers > 1: 26 | for wav_path in wavfile_paths: 27 | futures.append(executor.submit( 28 | partial(_compute_spkEmbed, out_dir, wav_path, weights_fpath))) 29 | return [future.result() for future in tqdm(futures)] 30 | else: 31 | for wav_path in wavfile_paths: 32 | _compute_spkEmbed(out_dir, wav_path, weights_fpath) 33 | 34 | def _compute_spkEmbed(out_dir, wav_path, weights_fpath): 35 | utt_id = os.path.basename(wav_path).rstrip(".wav") 36 | fname_save = os.path.join(out_dir, f"{utt_id}.npy") 37 | if os.path.isfile(fname_save): 38 | print("Igored because it is already computed: ", fname_save) 39 | return os.path.basename(fname_save) 40 | fpath = Path(wav_path) 41 | wav = preprocess_wav(fpath) 42 | 43 | encoder = SpeakerEncoder(weights_fpath) 44 | embed = encoder.embed_utterance(wav) 45 | np.save(fname_save, embed, allow_pickle=False) 46 | return os.path.basename(fname_save) 47 | 48 | 49 | def preprocess(in_dir, out_dir, spk, weights_fpath, num_workers): 50 | out_dir = os.path.join(out_dir, spk) 51 | os.makedirs(out_dir, exist_ok=True) 52 | metadata = build_from_path(in_dir, out_dir, weights_fpath, num_workers) 53 | 54 | 55 | if __name__ == "__main__": 56 | parser = argparse.ArgumentParser() 57 | parser.add_argument('--in-dir', type=str, 58 | default='dataset') 59 | parser.add_argument('--num-workers', type=int, default=8) 60 | parser.add_argument('--out-dir', type=str, 61 | default='dataset/spk_embeddings') 62 | parser.add_argument('--spk-encoder-ckpt', type=str, 63 | default='models/speaker_encoder/ckpt/pretrained_bak_5805000.pt') 64 | 65 | args = parser.parse_args() 66 | 67 | sub_folder_list = os.listdir(args.in_dir) 68 | sub_folder_list.sort() 69 | 70 | args.num_workers = args.num_workers if args.num_workers is not None else cpu_count() 71 | print("Number of workers: ", args.num_workers) 72 | ckpt_step = os.path.basename(args.spk_encoder_ckpt).split('.')[0].split('_')[-1] 73 | spk_embed_out_dir = args.out_dir 74 | print("[INFO] spk_embed_out_dir: ", spk_embed_out_dir) 75 | os.makedirs(spk_embed_out_dir, exist_ok=True) 76 | 77 | for spk in sub_folder_list: 78 | print("Preprocessing {} ...".format(spk)) 79 | in_dir = os.path.join(args.in_dir, spk) 80 | if not os.path.isdir(in_dir): 81 | continue 82 | preprocess(in_dir, spk_embed_out_dir, spk, 83 | args.spk_encoder_ckpt, args.num_workers) 84 | 85 | print("DONE!") 86 | sys.exit(0) 87 | -------------------------------------------------------------------------------- /scripts/run_inference.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | EXPERIMENT_PATH="/raid/alefiury/svc/free-svc/logs/config-online-language-emb.yaml/2024-02-23/10-11-19" 3 | INPUT_BASE_DIR="/raid/lucasgris/free-svc" 4 | 5 | HPFILE=$EXPERIMENT_PATH"/.hydra/config.yaml" 6 | PTFILE=$EXPERIMENT_PATH"/G_00012_0200000.pth" 7 | METADATA_PATH="/raid/lucasgris/free-svc/data/in_domain_transcriptions_weighted_spks.csv" 8 | IGNORE_METADATA_HEADER=true 9 | SPK_EMB_BASE_DIR="/raid/lucasgris/free-svc/data/spk_embeddings" 10 | PITCH_PREDICTOR="rmvpe" 11 | OUT_DIR=$EXPERIMENT_PATH"/audios" 12 | USE_TIMESTAMP=false 13 | CONCAT_AUDIO=false 14 | PITCH_FACTOR=0.9544 15 | 16 | python3 scripts/inference.py \ 17 | --hpfile=$HPFILE \ 18 | --ptfile=$PTFILE \ 19 | --input-base-dir=$INPUT_BASE_DIR \ 20 | --metadata-path=$METADATA_PATH \ 21 | --ignore-metadata-header=$IGNORE_METADATA_HEADER \ 22 | --spk-emb-base-dir=$SPK_EMB_BASE_DIR \ 23 | --pitch-predictor=$PITCH_PREDICTOR \ 24 | --out-dir=$OUT_DIR \ 25 | --pitch-factor=$PITCH_FACTOR -------------------------------------------------------------------------------- /scripts/run_inference_parallel.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | EXPERIMENT_PATH="/raid/alefiury/svc/free-svc/logs/config-online-language-emb.yaml/2024-02-23/10-11-19" 3 | INPUT_BASE_DIR="/raid/lucasgris/free-svc" 4 | 5 | HPFILE=$EXPERIMENT_PATH"/.hydra/config.yaml" 6 | PTFILE=$EXPERIMENT_PATH"/G_00012_0200000.pth" 7 | METADATA_PATH="/raid/lucasgris/free-svc/data/in_domain_transcriptions_weighted_spks.csv" 8 | IGNORE_METADATA_HEADER=true 9 | SPK_EMB_BASE_DIR="/raid/lucasgris/free-svc/data/spk_embeddings" 10 | PITCH_PREDICTOR="rmvpe" 11 | OUT_DIR=$EXPERIMENT_PATH"/audios" 12 | USE_TIMESTAMP=false 13 | CONCAT_AUDIO=false 14 | PITCH_FACTOR=0.9544 15 | 16 | NUM_WORKERS=4 17 | 18 | CUDA_LAUNCH_BLOCKING=1 python3 scripts/inference_parallel.py \ 19 | --hpfile=$HPFILE \ 20 | --ptfile=$PTFILE \ 21 | --input-base-dir=$INPUT_BASE_DIR \ 22 | --metadata-path=$METADATA_PATH \ 23 | --ignore-metadata-header=$IGNORE_METADATA_HEADER \ 24 | --spk-emb-base-dir=$SPK_EMB_BASE_DIR \ 25 | --pitch-predictor=$PITCH_PREDICTOR \ 26 | --out-dir=$OUT_DIR \ 27 | --pitch-factor=$PITCH_FACTOR \ 28 | --num-workers=$NUM_WORKERS --------------------------------------------------------------------------------