├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── configs
    ├── common.yaml
    ├── config-online-language-emb.yaml
    ├── config-online-spin-language-emb.yaml
    ├── config-online-spin.yaml
    ├── config-online.yaml
    └── config.yaml
├── data_utils.py
├── features.py
├── losses.py
├── mel_processing.py
├── models
    ├── __init__.py
    ├── clova
    │   ├── SpeakerNet.py
    │   └── models
    │   │   ├── RawNet3.py
    │   │   ├── RawNetBasicBlock.py
    │   │   ├── ResNetBlocks.py
    │   │   ├── ResNetSE34L.py
    │   │   ├── ResNetSE34V2.py
    │   │   ├── VGGVox.py
    │   │   ├── byol.py
    │   │   ├── ssl_singer_identity
    │   │       ├── .gitignore
    │   │       ├── LICENSE
    │   │       ├── README.md
    │   │       ├── convert_checkpoint.py
    │   │       ├── environment.yml
    │   │       ├── eval.py
    │   │       ├── metadata
    │   │       │   ├── img
    │   │       │   │   ├── byol.png
    │   │       │   │   ├── full_diagram.png
    │   │       │   │   ├── isolated.png
    │   │       │   │   ├── pipeline.png
    │   │       │   │   └── techniques_.png
    │   │       │   ├── m4singer_renamed_split_4s
    │   │       │   │   └── speaker_pairs.txt
    │   │       │   └── vocalset_renamed_split_4s
    │   │       │   │   └── speaker_pairs.txt
    │   │       ├── preprocess
    │   │       │   ├── create_speaker_pairs.py
    │   │       │   └── preprocess_dataset.py
    │   │       ├── singer_identity
    │   │       │   ├── __init__.py
    │   │       │   ├── callbacks
    │   │       │   │   ├── evaluation.py
    │   │       │   │   └── ma_updates.py
    │   │       │   ├── losses.py
    │   │       │   ├── model.py
    │   │       │   ├── models
    │   │       │   │   ├── byol.py
    │   │       │   │   └── network_components.py
    │   │       │   ├── train_configs
    │   │       │   │   ├── README.md
    │   │       │   │   ├── byol.yaml
    │   │       │   │   ├── common.yaml
    │   │       │   │   ├── contrastive-vc.yaml
    │   │       │   │   ├── contrastive.yaml
    │   │       │   │   ├── contrastive_test.yaml
    │   │       │   │   ├── uniformity-alignment.yaml
    │   │       │   │   └── vicreg.yaml
    │   │       │   ├── trainer.py
    │   │       │   ├── trainer_byol.py
    │   │       │   └── utils
    │   │       │   │   ├── core.py
    │   │       │   │   ├── data_utils.py
    │   │       │   │   └── fetch_pretrained.py
    │   │       └── train.py
    │   │   └── weights
    │   │       └── RawNet3
    │   │           ├── .gitattributes
    │   │           └── README.md
    ├── commons.py
    ├── content_extractors.py
    ├── f0_predictor
    │   ├── CrepeF0Predictor.py
    │   ├── DioF0Predictor.py
    │   ├── F0Predictor.py
    │   ├── FCPEF0Predictor.py
    │   ├── HarvestF0Predictor.py
    │   ├── PMF0Predictor.py
    │   ├── RMVPEF0Predictor.py
    │   ├── __init__.py
    │   ├── crepe.py
    │   ├── fcpe
    │   │   ├── __init__.py
    │   │   ├── model.py
    │   │   ├── nvSTFT.py
    │   │   └── pcmer.py
    │   └── rmvpe
    │   │   ├── __init__.py
    │   │   ├── constants.py
    │   │   ├── deepunet.py
    │   │   ├── inference.py
    │   │   ├── model.py
    │   │   ├── seq.py
    │   │   ├── spec.py
    │   │   └── utils.py
    ├── hifigan
    │   ├── __init__.py
    │   ├── config.json
    │   ├── generator_v1.txt
    │   └── models.py
    ├── models.py
    ├── modules.py
    ├── so_vits_svc.py
    ├── speaker_encoder
    │   ├── __init__.py
    │   ├── audio.py
    │   ├── compute_embed.py
    │   ├── config.py
    │   ├── hparams.py
    │   ├── inference.py
    │   ├── model.py
    │   ├── params_data.py
    │   ├── params_model.py
    │   ├── preprocess.py
    │   ├── train.py
    │   ├── visualizations.py
    │   └── voice_encoder.py
    ├── speaker_encoders.py
    ├── spin
    │   ├── __init__.py
    │   ├── spin.yaml
    │   └── src
    │   │   ├── data
    │   │       ├── __init__.py
    │   │       ├── audio.py
    │   │       ├── dataset.py
    │   │       ├── librispeech.py
    │   │       └── sampler.py
    │   │   ├── model
    │   │       ├── __init__.py
    │   │       ├── base.py
    │   │       └── spin.py
    │   │   ├── nn
    │   │       ├── __init__.py
    │   │       ├── dnn.py
    │   │       ├── hubert.py
    │   │       ├── swav_vq_dis.py
    │   │       └── wavlm.py
    │   │   ├── task
    │   │       ├── __init__.py
    │   │       └── train_spin.py
    │   │   └── util
    │   │       ├── __init__.py
    │   │       ├── log.py
    │   │       ├── model_utils.py
    │   │       ├── padding.py
    │   │       ├── pnmi.py
    │   │       └── scheduler.py
    ├── ssl_singer_identity
    │   ├── LICENSE
    │   ├── README.md
    │   ├── environment.yml
    │   ├── eval.py
    │   ├── metadata
    │   │   ├── img
    │   │   │   ├── byol.png
    │   │   │   └── pipeline.png
    │   │   ├── m4singer_renamed_split_4s
    │   │   │   └── speaker_pairs.txt
    │   │   └── vocalset_renamed_split_4s
    │   │   │   └── speaker_pairs.txt
    │   ├── preprocess
    │   │   ├── create_speaker_pairs.py
    │   │   └── preprocess_dataset.py
    │   ├── singer_identity
    │   │   ├── __init__.py
    │   │   ├── callbacks
    │   │   │   ├── evaluation.py
    │   │   │   └── ma_updates.py
    │   │   ├── losses.py
    │   │   ├── model.py
    │   │   ├── models
    │   │   │   ├── byol.py
    │   │   │   └── network_components.py
    │   │   ├── train_configs
    │   │   │   ├── README.md
    │   │   │   ├── byol.yaml
    │   │   │   ├── common.yaml
    │   │   │   ├── contrastive-vc.yaml
    │   │   │   ├── contrastive.yaml
    │   │   │   ├── contrastive_test.yaml
    │   │   │   ├── uniformity-alignment.yaml
    │   │   │   └── vicreg.yaml
    │   │   ├── trainer.py
    │   │   ├── trainer_byol.py
    │   │   └── utils
    │   │   │   ├── core.py
    │   │   │   ├── data_utils.py
    │   │   │   └── fetch_pretrained.py
    │   └── train.py
    └── wavlm
    │   ├── WavLM-Large.pt.txt
    │   ├── WavLM.py
    │   ├── __init__.py
    │   └── modules.py
├── requirements.txt
├── resources
    └── freesvc.png
├── scripts
    ├── convert.py
    ├── convert.txt
    ├── convert_dir_vad.py
    ├── downsample.py
    ├── inference.py
    ├── inference_parallel.py
    ├── inference_sample.py
    ├── prepare_nus_dataset.sh
    ├── prepare_nus_dataset_vad.sh
    ├── prepare_pop_dataset.sh
    ├── prepare_vctk_dataset.sh
    ├── preprocess_content.py
    ├── preprocess_flist.py
    ├── preprocess_pitch.py
    ├── preprocess_spk.py
    ├── preprocess_sr.py
    ├── run_inference.sh
    ├── run_inference_parallel.sh
    └── segment_vad.py
├── train.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Checkpoints
  2 | **/ckpt/
  3 | 
  4 | # Dataset
  5 | dataset
  6 | 
  7 | # Model weights
  8 | WavLM-Large.pt
  9 | generator_v1
 10 | 
 11 | # Byte-compiled / optimized / DLL files
 12 | __pycache__/
 13 | *.py[cod]
 14 | *$py.class
 15 | 
 16 | # C extensions
 17 | *.so
 18 | 
 19 | # Distribution / packaging
 20 | .Python
 21 | build/
 22 | develop-eggs/
 23 | dist/
 24 | downloads/
 25 | eggs/
 26 | .eggs/
 27 | lib/
 28 | lib64/
 29 | parts/
 30 | sdist/
 31 | var/
 32 | wheels/
 33 | share/python-wheels/
 34 | *.egg-info/
 35 | .installed.cfg
 36 | *.egg
 37 | MANIFEST
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .nox/
 53 | .coverage
 54 | .coverage.*
 55 | .cache
 56 | nosetests.xml
 57 | coverage.xml
 58 | *.cover
 59 | *.py,cover
 60 | .hypothesis/
 61 | .pytest_cache/
 62 | cover/
 63 | 
 64 | # Translations
 65 | *.mo
 66 | *.pot
 67 | 
 68 | # Django stuff:
 69 | *.log
 70 | local_settings.py
 71 | db.sqlite3
 72 | db.sqlite3-journal
 73 | 
 74 | # Flask stuff:
 75 | instance/
 76 | .webassets-cache
 77 | 
 78 | # Scrapy stuff:
 79 | .scrapy
 80 | 
 81 | # Sphinx documentation
 82 | docs/_build/
 83 | 
 84 | # PyBuilder
 85 | .pybuilder/
 86 | target/
 87 | 
 88 | # Jupyter Notebook
 89 | .ipynb_checkpoints
 90 | 
 91 | # IPython
 92 | profile_default/
 93 | ipython_config.py
 94 | 
 95 | # pyenv
 96 | #   For a library or package, you might want to ignore these files since the code is
 97 | #   intended to run in multiple environments; otherwise, check them in:
 98 | # .python-version
 99 | 
100 | # pipenv
101 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
102 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
103 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
104 | #   install all needed dependencies.
105 | #Pipfile.lock
106 | 
107 | # poetry
108 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
109 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
110 | #   commonly ignored for libraries.
111 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
112 | #poetry.lock
113 | 
114 | # pdm
115 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
116 | #pdm.lock
117 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
118 | #   in version control.
119 | #   https://pdm.fming.dev/#use-with-ide
120 | .pdm.toml
121 | 
122 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
123 | __pypackages__/
124 | 
125 | # Celery stuff
126 | celerybeat-schedule
127 | celerybeat.pid
128 | 
129 | # SageMath parsed files
130 | *.sage.py
131 | 
132 | # Environments
133 | .env
134 | .venv
135 | env/
136 | venv/
137 | ENV/
138 | env.bak/
139 | venv.bak/
140 | 
141 | # Spyder project settings
142 | .spyderproject
143 | .spyproject
144 | 
145 | # Rope project settings
146 | .ropeproject
147 | 
148 | # mkdocs documentation
149 | /site
150 | 
151 | # mypy
152 | .mypy_cache/
153 | .dmypy.json
154 | dmypy.json
155 | 
156 | # Pyre type checker
157 | .pyre/
158 | 
159 | # pytype static type analyzer
160 | .pytype/
161 | 
162 | # Cython debug symbols
163 | cython_debug/
164 | 
165 | # PyCharm
166 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
167 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
168 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
169 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
170 | #.idea/
171 | 
172 | /data/
173 | logs/
174 | spin.ckpt


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # FROM pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime
 2 | 
 3 | FROM nvcr.io/nvidia/pytorch:23.12-py3
 4 | RUN apt update && \
 5 |     apt -y install git libsndfile1-dev ffmpeg
 6 | 
 7 | # RUN python3 -m pip install --upgrade pip
 8 | 
 9 | # RUN python3 -m pip install torchaudio==0.13.1 -f https://download.pytorch.org/whl/cu116
10 | 
11 | COPY requirements.txt .
12 | RUN python3 -m pip install -r requirements.txt
13 | 
14 | # Install fairseq (not necessary now)
15 | # RUN git clone https://github.com/facebookresearch/fairseq.git && \
16 | #     cd fairseq && \
17 | #     git checkout 05255f9 && \
18 | #     python3 setup.py build_ext --inplace && \
19 | #     python3 -m pip install -e . && \
20 | #     python3 setup.py build develop
21 | 
22 | # RUN python3 -m pip install numpy --upgrade && python3 -m pip install numba
23 | 
24 | # Setup working directory
25 | ARG WORKSPACE=/workspace
26 | RUN mkdir -p /${WORKSPACE}
27 | WORKDIR ${WORKSPACE}
28 | COPY . ${WORKSPACE}/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Jingyi Li
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/configs/common.yaml:
--------------------------------------------------------------------------------
 1 | path: ./logs/${hydra.job.config_name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
 2 | 
 3 | log_level: INFO
 4 | seed: 1
 5 | tb_log_dir: tensorboard
 6 | tqdm: true
 7 | 
 8 | hydra:
 9 |   run:
10 |     dir: ${path}
11 |   job_logging:
12 |     formatters:
13 |       colorlog:
14 |         format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s:%(lineno)s:%(funcName)s()%(reset)s][%(log_color)s%(levelname)s%(reset)s]
15 |           - %(message)s'
16 |     handlers:
17 |       file:
18 |         filename: ${hydra.run.dir}/${hydra.job.name}_${now:%Y-%m-%d}_${now:%H-%M-%S}.log
19 | 
20 | defaults:
21 |   - override hydra/job_logging: colorlog
22 |   - override hydra/hydra_logging: colorlog


--------------------------------------------------------------------------------
/configs/config-online-language-emb.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 | - common
 3 | - config
 4 | 
 5 | data:
 6 |   use_lang_emb: true
 7 |   num_langs: 11
 8 |   lang_dim: 192 # same size as hidden_channels to facilitate the addition of the conditioning
 9 |   lang2id:
10 |     chinese: 0
11 |     dutch: 1
12 |     english: 2
13 |     french: 3
14 |     german: 4
15 |     italian: 5
16 |     japanese: 6
17 |     other: 7
18 |     polish: 8
19 |     portuguese: 9
20 |     spanish: 10
21 |   use_spk_emb: false
22 |   spk_embeddings_dir: null # compute on forward (model)
23 |   spk_encoder_type: null # compute on forward (model) | ECAPA2SpeakerEncoder16k
24 |   content_encoder_type: null # compute on forward (model) | hubert
25 |   content_encoder_ckpt: null # compute on forward (model) | lengyue233/content-vec-best
26 | 
27 | model:
28 |   use_spk_emb: true
29 |   spk_encoder_type: ECAPA2SpeakerEncoder16k
30 |   spk_encoder_ckpt: null  # Not used for ECAPA2SpeakerEncoder16k
31 |   content_encoder_type: hubert # or wavlm
32 |   content_encoder_ckpt: lengyue233/content-vec-best # or models/wavlm/WavLM-Large.pt
33 | 


--------------------------------------------------------------------------------
/configs/config-online-spin-language-emb.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 | - common
 3 | - config
 4 | 
 5 | data:
 6 |   use_lang_emb: true
 7 |   num_langs: 11
 8 |   lang_dim: 192 # same size as hidden_channels to facilitate the addition
 9 |   lang2id:
10 |     chinese: 0
11 |     dutch: 1
12 |     english: 2
13 |     french: 3
14 |     german: 4
15 |     italian: 5
16 |     japanese: 6
17 |     other: 7
18 |     polish: 8
19 |     portuguese: 9
20 |     spanish: 10
21 |   use_spk_emb: false
22 |   spk_embeddings_dir: null # compute on forward (model)
23 |   spk_encoder_type: null # compute on forward (model) | ECAPA2SpeakerEncoder16k
24 |   content_encoder_type: null # compute on forward (model) | hubert
25 |   content_encoder_ckpt: null # compute on forward (model) | lengyue233/content-vec-best
26 | 
27 | model:
28 |   use_spk_emb: true
29 |   spk_encoder_type: ECAPA2SpeakerEncoder16k
30 |   spk_encoder_ckpt: null  # Not used for ECAPA2SpeakerEncoder16k
31 |   content_encoder_type: spin # hubert | wavlm | spin
32 |   content_encoder_config: models/spin/spin.yaml # path to the config file for the content encoder
33 |   content_encoder_ckpt: models/spin/spin.ckpt # or models/wavlm/WavLM-Large.pt
34 | 


--------------------------------------------------------------------------------
/configs/config-online-spin.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 | - common
 3 | - config
 4 | 
 5 | data:
 6 |   use_spk_emb: false
 7 |   spk_embeddings_dir: null # compute on forward (model)
 8 |   spk_encoder_type: null # compute on forward (model) | ECAPA2SpeakerEncoder16k
 9 |   content_encoder_type: null # compute on forward (model) | hubert
10 |   content_encoder_ckpt: null # compute on forward (model) | lengyue233/content-vec-best
11 | 
12 | model:
13 |   use_spk_emb: true
14 |   spk_encoder_type: ECAPA2SpeakerEncoder16k
15 |   spk_encoder_ckpt: null  # Not used for ECAPA2SpeakerEncoder16k
16 |   content_encoder_type: spin # hubert | wavlm | spin
17 |   content_encoder_config: models/spin/spin.yaml # path to the config file for the content encoder
18 |   content_encoder_ckpt: models/spin/spin.ckpt # or models/wavlm/WavLM-Large.pt
19 | 


--------------------------------------------------------------------------------
/configs/config-online.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 | - common
 3 | - config
 4 | 
 5 | data:
 6 |   use_spk_emb: false
 7 |   spk_embeddings_dir: null # compute on forward (model)
 8 |   spk_encoder_type: null # compute on forward (model) | ECAPA2SpeakerEncoder16k
 9 |   content_encoder_type: null # compute on forward (model) | hubert
10 |   content_encoder_ckpt: null # compute on forward (model) | lengyue233/content-vec-best
11 | 
12 | model:
13 |   use_spk_emb: true
14 |   spk_encoder_type: ECAPA2SpeakerEncoder16k
15 |   spk_encoder_ckpt: null  # Not used for ECAPA2SpeakerEncoder16k
16 |   content_encoder_type: hubert # or wavlm
17 |   content_encoder_ckpt: lengyue233/content-vec-best # or models/wavlm/WavLM-Large.pt
18 | 


--------------------------------------------------------------------------------
/configs/config.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 | - common
 3 | 
 4 | train:
 5 |   batch_size: 128
 6 |   betas: [0.8, 0.99]
 7 |   c_kl: 1.0
 8 |   c_mel: 45
 9 |   distributed: false  # BUG: multi-gpu is not working
10 |   use_multiprocessing: false # BUG: multi-gpu is not working
11 |   epochs: 20
12 |   eps: 1e-9
13 |   fp16_run: false
14 |   init_lr_ratio: 1
15 |   raise_error: false
16 |   learning_rate: 2e-4
17 |   log_interval: 10
18 |   log_level: ${log_level}
19 |   lr_decay: 0.98
20 |   max_speclen: 128
21 |   port: 8005
22 |   resume_training: false  # set to false to finetune from a model
23 |   seed: 1234
24 |   segment_size: 8960
25 |   use_sr: false
26 |   valid_epoch_interval: 1
27 |   valid_steps_interval: 1000
28 |   save_epoch_interval: 10
29 |   save_steps_interval: 1000
30 |   warmup_epochs: 0
31 |   # weighted_batch_speaker_sampling : false
32 |   # weighted_batch_lang_sampling : false
33 |   weighted_batch_speaker_sampling : 0.5
34 |   weighted_batch_lang_sampling : 0.5
35 | 
36 | data:
37 |   dataset_dir: /raid/lucasgris/free-svc/data
38 |   filter_length: 1280
39 |   hop_length: 320
40 |   max_wav_value: 32768.0
41 |   mel_fmax: null
42 |   mel_fmin: 0.0
43 |   n_mel_channels: 80
44 |   num_workers: 64
45 |   # For pitch extraction, set the pitch_predictor (will compute in dataloader) or pitch_features_dir (will load from disk)
46 |   pitch_predictor: rmvpe # pm | crepe | harvest | dio | rmvpe | fcpe
47 |   pitch_features_dir: ${data.dataset_dir}/pitch_features/
48 |   sampling_rate: 24000
49 |   spectrogram_dir: null #${data.dataset_dir}/spectrograms # it is recommended NOT to use if you have small disk space
50 |   # For speaker embedding extraction, set the use_spk_emb to True and spk_embeddings_dir (will load from disk) or configure the model to compute it on forward
51 |   use_spk_emb: true
52 |   spk_embeddings_dir: ${data.dataset_dir}/spk_embeddings
53 |   # SR augmentation is deprecated, set use_sr to False
54 |   sr_min_max: [68, 92]
55 |   # For content feature extraction, set the content_feature_dir (will load from disk) or configure the model to compute it on forward
56 |   content_feature_dir: null
57 |   training_files: data/train.csv
58 |   validation_files: data/valid.csv
59 |   win_length: 1280
60 | 
61 | model:
62 |   save_dir: null
63 |   filter_channels: 768
64 |   finetune_from_model:
65 |     discriminator: /raid/lucasgris/free-svc/D-freevc-24.pth
66 |     generator: /raid/lucasgris/free-svc/freevc-24.pth
67 |   hidden_channels: 192
68 |   inter_channels: 192
69 |   kernel_size: 3
70 |   n_heads: 2
71 |   n_layers_q: 3
72 |   n_layers: 6
73 |   p_dropout: 0.1
74 |   resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
75 |   resblock_kernel_sizes: [3,7,11]
76 |   resblock: 1
77 |   c_dim: 768
78 |   upsample_initial_channel: 512
79 |   upsample_kernel_sizes: [16,16,4,4]
80 |   upsample_rates: [10,8,2,2]
81 |   use_spectral_norm: false
82 |   freeze_external_spk: true
83 |   device: cuda
84 |   # For online speaker embedding extraction, set the use_spk_emb to True and spk_encoder_type
85 |   use_spk_emb: false
86 |   gin_channels: null # gin_channels = spk_encoder.embedding_dim
87 |   spk_encoder_type: null # ECAPA2SpeakerEncoder16k |
88 |   # For content feature extraction, set the content_encoder_type and content_encoder_ckpt
89 |   content_encoder_type: null # load from disk (data) - hubert | wavlm
90 |   content_encoder_ckpt: null # load from disk (data) - [path] | models/wavlm/WavLM-Large.pt | lengyue233/content-vec-best
91 |   post_content_encoder_type: vits-encoder-with-uv-emb # or freevc-bottleneck
92 |   coarse_f0: true
93 |   cond_f0_on_flow: false
94 | 


--------------------------------------------------------------------------------
/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import functional as F
 3 | 
 4 | 
 5 | def feature_loss(fmap_r, fmap_g):
 6 |     loss = 0
 7 |     for dr, dg in zip(fmap_r, fmap_g):
 8 |         for rl, gl in zip(dr, dg):
 9 |             rl = rl.float().detach()
10 |             gl = gl.float()
11 |             loss += torch.mean(torch.abs(rl - gl))
12 | 
13 |     return loss * 2
14 | 
15 | 
16 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
17 |     loss = 0
18 |     r_losses = []
19 |     g_losses = []
20 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
21 |         dr = dr.float()
22 |         dg = dg.float()
23 |         r_loss = torch.mean((1-dr)**2)
24 |         g_loss = torch.mean(dg**2)
25 |         loss += (r_loss + g_loss)
26 |         r_losses.append(r_loss.item())
27 |         g_losses.append(g_loss.item())
28 | 
29 |     return loss, r_losses, g_losses
30 | 
31 | 
32 | def generator_loss(disc_outputs):
33 |     loss = 0
34 |     gen_losses = []
35 |     for dg in disc_outputs:
36 |         dg = dg.float()
37 |         l = torch.mean((1-dg)**2)
38 |         gen_losses.append(l)
39 |         loss += l
40 | 
41 |     return loss, gen_losses
42 | 
43 | 
44 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
45 |     """
46 |     z_p, logs_q: [b, h, t_t]
47 |     m_p, logs_p: [b, h, t_t]
48 |     """
49 |     z_p = z_p.float()
50 |     logs_q = logs_q.float()
51 |     m_p = m_p.float()
52 |     logs_p = logs_p.float()
53 |     z_mask = z_mask.float()
54 |     # print(logs_p)
55 |     kl = logs_p - logs_q - 0.5
56 |     kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p)
57 |     kl = torch.sum(kl * z_mask)
58 |     l = kl / torch.sum(z_mask)
59 |     return l
60 | 


--------------------------------------------------------------------------------
/mel_processing.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data
  3 | from librosa.filters import mel as librosa_mel_fn
  4 | 
  5 | import logging
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | MAX_WAV_VALUE = 32768.0
  9 | 
 10 | 
 11 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 12 |     """
 13 |     PARAMS
 14 |     ------
 15 |     C: compression factor
 16 |     """
 17 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 18 | 
 19 | 
 20 | def dynamic_range_decompression_torch(x, C=1):
 21 |     """
 22 |     PARAMS
 23 |     ------
 24 |     C: compression factor used to compress
 25 |     """
 26 |     return torch.exp(x) / C
 27 | 
 28 | 
 29 | def spectral_normalize_torch(magnitudes):
 30 |     output = dynamic_range_compression_torch(magnitudes)
 31 |     return output
 32 | 
 33 | 
 34 | def spectral_de_normalize_torch(magnitudes):
 35 |     output = dynamic_range_decompression_torch(magnitudes)
 36 |     return output
 37 | 
 38 | 
 39 | class MelProcessing:
 40 | 
 41 |     def __init__(self, mel_basis={}, hann_window={}):
 42 |         self.mel_basis = mel_basis
 43 |         self.hann_window = hann_window
 44 | 
 45 |     # TODO: sample rate is not used
 46 |     def spectrogram_torch(self, y, n_fft, sampling_rate, hop_size, win_size, center=False):
 47 |         if torch.min(y) < -1.:
 48 |             print('min value is ', torch.min(y))
 49 |         if torch.max(y) > 1.:
 50 |             print('max value is ', torch.max(y))
 51 | 
 52 |         dtype_device = str(y.dtype) + '_' + str(y.device)
 53 |         wnsize_dtype_device = str(win_size) + '_' + dtype_device
 54 |         if wnsize_dtype_device not in self.hann_window:
 55 |             self.hann_window[wnsize_dtype_device] = torch.hann_window(
 56 |                 win_size).to(dtype=y.dtype, device=y.device)
 57 | 
 58 |         y = torch.nn.functional.pad(y.unsqueeze(
 59 |             1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
 60 |         y = y.squeeze(1)
 61 | 
 62 |         spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=self.hann_window[wnsize_dtype_device],
 63 |                           center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
 64 | 
 65 |         spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
 66 |         return spec
 67 | 
 68 |     def spec_to_mel_torch(self, spec, n_fft, num_mels, sampling_rate, fmin, fmax):
 69 |         dtype_device = str(spec.dtype) + '_' + str(spec.device)
 70 |         fmax_dtype_device = str(fmax) + '_' + dtype_device
 71 |         if fmax_dtype_device not in self.mel_basis:
 72 |             mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft,
 73 |                                  n_mels=num_mels, fmin=fmin, fmax=fmax)
 74 |             self.mel_basis[fmax_dtype_device] = torch.from_numpy(
 75 |                 mel).to(dtype=spec.dtype, device=spec.device)
 76 |         spec = torch.matmul(self.mel_basis[fmax_dtype_device], spec)
 77 |         spec = spectral_normalize_torch(spec)
 78 |         return spec
 79 | 
 80 |     def mel_spectrogram_torch(self, y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
 81 |         if torch.min(y) < -1.:
 82 |             logger.debug('min value is ', torch.min(y))
 83 |         if torch.max(y) > 1.:
 84 |             logger.debug('max value is ', torch.max(y))
 85 | 
 86 |         dtype_device = str(y.dtype) + '_' + str(y.device)
 87 |         fmax_dtype_device = str(fmax) + '_' + dtype_device
 88 |         wnsize_dtype_device = str(win_size) + '_' + dtype_device
 89 |         if fmax_dtype_device not in self.mel_basis:
 90 |             mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft,
 91 |                                  n_mels=num_mels, fmin=fmin, fmax=fmax)
 92 |             self.mel_basis[fmax_dtype_device] = torch.from_numpy(
 93 |                 mel).to(dtype=y.dtype, device=y.device)
 94 |         if wnsize_dtype_device not in self.hann_window:
 95 |             self.hann_window[wnsize_dtype_device] = torch.hann_window(
 96 |                 win_size).to(dtype=y.dtype, device=y.device)
 97 | 
 98 |         y = torch.nn.functional.pad(y.unsqueeze(
 99 |             1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
100 |         y = y.squeeze(1)
101 | 
102 |         spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=self.hann_window[wnsize_dtype_device],
103 |                           center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
104 | 
105 |         spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
106 | 
107 |         spec = torch.matmul(self.mel_basis[fmax_dtype_device], spec)
108 |         spec = spectral_normalize_torch(spec)
109 | 
110 |         return spec
111 | 
112 | mel_processing = MelProcessing()


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import SynthesizerTrn
2 | from .models import MultiPeriodDiscriminator


--------------------------------------------------------------------------------
/models/clova/SpeakerNet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import importlib
 4 | 
 5 | 
 6 | class WrappedModel(nn.Module):
 7 | 
 8 |     ## The purpose of this wrapper is to make the model structure consistent between single and multi-GPU
 9 | 
10 |     def __init__(self, model):
11 |         super(WrappedModel, self).__init__()
12 |         self.module = model
13 | 
14 |     def forward(self, x, label=None):
15 |         return self.module(x, label)
16 | 
17 | 
18 | class SpeakerNet(nn.Module):
19 |     def __init__(self, model, **kwargs):
20 |         super(SpeakerNet, self).__init__()
21 | 
22 |         if type(model) == str:
23 |             SpeakerNetModel = importlib.import_module(".models." + model).__getattribute__("MainModel")
24 |         else:
25 |             SpeakerNetModel = model
26 |         self.model = SpeakerNetModel(**kwargs)
27 | 
28 |     def forward(self, data, label=None):
29 | 
30 |         data = data.reshape(-1, data.size()[-1]).cuda()
31 |         outp = self.model.forward(data)
32 |         return outp
33 | 
34 |     def loadParameters(self, path):
35 |         print("Loading pretrained model from %s" % (path))
36 |         pretrained_dict = torch.load(path)
37 |         model_dict = self.model.state_dict()
38 |         pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
39 |         model_dict.update(pretrained_dict)
40 |         self.model.load_state_dict(model_dict)
41 |         print("Pretrained model is loaded.")
42 | 


--------------------------------------------------------------------------------
/models/clova/models/RawNet3.py:
--------------------------------------------------------------------------------
  1 | # -*- encoding: utf-8 -*-
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from asteroid_filterbanks import Encoder, ParamSincFB
  6 | 
  7 | from .RawNetBasicBlock import Bottle2neck, PreEmphasis
  8 | 
  9 | 
 10 | class RawNet3(nn.Module):
 11 |     def __init__(self, block, model_scale, context, summed, C=1024, **kwargs):
 12 |         super().__init__()
 13 | 
 14 |         nOut = kwargs["nOut"]
 15 | 
 16 |         self.context = context
 17 |         self.encoder_type = kwargs["encoder_type"]
 18 |         self.log_sinc = kwargs["log_sinc"]
 19 |         self.norm_sinc = kwargs["norm_sinc"]
 20 |         self.out_bn = kwargs["out_bn"]
 21 |         self.summed = summed
 22 | 
 23 |         self.preprocess = nn.Sequential(
 24 |             PreEmphasis(), nn.InstanceNorm1d(1, eps=1e-4, affine=True)
 25 |         )
 26 |         self.conv1 = Encoder(
 27 |             ParamSincFB(
 28 |                 C // 4,
 29 |                 251,
 30 |                 stride=kwargs["sinc_stride"],
 31 |             )
 32 |         )
 33 |         self.relu = nn.ReLU()
 34 |         self.bn1 = nn.BatchNorm1d(C // 4)
 35 | 
 36 |         self.layer1 = block(
 37 |             C // 4, C, kernel_size=3, dilation=2, scale=model_scale, pool=5
 38 |         )
 39 |         self.layer2 = block(
 40 |             C, C, kernel_size=3, dilation=3, scale=model_scale, pool=3
 41 |         )
 42 |         self.layer3 = block(C, C, kernel_size=3, dilation=4, scale=model_scale)
 43 |         self.layer4 = nn.Conv1d(3 * C, 1536, kernel_size=1)
 44 | 
 45 |         if self.context:
 46 |             attn_input = 1536 * 3
 47 |         else:
 48 |             attn_input = 1536
 49 |         print("self.encoder_type", self.encoder_type)
 50 |         if self.encoder_type == "ECA":
 51 |             attn_output = 1536
 52 |         elif self.encoder_type == "ASP":
 53 |             attn_output = 1
 54 |         else:
 55 |             raise ValueError("Undefined encoder")
 56 | 
 57 |         self.attention = nn.Sequential(
 58 |             nn.Conv1d(attn_input, 128, kernel_size=1),
 59 |             nn.ReLU(),
 60 |             nn.BatchNorm1d(128),
 61 |             nn.Conv1d(128, attn_output, kernel_size=1),
 62 |             nn.Softmax(dim=2),
 63 |         )
 64 | 
 65 |         self.bn5 = nn.BatchNorm1d(3072)
 66 | 
 67 |         self.fc6 = nn.Linear(3072, nOut)
 68 |         self.bn6 = nn.BatchNorm1d(nOut)
 69 | 
 70 |         self.mp3 = nn.MaxPool1d(3)
 71 | 
 72 |     def forward(self, x):
 73 |         """
 74 |         :param x: input mini-batch (bs, samp)
 75 |         """
 76 | 
 77 |         with torch.cuda.amp.autocast(enabled=False):
 78 |             x = self.preprocess(x)
 79 |             x = torch.abs(self.conv1(x))
 80 |             if self.log_sinc:
 81 |                 x = torch.log(x + 1e-6)
 82 |             if self.norm_sinc == "mean":
 83 |                 x = x - torch.mean(x, dim=-1, keepdim=True)
 84 |             elif self.norm_sinc == "mean_std":
 85 |                 m = torch.mean(x, dim=-1, keepdim=True)
 86 |                 s = torch.std(x, dim=-1, keepdim=True)
 87 |                 s[s < 0.001] = 0.001
 88 |                 x = (x - m) / s
 89 | 
 90 |         if self.summed:
 91 |             x1 = self.layer1(x)
 92 |             x2 = self.layer2(x1)
 93 |             x3 = self.layer3(self.mp3(x1) + x2)
 94 |         else:
 95 |             x1 = self.layer1(x)
 96 |             x2 = self.layer2(x1)
 97 |             x3 = self.layer3(x2)
 98 | 
 99 |         x = self.layer4(torch.cat((self.mp3(x1), x2, x3), dim=1))
100 |         x = self.relu(x)
101 | 
102 |         t = x.size()[-1]
103 | 
104 |         if self.context:
105 |             global_x = torch.cat(
106 |                 (
107 |                     x,
108 |                     torch.mean(x, dim=2, keepdim=True).repeat(1, 1, t),
109 |                     torch.sqrt(
110 |                         torch.var(x, dim=2, keepdim=True).clamp(
111 |                             min=1e-4, max=1e4
112 |                         )
113 |                     ).repeat(1, 1, t),
114 |                 ),
115 |                 dim=1,
116 |             )
117 |         else:
118 |             global_x = x
119 | 
120 |         w = self.attention(global_x)
121 | 
122 |         mu = torch.sum(x * w, dim=2)
123 |         sg = torch.sqrt(
124 |             (torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4, max=1e4)
125 |         )
126 | 
127 |         x = torch.cat((mu, sg), 1)
128 | 
129 |         x = self.bn5(x)
130 | 
131 |         x = self.fc6(x)
132 | 
133 |         if self.out_bn:
134 |             x = self.bn6(x)
135 | 
136 |         return x
137 | 
138 | 
139 | def MainModel(**kwargs):
140 | 
141 |     model = RawNet3(
142 |         Bottle2neck, model_scale=8, context=True, summed=True, out_bn=False, log_sinc=True, norm_sinc="mean", grad_mult=1, **kwargs
143 |     )
144 |     return model
145 | 


--------------------------------------------------------------------------------
/models/clova/models/RawNetBasicBlock.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | 
  8 | class PreEmphasis(torch.nn.Module):
  9 |     def __init__(self, coef: float = 0.97) -> None:
 10 |         super().__init__()
 11 |         self.coef = coef
 12 |         # make kernel
 13 |         # In pytorch, the convolution operation uses cross-correlation. So, filter is flipped.
 14 |         self.register_buffer(
 15 |             "flipped_filter",
 16 |             torch.FloatTensor([-self.coef, 1.0]).unsqueeze(0).unsqueeze(0),
 17 |         )
 18 | 
 19 |     def forward(self, input: torch.tensor) -> torch.tensor:
 20 |         assert (
 21 |             len(input.size()) == 2
 22 |         ), "The number of dimensions of input tensor must be 2!"
 23 |         # reflect padding to match lengths of in/out
 24 |         input = input.unsqueeze(1)
 25 |         input = F.pad(input, (1, 0), "reflect")
 26 |         return F.conv1d(input, self.flipped_filter)
 27 | 
 28 | 
 29 | class AFMS(nn.Module):
 30 |     """
 31 |     Alpha-Feature map scaling, added to the output of each residual block[1,2].
 32 | 
 33 |     Reference:
 34 |     [1] RawNet2 : https://www.isca-speech.org/archive/Interspeech_2020/pdfs/1011.pdf
 35 |     [2] AMFS    : https://www.koreascience.or.kr/article/JAKO202029757857763.page
 36 |     """
 37 | 
 38 |     def __init__(self, nb_dim: int) -> None:
 39 |         super().__init__()
 40 |         self.alpha = nn.Parameter(torch.ones((nb_dim, 1)))
 41 |         self.fc = nn.Linear(nb_dim, nb_dim)
 42 |         self.sig = nn.Sigmoid()
 43 | 
 44 |     def forward(self, x):
 45 |         y = F.adaptive_avg_pool1d(x, 1).view(x.size(0), -1)
 46 |         y = self.sig(self.fc(y)).view(x.size(0), x.size(1), -1)
 47 | 
 48 |         x = x + self.alpha
 49 |         x = x * y
 50 |         return x
 51 | 
 52 | 
 53 | class Bottle2neck(nn.Module):
 54 |     def __init__(
 55 |         self,
 56 |         inplanes,
 57 |         planes,
 58 |         kernel_size=None,
 59 |         dilation=None,
 60 |         scale=4,
 61 |         pool=False,
 62 |     ):
 63 | 
 64 |         super().__init__()
 65 | 
 66 |         width = int(math.floor(planes / scale))
 67 | 
 68 |         self.conv1 = nn.Conv1d(inplanes, width * scale, kernel_size=1)
 69 |         self.bn1 = nn.BatchNorm1d(width * scale)
 70 | 
 71 |         self.nums = scale - 1
 72 | 
 73 |         convs = []
 74 |         bns = []
 75 | 
 76 |         num_pad = math.floor(kernel_size / 2) * dilation
 77 | 
 78 |         for i in range(self.nums):
 79 |             convs.append(
 80 |                 nn.Conv1d(
 81 |                     width,
 82 |                     width,
 83 |                     kernel_size=kernel_size,
 84 |                     dilation=dilation,
 85 |                     padding=num_pad,
 86 |                 )
 87 |             )
 88 |             bns.append(nn.BatchNorm1d(width))
 89 | 
 90 |         self.convs = nn.ModuleList(convs)
 91 |         self.bns = nn.ModuleList(bns)
 92 | 
 93 |         self.conv3 = nn.Conv1d(width * scale, planes, kernel_size=1)
 94 |         self.bn3 = nn.BatchNorm1d(planes)
 95 | 
 96 |         self.relu = nn.ReLU()
 97 | 
 98 |         self.width = width
 99 | 
100 |         self.mp = nn.MaxPool1d(pool) if pool else False
101 |         self.afms = AFMS(planes)
102 | 
103 |         if inplanes != planes:  # if change in number of filters
104 |             self.residual = nn.Sequential(
105 |                 nn.Conv1d(inplanes, planes, kernel_size=1, stride=1, bias=False)
106 |             )
107 |         else:
108 |             self.residual = nn.Identity()
109 | 
110 |     def forward(self, x):
111 |         residual = self.residual(x)
112 | 
113 |         out = self.conv1(x)
114 |         out = self.relu(out)
115 |         out = self.bn1(out)
116 | 
117 |         spx = torch.split(out, self.width, 1)
118 |         for i in range(self.nums):
119 |             if i == 0:
120 |                 sp = spx[i]
121 |             else:
122 |                 sp = sp + spx[i]
123 |             sp = self.convs[i](sp)
124 |             sp = self.relu(sp)
125 |             sp = self.bns[i](sp)
126 |             if i == 0:
127 |                 out = sp
128 |             else:
129 |                 out = torch.cat((out, sp), 1)
130 | 
131 |         out = torch.cat((out, spx[self.nums]), 1)
132 | 
133 |         out = self.conv3(out)
134 |         out = self.relu(out)
135 |         out = self.bn3(out)
136 | 
137 |         out += residual
138 |         if self.mp:
139 |             out = self.mp(out)
140 |         out = self.afms(out)
141 | 
142 |         return out
143 | 


--------------------------------------------------------------------------------
/models/clova/models/ResNetBlocks.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | class SEBasicBlock(nn.Module):
 8 |     expansion = 1
 9 | 
10 |     def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
11 |         super(SEBasicBlock, self).__init__()
12 |         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
13 |         self.bn1 = nn.BatchNorm2d(planes)
14 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
15 |         self.bn2 = nn.BatchNorm2d(planes)
16 |         self.relu = nn.ReLU(inplace=True)
17 |         self.se = SELayer(planes, reduction)
18 |         self.downsample = downsample
19 |         self.stride = stride
20 | 
21 |     def forward(self, x):
22 |         residual = x
23 | 
24 |         out = self.conv1(x)
25 |         out = self.relu(out)
26 |         out = self.bn1(out)
27 | 
28 |         out = self.conv2(out)
29 |         out = self.bn2(out)
30 |         out = self.se(out)
31 | 
32 |         if self.downsample is not None:
33 |             residual = self.downsample(x)
34 | 
35 |         out += residual
36 |         out = self.relu(out)
37 |         return out
38 | 
39 | 
40 | class SEBottleneck(nn.Module):
41 |     expansion = 4
42 | 
43 |     def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
44 |         super(SEBottleneck, self).__init__()
45 |         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
46 |         self.bn1 = nn.BatchNorm2d(planes)
47 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
48 |                                padding=1, bias=False)
49 |         self.bn2 = nn.BatchNorm2d(planes)
50 |         self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
51 |         self.bn3 = nn.BatchNorm2d(planes * 4)
52 |         self.relu = nn.ReLU(inplace=True)
53 |         self.se = SELayer(planes * 4, reduction)
54 |         self.downsample = downsample
55 |         self.stride = stride
56 | 
57 |     def forward(self, x):
58 |         residual = x
59 | 
60 |         out = self.conv1(x)
61 |         out = self.bn1(out)
62 |         out = self.relu(out)
63 | 
64 |         out = self.conv2(out)
65 |         out = self.bn2(out)
66 |         out = self.relu(out)
67 | 
68 |         out = self.conv3(out)
69 |         out = self.bn3(out)
70 |         out = self.se(out)
71 | 
72 |         if self.downsample is not None:
73 |             residual = self.downsample(x)
74 | 
75 |         out += residual
76 |         out = self.relu(out)
77 | 
78 |         return out
79 | 
80 | 
81 | class SELayer(nn.Module):
82 |     def __init__(self, channel, reduction=8):
83 |         super(SELayer, self).__init__()
84 |         self.avg_pool = nn.AdaptiveAvgPool2d(1)
85 |         self.fc = nn.Sequential(
86 |                 nn.Linear(channel, channel // reduction),
87 |                 nn.ReLU(inplace=True),
88 |                 nn.Linear(channel // reduction, channel),
89 |                 nn.Sigmoid()
90 |         )
91 | 
92 |     def forward(self, x):
93 |         b, c, _, _ = x.size()
94 |         y = self.avg_pool(x).view(b, c)
95 |         y = self.fc(y).view(b, c, 1, 1)
96 |         return x * y


--------------------------------------------------------------------------------
/models/clova/models/ResNetSE34L.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | # -*- encoding: utf-8 -*-
  3 | 
  4 | import torch
  5 | import torchaudio
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from torch.nn import Parameter
  9 | from models.ResNetBlocks import *
 10 | 
 11 | class ResNetSE(nn.Module):
 12 |     def __init__(self, block, layers, num_filters, nOut, encoder_type='SAP', n_mels=40, log_input=True, **kwargs):
 13 |         super(ResNetSE, self).__init__()
 14 | 
 15 |         print('Embedding size is %d, encoder %s.'%(nOut, encoder_type))
 16 |         
 17 |         self.inplanes   = num_filters[0]
 18 |         self.encoder_type = encoder_type
 19 |         self.n_mels     = n_mels
 20 |         self.log_input  = log_input
 21 | 
 22 |         self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=7, stride=(2, 1), padding=3,
 23 |                                bias=False)
 24 |         self.bn1 = nn.BatchNorm2d(num_filters[0])
 25 |         self.relu = nn.ReLU(inplace=True)
 26 | 
 27 |         self.layer1 = self._make_layer(block, num_filters[0], layers[0])
 28 |         self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2))
 29 |         self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2))
 30 |         self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(1, 1))
 31 | 
 32 |         self.instancenorm   = nn.InstanceNorm1d(n_mels)
 33 |         self.torchfb        = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, window_fn=torch.hamming_window, n_mels=n_mels)
 34 | 
 35 |         if self.encoder_type == "SAP":
 36 |             self.sap_linear = nn.Linear(num_filters[3] * block.expansion, num_filters[3] * block.expansion)
 37 |             self.attention = self.new_parameter(num_filters[3] * block.expansion, 1)
 38 |             out_dim = num_filters[3] * block.expansion
 39 |         elif self.encoder_type == "ASP":
 40 |             self.sap_linear = nn.Linear(num_filters[3] * block.expansion, num_filters[3] * block.expansion)
 41 |             self.attention = self.new_parameter(num_filters[3] * block.expansion, 1)
 42 |             out_dim = num_filters[3] * block.expansion * 2
 43 |         else:
 44 |             raise ValueError('Undefined encoder')
 45 | 
 46 |         self.fc = nn.Linear(out_dim, nOut)
 47 | 
 48 |         for m in self.modules():
 49 |             if isinstance(m, nn.Conv2d):
 50 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
 51 |             elif isinstance(m, nn.BatchNorm2d):
 52 |                 nn.init.constant_(m.weight, 1)
 53 |                 nn.init.constant_(m.bias, 0)
 54 | 
 55 |     def _make_layer(self, block, planes, blocks, stride=1):
 56 |         downsample = None
 57 |         if stride != 1 or self.inplanes != planes * block.expansion:
 58 |             downsample = nn.Sequential(
 59 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
 60 |                           kernel_size=1, stride=stride, bias=False),
 61 |                 nn.BatchNorm2d(planes * block.expansion),
 62 |             )
 63 | 
 64 |         layers = []
 65 |         layers.append(block(self.inplanes, planes, stride, downsample))
 66 |         self.inplanes = planes * block.expansion
 67 |         for i in range(1, blocks):
 68 |             layers.append(block(self.inplanes, planes))
 69 | 
 70 |         return nn.Sequential(*layers)
 71 | 
 72 |     def new_parameter(self, *size):
 73 |         out = nn.Parameter(torch.FloatTensor(*size))
 74 |         nn.init.xavier_normal_(out)
 75 |         return out
 76 | 
 77 |     def forward(self, x):
 78 | 
 79 |         with torch.no_grad():
 80 |             with torch.cuda.amp.autocast(enabled=False):
 81 |                 x = self.torchfb(x)+1e-6
 82 |                 if self.log_input: x = x.log()
 83 |                 x = self.instancenorm(x).unsqueeze(1).detach()
 84 | 
 85 |         x = self.conv1(x)
 86 |         x = self.bn1(x)
 87 |         x = self.relu(x)
 88 | 
 89 |         x = self.layer1(x)
 90 |         x = self.layer2(x)
 91 |         x = self.layer3(x)
 92 |         x = self.layer4(x)
 93 |         
 94 |         x = torch.mean(x, dim=2, keepdim=True)
 95 | 
 96 |         if self.encoder_type == "SAP":
 97 |             x = x.permute(0,3,1,2).squeeze(-1)
 98 |             h = torch.tanh(self.sap_linear(x))
 99 |             w = torch.matmul(h, self.attention).squeeze(dim=2)
100 |             w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1)
101 |             x = torch.sum(x * w, dim=1)
102 |         elif self.encoder_type == "ASP":
103 |             x = x.permute(0,3,1,2).squeeze(-1)
104 |             h = torch.tanh(self.sap_linear(x))
105 |             w = torch.matmul(h, self.attention).squeeze(dim=2)
106 |             w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1)
107 |             mu = torch.sum(x * w, dim=1)
108 |             rh = torch.sqrt( ( torch.sum((x**2) * w, dim=1) - mu**2 ).clamp(min=1e-5) )
109 |             x = torch.cat((mu,rh),1)
110 | 
111 |         x = x.view(x.size()[0], -1)
112 |         x = self.fc(x)
113 | 
114 |         return x
115 | 
116 | 
117 | def MainModel(nOut=256, **kwargs):
118 |     # Number of filters
119 |     num_filters = [16, 32, 64, 128]
120 |     model = ResNetSE(SEBasicBlock, [3, 4, 6, 3], num_filters, nOut, **kwargs)
121 |     return model
122 | 


--------------------------------------------------------------------------------
/models/clova/models/ResNetSE34V2.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | # -*- encoding: utf-8 -*-
  3 | 
  4 | import torch
  5 | import torchaudio
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from torch.nn import Parameter
  9 | from models.ResNetBlocks import *
 10 | from utils import PreEmphasis
 11 | 
 12 | class ResNetSE(nn.Module):
 13 |     def __init__(self, block, layers, num_filters, nOut, encoder_type='SAP', n_mels=40, log_input=True, **kwargs):
 14 |         super(ResNetSE, self).__init__()
 15 | 
 16 |         print('Embedding size is %d, encoder %s.'%(nOut, encoder_type))
 17 |         
 18 |         self.inplanes   = num_filters[0]
 19 |         self.encoder_type = encoder_type
 20 |         self.n_mels     = n_mels
 21 |         self.log_input  = log_input
 22 | 
 23 |         self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=3, stride=1, padding=1)
 24 |         self.relu = nn.ReLU(inplace=True)
 25 |         self.bn1 = nn.BatchNorm2d(num_filters[0])
 26 |         
 27 | 
 28 |         self.layer1 = self._make_layer(block, num_filters[0], layers[0])
 29 |         self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2))
 30 |         self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2))
 31 |         self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(2, 2))
 32 | 
 33 |         self.instancenorm   = nn.InstanceNorm1d(n_mels)
 34 |         self.torchfb        = torch.nn.Sequential(
 35 |                 PreEmphasis(),
 36 |                 torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, window_fn=torch.hamming_window, n_mels=n_mels)
 37 |                 )
 38 | 
 39 |         outmap_size = int(self.n_mels/8)
 40 | 
 41 |         self.attention = nn.Sequential(
 42 |             nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
 43 |             nn.ReLU(),
 44 |             nn.BatchNorm1d(128),
 45 |             nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
 46 |             nn.Softmax(dim=2),
 47 |             )
 48 | 
 49 |         if self.encoder_type == "SAP":
 50 |             out_dim = num_filters[3] * outmap_size
 51 |         elif self.encoder_type == "ASP":
 52 |             out_dim = num_filters[3] * outmap_size * 2
 53 |         else:
 54 |             raise ValueError('Undefined encoder')
 55 | 
 56 |         self.fc = nn.Linear(out_dim, nOut)
 57 | 
 58 |         for m in self.modules():
 59 |             if isinstance(m, nn.Conv2d):
 60 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
 61 |             elif isinstance(m, nn.BatchNorm2d):
 62 |                 nn.init.constant_(m.weight, 1)
 63 |                 nn.init.constant_(m.bias, 0)
 64 | 
 65 |     def _make_layer(self, block, planes, blocks, stride=1):
 66 |         downsample = None
 67 |         if stride != 1 or self.inplanes != planes * block.expansion:
 68 |             downsample = nn.Sequential(
 69 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
 70 |                           kernel_size=1, stride=stride, bias=False),
 71 |                 nn.BatchNorm2d(planes * block.expansion),
 72 |             )
 73 | 
 74 |         layers = []
 75 |         layers.append(block(self.inplanes, planes, stride, downsample))
 76 |         self.inplanes = planes * block.expansion
 77 |         for i in range(1, blocks):
 78 |             layers.append(block(self.inplanes, planes))
 79 | 
 80 |         return nn.Sequential(*layers)
 81 | 
 82 |     def new_parameter(self, *size):
 83 |         out = nn.Parameter(torch.FloatTensor(*size))
 84 |         nn.init.xavier_normal_(out)
 85 |         return out
 86 | 
 87 |     def forward(self, x):
 88 | 
 89 |         with torch.no_grad():
 90 |             with torch.cuda.amp.autocast(enabled=False):
 91 |                 x = self.torchfb(x)+1e-6
 92 |                 if self.log_input: x = x.log()
 93 |                 x = self.instancenorm(x).unsqueeze(1)
 94 | 
 95 |         x = self.conv1(x)
 96 |         x = self.relu(x)
 97 |         x = self.bn1(x)
 98 | 
 99 |         x = self.layer1(x)
100 |         x = self.layer2(x)
101 |         x = self.layer3(x)
102 |         x = self.layer4(x)
103 | 
104 |         x = x.reshape(x.size()[0],-1,x.size()[-1])
105 | 
106 |         w = self.attention(x)
107 | 
108 |         if self.encoder_type == "SAP":
109 |             x = torch.sum(x * w, dim=2)
110 |         elif self.encoder_type == "ASP":
111 |             mu = torch.sum(x * w, dim=2)
112 |             sg = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-5) )
113 |             x = torch.cat((mu,sg),1)
114 | 
115 |         x = x.view(x.size()[0], -1)
116 |         x = self.fc(x)
117 | 
118 |         return x
119 | 
120 | 
121 | def MainModel(nOut=256, **kwargs):
122 |     # Number of filters
123 |     num_filters = [32, 64, 128, 256]
124 |     model = ResNetSE(SEBasicBlock, [3, 4, 6, 3], num_filters, nOut, **kwargs)
125 |     return model
126 | 
127 | 


--------------------------------------------------------------------------------
/models/clova/models/VGGVox.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | import torchaudio
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | from torch.nn import Parameter
 9 | 
10 | class MainModel(nn.Module):
11 |     def __init__(self, nOut = 1024, encoder_type='SAP', log_input=True, **kwargs):
12 |         super(MainModel, self).__init__();
13 | 
14 |         print('Embedding size is %d, encoder %s.'%(nOut, encoder_type))
15 |         
16 |         self.encoder_type = encoder_type
17 |         self.log_input    = log_input
18 | 
19 |         self.netcnn = nn.Sequential(
20 |             nn.Conv2d(1, 96, kernel_size=(5,7), stride=(1,2), padding=(2,2)),
21 |             nn.BatchNorm2d(96),
22 |             nn.ReLU(inplace=True),
23 |             nn.MaxPool2d(kernel_size=(1,3), stride=(1,2)),
24 | 
25 |             nn.Conv2d(96, 256, kernel_size=(5,5), stride=(2,2), padding=(1,1)),
26 |             nn.BatchNorm2d(256),
27 |             nn.ReLU(inplace=True),
28 |             nn.MaxPool2d(kernel_size=(3,3), stride=(2,2)),
29 | 
30 |             nn.Conv2d(256, 384, kernel_size=(3,3), padding=(1,1)),
31 |             nn.BatchNorm2d(384),
32 |             nn.ReLU(inplace=True),
33 | 
34 |             nn.Conv2d(384, 256, kernel_size=(3,3), padding=(1,1)),
35 |             nn.BatchNorm2d(256),
36 |             nn.ReLU(inplace=True),
37 | 
38 |             nn.Conv2d(256, 256, kernel_size=(3,3), padding=(1,1)),
39 |             nn.BatchNorm2d(256),
40 |             nn.ReLU(inplace=True),
41 |             nn.MaxPool2d(kernel_size=(3,3), stride=(2,2)),
42 | 
43 |             nn.Conv2d(256, 512, kernel_size=(4,1), padding=(0,0)),
44 |             nn.BatchNorm2d(512),
45 |             nn.ReLU(inplace=True),
46 |             
47 |         );
48 | 
49 |         if self.encoder_type == "MAX":
50 |             self.encoder = nn.AdaptiveMaxPool2d((1,1))
51 |             out_dim = 512
52 |         elif self.encoder_type == "TAP":
53 |             self.encoder = nn.AdaptiveAvgPool2d((1,1))
54 |             out_dim = 512
55 |         elif self.encoder_type == "SAP":
56 |             self.sap_linear = nn.Linear(512, 512)
57 |             self.attention = self.new_parameter(512, 1)
58 |             out_dim = 512
59 |         else:
60 |             raise ValueError('Undefined encoder')
61 | 
62 |         self.fc = nn.Linear(out_dim, nOut)
63 | 
64 |         self.instancenorm   = nn.InstanceNorm1d(40)
65 |         self.torchfb        = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, f_min=0.0, f_max=8000, pad=0, n_mels=40)
66 | 
67 |     def new_parameter(self, *size):
68 |         out = nn.Parameter(torch.FloatTensor(*size))
69 |         nn.init.xavier_normal_(out)
70 |         return out
71 |         
72 |     def forward(self, x):
73 | 
74 |         with torch.no_grad():
75 |             with torch.cuda.amp.autocast(enabled=False):
76 |                 x = self.torchfb(x)+1e-6
77 |                 if self.log_input: x = x.log()
78 |                 x = self.instancenorm(x).unsqueeze(1)
79 | 
80 |         x = self.netcnn(x);
81 | 
82 |         if self.encoder_type == "MAX" or self.encoder_type == "TAP":
83 |             x = self.encoder(x)
84 |             x = x.view((x.size()[0], -1))
85 | 
86 |         elif self.encoder_type == "SAP":
87 |             x = x.permute(0, 2, 1, 3)
88 |             x = x.squeeze(dim=1).permute(0, 2, 1)  # batch * L * D
89 |             h = torch.tanh(self.sap_linear(x))
90 |             w = torch.matmul(h, self.attention).squeeze(dim=2)
91 |             w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1)
92 |             x = torch.sum(x * w, dim=1)
93 | 
94 |         x = self.fc(x);
95 | 
96 |         return x;
97 | 
98 | 


--------------------------------------------------------------------------------
/models/clova/models/byol.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | 
 3 | from .ssl_singer_identity.singer_identity import load_model
 4 | 
 5 | 
 6 | def MainModel(**kwargs):
 7 | 
 8 |     model = load_model("byol", torchscript=True)
 9 |     model.train()
10 |     return model
11 | 


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Bernardo Torres
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/metadata/img/byol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freds0/free-svc/def7f35a0d1935b00e6ebaf19a4a149617f3354b/models/clova/models/ssl_singer_identity/metadata/img/byol.png


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/metadata/img/full_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freds0/free-svc/def7f35a0d1935b00e6ebaf19a4a149617f3354b/models/clova/models/ssl_singer_identity/metadata/img/full_diagram.png


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/metadata/img/isolated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freds0/free-svc/def7f35a0d1935b00e6ebaf19a4a149617f3354b/models/clova/models/ssl_singer_identity/metadata/img/isolated.png


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/metadata/img/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freds0/free-svc/def7f35a0d1935b00e6ebaf19a4a149617f3354b/models/clova/models/ssl_singer_identity/metadata/img/pipeline.png


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/metadata/img/techniques_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freds0/free-svc/def7f35a0d1935b00e6ebaf19a4a149617f3354b/models/clova/models/ssl_singer_identity/metadata/img/techniques_.png


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/singer_identity/__init__.py:
--------------------------------------------------------------------------------
1 | # from . import losses
2 | 
3 | from .model import load_model
4 | # from . import model
5 | # from . import trainer
6 | # from . import utils
7 | # from .data import siamese_encoders
8 | 


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/singer_identity/callbacks/ma_updates.py:
--------------------------------------------------------------------------------
 1 | from math import cos, pi
 2 | from typing import Optional, Sequence
 3 | 
 4 | import torch
 5 | from pytorch_lightning import Callback, LightningModule, Trainer
 6 | 
 7 | 
 8 | class MAWeightUpdate(Callback):
 9 |     """Weight update rule from BYOL.
10 |     Your model should have:
11 |         - ``self.online_network``
12 |         - ``self.target_network``
13 |     Updates the target_network params using an exponential moving average update rule weighted by tau.
14 |     BYOL claims this keeps the online_network from collapsing.
15 |     .. note:: Automatically increases tau from ``initial_tau`` to 1.0 with every training step
16 |     Example::
17 |         # model must have 2 attributes
18 |         model = Model()
19 |         model.online_network = ...
20 |         model.target_network = ...
21 |         trainer = Trainer(callbacks=[MAWeightUpdate()])
22 |     """
23 | 
24 |     def __init__(self, initial_tau: float = 0.996, max_epochs=100, should_update: bool = True):
25 |         """
26 |         Args:
27 |             initial_tau: starting tau. Auto-updates with every training step
28 |         """
29 |         super().__init__()
30 |         self.initial_tau = initial_tau
31 |         self.max_epochs = max_epochs
32 |         self.should_update = should_update
33 | 
34 |         self.current_tau = initial_tau
35 | 
36 |     def on_train_batch_end(
37 |             self,
38 |             trainer: Trainer,
39 |             pl_module: LightningModule,
40 |             outputs: Sequence,
41 |             batch: Sequence,
42 |             batch_idx: int,
43 |             unused: Optional[int] = 0
44 |     ) -> None:
45 |         # get networks
46 |         student_network = pl_module.student_network
47 |         teacher_network = pl_module.teacher_network
48 | 
49 |         # update weights
50 |         self.update_weights(student_network, teacher_network)
51 | 
52 |         # log tau
53 |         pl_module.log("hparams/MA rate", self.current_tau, prog_bar=False, logger=True)
54 | 
55 |         # update tau after
56 |         if self.should_update:
57 |             self.current_tau = self.update_tau(pl_module, trainer)
58 | 
59 |     def update_tau(self, pl_module: LightningModule, trainer: Trainer) -> float:
60 |         max_steps = len(trainer.train_dataloader) * self.max_epochs
61 |         tau = 1 - (1 - self.initial_tau) * (cos(pi * pl_module.global_step / max_steps) + 1) / 2
62 |         return tau
63 | 
64 |     def update_weights(
65 |         self,
66 |         student_network: torch.nn.Module,
67 |         teacher_network: torch.nn.Module
68 |     ) -> None:
69 |         # apply MA weight update
70 |         for (name, student_p), (_, teacher_p) in zip(
71 |             student_network.named_parameters(),
72 |             teacher_network.named_parameters(),
73 |         ):
74 |             teacher_p.data = self.current_tau * teacher_p.data + (1 - self.current_tau) * student_p.data
75 | 


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/singer_identity/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from singer_identity.utils.core import similarity, roll
 4 | 
 5 | 
 6 | def std_batch(x, var=1, eps=1e-8):
 7 |     std = torch.sqrt(x.var(dim=0) + eps)
 8 |     return torch.mean(F.relu(var - std))
 9 | 
10 | 
11 | def variance_hinge_reg(x, y, var=1):
12 |     # From https://github.com/facebookresearch/vicreg
13 |     std_x = std_batch(x, var=var)
14 |     std_y = std_batch(y, var=var)
15 |     std_loss = std_x / 2 + std_y / 2
16 |     return std_loss
17 | 
18 | 
19 | def covariance(x):
20 |     # In official implementation they do mean over batch (to verify)
21 |     # mean = x.mean(1, keepdims=True)
22 |     mean = x.mean(dim=0)
23 |     x = x - mean
24 |     cov = torch.matmul(x.transpose(0, 1), x) / (x.shape[0] - 1)
25 |     # cov = (x.T @ x) / (x.shape[0] - 1)
26 |     return cov
27 | 
28 | 
29 | def covariance_reg(x, y):
30 |     eye = torch.eye(x.shape[1]).to(x.device)
31 |     cov_x = covariance(x)
32 |     cov_y = covariance(y)
33 |     assert cov_x.shape[0] == cov_x.shape[1]
34 |     assert cov_y.shape[0] == cov_y.shape[1]
35 |     cov_reg = (cov_x * (1 - eye)).pow(2).sum() / x.shape[1] + (cov_y * (1 - eye)).pow(
36 |         2
37 |     ).sum() / x.shape[1]
38 |     return cov_reg
39 | 
40 | 
41 | def invariance_loss(x, y):
42 |     return F.mse_loss(x, y)
43 | 
44 | 
45 | def vicreg_loss(x, y, gamma=1, fact_inv_loss=1, fact_var=1, fact_cov=1):
46 |     # Adapted from https://github.com/facebookresearch/vicreg
47 |     repr_loss = invariance_loss(x, y)
48 |     std_loss = variance_hinge_reg(x, y, var=gamma)
49 |     cov_loss = covariance_reg(x, y)
50 |     loss = fact_inv_loss * repr_loss + fact_var * std_loss + fact_cov * cov_loss
51 |     return loss
52 | 
53 | 
54 | def compute_norms(*args):
55 |     norms = []
56 |     for arg in args:
57 |         norms.append(torch.sqrt((arg**2).sum(1)))
58 |     return norms
59 | 
60 | 
61 | def align_loss(x, y, alpha=2):
62 |     # From https://github.com/SsnL/align_uniform
63 |     return (x - y).norm(p=2, dim=1).pow(alpha).mean()
64 | 
65 | 
66 | def uniform_loss(x, t=2):
67 |     # From https://github.com/SsnL/align_uniform
68 |     return torch.pdist(x, p=2).pow(2).mul(-t).exp().mean().log()
69 | 
70 | 
71 | def contrastive_loss(z1, z2, temp=0.2, nr_negative=1, decouple=False):
72 |     cost_pos = similarity(z1, z2, temp)  # Positive samples
73 |     cost_neg = []
74 | 
75 |     n_rolls = min(z1.shape[0] - 1, nr_negative)  # Number of negative samples
76 |     curr_neg_z = z2
77 | 
78 |     for i in range(n_rolls):
79 |         curr_neg_z = roll(curr_neg_z)  # Shifts batch
80 |         cost_neg.append(similarity(z1, curr_neg_z, temp))  # Negative sim.
81 | 
82 |     if not decouple:
83 |         cost_neg.append(cost_pos)  # Adds positive similarity in denominator
84 | 
85 |     cost_neg = torch.stack(cost_neg).transpose(1, 0)
86 |     cost = (-cost_pos + torch.logsumexp(cost_neg, 1)).mean()
87 |     # TODO: implement similarities with less operations, but this works
88 |     ratio = torch.mean(cost_neg) / (
89 |         torch.mean(cost_pos) + torch.tensor(1e-6).type_as(z1)
90 |     )
91 |     return cost, ratio.item()
92 | 


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/singer_identity/models/network_components.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from typing import Union, Callable, List, Optional
 4 | from torchvision.models import efficientnet_b0, efficientnet_b4
 5 | import torchvision.transforms as vt
 6 | 
 7 | 
 8 | def get_vision_backbone(
 9 |     vismod="efficientnet_b0", num_classes=1000, pretrained=False, **kwargs
10 | ):
11 |     if vismod == "efficientnet_b0":
12 |         return efficientnet_b0(pretrained=pretrained, num_classes=num_classes, **kwargs)
13 |     elif vismod == "efficientnet_b4":
14 |         return efficientnet_b4(pretrained=pretrained, num_classes=num_classes, **kwargs)
15 | 
16 |     else:
17 |         raise NotImplementedError
18 | 
19 | 
20 | class Grey2Rgb(nn.Module):
21 |     def __init__(self):
22 |         super().__init__()
23 |         self.normalize = vt.Normalize(
24 |             mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
25 |         )
26 | 
27 |     def forward(self, data):
28 |         batch_size, freq_bins, times = data.shape
29 |         data /= data.max()
30 |         data = data.unsqueeze(1).expand(batch_size, 3, freq_bins, times)
31 |         data = self.normalize(data)
32 |         return data
33 | 
34 | 
35 | class LogScale(nn.Module):
36 |     def forward(self, data):
37 |         # eps = 1e-8
38 |         eps = torch.tensor(1e-8, device=data.device)
39 |         return torch.log(data + eps)
40 | 
41 | 
42 | class Aggregator(nn.Module):
43 |     """Aggregates (in time) a list of features"""
44 | 
45 |     def __init__(self):
46 |         super().__init__()
47 |         self.aggregation = nn.Sequential(nn.AdaptiveAvgPool1d(1), nn.Flatten(1))
48 | 
49 |     def forward(self, features):
50 |         """
51 |         Returns:
52 |             outputs_feature: torch.Tensor of shape(B x C x t)
53 |         """
54 |         if isinstance(features, list):
55 |             output_feature = [self.aggregation(feature) for feature in features]
56 |         else:
57 |             output_feature = self.aggregation(features)
58 |         return output_feature
59 | 


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/singer_identity/train_configs/README.md:
--------------------------------------------------------------------------------
 1 | # Configuration File for Training
 2 | 
 3 | You can use a configuration file to train a model using the `train.py` script. Here we provide a description of how to setup the config file. The common options are described in the [common config](common.yaml) file. 
 4 | 
 5 | 
 6 | ```python
 7 | python train.py --config path/to/common.yaml --config path/to/model_config.yaml
 8 | ```
 9 | The model specific options are described below. In the example above, `model_config.yaml` will overwrite the options in `common.yaml` when options are repeated. For more details check the [Lightning CLI](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli.html#lightning-cli) docs.
10 | 
11 | ## 1. Model specific options
12 | In order to use contrastive, VICReg and Uniformity-Alignment, simply change the loss arguments in the config file. Below is the example for the contrastive loss:
13 | 
14 | ```yaml
15 | use_contrastive_loss: true  # use contrastive loss
16 | temp: 0.2  # temperature for contrastive loss
17 | nr_negative: 250  # number of negative samples for contrastive loss
18 | decouple: true  # use decouple contrastive loss or regular NT-Xent loss
19 | use_covariance_reg: false  # use covariance regularization
20 | use_variance_reg: false  # use variance regularization
21 | use_vicreg_loss: false  # use vicreg loss
22 | use_align_loss: false  # use alignment loss
23 | use_uniform_loss: false  # use uniformity loss
24 | ```
25 | The individual weights for the losses can be specified as well. BYOL training has its dedicated trainer class and needs to be specified as shown in `byol.yaml`.
26 | 
27 | We provide the following configs for the models used in the paper:
28 | 
29 | - `byol.yaml`
30 | - `contrastive.yaml`
31 | - `contrastive_vc.yaml`
32 | - `uniformity-alignment.yaml`
33 | - `vicreg.yaml`
34 | 
35 | 
36 | ## 2. Data Options
37 | In the config file used to launch training (`common.yaml` is this example), specify the datasets to use as follows:
38 |     
39 | ```yaml
40 | data:
41 | class_path: singer_id.data.siamese_encoders.SiameseEncodersDataModule  # default the dataloader class
42 | init_args:
43 |     dataset_dirs: 
44 |     - '/Path/to/dataset1/dataset1_name'
45 |     - '/Path/to/dataset2/dataset2_name'
46 |     batch_size:  # batch size for training
47 |     batch_size_val:  # batch size for validation
48 |     nr_samples: # number of samples to use for training (default: 176000, ie 4 seconds of audio in 44.1kHz)
49 |     normalize: # normalize the audio when loading  
50 |     num_workers: # number of workers for the dataloader
51 |     batch_sampling_mode:  # "sample_clips" or "sample groups". Use "sample_clips" for self-supervised COLA loading
52 |     eval_frac: # fraction of the dataset to use for validation
53 |     group_name_is_folder: 
54 |     group_by_artist: 
55 |     multi_epoch:  # number of epochs to repeat the dataset to simulate a larger dataset
56 | ```
57 | 
58 | ## 3. Augmentation Options
59 | 
60 | The following augmentations are available. We use [Audiomentations](https://github.com/iver56/audiomentations) and [Parselmouth](https://github.com/YannickJadoul/Parselmouth) to perform the augmentations. All fields specify the probability of applying the augmentation, except for `pitch_shift_parselmouth`, `pitch_range_parselmouth`.
61 | 
62 | ```yaml
63 |     augmentations: 
64 |     "enable": true
65 |     "gaussian_noise": 0.5  # min_amplitude=0.001, max_amplitude=0.05
66 |     "pitch_shift_naive": 0  # naive pitch shift (using librosa), not used in the paper
67 |     "time_stretch": 0 # time stretch, not used in the paper
68 |     "gain": 0.5  #  min_gain_in_db=-6, max_gain_in_db=0
69 |     "shift": 0  # not used in the paper
70 |     "parametric_eq": 0  # not used in the paper
71 |     "tanh_distortion": 0  # not used in the paper
72 |     "time_mask": 0.5  # max_band_part=1/8
73 |     "formant_shift_parselmouth": 0  # not used in the paper
74 |     "pitch_shift_parselmouth": [1, 1.3]  # Pitch shift value on parselmouth
75 |     "pitch_range_parselmouth": 1.5  # Pitch range value on parselmouth
76 |     "pitch_shift_parselmouth_prob": 0.5 
77 | ```
78 | 
79 | 
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/singer_identity/train_configs/byol.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   class_path: singer_identity.trainer_byol.BYOL
 3 |   init_args:
 4 | # ------------------ Feature extractor ------------------
 5 |     feature_extractor:
 6 |       spec_layer: 'melspectogram'
 7 |       n_fft: 2048
 8 |       hop_length: 512
 9 | # ------------------ Encoder ------------------
10 |     backbone:
11 |       backbone: "efficientnet_b0"
12 |       pretrained: true
13 |       embedding_dim: 1000
14 | # ------------------ Projection ------------------       
15 |     projection:
16 |       input_dim: 1000
17 |       output_dim: 128
18 |       l2_normalize: true
19 | # ------------------ Predictor ------------------       
20 |     predictor:
21 |       dims:
22 |         - 128
23 |         - 1024
24 |         - 128
25 |       use_batchnorm: true
26 |     normalize_projections: true
27 |     weight_callback:
28 |       class_path: singer_identity.callbacks.ma_updates.MAWeightUpdate
29 |       init_args:
30 |         initial_tau: 0.99
31 |         max_epochs: 1000
32 | # ------------------ Optimizer ------------------
33 |     optimizer:
34 |       class_path: singer_identity.models.byol.Adam
35 |       init_args:
36 |         lr: 3e-5
37 |         weight_decay: 1.5e-6
38 |     scheduler:
39 |       class_path: singer_identity.models.byol.LinearWarmupCosineAnnealing
40 |       init_args:
41 |         warmup_epochs: 10
42 |         max_epochs: 1000
43 |         
44 | trainer:
45 | # ------------------ Logger ------------------
46 |   logger:
47 |     class_path: pytorch_lightning.loggers.TensorBoardLogger
48 |     init_args:
49 |       save_dir: "logs"
50 |       name: "byol"


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/singer_identity/train_configs/contrastive-vc.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   class_path: singer_identity.trainer.SSLTrainer
 3 |   init_args:
 4 | # ------------------ Feature extractor ------------------
 5 |     feature_extractor:
 6 |       spec_layer: 'melspectogram'
 7 |       n_fft: 2048
 8 |       hop_length: 512
 9 | # ------------------ Encoder ------------------
10 |     backbone:
11 |       backbone: "efficientnet_b0"
12 |       pretrained: true
13 |       embedding_dim: 1000
14 | # ------------------ Projection ------------------       
15 |     projection:
16 |       input_dim: 1000
17 |       output_dim: 128
18 |       l2_normalize: true
19 | # ------------------ Training hyperparameters ------------------       
20 |     use_contrastive_loss: true 
21 |     temp: 0.2
22 |     nr_negative: 250
23 |     decouple: true
24 |     use_covariance_reg: true
25 |     fact_cov: 100
26 |     use_variance_reg: true
27 |     fact_var: 25
28 |     use_invariance_loss: false
29 |     use_vicreg_loss: false
30 |     use_align_loss: false
31 |     use_uniform_loss: false
32 |     # ------------------ Optimizer ------------------
33 |     optimizer1_init:
34 |       class_path: torch.optim.Adam
35 |       init_args:
36 |         lr: 0.0001
37 |         weight_decay: 1e-5
38 | trainer:
39 | # ------------------ Logger ------------------
40 |   logger:
41 |     class_path: pytorch_lightning.loggers.TensorBoardLogger
42 |     init_args:
43 |       save_dir: "logs"
44 |       name: "contrastive-vc"


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/singer_identity/train_configs/contrastive.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   class_path: singer_identity.trainer.SSLTrainer
 3 |   init_args:
 4 | # ------------------ Feature extractor ------------------
 5 |     feature_extractor:
 6 |       spec_layer: 'melspectogram'
 7 |       n_fft: 2048
 8 |       hop_length: 512
 9 | # ------------------ Encoder ------------------
10 |     backbone:
11 |       backbone: "efficientnet_b0"
12 |       pretrained: true
13 |       embedding_dim: 1000
14 | # ------------------ Projection ------------------       
15 |     projection:
16 |       input_dim: 1000
17 |       output_dim: 128
18 |       l2_normalize: true
19 | # ------------------ Training hyperparameters ------------------       
20 |     use_contrastive_loss: true 
21 |     temp: 0.2
22 |     nr_negative: 250
23 |     decouple: true
24 |     use_covariance_reg: false
25 |     use_variance_reg: false
26 |     use_vicreg_loss: false
27 |     use_align_loss: false
28 |     use_uniform_loss: false
29 |     # ------------------ Optimizer ------------------
30 |     optimizer1_init:
31 |       class_path: torch.optim.Adam
32 |       init_args:
33 |         lr: 0.0001
34 |         weight_decay: 1e-5
35 | trainer:
36 | # ------------------ Logger ------------------
37 |   logger:
38 |     class_path: pytorch_lightning.loggers.TensorBoardLogger
39 |     init_args:
40 |       save_dir: "logs"
41 |       name: "contrastive"


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/singer_identity/train_configs/contrastive_test.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   class_path: singer_identity.trainer.SSLTrainer
 3 |   init_args:
 4 | # ------------------ Feature extractor ------------------
 5 |     feature_extractor:
 6 |       spec_layer: 'melspectogram'
 7 |       n_fft: 2048
 8 |       hop_length: 512
 9 | 
10 | # ------------------ Encoder ------------------
11 |     backbone:
12 |       backbone: "efficientnet_b0"
13 |       pretrained: true
14 |       embedding_dim: 1000
15 | 
16 | # ------------------ Projection ------------------       
17 |     projection:
18 |       input_dim: 1000
19 |       output_dim: 128
20 |       l2_normalize: true
21 | 
22 | # ------------------ Training hyperparameters ------------------       
23 |     use_contrastive_loss: true 
24 |     temp: 0.2
25 |     nr_negative: 250
26 |     decouple: true
27 |     use_covariance_reg: false
28 |     use_variance_reg: false
29 |     use_vicreg_loss: false
30 |     use_align_loss: false
31 |     use_uniform_loss: false
32 | 
33 |     # ------------------ Optimizer ------------------
34 |     optimizer1_init:
35 |       class_path: torch.optim.Adam
36 |       init_args:
37 |         lr: 0.0001
38 |         weight_decay: 1e-5
39 | 
40 | trainer:
41 | # ------------------ Logger ------------------
42 |   logger:
43 |     class_path: pytorch_lightning.loggers.TensorBoardLogger
44 |     init_args:
45 |       save_dir: "logs"
46 |       name: "contrastive"


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/singer_identity/train_configs/uniformity-alignment.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   class_path: singer_identity.trainer.SSLTrainer
 3 |   init_args:
 4 | # ------------------ Feature extractor ------------------
 5 |     feature_extractor:
 6 |       spec_layer: 'melspectogram'
 7 |       n_fft: 2048
 8 |       hop_length: 512
 9 | # ------------------ Encoder ------------------
10 |     backbone:
11 |       backbone: "efficientnet_b0"
12 |       pretrained: true
13 |       embedding_dim: 1000
14 | # ------------------ Projection ------------------       
15 |     projection:
16 |       input_dim: 1000
17 |       output_dim: 128
18 |       l2_normalize: true
19 | # ------------------ Training hyperparameters ------------------       
20 |     use_contrastive_loss: false 
21 |     use_covariance_reg: false
22 |     use_variance_reg: false
23 |     use_invariance_loss: false
24 |     use_align_loss: true
25 |     fact_align_loss: 1
26 |     use_uniform_loss: true
27 |     fact_unif_loss: 1
28 |     # ------------------ Optimizer ------------------
29 |     optimizer1_init:
30 |       class_path: torch.optim.Adam
31 |       init_args:
32 |         lr: 0.0001
33 |         weight_decay: 1e-5
34 | trainer:
35 | # ------------------ Logger ------------------
36 |   logger:
37 |     class_path: pytorch_lightning.loggers.TensorBoardLogger
38 |     init_args:
39 |       save_dir: "logs"
40 |       name: "uniformity-alignment"


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/singer_identity/train_configs/vicreg.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   class_path: singer_identity.trainer.SSLTrainer
 3 |   init_args:
 4 | # ------------------ Feature extractor ------------------
 5 |     feature_extractor:
 6 |       spec_layer: 'melspectogram'
 7 |       n_fft: 2048
 8 |       hop_length: 512
 9 | # ------------------ Encoder ------------------
10 |     backbone:
11 |       backbone: "efficientnet_b0"
12 |       pretrained: true
13 |       embedding_dim: 1000
14 | # ------------------ Projection ------------------       
15 |     projection:
16 |       input_dim: 1000
17 |       output_dim: 128
18 |       l2_normalize: true
19 | # ------------------ Training hyperparameters ------------------       
20 |     use_contrastive_loss: false 
21 |     # temp: 0.2
22 |     # nr_negative: 250
23 |     # decouple: true
24 |     use_invariance_loss: true
25 |     fact_inv_loss:  25
26 |     use_covariance_reg: true
27 |     fact_cov: 100
28 |     use_variance_reg: true
29 |     fact_var: 25
30 |     gamma: 1
31 |     use_align_loss: false
32 |     use_uniform_loss: false
33 |     # ------------------ Optimizer ------------------
34 |     optimizer1_init:
35 |       class_path: torch.optim.Adam
36 |       init_args:
37 |         lr: 0.0001
38 |         weight_decay: 1e-5
39 | trainer:
40 | # ------------------ Logger ------------------
41 |   logger:
42 |     class_path: pytorch_lightning.loggers.TensorBoardLogger
43 |     init_args:
44 |       save_dir: "logs"
45 |       name: "vicreg"


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/singer_identity/trainer_byol.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Any
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | from singer_identity.models.byol import TeacherStudentModel, Optimizer, Scheduler
 6 | from singer_identity.model import IdentityEncoder, Projection, SiameseArm, MLP
 7 | 
 8 | import copy
 9 | 
10 | class BYOL(TeacherStudentModel):
11 |     def __init__(
12 |         self,
13 |         # module: nn.Module,
14 |         weight_callback,
15 |         optimizer: Optimizer,
16 |         backbone: dict = {},
17 |         projection: dict  = {},
18 |         predictor: dict = {},
19 |         feature_extractor: dict = {},
20 |         loss_fn: nn.Module = torch.nn.MSELoss(),
21 |         scheduler: Optional[Scheduler] = None,
22 |         normalize_projections: bool = False,
23 |         normalize_representations: bool = False,
24 |     ):
25 |         encoder = IdentityEncoder(feature_extractor=feature_extractor, encoder=backbone)
26 |         projection_layer = Projection(**projection)
27 |         predictor_layer = MLP(**copy.deepcopy(predictor))
28 |         module = SiameseArm(
29 |             encoder=encoder,
30 |             projector=projection_layer,
31 |             predictor=predictor_layer,
32 |             normalize_projections=normalize_projections,
33 |             normalize_representations=normalize_representations,
34 |         )
35 | 
36 |         super(BYOL, self).__init__(
37 |             module, loss_fn, weight_callback, optimizer, scheduler=scheduler
38 |         )
39 |         self.save_hyperparameters(ignore=["module", "loss_fn"])
40 | 
41 |     def shared_step(self, batch, step_name: str):
42 |         x1 = batch["clip1"]
43 |         x2 = batch["clip2"]
44 | 
45 |         batch_size = x1.shape[0]
46 | 
47 |         ys, zs, qs = self.student_network(x1)
48 |         with torch.no_grad():
49 |             yt, zt, qt = self.teacher_network(x2)
50 |         loss_12 = self.loss_fn(qs, zt)
51 | 
52 |         ys, zs, qs = self.student_network(x2)
53 |         with torch.no_grad():
54 |             yt, zt, qt = self.teacher_network(x1)
55 |         loss_21 = self.loss_fn(qs, zt)
56 | 
57 |         loss = (loss_12 + loss_21) / 2
58 | 
59 |         self.log(
60 |             f"loss/{step_name}",
61 |             loss,
62 |             prog_bar=True,
63 |             batch_size=batch_size,
64 |         )
65 | 
66 |         self.record_variables(y1=ys, z1=zs, y2=yt, z2=zt)
67 | 
68 |         return loss
69 | 
70 |     def training_step(self, batch, batch_idx):
71 |         return self.shared_step(batch, "train")
72 | 
73 |     def validation_step(self, batch, batch_idx):
74 |         return self.shared_step(batch, "val")
75 | 


--------------------------------------------------------------------------------
/models/clova/models/ssl_singer_identity/train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pytorch_lightning as pl
 3 | from pytorch_lightning.cli import LightningCLI
 4 | 
 5 | 
 6 | class CLI(LightningCLI):
 7 |     def add_arguments_to_parser(self, parser):
 8 |         parser.add_argument("--ckpt_path", default=None)
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     cli = CLI(
13 |         model_class=pl.LightningModule,
14 |         datamodule_class=pl.LightningDataModule,
15 |         subclass_mode_model=True,
16 |         subclass_mode_data=True,
17 |         save_config_kwargs={"overwrite": True},
18 |         run=False,
19 |     )
20 | 
21 |     ckpt_path = cli.config["ckpt_path"]
22 | 
23 |     if ckpt_path is not None:
24 |         step = torch.load(ckpt_path, map_location="cpu")["global_step"]
25 |         cli.trainer.fit_loop.epoch_loop._batches_that_stepped = step
26 | 
27 |     cli.trainer.fit(cli.model, cli.datamodule, ckpt_path=ckpt_path)
28 | 


--------------------------------------------------------------------------------
/models/clova/models/weights/RawNet3/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.7z filter=lfs diff=lfs merge=lfs -text
 2 | *.arrow filter=lfs diff=lfs merge=lfs -text
 3 | *.bin filter=lfs diff=lfs merge=lfs -text
 4 | *.bin.* filter=lfs diff=lfs merge=lfs -text
 5 | *.bz2 filter=lfs diff=lfs merge=lfs -text
 6 | *.ftz filter=lfs diff=lfs merge=lfs -text
 7 | *.gz filter=lfs diff=lfs merge=lfs -text
 8 | *.h5 filter=lfs diff=lfs merge=lfs -text
 9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.model filter=lfs diff=lfs merge=lfs -text
12 | *.msgpack filter=lfs diff=lfs merge=lfs -text
13 | *.onnx filter=lfs diff=lfs merge=lfs -text
14 | *.ot filter=lfs diff=lfs merge=lfs -text
15 | *.parquet filter=lfs diff=lfs merge=lfs -text
16 | *.pb filter=lfs diff=lfs merge=lfs -text
17 | *.pt filter=lfs diff=lfs merge=lfs -text
18 | *.pth filter=lfs diff=lfs merge=lfs -text
19 | *.rar filter=lfs diff=lfs merge=lfs -text
20 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 
21 | *.tar.* filter=lfs diff=lfs merge=lfs -text
22 | *.tflite filter=lfs diff=lfs merge=lfs -text
23 | *.tgz filter=lfs diff=lfs merge=lfs -text
24 | *.xz filter=lfs diff=lfs merge=lfs -text
25 | *.zip filter=lfs diff=lfs merge=lfs -text
26 | *.zstandard filter=lfs diff=lfs merge=lfs -text
27 | *tfevents* filter=lfs diff=lfs merge=lfs -text
28 | 


--------------------------------------------------------------------------------
/models/clova/models/weights/RawNet3/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | license: mit
 3 | ---
 4 | 
 5 | 
 6 | 
 7 | 
 8 | 
 9 | ---
10 | thumbnail: Refer to https://github.com/jungjee/RawNet for full documentation
11 | 
12 | tags:
13 | - Speaker recognition
14 | - Speaker verification
15 | - RawNet
16 | - RawNet3
17 | 
18 | license: "mit"
19 | 
20 | datasets:
21 |   - VoxCeleb1 
22 |   - VoxCeleb2
23 | 
24 | metrics:
25 |   - EER 0.89% on Vox1-O 
26 |   - minDCF 0.0659 on Vox1-O
27 | ---
28 | 


--------------------------------------------------------------------------------
/models/f0_predictor/CrepeF0Predictor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from models.f0_predictor.crepe import CrepePitchExtractor
 4 | from models.f0_predictor.F0Predictor import F0Predictor
 5 | 
 6 | 
 7 | class CrepeF0Predictor(F0Predictor):
 8 |     def __init__(self,hop_length=512,f0_min=50,f0_max=1100,device=None,sampling_rate=44100,threshold=0.05,model="full"):
 9 |         self.F0Creper = CrepePitchExtractor(hop_length=hop_length,f0_min=f0_min,f0_max=f0_max,device=device,threshold=threshold,model=model)
10 |         self.hop_length = hop_length
11 |         self.f0_min = f0_min
12 |         self.f0_max = f0_max
13 |         self.device = device
14 |         self.threshold = threshold
15 |         self.sampling_rate = sampling_rate
16 |         self.name = "crepe"
17 | 
18 |     def compute_f0(self,wav,p_len=None):
19 |         x = torch.FloatTensor(wav).to(self.device)
20 |         if p_len is None:
21 |             p_len = x.shape[0]//self.hop_length
22 |         else:
23 |             assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error"
24 |         f0,uv = self.F0Creper(x[None,:].float(),self.sampling_rate,pad_to=p_len)
25 |         return f0
26 |     
27 |     def compute_f0_uv(self,wav,p_len=None):
28 |         x = torch.FloatTensor(wav).to(self.device)
29 |         if p_len is None:
30 |             p_len = x.shape[0]//self.hop_length
31 |         else:
32 |             assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error"
33 |         f0,uv = self.F0Creper(x[None,:].float(),self.sampling_rate,pad_to=p_len)
34 |         return f0,uv


--------------------------------------------------------------------------------
/models/f0_predictor/DioF0Predictor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pyworld
 3 | 
 4 | from models.f0_predictor.F0Predictor import F0Predictor
 5 | 
 6 | 
 7 | class DioF0Predictor(F0Predictor):
 8 |     def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):
 9 |         self.hop_length = hop_length
10 |         self.f0_min = f0_min
11 |         self.f0_max = f0_max
12 |         self.sampling_rate = sampling_rate
13 |         self.name = "dio"
14 | 
15 |     def interpolate_f0(self,f0):
16 |         '''
17 |         对F0进行插值处理
18 |         '''
19 |         vuv_vector = np.zeros_like(f0, dtype=np.float32)
20 |         vuv_vector[f0 > 0.0] = 1.0
21 |         vuv_vector[f0 <= 0.0] = 0.0
22 |     
23 |         nzindex = np.nonzero(f0)[0]
24 |         data = f0[nzindex]
25 |         nzindex = nzindex.astype(np.float32)
26 |         time_org = self.hop_length / self.sampling_rate * nzindex
27 |         time_frame = np.arange(f0.shape[0]) * self.hop_length / self.sampling_rate
28 | 
29 |         if data.shape[0] <= 0:
30 |             return np.zeros(f0.shape[0], dtype=np.float32),vuv_vector
31 | 
32 |         if data.shape[0] == 1:
33 |             return np.ones(f0.shape[0], dtype=np.float32) * f0[0],vuv_vector
34 | 
35 |         f0 = np.interp(time_frame, time_org, data, left=data[0], right=data[-1])
36 |         
37 |         return f0,vuv_vector
38 | 
39 |     def resize_f0(self,x, target_len):
40 |         source = np.array(x)
41 |         source[source<0.001] = np.nan
42 |         target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source)
43 |         res = np.nan_to_num(target)
44 |         return res
45 |         
46 |     def compute_f0(self,wav,p_len=None):
47 |         if p_len is None:
48 |             p_len = wav.shape[0]//self.hop_length
49 |         f0, t = pyworld.dio(
50 |             wav.astype(np.double),
51 |             fs=self.sampling_rate,
52 |             f0_floor=self.f0_min,
53 |             f0_ceil=self.f0_max,
54 |             frame_period=1000 * self.hop_length / self.sampling_rate,
55 |         )
56 |         f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
57 |         for index, pitch in enumerate(f0):
58 |             f0[index] = round(pitch, 1)
59 |         return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
60 | 
61 |     def compute_f0_uv(self,wav,p_len=None):
62 |         if p_len is None:
63 |             p_len = wav.shape[0]//self.hop_length
64 |         f0, t = pyworld.dio(
65 |             wav.astype(np.double),
66 |             fs=self.sampling_rate,
67 |             f0_floor=self.f0_min,
68 |             f0_ceil=self.f0_max,
69 |             frame_period=1000 * self.hop_length / self.sampling_rate,
70 |         )
71 |         f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
72 |         for index, pitch in enumerate(f0):
73 |             f0[index] = round(pitch, 1)
74 |         return self.interpolate_f0(self.resize_f0(f0, p_len))
75 | 


--------------------------------------------------------------------------------
/models/f0_predictor/F0Predictor.py:
--------------------------------------------------------------------------------
 1 | class F0Predictor(object):
 2 |     def compute_f0(self,wav,p_len):
 3 |         '''
 4 |         input: wav:[signal_length]
 5 |                p_len:int
 6 |         output: f0:[signal_length//hop_length]
 7 |         '''
 8 |         pass
 9 | 
10 |     def compute_f0_uv(self,wav,p_len):
11 |         '''
12 |         input: wav:[signal_length]
13 |                p_len:int
14 |         output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
15 |         '''
16 |         pass


--------------------------------------------------------------------------------
/models/f0_predictor/FCPEF0Predictor.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn.functional as F
  6 | 
  7 | from models.f0_predictor.F0Predictor import F0Predictor
  8 | 
  9 | from .fcpe.model import FCPEInfer
 10 | 
 11 | 
 12 | class FCPEF0Predictor(F0Predictor):
 13 |     def __init__(self, hop_length=512, f0_min=50, f0_max=1100, dtype=torch.float32, device=None, sampling_rate=44100,
 14 |                  threshold=0.05):
 15 |         self.fcpe = FCPEInfer(model_path="pretrain/fcpe.pt", device=device, dtype=dtype)
 16 |         self.hop_length = hop_length
 17 |         self.f0_min = f0_min
 18 |         self.f0_max = f0_max
 19 |         if device is None:
 20 |             self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
 21 |         else:
 22 |             self.device = device
 23 |         self.threshold = threshold
 24 |         self.sampling_rate = sampling_rate
 25 |         self.dtype = dtype
 26 |         self.name = "fcpe"
 27 | 
 28 |     def repeat_expand(
 29 |             self, content: Union[torch.Tensor, np.ndarray], target_len: int, mode: str = "nearest"
 30 |     ):
 31 |         ndim = content.ndim
 32 | 
 33 |         if content.ndim == 1:
 34 |             content = content[None, None]
 35 |         elif content.ndim == 2:
 36 |             content = content[None]
 37 | 
 38 |         assert content.ndim == 3
 39 | 
 40 |         is_np = isinstance(content, np.ndarray)
 41 |         if is_np:
 42 |             content = torch.from_numpy(content)
 43 | 
 44 |         results = torch.nn.functional.interpolate(content, size=target_len, mode=mode)
 45 | 
 46 |         if is_np:
 47 |             results = results.numpy()
 48 | 
 49 |         if ndim == 1:
 50 |             return results[0, 0]
 51 |         elif ndim == 2:
 52 |             return results[0]
 53 | 
 54 |     def post_process(self, x, sampling_rate, f0, pad_to):
 55 |         if isinstance(f0, np.ndarray):
 56 |             f0 = torch.from_numpy(f0).float().to(x.device)
 57 | 
 58 |         if pad_to is None:
 59 |             return f0
 60 | 
 61 |         f0 = self.repeat_expand(f0, pad_to)
 62 | 
 63 |         vuv_vector = torch.zeros_like(f0)
 64 |         vuv_vector[f0 > 0.0] = 1.0
 65 |         vuv_vector[f0 <= 0.0] = 0.0
 66 | 
 67 |         # 去掉0频率, 并线性插值
 68 |         nzindex = torch.nonzero(f0).squeeze()
 69 |         f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
 70 |         time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy()
 71 |         time_frame = np.arange(pad_to) * self.hop_length / sampling_rate
 72 | 
 73 |         vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0]
 74 | 
 75 |         if f0.shape[0] <= 0:
 76 |             return torch.zeros(pad_to, dtype=torch.float, device=x.device).cpu().numpy(), vuv_vector.cpu().numpy()
 77 |         if f0.shape[0] == 1:
 78 |             return (torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[
 79 |                 0]).cpu().numpy(), vuv_vector.cpu().numpy()
 80 | 
 81 |         # 大概可以用 torch 重写?
 82 |         f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
 83 |         # vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector,pad_to/len(vuv_vector),order = 0))
 84 | 
 85 |         return f0, vuv_vector.cpu().numpy()
 86 | 
 87 |     def compute_f0(self, wav, p_len=None):
 88 |         x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
 89 |         if p_len is None:
 90 |             p_len = x.shape[0] // self.hop_length
 91 |         else:
 92 |             assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
 93 |         f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0,:,0]
 94 |         if torch.all(f0 == 0):
 95 |             rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len)
 96 |             return rtn, rtn
 97 |         return self.post_process(x, self.sampling_rate, f0, p_len)[0]
 98 | 
 99 |     def compute_f0_uv(self, wav, p_len=None):
100 |         x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
101 |         if p_len is None:
102 |             p_len = x.shape[0] // self.hop_length
103 |         else:
104 |             assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
105 |         f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0,:,0]
106 |         if torch.all(f0 == 0):
107 |             rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len)
108 |             return rtn, rtn
109 |         return self.post_process(x, self.sampling_rate, f0, p_len)


--------------------------------------------------------------------------------
/models/f0_predictor/HarvestF0Predictor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pyworld
 3 | 
 4 | from models.f0_predictor.F0Predictor import F0Predictor
 5 | 
 6 | 
 7 | class HarvestF0Predictor(F0Predictor):
 8 |     def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):
 9 |         self.hop_length = hop_length
10 |         self.f0_min = f0_min
11 |         self.f0_max = f0_max
12 |         self.sampling_rate = sampling_rate
13 |         self.name = "harvest"
14 | 
15 |     def interpolate_f0(self,f0):
16 |         '''
17 |         对F0进行插值处理
18 |         '''
19 |         vuv_vector = np.zeros_like(f0, dtype=np.float32)
20 |         vuv_vector[f0 > 0.0] = 1.0
21 |         vuv_vector[f0 <= 0.0] = 0.0
22 |     
23 |         nzindex = np.nonzero(f0)[0]
24 |         data = f0[nzindex]
25 |         nzindex = nzindex.astype(np.float32)
26 |         time_org = self.hop_length / self.sampling_rate * nzindex
27 |         time_frame = np.arange(f0.shape[0]) * self.hop_length / self.sampling_rate
28 | 
29 |         if data.shape[0] <= 0:
30 |             return np.zeros(f0.shape[0], dtype=np.float32),vuv_vector
31 | 
32 |         if data.shape[0] == 1:
33 |             return np.ones(f0.shape[0], dtype=np.float32) * f0[0],vuv_vector
34 | 
35 |         f0 = np.interp(time_frame, time_org, data, left=data[0], right=data[-1])
36 |         
37 |         return f0,vuv_vector
38 |     def resize_f0(self,x, target_len):
39 |         source = np.array(x)
40 |         source[source<0.001] = np.nan
41 |         target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source)
42 |         res = np.nan_to_num(target)
43 |         return res
44 |         
45 |     def compute_f0(self,wav,p_len=None):
46 |         if p_len is None:
47 |             p_len = wav.shape[0]//self.hop_length
48 |         f0, t = pyworld.harvest(
49 |                 wav.astype(np.double),
50 |                 fs=self.hop_length,
51 |                 f0_ceil=self.f0_max,
52 |                 f0_floor=self.f0_min,
53 |                 frame_period=1000 * self.hop_length / self.sampling_rate,
54 |             )
55 |         f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
56 |         return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
57 | 
58 |     def compute_f0_uv(self,wav,p_len=None):
59 |         if p_len is None:
60 |             p_len = wav.shape[0]//self.hop_length
61 |         f0, t = pyworld.harvest(
62 |             wav.astype(np.double),
63 |             fs=self.sampling_rate,
64 |             f0_floor=self.f0_min,
65 |             f0_ceil=self.f0_max,
66 |             frame_period=1000 * self.hop_length / self.sampling_rate,
67 |         )
68 |         f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
69 |         return self.interpolate_f0(self.resize_f0(f0, p_len))
70 | 


--------------------------------------------------------------------------------
/models/f0_predictor/PMF0Predictor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import parselmouth
 3 | 
 4 | from models.f0_predictor.F0Predictor import F0Predictor
 5 | 
 6 | 
 7 | class PMF0Predictor(F0Predictor):
 8 |     def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):
 9 |         self.hop_length = hop_length
10 |         self.f0_min = f0_min
11 |         self.f0_max = f0_max
12 |         self.sampling_rate = sampling_rate
13 |         self.name = "pm"
14 |     
15 |     def interpolate_f0(self,f0):
16 |         '''
17 |         对F0进行插值处理
18 |         '''
19 |         vuv_vector = np.zeros_like(f0, dtype=np.float32)
20 |         vuv_vector[f0 > 0.0] = 1.0
21 |         vuv_vector[f0 <= 0.0] = 0.0
22 |     
23 |         nzindex = np.nonzero(f0)[0]
24 |         data = f0[nzindex]
25 |         nzindex = nzindex.astype(np.float32)
26 |         time_org = self.hop_length / self.sampling_rate * nzindex
27 |         time_frame = np.arange(f0.shape[0]) * self.hop_length / self.sampling_rate
28 | 
29 |         if data.shape[0] <= 0:
30 |             return np.zeros(f0.shape[0], dtype=np.float32),vuv_vector
31 | 
32 |         if data.shape[0] == 1:
33 |             return np.ones(f0.shape[0], dtype=np.float32) * f0[0],vuv_vector
34 | 
35 |         f0 = np.interp(time_frame, time_org, data, left=data[0], right=data[-1])
36 |         
37 |         return f0,vuv_vector
38 |     
39 | 
40 |     def compute_f0(self,wav,p_len=None):
41 |         x = wav
42 |         if p_len is None:
43 |             p_len = x.shape[0]//self.hop_length
44 |         else:
45 |             assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error"
46 |         time_step = self.hop_length / self.sampling_rate * 1000
47 |         f0 = parselmouth.Sound(x, self.sampling_rate).to_pitch_ac(
48 |             time_step=time_step / 1000, voicing_threshold=0.6,
49 |             pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array['frequency']
50 | 
51 |         pad_size=(p_len - len(f0) + 1) // 2
52 |         if(pad_size>0 or p_len - len(f0) - pad_size>0):
53 |             f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
54 |         f0,uv = self.interpolate_f0(f0)
55 |         return f0
56 | 
57 |     def compute_f0_uv(self,wav,p_len=None):
58 |         x = wav
59 |         if p_len is None:
60 |             p_len = x.shape[0]//self.hop_length
61 |         else:
62 |             assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error"
63 |         time_step = self.hop_length / self.sampling_rate * 1000
64 |         f0 = parselmouth.Sound(x, self.sampling_rate).to_pitch_ac(
65 |             time_step=time_step / 1000, voicing_threshold=0.6,
66 |             pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array['frequency']
67 | 
68 |         pad_size=(p_len - len(f0) + 1) // 2
69 |         if(pad_size>0 or p_len - len(f0) - pad_size>0):
70 |             f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
71 |         f0,uv = self.interpolate_f0(f0)
72 |         return f0,uv
73 | 


--------------------------------------------------------------------------------
/models/f0_predictor/RMVPEF0Predictor.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | import os
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn.functional as F
  6 | 
  7 | from models.f0_predictor.F0Predictor import F0Predictor
  8 | 
  9 | from .rmvpe import RMVPE
 10 | 
 11 | 
 12 | class RMVPEF0Predictor(F0Predictor):
 13 |     def __init__(self,hop_length=512,f0_min=50,f0_max=1100, dtype=torch.float32, device=None,sampling_rate=44100,threshold=0.05):
 14 |         ckpt_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ckpt/rmvpe.pt")
 15 |         self.rmvpe = RMVPE(model_path=ckpt_filepath, dtype=dtype, device=device)
 16 |         self.hop_length = hop_length
 17 |         self.f0_min = f0_min
 18 |         self.f0_max = f0_max
 19 |         if device is None:
 20 |             self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
 21 |         else:
 22 |             self.device = device
 23 |         self.threshold = threshold
 24 |         self.sampling_rate = sampling_rate
 25 |         self.dtype = dtype
 26 |         self.name = "rmvpe"
 27 | 
 28 |     def repeat_expand(
 29 |         self, content: Union[torch.Tensor, np.ndarray], target_len: int, mode: str = "nearest"
 30 |     ):
 31 |         ndim = content.ndim
 32 | 
 33 |         if content.ndim == 1:
 34 |             content = content[None, None]
 35 |         elif content.ndim == 2:
 36 |             content = content[None]
 37 | 
 38 |         assert content.ndim == 3
 39 | 
 40 |         is_np = isinstance(content, np.ndarray)
 41 |         if is_np:
 42 |             content = torch.from_numpy(content)
 43 | 
 44 |         results = torch.nn.functional.interpolate(content, size=target_len, mode=mode)
 45 | 
 46 |         if is_np:
 47 |             results = results.numpy()
 48 | 
 49 |         if ndim == 1:
 50 |             return results[0, 0]
 51 |         elif ndim == 2:
 52 |             return results[0]
 53 | 
 54 |     def post_process(self, x, sampling_rate, f0, pad_to):
 55 |         if isinstance(f0, np.ndarray):
 56 |             f0 = torch.from_numpy(f0).float().to(x.device)
 57 | 
 58 |         if pad_to is None:
 59 |             return f0
 60 | 
 61 |         f0 = self.repeat_expand(f0, pad_to)
 62 |         
 63 |         vuv_vector = torch.zeros_like(f0)
 64 |         vuv_vector[f0 > 0.0] = 1.0
 65 |         vuv_vector[f0 <= 0.0] = 0.0
 66 |         
 67 |         # 去掉0频率, 并线性插值
 68 |         nzindex = torch.nonzero(f0).squeeze()
 69 |         f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
 70 |         time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy()
 71 |         time_frame = np.arange(pad_to) * self.hop_length / sampling_rate
 72 |         
 73 |         vuv_vector = F.interpolate(vuv_vector[None,None,:],size=pad_to)[0][0]
 74 | 
 75 |         if f0.shape[0] <= 0:
 76 |             return torch.zeros(pad_to, dtype=torch.float, device=x.device).cpu().numpy(),vuv_vector.cpu().numpy()
 77 |         if f0.shape[0] == 1:
 78 |             return (torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[0]).cpu().numpy() ,vuv_vector.cpu().numpy()
 79 |     
 80 |         # 大概可以用 torch 重写?
 81 |         f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
 82 |         #vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector,pad_to/len(vuv_vector),order = 0))
 83 |         
 84 |         return f0,vuv_vector.cpu().numpy()
 85 | 
 86 |     def compute_f0(self,wav,p_len=None):
 87 |         x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
 88 |         if p_len is None:
 89 |             p_len = x.shape[0]//self.hop_length
 90 |         else:
 91 |             assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error"
 92 |         f0 = self.rmvpe.infer_from_audio(x,self.sampling_rate,self.threshold)
 93 |         if torch.all(f0 == 0):
 94 |             rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len)
 95 |             return rtn,rtn
 96 |         return self.post_process(x,self.sampling_rate,f0,p_len)[0]
 97 |     
 98 |     def compute_f0_uv(self,wav,p_len=None):
 99 |         x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
100 |         if p_len is None:
101 |             p_len = x.shape[0]//self.hop_length
102 |         else:
103 |             assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error"
104 |         f0 = self.rmvpe.infer_from_audio(x,self.sampling_rate,self.threshold)
105 |         if torch.all(f0 == 0):
106 |             rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len)
107 |             return rtn,rtn
108 |         return self.post_process(x,self.sampling_rate,f0,p_len)
109 | 


--------------------------------------------------------------------------------
/models/f0_predictor/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs):
 3 |     if f0_predictor == "pm":
 4 |         from models.f0_predictor.PMF0Predictor import PMF0Predictor
 5 |         f0_predictor_object = PMF0Predictor(hop_length=hop_length, sampling_rate=sampling_rate)
 6 | 
 7 |     elif f0_predictor == "crepe":
 8 |         from models.f0_predictor.CrepeF0Predictor import CrepeF0Predictor
 9 |         f0_predictor_object = CrepeF0Predictor(hop_length=hop_length, sampling_rate=sampling_rate, device=kargs["device"],threshold=kargs["threshold"])
10 | 
11 |     elif f0_predictor == "harvest":
12 |         from models.f0_predictor.HarvestF0Predictor import HarvestF0Predictor
13 |         f0_predictor_object = HarvestF0Predictor(hop_length=hop_length, sampling_rate=sampling_rate)
14 | 
15 |     elif f0_predictor == "dio":
16 |         from models.f0_predictor.DioF0Predictor import DioF0Predictor
17 |         f0_predictor_object = DioF0Predictor(hop_length=hop_length, sampling_rate=sampling_rate) 
18 | 
19 |     elif f0_predictor == "rmvpe":
20 |         from models.f0_predictor.RMVPEF0Predictor import RMVPEF0Predictor
21 |         f0_predictor_object = RMVPEF0Predictor(hop_length=hop_length, sampling_rate=sampling_rate, dtype=torch.float32 ,device=kargs["device"],threshold=kargs["threshold"])
22 | 
23 |     elif f0_predictor == "fcpe":
24 |         from models.f0_predictor.FCPEF0Predictor import FCPEF0Predictor
25 |         f0_predictor_object = FCPEF0Predictor(hop_length=hop_length, sampling_rate=sampling_rate, dtype=torch.float32 ,device=kargs["device"],threshold=kargs["threshold"])
26 | 
27 |     else:
28 |         raise Exception("Unknown f0 predictor")
29 |     return f0_predictor_object
30 | 


--------------------------------------------------------------------------------
/models/f0_predictor/fcpe/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import FCPEInfer  # noqa: F401
2 | from .nvSTFT import STFT  # noqa: F401
3 | from .pcmer import PCmer  # noqa: F401
4 | 


--------------------------------------------------------------------------------
/models/f0_predictor/rmvpe/__init__.py:
--------------------------------------------------------------------------------
 1 | from .constants import *  # noqa: F403
 2 | from .inference import RMVPE  # noqa: F401
 3 | from .model import E2E, E2E0  # noqa: F401
 4 | from .spec import MelSpectrogram  # noqa: F401
 5 | from .utils import (  # noqa: F401
 6 |     cycle,
 7 |     summary,
 8 |     to_local_average_cents,
 9 |     to_viterbi_cents,
10 | )
11 | 


--------------------------------------------------------------------------------
/models/f0_predictor/rmvpe/constants.py:
--------------------------------------------------------------------------------
 1 | SAMPLE_RATE = 16000
 2 | 
 3 | N_CLASS = 360
 4 | 
 5 | N_MELS = 128
 6 | MEL_FMIN = 30
 7 | MEL_FMAX = SAMPLE_RATE // 2
 8 | WINDOW_LENGTH = 1024
 9 | CONST = 1997.3794084376191
10 | 


--------------------------------------------------------------------------------
/models/f0_predictor/rmvpe/inference.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torchaudio.transforms import Resample
 4 | 
 5 | from .constants import *  # noqa: F403
 6 | from .model import E2E0
 7 | from .spec import MelSpectrogram
 8 | from .utils import to_local_average_cents, to_viterbi_cents
 9 | 
10 | 
11 | class RMVPE:
12 |     def __init__(self, model_path, device=None, dtype = torch.float32, hop_length=160):
13 |         self.resample_kernel = {}
14 |         if device is None:
15 |             self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
16 |         else:
17 |             self.device = device
18 |         model = E2E0(4, 1, (2, 2))
19 |         ckpt = torch.load(model_path, map_location=torch.device(self.device))
20 |         model.load_state_dict(ckpt['model'])
21 |         model = model.to(dtype).to(self.device)
22 |         model.eval()
23 |         self.model = model
24 |         self.dtype = dtype
25 |         self.mel_extractor = MelSpectrogram(N_MELS, SAMPLE_RATE, WINDOW_LENGTH, hop_length, None, MEL_FMIN, MEL_FMAX)  # noqa: F405
26 |         self.resample_kernel = {}
27 | 
28 |     def mel2hidden(self, mel):
29 |         with torch.no_grad():
30 |             n_frames = mel.shape[-1]
31 |             mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='constant')
32 |             hidden = self.model(mel)
33 |             return hidden[:, :n_frames]
34 | 
35 |     def decode(self, hidden, thred=0.03, use_viterbi=False):
36 |         if use_viterbi:
37 |             cents_pred = to_viterbi_cents(hidden, thred=thred)
38 |         else:
39 |             cents_pred = to_local_average_cents(hidden, thred=thred)
40 |         f0 = torch.Tensor([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred]).to(self.device)
41 |         return f0
42 | 
43 |     def infer_from_audio(self, audio, sample_rate=16000, thred=0.05, use_viterbi=False):
44 |         audio = audio.unsqueeze(0).to(self.dtype).to(self.device)
45 |         if sample_rate == 16000:
46 |             audio_res = audio
47 |         else:
48 |             key_str = str(sample_rate)
49 |             if key_str not in self.resample_kernel:
50 |                 self.resample_kernel[key_str] = Resample(sample_rate, 16000, lowpass_filter_width=128)
51 |             self.resample_kernel[key_str] = self.resample_kernel[key_str].to(self.dtype).to(self.device)
52 |             audio_res = self.resample_kernel[key_str](audio)
53 |         mel_extractor = self.mel_extractor.to(self.device)
54 |         mel = mel_extractor(audio_res, center=True).to(self.dtype)
55 |         hidden = self.mel2hidden(mel)
56 |         f0 = self.decode(hidden.squeeze(0), thred=thred, use_viterbi=use_viterbi)
57 |         return f0
58 | 


--------------------------------------------------------------------------------
/models/f0_predictor/rmvpe/model.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | from .constants import *  # noqa: F403
 4 | from .deepunet import DeepUnet, DeepUnet0
 5 | from .seq import BiGRU
 6 | from .spec import MelSpectrogram
 7 | 
 8 | 
 9 | class E2E(nn.Module):
10 |     def __init__(self, hop_length, n_blocks, n_gru, kernel_size, en_de_layers=5, inter_layers=4, in_channels=1,
11 |                  en_out_channels=16):
12 |         super(E2E, self).__init__()
13 |         self.mel = MelSpectrogram(N_MELS, SAMPLE_RATE, WINDOW_LENGTH, hop_length, None, MEL_FMIN, MEL_FMAX)  # noqa: F405
14 |         self.unet = DeepUnet(kernel_size, n_blocks, en_de_layers, inter_layers, in_channels, en_out_channels)
15 |         self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
16 |         if n_gru:
17 |             self.fc = nn.Sequential(
18 |                 BiGRU(3 * N_MELS, 256, n_gru),   # noqa: F405
19 |                 nn.Linear(512, N_CLASS),   # noqa: F405
20 |                 nn.Dropout(0.25),
21 |                 nn.Sigmoid()
22 |             )
23 |         else:
24 |             self.fc = nn.Sequential(
25 |                 nn.Linear(3 * N_MELS, N_CLASS),  # noqa: F405
26 |                 nn.Dropout(0.25),
27 |                 nn.Sigmoid()
28 |             )
29 | 
30 |     def forward(self, x):
31 |         mel = self.mel(x.reshape(-1, x.shape[-1])).transpose(-1, -2).unsqueeze(1)
32 |         x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
33 |         # x = self.fc(x)
34 |         hidden_vec = 0
35 |         if len(self.fc) == 4:
36 |             for i in range(len(self.fc)):
37 |                 x = self.fc[i](x)
38 |                 if i == 0:
39 |                     hidden_vec = x
40 |         return hidden_vec, x
41 | 
42 | 
43 | class E2E0(nn.Module):
44 |     def __init__(self, n_blocks, n_gru, kernel_size, en_de_layers=5, inter_layers=4, in_channels=1,
45 |                  en_out_channels=16):
46 |         super(E2E0, self).__init__()
47 |         self.unet = DeepUnet0(kernel_size, n_blocks, en_de_layers, inter_layers, in_channels, en_out_channels)
48 |         self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
49 |         if n_gru:
50 |             self.fc = nn.Sequential(
51 |                 BiGRU(3 * N_MELS, 256, n_gru),  # noqa: F405
52 |                 nn.Linear(512, N_CLASS),  # noqa: F405
53 |                 nn.Dropout(0.25),
54 |                 nn.Sigmoid()
55 |             )
56 |         else:
57 |             self.fc = nn.Sequential(
58 |                 nn.Linear(3 * N_MELS, N_CLASS),  # noqa: F405
59 |                 nn.Dropout(0.25),
60 |                 nn.Sigmoid()
61 |             )
62 | 
63 |     def forward(self, mel):
64 |         mel = mel.transpose(-1, -2).unsqueeze(1)
65 |         x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
66 |         x = self.fc(x)
67 |         return x
68 | 


--------------------------------------------------------------------------------
/models/f0_predictor/rmvpe/seq.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | 
 4 | class BiGRU(nn.Module):
 5 |     def __init__(self, input_features, hidden_features, num_layers):
 6 |         super(BiGRU, self).__init__()
 7 |         self.gru = nn.GRU(input_features, hidden_features, num_layers=num_layers, batch_first=True, bidirectional=True)
 8 | 
 9 |     def forward(self, x):
10 |         return self.gru(x)[0]
11 | 
12 | 
13 | class BiLSTM(nn.Module):
14 |     def __init__(self, input_features, hidden_features, num_layers):
15 |         super(BiLSTM, self).__init__()
16 |         self.lstm = nn.LSTM(input_features, hidden_features, num_layers=num_layers, batch_first=True, bidirectional=True)
17 | 
18 |     def forward(self, x):
19 |         return self.lstm(x)[0]
20 | 
21 | 


--------------------------------------------------------------------------------
/models/f0_predictor/rmvpe/spec.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn.functional as F
 4 | from librosa.filters import mel
 5 | 
 6 | 
 7 | class MelSpectrogram(torch.nn.Module):
 8 |     def __init__(
 9 |         self,
10 |         n_mel_channels,
11 |         sampling_rate,
12 |         win_length,
13 |         hop_length,
14 |         n_fft=None,
15 |         mel_fmin=0,
16 |         mel_fmax=None,
17 |         clamp = 1e-5
18 |     ):
19 |         super().__init__()
20 |         n_fft = win_length if n_fft is None else n_fft
21 |         self.hann_window = {}
22 |         mel_basis = mel(
23 |             sr=sampling_rate,
24 |             n_fft=n_fft, 
25 |             n_mels=n_mel_channels, 
26 |             fmin=mel_fmin, 
27 |             fmax=mel_fmax,
28 |             htk=True)
29 |         mel_basis = torch.from_numpy(mel_basis).float()
30 |         self.register_buffer("mel_basis", mel_basis)
31 |         self.n_fft = win_length if n_fft is None else n_fft
32 |         self.hop_length = hop_length
33 |         self.win_length = win_length
34 |         self.sampling_rate = sampling_rate
35 |         self.n_mel_channels = n_mel_channels
36 |         self.clamp = clamp
37 | 
38 |     def forward(self, audio, keyshift=0, speed=1, center=True):
39 |         factor = 2 ** (keyshift / 12)       
40 |         n_fft_new = int(np.round(self.n_fft * factor))
41 |         win_length_new = int(np.round(self.win_length * factor))
42 |         hop_length_new = int(np.round(self.hop_length * speed))
43 |         
44 |         keyshift_key = str(keyshift)+'_'+str(audio.device)
45 |         if keyshift_key not in self.hann_window:
46 |             self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(audio.device)
47 |             
48 |         fft = torch.stft(
49 |             audio,
50 |             n_fft=n_fft_new,
51 |             hop_length=hop_length_new,
52 |             win_length=win_length_new,
53 |             window=self.hann_window[keyshift_key],
54 |             center=center,
55 |             return_complex=True)
56 |         magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
57 |         
58 |         if keyshift != 0:
59 |             size = self.n_fft // 2 + 1
60 |             resize = magnitude.size(1)
61 |             if resize < size:
62 |                 magnitude = F.pad(magnitude, (0, 0, 0, size-resize))
63 |             magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
64 |             
65 |         mel_output = torch.matmul(self.mel_basis, magnitude)
66 |         log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
67 |         return log_mel_spec


--------------------------------------------------------------------------------
/models/f0_predictor/rmvpe/utils.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from functools import reduce
  3 | 
  4 | import librosa
  5 | import numpy as np
  6 | import torch
  7 | from torch.nn.modules.module import _addindent
  8 | 
  9 | from .constants import *  # noqa: F403
 10 | 
 11 | 
 12 | def cycle(iterable):
 13 |     while True:
 14 |         for item in iterable:
 15 |             yield item
 16 | 
 17 | 
 18 | def summary(model, file=sys.stdout):
 19 |     def repr(model):
 20 |         # We treat the extra repr like the sub-module, one item per line
 21 |         extra_lines = []
 22 |         extra_repr = model.extra_repr()
 23 |         # empty string will be split into list ['']
 24 |         if extra_repr:
 25 |             extra_lines = extra_repr.split('\n')
 26 |         child_lines = []
 27 |         total_params = 0
 28 |         for key, module in model._modules.items():
 29 |             mod_str, num_params = repr(module)
 30 |             mod_str = _addindent(mod_str, 2)
 31 |             child_lines.append('(' + key + '): ' + mod_str)
 32 |             total_params += num_params
 33 |         lines = extra_lines + child_lines
 34 | 
 35 |         for name, p in model._parameters.items():
 36 |             if hasattr(p, 'shape'):
 37 |                 total_params += reduce(lambda x, y: x * y, p.shape)
 38 | 
 39 |         main_str = model._get_name() + '('
 40 |         if lines:
 41 |             # simple one-liner info, which most builtin Modules will use
 42 |             if len(extra_lines) == 1 and not child_lines:
 43 |                 main_str += extra_lines[0]
 44 |             else:
 45 |                 main_str += '\n  ' + '\n  '.join(lines) + '\n'
 46 | 
 47 |         main_str += ')'
 48 |         if file is sys.stdout:
 49 |             main_str += ', \033[92m{:,}\033[0m params'.format(total_params)
 50 |         else:
 51 |             main_str += ', {:,} params'.format(total_params)
 52 |         return main_str, total_params
 53 | 
 54 |     string, count = repr(model)
 55 |     if file is not None:
 56 |         if isinstance(file, str):
 57 |             file = open(file, 'w')
 58 |         print(string, file=file)
 59 |         file.flush()
 60 | 
 61 |     return count
 62 | 
 63 |     
 64 | def to_local_average_cents(salience, center=None, thred=0.05):
 65 |     """
 66 |     find the weighted average cents near the argmax bin
 67 |     """
 68 | 
 69 |     if not hasattr(to_local_average_cents, 'cents_mapping'):
 70 |         # the bin number-to-cents mapping
 71 |         to_local_average_cents.cents_mapping = (
 72 |                 20 * torch.arange(N_CLASS) + CONST).to(salience.device)  # noqa: F405
 73 | 
 74 |     if salience.ndim == 1:
 75 |         if center is None:
 76 |             center = int(torch.argmax(salience))
 77 |         start = max(0, center - 4)
 78 |         end = min(len(salience), center + 5)
 79 |         salience = salience[start:end]
 80 |         product_sum = torch.sum(
 81 |             salience * to_local_average_cents.cents_mapping[start:end])
 82 |         weight_sum = torch.sum(salience)
 83 |         return product_sum / weight_sum if torch.max(salience) > thred else 0
 84 |     if salience.ndim == 2:
 85 |         return torch.Tensor([to_local_average_cents(salience[i, :], None, thred) for i in
 86 |                          range(salience.shape[0])]).to(salience.device)
 87 | 
 88 |     raise Exception("label should be either 1d or 2d ndarray")
 89 | 
 90 | def to_viterbi_cents(salience, thred=0.05):
 91 |     # Create viterbi transition matrix
 92 |     if not hasattr(to_viterbi_cents, 'transition'):
 93 |         xx, yy = torch.meshgrid(range(N_CLASS), range(N_CLASS))  # noqa: F405
 94 |         transition = torch.maximum(30 - abs(xx - yy), 0)
 95 |         transition = transition / transition.sum(axis=1, keepdims=True)
 96 |         to_viterbi_cents.transition = transition
 97 | 
 98 |     # Convert to probability
 99 |     prob = salience.T
100 |     prob = prob / prob.sum(axis=0)    
101 | 
102 |     # Perform viterbi decoding
103 |     path = librosa.sequence.viterbi(prob.detach().cpu().numpy(), to_viterbi_cents.transition).astype(np.int64)
104 | 
105 |     return torch.Tensor([to_local_average_cents(salience[i, :], path[i], thred) for i in
106 |                      range(len(path))]).to(salience.device)
107 |                      


--------------------------------------------------------------------------------
/models/hifigan/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import Generator
2 | 
3 | 
4 | class AttrDict(dict):
5 |     def __init__(self, *args, **kwargs):
6 |         super(AttrDict, self).__init__(*args, **kwargs)
7 |         self.__dict__ = self


--------------------------------------------------------------------------------
/models/hifigan/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 16,
 5 |     "learning_rate": 0.0002,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.999,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [8,8,2,2],
12 |     "upsample_kernel_sizes": [16,16,4,4],
13 |     "upsample_initial_channel": 512,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 |     "resblock_initial_channel": 256,
17 | 
18 |     "segment_size": 8192,
19 |     "num_mels": 80,
20 |     "num_freq": 1025,
21 |     "n_fft": 1024,
22 |     "hop_size": 256,
23 |     "win_size": 1024,
24 | 
25 |     "sampling_rate": 22050,
26 | 
27 |     "fmin": 0,
28 |     "fmax": 8000,
29 |     "fmax_loss": null,
30 | 
31 |     "num_workers": 4,
32 | 
33 |     "dist_config": {
34 |         "dist_backend": "nccl",
35 |         "dist_url": "tcp://localhost:54321",
36 |         "world_size": 1
37 |     }
38 | }


--------------------------------------------------------------------------------
/models/hifigan/generator_v1.txt:
--------------------------------------------------------------------------------
1 | https://github.com/jik876/hifi-gan


--------------------------------------------------------------------------------
/models/speaker_encoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freds0/free-svc/def7f35a0d1935b00e6ebaf19a4a149617f3354b/models/speaker_encoder/__init__.py


--------------------------------------------------------------------------------
/models/speaker_encoder/audio.py:
--------------------------------------------------------------------------------
  1 | from scipy.ndimage.morphology import binary_dilation
  2 | from models.speaker_encoder.params_data import *
  3 | from pathlib import Path
  4 | from typing import Optional, Union
  5 | import numpy as np
  6 | import webrtcvad
  7 | import librosa
  8 | import struct
  9 | 
 10 | int16_max = (2 ** 15) - 1
 11 | 
 12 | 
 13 | def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
 14 |                    source_sr: Optional[int] = None):
 15 |     """
 16 |     Applies the preprocessing operations used in training the Speaker Encoder to a waveform 
 17 |     either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
 18 | 
 19 |     :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 
 20 |     just .wav), either the waveform as a numpy array of floats.
 21 |     :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 
 22 |     preprocessing. After preprocessing, the waveform's sampling rate will match the data 
 23 |     hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 
 24 |     this argument will be ignored.
 25 |     """
 26 |     # Load the wav from disk if needed
 27 |     if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
 28 |         wav, source_sr = librosa.load(fpath_or_wav, sr=None)
 29 |     else:
 30 |         wav = fpath_or_wav
 31 |     
 32 |     # Resample the wav if needed
 33 |     if source_sr is not None and source_sr != sampling_rate:
 34 |         wav = librosa.resample(wav, source_sr, sampling_rate)
 35 |         
 36 |     # Apply the preprocessing: normalize volume and shorten long silences 
 37 |     wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
 38 |     wav = trim_long_silences(wav)
 39 |     
 40 |     return wav
 41 | 
 42 | 
 43 | def wav_to_mel_spectrogram(wav):
 44 |     """
 45 |     Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
 46 |     Note: this not a log-mel spectrogram.
 47 |     """
 48 |     frames = librosa.feature.melspectrogram(
 49 |         y=wav,
 50 |         sr=sampling_rate,
 51 |         n_fft=int(sampling_rate * mel_window_length / 1000),
 52 |         hop_length=int(sampling_rate * mel_window_step / 1000),
 53 |         n_mels=mel_n_channels
 54 |     )
 55 |     return frames.astype(np.float32).T
 56 | 
 57 | 
 58 | def trim_long_silences(wav):
 59 |     """
 60 |     Ensures that segments without voice in the waveform remain no longer than a 
 61 |     threshold determined by the VAD parameters in params.py.
 62 | 
 63 |     :param wav: the raw waveform as a numpy array of floats 
 64 |     :return: the same waveform with silences trimmed away (length <= original wav length)
 65 |     """
 66 |     # Compute the voice detection window size
 67 |     samples_per_window = (vad_window_length * sampling_rate) // 1000
 68 |     
 69 |     # Trim the end of the audio to have a multiple of the window size
 70 |     wav = wav[:len(wav) - (len(wav) % samples_per_window)]
 71 |     
 72 |     # Convert the float waveform to 16-bit mono PCM
 73 |     pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
 74 |     
 75 |     # Perform voice activation detection
 76 |     voice_flags = []
 77 |     vad = webrtcvad.Vad(mode=3)
 78 |     for window_start in range(0, len(wav), samples_per_window):
 79 |         window_end = window_start + samples_per_window
 80 |         voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
 81 |                                          sample_rate=sampling_rate))
 82 |     voice_flags = np.array(voice_flags)
 83 |     
 84 |     # Smooth the voice detection with a moving average
 85 |     def moving_average(array, width):
 86 |         array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
 87 |         ret = np.cumsum(array_padded, dtype=float)
 88 |         ret[width:] = ret[width:] - ret[:-width]
 89 |         return ret[width - 1:] / width
 90 |     
 91 |     audio_mask = moving_average(voice_flags, vad_moving_average_width)
 92 |     audio_mask = np.round(audio_mask).astype(np.bool)
 93 |     
 94 |     # Dilate the voiced regions
 95 |     audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
 96 |     audio_mask = np.repeat(audio_mask, samples_per_window)
 97 |     
 98 |     return wav[audio_mask == True]
 99 | 
100 | 
101 | def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
102 |     if increase_only and decrease_only:
103 |         raise ValueError("Both increase only and decrease only are set")
104 |     dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
105 |     if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
106 |         return wav
107 |     return wav * (10 ** (dBFS_change / 20))
108 | 


--------------------------------------------------------------------------------
/models/speaker_encoder/compute_embed.py:
--------------------------------------------------------------------------------
 1 | from speaker_encoder import inference as encoder
 2 | from multiprocessing.pool import Pool
 3 | from functools import partial
 4 | from pathlib import Path
 5 | # from utils import logmmse
 6 | # from tqdm import tqdm
 7 | # import numpy as np
 8 | # import librosa
 9 | 
10 | 
11 | def embed_utterance(fpaths, encoder_model_fpath):
12 |     if not encoder.is_loaded():
13 |         encoder.load_model(encoder_model_fpath)
14 | 
15 |     # Compute the speaker embedding of the utterance
16 |     wav_fpath, embed_fpath = fpaths
17 |     wav = np.load(wav_fpath)
18 |     wav = encoder.preprocess_wav(wav)
19 |     embed = encoder.embed_utterance(wav)
20 |     np.save(embed_fpath, embed, allow_pickle=False)
21 |     
22 |  
23 | def create_embeddings(outdir_root: Path, wav_dir: Path, encoder_model_fpath: Path, n_processes: int):
24 | 
25 |     wav_dir = outdir_root.joinpath("audio")
26 |     metadata_fpath = synthesizer_root.joinpath("train.txt")
27 |     assert wav_dir.exists() and metadata_fpath.exists()
28 |     embed_dir = synthesizer_root.joinpath("embeds")
29 |     embed_dir.mkdir(exist_ok=True)
30 |     
31 |     # Gather the input wave filepath and the target output embed filepath
32 |     with metadata_fpath.open("r") as metadata_file:
33 |         metadata = [line.split("|") for line in metadata_file]
34 |         fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
35 |         
36 |     # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
37 |     # Embed the utterances in separate threads
38 |     func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
39 |     job = Pool(n_processes).imap(func, fpaths)
40 |     list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))


--------------------------------------------------------------------------------
/models/speaker_encoder/config.py:
--------------------------------------------------------------------------------
 1 | librispeech_datasets = {
 2 |     "train": {
 3 |         "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
 4 |         "other": ["LibriSpeech/train-other-500"]
 5 |     },
 6 |     "test": {
 7 |         "clean": ["LibriSpeech/test-clean"],
 8 |         "other": ["LibriSpeech/test-other"]
 9 |     },
10 |     "dev": {
11 |         "clean": ["LibriSpeech/dev-clean"],
12 |         "other": ["LibriSpeech/dev-other"]
13 |     },
14 | }
15 | libritts_datasets = {
16 |     "train": {
17 |         "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
18 |         "other": ["LibriTTS/train-other-500"]
19 |     },
20 |     "test": {
21 |         "clean": ["LibriTTS/test-clean"],
22 |         "other": ["LibriTTS/test-other"]
23 |     },
24 |     "dev": {
25 |         "clean": ["LibriTTS/dev-clean"],
26 |         "other": ["LibriTTS/dev-other"]
27 |     },
28 | }
29 | voxceleb_datasets = {
30 |     "voxceleb1" : {
31 |         "train": ["VoxCeleb1/wav"],
32 |         "test": ["VoxCeleb1/test_wav"]
33 |     },
34 |     "voxceleb2" : {
35 |         "train": ["VoxCeleb2/dev/aac"],
36 |         "test": ["VoxCeleb2/test_wav"]
37 |     }
38 | }
39 | 
40 | other_datasets = [
41 |     "LJSpeech-1.1",
42 |     "VCTK-Corpus/wav48",
43 | ]
44 | 
45 | anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
46 | 


--------------------------------------------------------------------------------
/models/speaker_encoder/hparams.py:
--------------------------------------------------------------------------------
 1 | ## Mel-filterbank
 2 | mel_window_length = 25  # In milliseconds
 3 | mel_window_step = 10    # In milliseconds
 4 | mel_n_channels = 40
 5 | 
 6 | 
 7 | ## Audio
 8 | sampling_rate = 16000
 9 | # Number of spectrogram frames in a partial utterance
10 | partials_n_frames = 160     # 1600 ms
11 | 
12 | 
13 | ## Voice Activation Detection
14 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
15 | # This sets the granularity of the VAD. Should not need to be changed.
16 | vad_window_length = 30  # In milliseconds
17 | # Number of frames to average together when performing the moving average smoothing.
18 | # The larger this value, the larger the VAD variations must be to not get smoothed out. 
19 | vad_moving_average_width = 8
20 | # Maximum number of consecutive silent frames a segment can have.
21 | vad_max_silence_length = 6
22 | 
23 | 
24 | ## Audio volume normalization
25 | audio_norm_target_dBFS = -30
26 | 
27 | 
28 | ## Model parameters
29 | model_hidden_size = 256
30 | model_embedding_size = 256
31 | model_num_layers = 3


--------------------------------------------------------------------------------
/models/speaker_encoder/params_data.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Mel-filterbank
 3 | mel_window_length = 25  # In milliseconds
 4 | mel_window_step = 10    # In milliseconds
 5 | mel_n_channels = 40
 6 | 
 7 | 
 8 | ## Audio
 9 | sampling_rate = 16000
10 | # Number of spectrogram frames in a partial utterance
11 | partials_n_frames = 160     # 1600 ms
12 | # Number of spectrogram frames at inference
13 | inference_n_frames = 80     #  800 ms
14 | 
15 | 
16 | ## Voice Activation Detection
17 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
18 | # This sets the granularity of the VAD. Should not need to be changed.
19 | vad_window_length = 30  # In milliseconds
20 | # Number of frames to average together when performing the moving average smoothing.
21 | # The larger this value, the larger the VAD variations must be to not get smoothed out. 
22 | vad_moving_average_width = 8
23 | # Maximum number of consecutive silent frames a segment can have.
24 | vad_max_silence_length = 6
25 | 
26 | 
27 | ## Audio volume normalization
28 | audio_norm_target_dBFS = -30
29 | 
30 | 


--------------------------------------------------------------------------------
/models/speaker_encoder/params_model.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Model parameters
 3 | model_hidden_size = 256
 4 | model_embedding_size = 256
 5 | model_num_layers = 3
 6 | 
 7 | 
 8 | ## Training parameters
 9 | learning_rate_init = 1e-4
10 | speakers_per_batch = 64
11 | utterances_per_speaker = 10
12 | 


--------------------------------------------------------------------------------
/models/spin/__init__.py:
--------------------------------------------------------------------------------
1 | from .src.model import SpinModel  # noqa
2 | from .src.data.dataset import collate_fn as spin_collate_fn  # noqa


--------------------------------------------------------------------------------
/models/spin/spin.yaml:
--------------------------------------------------------------------------------
 1 | # Interspeech 2023 version
 2 | 
 3 | # Training data
 4 | data:
 5 |   json_dir: /data/sls/r/u/hengjui/home/scratch/spin_test/data
 6 |   splits:
 7 |     - train-clean-100
 8 |   sample_rate: 16000
 9 |   min_audio_len: 40000  # minimum audio samples per utterance
10 |   random_crop_len: 272000  # maximum audio samples per utterance
11 |   spk2info: /root/RVC_Spin/spin_train/spk_to_f0.csv
12 |   out_of_len_audios: /root/RVC_Spin/spin_train/out_of_len_audios.txt
13 | 
14 | # Validation data (not used for checkpointing, just for monitoring training progress)
15 | val_data:
16 |   json_dir: /data/sls/r/u/hengjui/home/scratch/spin_test/data
17 |   phn_dir: /root/RVC_Spin/spin_train/phone_alignment_info
18 |   data_dir: /libri_tts/LibriTTS/dev-clean
19 |   out_of_len_audios: /root/RVC_Spin/spin_train/out_of_len_audios_val.txt
20 |   splits:
21 |     - libri-dev-clean
22 |     - libri-dev-other
23 |   sample_rate: 16000
24 | 
25 | # SpinModel config
26 | model:
27 |   encoder:
28 |     type: HuBERT  # `HuBERT` / `WavLM`
29 |     use_layer: 12  # the layer which its representations are used for clustering
30 |     normalize: False
31 |     feat_select: x
32 |     randomize_all: False
33 |     randomize_layers: []
34 |     freeze_all: False
35 |     freeze_layers: ["pos", 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]  # `pos`: positional encoding, `0`: CNN extractor
36 |   pred_head:
37 |     type: DNN
38 |     hid_dims: [256]
39 |     dropout: 0
40 |     activation: ReLU
41 |   loss:
42 |     type: SwavVQDisentangle
43 |     num_vars: 2048  # cluster size
44 |     epsilon: 0.02
45 |     sinkhorn_iters: 3
46 |     temp: 0.1
47 |     l2_norm: True
48 |     prob_ratio: 1.0
49 | 
50 | # Optimization
51 | optim:
52 |   optimizer:
53 |     name: Adam
54 |     args:
55 |       lr: 1.e-4
56 |       weight_decay: 1.e-6
57 |   scheduler:
58 |     name: linear_warmup_decay  # `linear_warmup_decay` / `linear_warmup_cosine_scheduler` / `noam_scheduler`
59 |     args:
60 |       warmup: 2500
61 |       max_step: 63052
62 |       final_lr: 1.e-6
63 | 
64 | hparam:
65 |   #batch_len: 4096000  # audio samples per GPU (256 secs ~ batch_size = 12.8k)   4096000
66 |   batch_size: 32
67 |   val_batch_size: 8
68 | 
69 | # pytorch_lightning.Trainer
70 | # ref: https://lightning.ai/docs/pytorch/latest/common/trainer.html
71 | trainer:
72 |   max_steps: 63052
73 |   gradient_clip_val: 10
74 |   accumulate_grad_batches: 1
75 |   precision: 16
76 |   logger: wandb  # use `False` to disable logging
77 |   log_every_n_steps: 100
78 |   default_root_dir: exp/tmp
79 |   accelerator: gpu
80 | 
81 |   strategy: ddp_find_unused_parameters_true  # UNCOMMENT this line to enable DDP training
82 | 
83 |   num_sanity_val_steps: 0
84 |   val_check_interval: 1000
85 | 
86 | # pytorch_lightning.callbacks.ModelCheckpoint
87 | # ref: https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.ModelCheckpoint.html
88 | checkpoint:
89 |   filename: "{epoch}-{step}"
90 |   every_n_train_steps: 2000
91 |   save_last: true
92 | 
93 | # pytorch_lightning.loggers.WandbLogger
94 | # ref: https://lightning.ai/docs/pytorch/latest/extensions/generated/lightning.pytorch.loggers.WandbLogger.html
95 | logger:
96 |   project: spin_is2023


--------------------------------------------------------------------------------
/models/spin/src/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataset import (
2 |     AudioPretrainDataset,
3 |     AudioPretrainPnmiValDataset,
4 |     collate_fn,
5 |     val_collate_fn,
6 | )
7 | from .sampler import MaxLengthBatchSampler, MaxLengthDistributedSampler
8 | 


--------------------------------------------------------------------------------
/models/spin/src/data/librispeech.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | from typing import List, Tuple
 4 | 
 5 | import torchaudio
 6 | from tqdm import tqdm
 7 | 
 8 | 
 9 | def find_all_librispeech(root: str, sort_by_len: bool = False) -> List[Tuple[str, int]]:
10 |     files = list(Path(root).rglob("*.flac"))
11 |     files = [str(f) for f in files]
12 |     file_lens = [torchaudio.info(f).num_frames for f in tqdm(files)]
13 |     assert len(files) == len(file_lens), (len(files), len(file_lens))
14 |     data = sorted(
15 |         zip(files, file_lens), key=lambda x: x[1 if sort_by_len else 0], reverse=True
16 |     )
17 |     return data
18 | 
19 | 
20 | def save_data_info(data: List[Tuple[str, int]], path: str) -> None:
21 |     with open(path, "w") as fp:
22 |         json.dump(data, fp, indent=2)
23 | 


--------------------------------------------------------------------------------
/models/spin/src/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .spin import SpinModel
2 | 


--------------------------------------------------------------------------------
/models/spin/src/model/base.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | import pytorch_lightning as pl
 4 | import yaml
 5 | 
 6 | 
 7 | class BaseModel(pl.LightningModule):
 8 |     def __init__(self, config) -> None:
 9 |         super().__init__()
10 | 
11 |         if isinstance(config, str) and config.split(".")[-1] in {"yaml", "yml"}:
12 |             config = yaml.load(open(config, "r"), Loader=yaml.FullLoader)
13 | 
14 |         self.config = config
15 |         self.save_hyperparameters(config)
16 | 
17 |     @abc.abstractmethod
18 |     def forward(self, batch):
19 |         raise NotImplementedError
20 | 
21 |     @abc.abstractmethod
22 |     def training_step(self, batch, batch_idx):
23 |         raise NotImplementedError
24 | 
25 |     @abc.abstractmethod
26 |     def configure_optimizers(self):
27 |         raise NotImplementedError
28 | 


--------------------------------------------------------------------------------
/models/spin/src/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from .dnn import DNN
2 | from .hubert import HuBERT
3 | from .swav_vq_dis import SwavVQDisentangle
4 | from .wavlm import WavLM
5 | 


--------------------------------------------------------------------------------
/models/spin/src/nn/dnn.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | 
 7 | class DNN(nn.Module):
 8 |     def __init__(
 9 |         self,
10 |         in_dim: int,
11 |         hid_dims: List[int],
12 |         dropout: float = 0.0,
13 |         activation: str = "ReLU",
14 |         activate_last: bool = False,
15 |     ) -> None:
16 |         super().__init__()
17 | 
18 |         self.in_dim = in_dim
19 |         self.out_dim = hid_dims[-1]
20 |         self.activate_last = activate_last
21 | 
22 |         assert len(hid_dims) > 0, len(hid_dims)
23 |         hid_dims = [in_dim] + hid_dims
24 | 
25 |         self.layers = nn.ModuleList(
26 |             [nn.Linear(hid_dims[i], hid_dims[i + 1]) for i in range(len(hid_dims) - 1)]
27 |         )
28 |         self.num_layer = len(self.layers)
29 |         self.dropout = nn.Dropout(dropout)
30 |         n_acts = self.num_layer - (0 if self.activate_last else 1)
31 |         self.acts = nn.ModuleList([getattr(nn, activation)() for _ in range(n_acts)])
32 | 
33 |     def forward(self, x: torch.Tensor, x_len: torch.LongTensor = None) -> torch.Tensor:
34 |         for i in range(self.num_layer):
35 |             x = self.layers[i](x)
36 |             if i < self.num_layer - 1 or self.activate_last:
37 |                 x = self.dropout(x)
38 |                 x = self.acts[i](x)
39 |         return x
40 | 


--------------------------------------------------------------------------------
/models/spin/src/task/__init__.py:
--------------------------------------------------------------------------------
1 | from .train_spin import SpinPretrainTask
2 | 


--------------------------------------------------------------------------------
/models/spin/src/task/train_spin.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import torch
  4 | import yaml
  5 | from pytorch_lightning import Trainer, seed_everything
  6 | from pytorch_lightning.callbacks import (
  7 |     LearningRateMonitor,
  8 |     ModelCheckpoint,
  9 |     TQDMProgressBar,
 10 | )
 11 | from torch.utils.data import DataLoader
 12 | 
 13 | from src.data import AudioPretrainPnmiValDataset, val_collate_fn
 14 | from src.model import SpinModel
 15 | from src.util import set_logging, set_pl_logger
 16 | 
 17 | 
 18 | class SpinPretrainTask:
 19 |     def __init__(self):
 20 |         parser = argparse.ArgumentParser()
 21 |         parser.add_argument("task", help="Task name")
 22 |         parser.add_argument("--config", "-c", help="Config .yaml file")
 23 |         parser.add_argument("--save-path", "-s", help="Path to save exp")
 24 |         parser.add_argument("--resume", "-r", default="", help="Resume training")
 25 |         parser.add_argument("--gpus", "-g", type=int, default=1, help="Number of GPUs")
 26 |         parser.add_argument(
 27 |             "--njobs", "-j", type=int, default=8, help="Number of workers"
 28 |         )
 29 |         parser.add_argument("--seed", type=int, default=7122, help="Random seed")
 30 |         parser.add_argument("--log-level", default="info", help="Logging level")
 31 |         args = parser.parse_args()
 32 | 
 33 |         if not torch.cuda.is_available():
 34 |             args.device = "cpu"
 35 |             args.gpus = 0
 36 |         else:
 37 |             args.device = "cuda" if args.gpus > 0 else "cpu"
 38 | 
 39 |         self.args = args
 40 |         set_logging(args.log_level)
 41 | 
 42 |     def run(self, model_cls=SpinModel):
 43 |         assert isinstance(self.args, argparse.Namespace)
 44 | 
 45 |         config = yaml.load(open(self.args.config, "r"), Loader=yaml.FullLoader)
 46 |         self.config = config
 47 | 
 48 |         use_ddp = (
 49 |             config["trainer"].get("strategy", "").startswith("ddp")
 50 |             and self.args.gpus > 1
 51 |         )
 52 | 
 53 |         if self.args.save_path != "":
 54 |             config["trainer"]["default_root_dir"] = self.args.save_path
 55 | 
 56 |         model_checkpoint = ModelCheckpoint(
 57 |             dirpath=config["trainer"]["default_root_dir"], **config["checkpoint"]
 58 |         )
 59 | 
 60 |         config["trainer"]["logger"] = set_pl_logger(
 61 |             config["trainer"]["logger"],
 62 |             config["logger"]["project"],
 63 |             config["trainer"]["default_root_dir"].split("/")[-1],
 64 |         )
 65 | 
 66 |         trainer = Trainer(
 67 |             callbacks=[
 68 |                 TQDMProgressBar(),
 69 |                 model_checkpoint,
 70 |                 LearningRateMonitor("step"),
 71 |             ],
 72 |             enable_progress_bar=True,
 73 |             devices=self.args.gpus,
 74 |             check_val_every_n_epoch=None,
 75 |             use_distributed_sampler=False,
 76 |             sync_batchnorm=use_ddp,
 77 |             **config["trainer"],
 78 |         )
 79 | 
 80 |         seed_everything(self.args.seed)
 81 | 
 82 |         if config.get("val_data", None) is not None:
 83 |             val_dataset = AudioPretrainPnmiValDataset(**config["val_data"])
 84 |             val_loader = DataLoader(
 85 |                 val_dataset,
 86 |                 batch_size=config["hparam"]["val_batch_size"],
 87 |                 num_workers=self.args.njobs,
 88 |                 pin_memory=True,
 89 |                 collate_fn=val_collate_fn,
 90 |                 shuffle=False,
 91 |                 drop_last=False,
 92 |             )
 93 |         else:
 94 |             val_dataset = None
 95 |             val_loader = None
 96 | 
 97 |         if self.args.resume != "":
 98 |             model = model_cls.load_from_checkpoint(self.args.resume)
 99 |         else:
100 |             self.args.resume = None
101 |             model = model_cls(config, 2)
102 | 
103 |         model.set_random_seed(self.args.seed)
104 |         model.set_njobs(self.args.njobs)
105 |         model.set_use_ddp(use_ddp)
106 | 
107 |         trainer.fit(model, val_dataloaders=val_loader, ckpt_path=self.args.resume)
108 | 


--------------------------------------------------------------------------------
/models/spin/src/util/__init__.py:
--------------------------------------------------------------------------------
 1 | from .log import set_logging, set_pl_logger
 2 | from .model_utils import (
 3 |     count_parameters,
 4 |     freeze_module,
 5 |     init_module,
 6 |     init_module_bert,
 7 |     init_module_cnn,
 8 |     init_module_pos_conv,
 9 |     unfreeze_module,
10 | )
11 | from .padding import (
12 |     add_front_padding_mask,
13 |     len_to_padding,
14 |     padding_to_len,
15 |     update_padding_mask,
16 | )
17 | from .pnmi import compute_show_pnmi, compute_snmi
18 | from .scheduler import get_scheduler
19 | 


--------------------------------------------------------------------------------
/models/spin/src/util/log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Union
 3 | 
 4 | from pytorch_lightning.loggers import WandbLogger
 5 | 
 6 | 
 7 | def set_logging(log_level: str = "info") -> None:
 8 |     level = getattr(logging, str(log_level).upper())
 9 |     logging.basicConfig(
10 |         level=level,
11 |         format="%(asctime)s %(filename)s.%(funcName)s %(message)s",
12 |         datefmt="%m-%d %H:%M",
13 |     )
14 | 
15 | 
16 | def set_pl_logger(
17 |     logger_type: Union[bool, str],
18 |     project: str = "speech_disentangle",
19 |     name: str = "example",
20 | ):
21 |     if isinstance(logger_type, bool):
22 |         return logger_type
23 |     elif logger_type == "wandb":
24 |         logger = WandbLogger(project=project, name=name)
25 |         return logger
26 |     else:
27 |         raise NotImplementedError(f"Unknown logger type = {logger_type}")
28 | 


--------------------------------------------------------------------------------
/models/spin/src/util/model_utils.py:
--------------------------------------------------------------------------------
 1 | from s3prl.upstream.wav2vec2.wav2vec2_model import MultiheadAttention
 2 | from torch import nn
 3 | 
 4 | 
 5 | def freeze_module(m: nn.Module) -> None:
 6 |     for p in m.parameters():
 7 |         p.requires_grad = False
 8 | 
 9 | 
10 | def unfreeze_module(m: nn.Module) -> None:
11 |     for p in m.parameters():
12 |         p.requires_grad = True
13 | 
14 | 
15 | def init_module(m: nn.Module):
16 |     for p in m.parameters():
17 |         nn.init.normal_(p, mean=0, std=0.02)
18 | 
19 | 
20 | def init_module_bert(m: nn.Module):
21 |     def normal_(data):
22 |         # with FSDP, module params will be on CUDA, so we cast them back to CPU
23 |         # so that the RNG is consistent with and without FSDP
24 |         data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device))
25 | 
26 |     if isinstance(m, nn.Linear):
27 |         normal_(m.weight.data)
28 |         if m.bias is not None:
29 |             m.bias.data.zero_()
30 |     if isinstance(m, nn.Embedding):
31 |         normal_(m.weight.data)
32 |         if m.padding_idx is not None:
33 |             m.weight.data[m.padding_idx].zero_()
34 |     if isinstance(m, MultiheadAttention):
35 |         normal_(m.q_proj.weight.data)
36 |         normal_(m.k_proj.weight.data)
37 |         normal_(m.v_proj.weight.data)
38 | 
39 | 
40 | def init_module_cnn(m: nn.Module):
41 |     if isinstance(m, nn.Conv1d):
42 |         nn.init.kaiming_normal_(m.weight)
43 |     if isinstance(m, nn.LayerNorm):
44 |         m.reset_parameters()
45 | 
46 | 
47 | def init_module_pos_conv(m: nn.Module):
48 |     if isinstance(m, nn.Conv1d):
49 |         m.reset_parameters()
50 |     if isinstance(m, nn.LayerNorm):
51 |         m.reset_parameters()
52 | 
53 | 
54 | def count_parameters(model: nn.Module) -> int:
55 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
56 | 


--------------------------------------------------------------------------------
/models/spin/src/util/padding.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | @torch.no_grad()
 5 | def len_to_padding(x_len: torch.LongTensor, max_len: int = 0) -> torch.BoolTensor:
 6 |     if max_len == 0:
 7 |         max_len = max(x_len)
 8 |     idxs = torch.arange(max_len, dtype=torch.long).to(x_len.device)
 9 |     padding_mask = idxs.unsqueeze(0) >= x_len.unsqueeze(1)
10 |     return padding_mask
11 | 
12 | 
13 | @torch.no_grad()
14 | def padding_to_len(padding_mask: torch.BoolTensor) -> torch.LongTensor:
15 |     x_len = (~padding_mask).long().sum(-1)
16 |     return x_len
17 | 
18 | 
19 | @torch.no_grad()
20 | def update_padding_mask(
21 |     padding_mask: torch.BoolTensor, new_len: int
22 | ) -> torch.BoolTensor:
23 |     extra = padding_mask.shape[1] % new_len
24 |     if extra > 0:
25 |         padding_mask = padding_mask[:, :-extra]
26 |     padding_mask = padding_mask.view(padding_mask.shape[0], new_len, -1)
27 |     padding_mask = padding_mask.all(-1)
28 |     return padding_mask
29 | 
30 | 
31 | @torch.no_grad()
32 | def add_front_padding_mask(
33 |     padding_mask: torch.BoolTensor, pad_front_lens: torch.LongTensor
34 | ) -> None:
35 |     for i in range(len(padding_mask)):
36 |         if pad_front_lens[i] > 0:
37 |             padding_mask[i, : pad_front_lens[i]] = True
38 | 


--------------------------------------------------------------------------------
/models/spin/src/util/pnmi.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | from collections import Counter
  7 | 
  8 | import numpy as np
  9 | from tabulate import tabulate
 10 | 
 11 | 
 12 | def comp_purity(p_xy, axis):
 13 |     max_p = p_xy.max(axis=axis)
 14 |     marg_p = p_xy.sum(axis=axis)
 15 |     indv_pur = max_p / marg_p
 16 |     aggr_pur = max_p.sum()
 17 |     return indv_pur, aggr_pur
 18 | 
 19 | 
 20 | def comp_entropy(p):
 21 |     return (-p * np.log(p + 1e-8)).sum()
 22 | 
 23 | 
 24 | def comp_norm_mutual_info(p_xy):
 25 |     p_x = p_xy.sum(axis=1, keepdims=True)
 26 |     p_y = p_xy.sum(axis=0, keepdims=True)
 27 |     pmi = np.log(p_xy / np.matmul(p_x, p_y) + 1e-8)
 28 |     mi = (p_xy * pmi).sum()
 29 |     h_x = comp_entropy(p_x)
 30 |     h_y = comp_entropy(p_y)
 31 |     return mi, mi / h_x, mi / h_y, h_x, h_y
 32 | 
 33 | 
 34 | def pad(labs, n):
 35 |     if n == 0:
 36 |         return np.array(labs)
 37 |     return np.concatenate([[labs[0]] * n, labs, [labs[-1]] * n])
 38 | 
 39 | 
 40 | def comp_avg_seg_dur(labs_list):
 41 |     n_frms = 0
 42 |     n_segs = 0
 43 |     for labs in labs_list:
 44 |         labs = np.array(labs)
 45 |         edges = np.zeros(len(labs)).astype(bool)
 46 |         edges[0] = True
 47 |         edges[1:] = labs[1:] != labs[:-1]
 48 |         n_frms += len(edges)
 49 |         n_segs += edges.astype(int).sum()
 50 |     return n_frms / n_segs
 51 | 
 52 | 
 53 | def comp_joint_prob(uid2refs, uid2hyps):
 54 |     cnts = Counter()
 55 |     skipped = []
 56 |     abs_frmdiff = 0
 57 |     for uid in uid2refs:
 58 |         if uid not in uid2hyps:
 59 |             skipped.append(uid)
 60 |             continue
 61 |         refs = uid2refs[uid]
 62 |         hyps = uid2hyps[uid]
 63 |         abs_frmdiff += abs(len(refs) - len(hyps))
 64 |         min_len = min(len(refs), len(hyps))
 65 |         refs = refs[:min_len]
 66 |         hyps = hyps[:min_len]
 67 |         cnts.update(zip(refs, hyps))
 68 |     tot = sum(cnts.values())
 69 | 
 70 |     ref_set = sorted({ref for ref, _ in cnts.keys()})
 71 |     hyp_set = sorted({hyp for _, hyp in cnts.keys()})
 72 |     ref2pid = dict(zip(ref_set, range(len(ref_set))))
 73 |     hyp2lid = dict(zip(hyp_set, range(len(hyp_set))))
 74 | 
 75 |     p_xy = np.zeros((len(ref2pid), len(hyp2lid)), dtype=float)
 76 |     for (ref, hyp), cnt in cnts.items():
 77 |         p_xy[ref2pid[ref], hyp2lid[hyp]] = cnt
 78 |     freq_xy = p_xy
 79 |     full_freq_xy = np.zeros((len(ref2pid), 4096), dtype=float)
 80 |     for (ref, hyp), cnt in cnts.items():
 81 |         full_freq_xy[ref2pid[ref], int(hyp)] = cnt
 82 |     p_xy = p_xy / p_xy.sum()
 83 |     return (
 84 |         freq_xy,
 85 |         full_freq_xy,
 86 |         p_xy,
 87 |         ref2pid,
 88 |         hyp2lid,
 89 |         tot,
 90 |         abs_frmdiff,
 91 |         skipped,
 92 |         ref_set,
 93 |         hyp_set,
 94 |     )
 95 | 
 96 | 
 97 | def comp_phone2code(p_xy):
 98 |     p_x = p_xy.sum(axis=1, keepdims=True)  # ref (phone)
 99 |     p_y = p_xy.sum(axis=0, keepdims=True)  # hyp (code)
100 | 
101 |     p_x_y = p_xy / p_y  # P(x | y) = P(phone | code)
102 | 
103 |     y_order = np.argsort(p_x_y.argmax(0))
104 |     p_x_y_sorted_y = np.take_along_axis(p_x_y, y_order.reshape((1, -1)), axis=1)
105 | 
106 |     x_order = np.argsort(p_x[:, 0])
107 |     x_order = np.flip(x_order)
108 |     p_x_y_sorted_x = np.take_along_axis(p_x_y, x_order.reshape((-1, 1)), axis=0)
109 |     y_order = np.argsort(p_x_y_sorted_x.argmax(0))
110 |     p_x_y_sorted_xy = np.take_along_axis(
111 |         p_x_y_sorted_x, y_order.reshape((1, -1)), axis=1
112 |     )
113 | 
114 |     return p_x_y, p_x_y_sorted_xy, p_x_y_sorted_y, x_order
115 | 
116 | 
117 | def compute_show_pnmi(uid2refs, uid2hyps, upsample=1, show_results: bool = False):
118 |     for k, v in uid2hyps.items():
119 |         uid2hyps[k] = pad(v, 0).repeat(upsample)
120 | 
121 |     (
122 |         freq_xy,
123 |         full_freq_xy,
124 |         p_xy,
125 |         ref2pid,
126 |         hyp2lid,
127 |         tot,
128 |         frmdiff,
129 |         skipped,
130 |         ref_set,
131 |         hyp_set,
132 |     ) = comp_joint_prob(uid2refs, uid2hyps)
133 |     ref_pur_by_hyp, ref_pur = comp_purity(p_xy, axis=0)
134 |     hyp_pur_by_ref, hyp_pur = comp_purity(p_xy, axis=1)
135 |     (mi, mi_norm_by_ref, mi_norm_by_hyp, h_ref, h_hyp) = comp_norm_mutual_info(p_xy)
136 | 
137 |     if show_results:
138 |         print(
139 |             tabulate(
140 |                 [[hyp_pur, ref_pur, mi_norm_by_ref]],
141 |                 ["Cls Pur", "Phn Pur", "PNMI"],
142 |                 floatfmt=".3f",
143 |                 tablefmt="fancy_grid",
144 |             )
145 |         )
146 | 
147 |     return {
148 |         "cls_pur": hyp_pur,
149 |         "phn_pur": ref_pur,
150 |         "pnmi": mi_norm_by_ref,
151 |     }
152 | 
153 | 
154 | def compute_snmi(p_xy):
155 |     _, ref_pur = comp_purity(p_xy, axis=0)
156 |     _, hyp_pur = comp_purity(p_xy, axis=1)
157 |     (_, mi_norm_by_ref, _, _, _) = comp_norm_mutual_info(p_xy)
158 | 
159 |     return {
160 |         "cls_pur": hyp_pur,
161 |         "spk_pur": ref_pur,
162 |         "snmi": mi_norm_by_ref,
163 |     }
164 | 


--------------------------------------------------------------------------------
/models/spin/src/util/scheduler.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | from torch.optim import Optimizer
 4 | from torch.optim.lr_scheduler import LambdaLR, _LRScheduler
 5 | 
 6 | 
 7 | def get_lr(optimizer: Optimizer) -> float:
 8 |     for param_group in optimizer.param_groups:
 9 |         return param_group["lr"]
10 | 
11 | 
12 | def noam_scheduler(
13 |     optimizer: Optimizer, warmup: int = 4000, last_epoch: int = -1
14 | ) -> _LRScheduler:
15 |     def func(step: int):
16 |         if step < warmup:
17 |             return (step + 1) / warmup
18 |         else:
19 |             return (warmup / (step + 1)) ** 0.5
20 | 
21 |     return LambdaLR(optimizer, func, last_epoch)
22 | 
23 | 
24 | def linear_warmup_decay_scheduler(
25 |     optimizer: Optimizer,
26 |     warmup: int = 4000,
27 |     max_step: int = 1000000,
28 |     init_lr: float = 1e-6,
29 |     final_lr: float = 1e-6,
30 | ) -> _LRScheduler:
31 |     func_list = []
32 | 
33 |     for param_group in optimizer.param_groups:
34 |         base_lr = param_group["lr"]
35 |         rate_i = init_lr / base_lr
36 |         rate_f = final_lr / base_lr
37 | 
38 |         def func(step: int) -> float:
39 |             if step <= warmup:
40 |                 return rate_i + (1.0 - rate_i) * step / warmup
41 |             else:
42 |                 return 1.0 - (1.0 - rate_f) * (step - warmup) / (max_step - warmup - 1)
43 | 
44 |         func_list.append(func)
45 | 
46 |     return LambdaLR(optimizer, func_list)
47 | 
48 | 
49 | def linear_warmup_cosine_scheduler(
50 |     optimizer: Optimizer,
51 |     warmup: int = 4000,
52 |     max_step: int = 1000000,
53 |     final_lr: float = 1e-6,
54 | ) -> _LRScheduler:
55 |     func_list = []
56 | 
57 |     for param_group in optimizer.param_groups:
58 |         base_lr = param_group["lr"]
59 |         rate = final_lr / base_lr
60 | 
61 |         def func(step: int) -> float:
62 |             if step < warmup:
63 |                 return (step + 1) / warmup
64 |             else:
65 |                 q = 0.5 * (
66 |                     1 + math.cos(math.pi * (step + 1 - warmup) / (max_step - warmup))
67 |                 )
68 |                 return (1.0 - rate) * q + rate
69 | 
70 |         func_list.append(func)
71 | 
72 |     return LambdaLR(optimizer, func_list)
73 | 
74 | 
75 | def get_scheduler(name: str, optimizer: Optimizer, **kwargs) -> _LRScheduler:
76 |     if name == "noam":
77 |         return noam_scheduler(optimizer, **kwargs)
78 |     elif name == "linear_warmup_decay":
79 |         return linear_warmup_decay_scheduler(optimizer, **kwargs)
80 |     elif name == "linear_warmup_cosine":
81 |         return linear_warmup_cosine_scheduler(optimizer, **kwargs)
82 |     else:
83 |         raise NotImplementedError(f"Unknown lr scheduler {name}")
84 | 


--------------------------------------------------------------------------------
/models/ssl_singer_identity/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Bernardo Torres
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/models/ssl_singer_identity/metadata/img/byol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freds0/free-svc/def7f35a0d1935b00e6ebaf19a4a149617f3354b/models/ssl_singer_identity/metadata/img/byol.png


--------------------------------------------------------------------------------
/models/ssl_singer_identity/metadata/img/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freds0/free-svc/def7f35a0d1935b00e6ebaf19a4a149617f3354b/models/ssl_singer_identity/metadata/img/pipeline.png


--------------------------------------------------------------------------------
/models/ssl_singer_identity/singer_identity/__init__.py:
--------------------------------------------------------------------------------
1 | # from . import losses
2 | 
3 | from .model import load_model
4 | # from . import model
5 | # from . import trainer
6 | # from . import utils
7 | # from .data import siamese_encoders
8 | 


--------------------------------------------------------------------------------
/models/ssl_singer_identity/singer_identity/callbacks/ma_updates.py:
--------------------------------------------------------------------------------
 1 | from math import cos, pi
 2 | from typing import Optional, Sequence
 3 | 
 4 | import torch
 5 | from pytorch_lightning import Callback, LightningModule, Trainer
 6 | 
 7 | 
 8 | class MAWeightUpdate(Callback):
 9 |     """Weight update rule from BYOL.
10 |     Your model should have:
11 |         - ``self.online_network``
12 |         - ``self.target_network``
13 |     Updates the target_network params using an exponential moving average update rule weighted by tau.
14 |     BYOL claims this keeps the online_network from collapsing.
15 |     .. note:: Automatically increases tau from ``initial_tau`` to 1.0 with every training step
16 |     Example::
17 |         # model must have 2 attributes
18 |         model = Model()
19 |         model.online_network = ...
20 |         model.target_network = ...
21 |         trainer = Trainer(callbacks=[MAWeightUpdate()])
22 |     """
23 | 
24 |     def __init__(self, initial_tau: float = 0.996, max_epochs=100, should_update: bool = True):
25 |         """
26 |         Args:
27 |             initial_tau: starting tau. Auto-updates with every training step
28 |         """
29 |         super().__init__()
30 |         self.initial_tau = initial_tau
31 |         self.max_epochs = max_epochs
32 |         self.should_update = should_update
33 | 
34 |         self.current_tau = initial_tau
35 | 
36 |     def on_train_batch_end(
37 |             self,
38 |             trainer: Trainer,
39 |             pl_module: LightningModule,
40 |             outputs: Sequence,
41 |             batch: Sequence,
42 |             batch_idx: int,
43 |             unused: Optional[int] = 0
44 |     ) -> None:
45 |         # get networks
46 |         student_network = pl_module.student_network
47 |         teacher_network = pl_module.teacher_network
48 | 
49 |         # update weights
50 |         self.update_weights(student_network, teacher_network)
51 | 
52 |         # log tau
53 |         pl_module.log("hparams/MA rate", self.current_tau, prog_bar=False, logger=True)
54 | 
55 |         # update tau after
56 |         if self.should_update:
57 |             self.current_tau = self.update_tau(pl_module, trainer)
58 | 
59 |     def update_tau(self, pl_module: LightningModule, trainer: Trainer) -> float:
60 |         max_steps = len(trainer.train_dataloader) * self.max_epochs
61 |         tau = 1 - (1 - self.initial_tau) * (cos(pi * pl_module.global_step / max_steps) + 1) / 2
62 |         return tau
63 | 
64 |     def update_weights(
65 |         self,
66 |         student_network: torch.nn.Module,
67 |         teacher_network: torch.nn.Module
68 |     ) -> None:
69 |         # apply MA weight update
70 |         for (name, student_p), (_, teacher_p) in zip(
71 |             student_network.named_parameters(),
72 |             teacher_network.named_parameters(),
73 |         ):
74 |             teacher_p.data = self.current_tau * teacher_p.data + (1 - self.current_tau) * student_p.data
75 | 


--------------------------------------------------------------------------------
/models/ssl_singer_identity/singer_identity/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from singer_identity.utils.core import similarity, roll
 4 | 
 5 | 
 6 | def std_batch(x, var=1, eps=1e-8):
 7 |     std = torch.sqrt(x.var(dim=0) + eps)
 8 |     return torch.mean(F.relu(var - std))
 9 | 
10 | 
11 | def variance_hinge_reg(x, y, var=1):
12 |     # From https://github.com/facebookresearch/vicreg
13 |     std_x = std_batch(x, var=var)
14 |     std_y = std_batch(y, var=var)
15 |     std_loss = std_x / 2 + std_y / 2
16 |     return std_loss
17 | 
18 | 
19 | def covariance(x):
20 |     # In official implementation they do mean over batch (to verify)
21 |     # mean = x.mean(1, keepdims=True)
22 |     mean = x.mean(dim=0)
23 |     x = x - mean
24 |     cov = torch.matmul(x.transpose(0, 1), x) / (x.shape[0] - 1)
25 |     # cov = (x.T @ x) / (x.shape[0] - 1)
26 |     return cov
27 | 
28 | 
29 | def covariance_reg(x, y):
30 |     eye = torch.eye(x.shape[1]).to(x.device)
31 |     cov_x = covariance(x)
32 |     cov_y = covariance(y)
33 |     assert cov_x.shape[0] == cov_x.shape[1]
34 |     assert cov_y.shape[0] == cov_y.shape[1]
35 |     cov_reg = (cov_x * (1 - eye)).pow(2).sum() / x.shape[1] + (cov_y * (1 - eye)).pow(
36 |         2
37 |     ).sum() / x.shape[1]
38 |     return cov_reg
39 | 
40 | 
41 | def invariance_loss(x, y):
42 |     return F.mse_loss(x, y)
43 | 
44 | 
45 | def vicreg_loss(x, y, gamma=1, fact_inv_loss=1, fact_var=1, fact_cov=1):
46 |     # Adapted from https://github.com/facebookresearch/vicreg
47 |     repr_loss = invariance_loss(x, y)
48 |     std_loss = variance_hinge_reg(x, y, var=gamma)
49 |     cov_loss = covariance_reg(x, y)
50 |     loss = fact_inv_loss * repr_loss + fact_var * std_loss + fact_cov * cov_loss
51 |     return loss
52 | 
53 | 
54 | def compute_norms(*args):
55 |     norms = []
56 |     for arg in args:
57 |         norms.append(torch.sqrt((arg**2).sum(1)))
58 |     return norms
59 | 
60 | 
61 | def align_loss(x, y, alpha=2):
62 |     # From https://github.com/SsnL/align_uniform
63 |     return (x - y).norm(p=2, dim=1).pow(alpha).mean()
64 | 
65 | 
66 | def uniform_loss(x, t=2):
67 |     # From https://github.com/SsnL/align_uniform
68 |     return torch.pdist(x, p=2).pow(2).mul(-t).exp().mean().log()
69 | 
70 | 
71 | def contrastive_loss(z1, z2, temp=0.2, nr_negative=1, decouple=False):
72 |     cost_pos = similarity(z1, z2, temp)  # Positive samples
73 |     cost_neg = []
74 | 
75 |     n_rolls = min(z1.shape[0] - 1, nr_negative)  # Number of negative samples
76 |     curr_neg_z = z2
77 | 
78 |     for i in range(n_rolls):
79 |         curr_neg_z = roll(curr_neg_z)  # Shifts batch
80 |         cost_neg.append(similarity(z1, curr_neg_z, temp))  # Negative sim.
81 | 
82 |     if not decouple:
83 |         cost_neg.append(cost_pos)  # Adds positive similarity in denominator
84 | 
85 |     cost_neg = torch.stack(cost_neg).transpose(1, 0)
86 |     cost = (-cost_pos + torch.logsumexp(cost_neg, 1)).mean()
87 |     # TODO: implement similarities with less operations, but this works
88 |     ratio = torch.mean(cost_neg) / (
89 |         torch.mean(cost_pos) + torch.tensor(1e-6).type_as(z1)
90 |     )
91 |     return cost, ratio.item()
92 | 


--------------------------------------------------------------------------------
/models/ssl_singer_identity/singer_identity/models/network_components.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from typing import Union, Callable, List, Optional
 4 | from torchvision.models import efficientnet_b0, efficientnet_b4
 5 | import torchvision.transforms as vt
 6 | 
 7 | 
 8 | def get_vision_backbone(
 9 |     vismod="efficientnet_b0", num_classes=1000, pretrained=False, **kwargs
10 | ):
11 |     if vismod == "efficientnet_b0":
12 |         return efficientnet_b0(pretrained=pretrained, num_classes=num_classes, **kwargs)
13 |     elif vismod == "efficientnet_b4":
14 |         return efficientnet_b4(pretrained=pretrained, num_classes=num_classes, **kwargs)
15 | 
16 |     else:
17 |         raise NotImplementedError
18 | 
19 | 
20 | class Grey2Rgb(nn.Module):
21 |     def __init__(self):
22 |         super().__init__()
23 |         self.normalize = vt.Normalize(
24 |             mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
25 |         )
26 | 
27 |     def forward(self, data):
28 |         batch_size, freq_bins, times = data.shape
29 |         data /= data.max()
30 |         data = data.unsqueeze(1).expand(batch_size, 3, freq_bins, times)
31 |         data = self.normalize(data)
32 |         return data
33 | 
34 | 
35 | class LogScale(nn.Module):
36 |     def forward(self, data):
37 |         # eps = 1e-8
38 |         eps = torch.tensor(1e-8, device=data.device)
39 |         return torch.log(data + eps)
40 | 
41 | 
42 | class Aggregator(nn.Module):
43 |     """Aggregates (in time) a list of features"""
44 | 
45 |     def __init__(self):
46 |         super().__init__()
47 |         self.aggregation = nn.Sequential(nn.AdaptiveAvgPool1d(1), nn.Flatten(1))
48 | 
49 |     def forward(self, features):
50 |         """
51 |         Returns:
52 |             outputs_feature: torch.Tensor of shape(B x C x t)
53 |         """
54 |         if isinstance(features, list):
55 |             output_feature = [self.aggregation(feature) for feature in features]
56 |         else:
57 |             output_feature = self.aggregation(features)
58 |         return output_feature
59 | 


--------------------------------------------------------------------------------
/models/ssl_singer_identity/singer_identity/train_configs/README.md:
--------------------------------------------------------------------------------
 1 | # Configuration File for Training
 2 | 
 3 | You can use a configuration file to train a model using the `train.py` script. Here we provide a description of how to setup the config file. The common options are described in the [common config](common.yaml) file. 
 4 | 
 5 | 
 6 | ```python
 7 | python train.py --config path/to/common.yaml --config path/to/model_config.yaml
 8 | ```
 9 | The model specific options are described below. In the example above, `model_config.yaml` will overwrite the options in `common.yaml` when options are repeated. For more details check the [Lightning CLI](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli.html#lightning-cli) docs.
10 | 
11 | ## 1. Model specific options
12 | In order to use contrastive, VICReg and Uniformity-Alignment, simply change the loss arguments in the config file. Below is the example for the contrastive loss:
13 | 
14 | ```yaml
15 | use_contrastive_loss: true  # use contrastive loss
16 | temp: 0.2  # temperature for contrastive loss
17 | nr_negative: 250  # number of negative samples for contrastive loss
18 | decouple: true  # use decouple contrastive loss or regular NT-Xent loss
19 | use_covariance_reg: false  # use covariance regularization
20 | use_variance_reg: false  # use variance regularization
21 | use_vicreg_loss: false  # use vicreg loss
22 | use_align_loss: false  # use alignment loss
23 | use_uniform_loss: false  # use uniformity loss
24 | ```
25 | The individual weights for the losses can be specified as well. BYOL training has its dedicated trainer class and needs to be specified as shown in `byol.yaml`.
26 | 
27 | We provide the following configs for the models used in the paper:
28 | 
29 | - `byol.yaml`
30 | - `contrastive.yaml`
31 | - `contrastive_vc.yaml`
32 | - `uniformity-alignment.yaml`
33 | - `vicreg.yaml`
34 | 
35 | 
36 | ## 2. Data Options
37 | In the config file used to launch training (`common.yaml` is this example), specify the datasets to use as follows:
38 |     
39 | ```yaml
40 | data:
41 | class_path: singer_id.data.siamese_encoders.SiameseEncodersDataModule  # default the dataloader class
42 | init_args:
43 |     dataset_dirs: 
44 |     - '/Path/to/dataset1/dataset1_name'
45 |     - '/Path/to/dataset2/dataset2_name'
46 |     batch_size:  # batch size for training
47 |     batch_size_val:  # batch size for validation
48 |     nr_samples: # number of samples to use for training (default: 176000, ie 4 seconds of audio in 44.1kHz)
49 |     normalize: # normalize the audio when loading  
50 |     num_workers: # number of workers for the dataloader
51 |     batch_sampling_mode:  # "sample_clips" or "sample groups". Use "sample_clips" for self-supervised COLA loading
52 |     eval_frac: # fraction of the dataset to use for validation
53 |     group_name_is_folder: 
54 |     group_by_artist: 
55 |     multi_epoch:  # number of epochs to repeat the dataset to simulate a larger dataset
56 | ```
57 | 
58 | ## 3. Augmentation Options
59 | 
60 | The following augmentations are available. We use [Audiomentations](https://github.com/iver56/audiomentations) and [Parselmouth](https://github.com/YannickJadoul/Parselmouth) to perform the augmentations. All fields specify the probability of applying the augmentation, except for `pitch_shift_parselmouth`, `pitch_range_parselmouth`.
61 | 
62 | ```yaml
63 |     augmentations: 
64 |     "enable": true
65 |     "gaussian_noise": 0.5  # min_amplitude=0.001, max_amplitude=0.05
66 |     "pitch_shift_naive": 0  # naive pitch shift (using librosa), not used in the paper
67 |     "time_stretch": 0 # time stretch, not used in the paper
68 |     "gain": 0.5  #  min_gain_in_db=-6, max_gain_in_db=0
69 |     "shift": 0  # not used in the paper
70 |     "parametric_eq": 0  # not used in the paper
71 |     "tanh_distortion": 0  # not used in the paper
72 |     "time_mask": 0.5  # max_band_part=1/8
73 |     "formant_shift_parselmouth": 0  # not used in the paper
74 |     "pitch_shift_parselmouth": [1, 1.3]  # Pitch shift value on parselmouth
75 |     "pitch_range_parselmouth": 1.5  # Pitch range value on parselmouth
76 |     "pitch_shift_parselmouth_prob": 0.5 
77 | ```
78 | 
79 | 
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/models/ssl_singer_identity/singer_identity/train_configs/byol.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   class_path: singer_identity.trainer_byol.BYOL
 3 |   init_args:
 4 | # ------------------ Feature extractor ------------------
 5 |     feature_extractor:
 6 |       spec_layer: 'melspectogram'
 7 |       n_fft: 2048
 8 |       hop_length: 512
 9 | # ------------------ Encoder ------------------
10 |     backbone:
11 |       backbone: "efficientnet_b0"
12 |       pretrained: true
13 |       embedding_dim: 1000
14 | # ------------------ Projection ------------------       
15 |     projection:
16 |       input_dim: 1000
17 |       output_dim: 128
18 |       l2_normalize: true
19 | # ------------------ Predictor ------------------       
20 |     predictor:
21 |       dims:
22 |         - 128
23 |         - 1024
24 |         - 128
25 |       use_batchnorm: true
26 |     normalize_projections: true
27 |     weight_callback:
28 |       class_path: singer_identity.callbacks.ma_updates.MAWeightUpdate
29 |       init_args:
30 |         initial_tau: 0.99
31 |         max_epochs: 1000
32 | # ------------------ Optimizer ------------------
33 |     optimizer:
34 |       class_path: singer_identity.models.byol.Adam
35 |       init_args:
36 |         lr: 3e-5
37 |         weight_decay: 1.5e-6
38 |     scheduler:
39 |       class_path: singer_identity.models.byol.LinearWarmupCosineAnnealing
40 |       init_args:
41 |         warmup_epochs: 10
42 |         max_epochs: 1000
43 |         
44 | trainer:
45 | # ------------------ Logger ------------------
46 |   logger:
47 |     class_path: pytorch_lightning.loggers.TensorBoardLogger
48 |     init_args:
49 |       save_dir: "logs"
50 |       name: "byol"


--------------------------------------------------------------------------------
/models/ssl_singer_identity/singer_identity/train_configs/common.yaml:
--------------------------------------------------------------------------------
  1 | seed_everything: 123
  2 | 
  3 | # ------------------ Datasets ------------------       
  4 | data:
  5 |   class_path: singer_identity.data.siamese_encoders.SiameseEncodersDataModule
  6 |   init_args:
  7 |     dataset_dirs: 
  8 |       # - 'PLACE PATH TO DATASET HERE'
  9 |       # - 'PLACE PATH TO OTHER DATASET HERE IF NEEDED'
 10 | 
 11 | # ------------------ Data loading hyperparameters ------------------       
 12 |     batch_size: 140
 13 |     batch_size_val:  140
 14 |     nr_samples: 176000  # 4s in 44.1kHz
 15 |     normalize: true
 16 |     num_workers: 4
 17 |     batch_sampling_mode: "sample_clips"
 18 |     eval_frac: 0.2   # Fraction of the dataset to use for validation
 19 |     verbose: true
 20 |     group_name_is_folder: true
 21 |     group_by_artist: true
 22 |     multi_epoch: 1
 23 | # ------------------ Augmentations ------------------       
 24 |     augmentations: 
 25 |       "enable": true
 26 |       "gaussian_noise": 0.5
 27 |       "pitch_shift_naive": 0
 28 |       "time_stretch": 0
 29 |       "gain": 0.5
 30 |       "shift": 0
 31 |       "parametric_eq": 0
 32 |       "tanh_distortion": 0
 33 |       "time_mask": 0.5
 34 |       "formant_shift_parselmouth": 0
 35 |       "pitch_shift_parselmouth": [1, 1.3]
 36 |       "pitch_range_parselmouth": 1.5
 37 |       "pitch_shift_parselmouth_prob": 0.5
 38 | 
 39 | # ------------------ Model ------------------
 40 | model:
 41 |   class_path: singer_identity.trainer.SSLTrainer  # Default trainer class, does not need to change
 42 |   init_args:
 43 | # ------------------ Optimizer ------------------
 44 |     optimizer1_init:
 45 |       class_path: torch.optim.Adam
 46 |       init_args:
 47 |         lr: 0.0001
 48 |         weight_decay: 1e-5
 49 | 
 50 | # ------------------ Feature extractor ------------------
 51 |     feature_extractor:
 52 |       spec_layer: 'melspectogram'
 53 |       n_fft: 2048
 54 |       hop_length: 512
 55 | 
 56 | # ------------------ Encoder ------------------
 57 |     backbone:
 58 |       backbone: "efficientnet_b0"
 59 |       pretrained: true
 60 |       embedding_dim: 1000  # This is the embedding dimension of the backbone
 61 | 
 62 | # ------------------ Projection ------------------       
 63 |     projection:
 64 |       input_dim: 1000
 65 |       output_dim: 128  # Projection dimension
 66 |       l2_normalize: true  # Whether to normalize the projection vectors
 67 | 
 68 | 
 69 | 
 70 | 
 71 | # ------------------ Training ------------------
 72 | trainer:
 73 |   max_epochs: 100000  # Maximum number of epochs to train for
 74 |   max_steps: 1000000000   # Maximum number of steps to train for
 75 |   accelerator: "gpu"
 76 |   num_nodes: 1
 77 | # ------------------ Logger ------------------
 78 |   logger:
 79 |     class_path: pytorch_lightning.loggers.TensorBoardLogger  # Replace with logger of choice
 80 |     init_args:
 81 |       save_dir: "logs"
 82 |       name: "log_name_here"  
 83 | 
 84 | # ------------------ Vizualization callbacks ------------------
 85 |   callbacks:
 86 | 
 87 | # ------------------ Evaluation callbacks ------------------
 88 | #  Evaluation callbacks are used to evaluate the model on the validation set
 89 | #  and are logged during training.
 90 |     - class_path: singer_identity.callbacks.evaluation.OrderEvaluation  # Rank evaluation
 91 |       init_args:
 92 |         log_n_epochs: 5
 93 |         on_train: true
 94 |     - class_path: singer_identity.callbacks.evaluation.EEREvaluation  # EER
 95 |       init_args:
 96 |         log_n_epochs: 5
 97 |         on_train: false
 98 |     - class_path: singer_identity.callbacks.evaluation.HypersphereEvaluation  # Alignment/uniformity
 99 |       init_args:
100 |         log_n_epochs: 5
101 |         on_train: true
102 | 
103 | # ------------------ Checkpoint callbacks ------------------
104 | # Checkpoint callbacks are used to save the model during training.
105 | # Uncomment the ones you want to use.
106 |     - class_path: pytorch_lightning.callbacks.ModelCheckpoint
107 |       init_args:
108 |         monitor: "loss/val"
109 |         mode: "min"
110 |         filename: "best-val-loss-{epoch}-{step}"
111 |         save_top_k: 1
112 | 
113 |     - class_path: pytorch_lightning.callbacks.ModelCheckpoint
114 |       init_args:
115 |         every_n_epochs: 50
116 |         save_top_k: -1
117 |         filename: "ckpt-{epoch}-{step}"
118 | 
119 |     # - class_path: pytorch_lightning.callbacks.ModelCheckpoint
120 |     #   init_args:
121 |     #     monitor: "EER evaluation proj/val"
122 |     #     mode: "min"
123 |     #     filename: "best-eer-val-{epoch}-{step}"
124 |     #     save_top_k: 1
125 |     # - class_path: pytorch_lightning.callbacks.ModelCheckpoint
126 |     #   init_args:
127 |     #     monitor: "Order evaluation mean proj/val"
128 |     #     mode: "min"
129 |     #     filename: "best-rank-val-{epoch}-{step}"
130 |     #     save_top_k: 1
131 |     # - class_path: pytorch_lightning.callbacks.ModelCheckpoint
132 |     #   init_args:
133 |     #     monitor: "Alignment evaluation proj/val"
134 |     #     mode: "min"
135 |     #     filename: "best-alignment-val-{epoch}-{step}"
136 |     #     save_top_k: 1
137 |     # - class_path: pytorch_lightning.callbacks.ModelCheckpoint
138 |     #   init_args:
139 |     #     monitor: "Uniformity evaluation proj/val"
140 |     #     mode: "min"
141 |     #     filename: "best-uniformity-val-{epoch}-{step}"
142 |     #     save_top_k: 1
143 | 
144 |     
145 | 
146 | 


--------------------------------------------------------------------------------
/models/ssl_singer_identity/singer_identity/train_configs/contrastive-vc.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   class_path: singer_identity.trainer.SSLTrainer
 3 |   init_args:
 4 | # ------------------ Feature extractor ------------------
 5 |     feature_extractor:
 6 |       spec_layer: 'melspectogram'
 7 |       n_fft: 2048
 8 |       hop_length: 512
 9 | # ------------------ Encoder ------------------
10 |     backbone:
11 |       backbone: "efficientnet_b0"
12 |       pretrained: true
13 |       embedding_dim: 1000
14 | # ------------------ Projection ------------------       
15 |     projection:
16 |       input_dim: 1000
17 |       output_dim: 128
18 |       l2_normalize: true
19 | # ------------------ Training hyperparameters ------------------       
20 |     use_contrastive_loss: true 
21 |     temp: 0.2
22 |     nr_negative: 250
23 |     decouple: true
24 |     use_covariance_reg: true
25 |     fact_cov: 100
26 |     use_variance_reg: true
27 |     fact_var: 25
28 |     use_invariance_loss: false
29 |     use_vicreg_loss: false
30 |     use_align_loss: false
31 |     use_uniform_loss: false
32 |     # ------------------ Optimizer ------------------
33 |     optimizer1_init:
34 |       class_path: torch.optim.Adam
35 |       init_args:
36 |         lr: 0.0001
37 |         weight_decay: 1e-5
38 | trainer:
39 | # ------------------ Logger ------------------
40 |   logger:
41 |     class_path: pytorch_lightning.loggers.TensorBoardLogger
42 |     init_args:
43 |       save_dir: "logs"
44 |       name: "contrastive-vc"


--------------------------------------------------------------------------------
/models/ssl_singer_identity/singer_identity/train_configs/contrastive.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   class_path: singer_identity.trainer.SSLTrainer
 3 |   init_args:
 4 | # ------------------ Feature extractor ------------------
 5 |     feature_extractor:
 6 |       spec_layer: 'melspectogram'
 7 |       n_fft: 2048
 8 |       hop_length: 512
 9 | # ------------------ Encoder ------------------
10 |     backbone:
11 |       backbone: "efficientnet_b0"
12 |       pretrained: true
13 |       embedding_dim: 1000
14 | # ------------------ Projection ------------------       
15 |     projection:
16 |       input_dim: 1000
17 |       output_dim: 128
18 |       l2_normalize: true
19 | # ------------------ Training hyperparameters ------------------       
20 |     use_contrastive_loss: true 
21 |     temp: 0.2
22 |     nr_negative: 250
23 |     decouple: true
24 |     use_covariance_reg: false
25 |     use_variance_reg: false
26 |     use_vicreg_loss: false
27 |     use_align_loss: false
28 |     use_uniform_loss: false
29 |     # ------------------ Optimizer ------------------
30 |     optimizer1_init:
31 |       class_path: torch.optim.Adam
32 |       init_args:
33 |         lr: 0.0001
34 |         weight_decay: 1e-5
35 | trainer:
36 | # ------------------ Logger ------------------
37 |   logger:
38 |     class_path: pytorch_lightning.loggers.TensorBoardLogger
39 |     init_args:
40 |       save_dir: "logs"
41 |       name: "contrastive"


--------------------------------------------------------------------------------
/models/ssl_singer_identity/singer_identity/train_configs/contrastive_test.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   class_path: singer_identity.trainer.SSLTrainer
 3 |   init_args:
 4 | # ------------------ Feature extractor ------------------
 5 |     feature_extractor:
 6 |       spec_layer: 'melspectogram'
 7 |       n_fft: 2048
 8 |       hop_length: 512
 9 | 
10 | # ------------------ Encoder ------------------
11 |     backbone:
12 |       backbone: "efficientnet_b0"
13 |       pretrained: true
14 |       embedding_dim: 1000
15 | 
16 | # ------------------ Projection ------------------       
17 |     projection:
18 |       input_dim: 1000
19 |       output_dim: 128
20 |       l2_normalize: true
21 | 
22 | # ------------------ Training hyperparameters ------------------       
23 |     use_contrastive_loss: true 
24 |     temp: 0.2
25 |     nr_negative: 250
26 |     decouple: true
27 |     use_covariance_reg: false
28 |     use_variance_reg: false
29 |     use_vicreg_loss: false
30 |     use_align_loss: false
31 |     use_uniform_loss: false
32 | 
33 |     # ------------------ Optimizer ------------------
34 |     optimizer1_init:
35 |       class_path: torch.optim.Adam
36 |       init_args:
37 |         lr: 0.0001
38 |         weight_decay: 1e-5
39 | 
40 | trainer:
41 | # ------------------ Logger ------------------
42 |   logger:
43 |     class_path: pytorch_lightning.loggers.TensorBoardLogger
44 |     init_args:
45 |       save_dir: "logs"
46 |       name: "contrastive"


--------------------------------------------------------------------------------
/models/ssl_singer_identity/singer_identity/train_configs/uniformity-alignment.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   class_path: singer_identity.trainer.SSLTrainer
 3 |   init_args:
 4 | # ------------------ Feature extractor ------------------
 5 |     feature_extractor:
 6 |       spec_layer: 'melspectogram'
 7 |       n_fft: 2048
 8 |       hop_length: 512
 9 | # ------------------ Encoder ------------------
10 |     backbone:
11 |       backbone: "efficientnet_b0"
12 |       pretrained: true
13 |       embedding_dim: 1000
14 | # ------------------ Projection ------------------       
15 |     projection:
16 |       input_dim: 1000
17 |       output_dim: 128
18 |       l2_normalize: true
19 | # ------------------ Training hyperparameters ------------------       
20 |     use_contrastive_loss: false 
21 |     use_covariance_reg: false
22 |     use_variance_reg: false
23 |     use_invariance_loss: false
24 |     use_align_loss: true
25 |     fact_align_loss: 1
26 |     use_uniform_loss: true
27 |     fact_unif_loss: 1
28 |     # ------------------ Optimizer ------------------
29 |     optimizer1_init:
30 |       class_path: torch.optim.Adam
31 |       init_args:
32 |         lr: 0.0001
33 |         weight_decay: 1e-5
34 | trainer:
35 | # ------------------ Logger ------------------
36 |   logger:
37 |     class_path: pytorch_lightning.loggers.TensorBoardLogger
38 |     init_args:
39 |       save_dir: "logs"
40 |       name: "uniformity-alignment"


--------------------------------------------------------------------------------
/models/ssl_singer_identity/singer_identity/train_configs/vicreg.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   class_path: singer_identity.trainer.SSLTrainer
 3 |   init_args:
 4 | # ------------------ Feature extractor ------------------
 5 |     feature_extractor:
 6 |       spec_layer: 'melspectogram'
 7 |       n_fft: 2048
 8 |       hop_length: 512
 9 | # ------------------ Encoder ------------------
10 |     backbone:
11 |       backbone: "efficientnet_b0"
12 |       pretrained: true
13 |       embedding_dim: 1000
14 | # ------------------ Projection ------------------       
15 |     projection:
16 |       input_dim: 1000
17 |       output_dim: 128
18 |       l2_normalize: true
19 | # ------------------ Training hyperparameters ------------------       
20 |     use_contrastive_loss: false 
21 |     # temp: 0.2
22 |     # nr_negative: 250
23 |     # decouple: true
24 |     use_invariance_loss: true
25 |     fact_inv_loss:  25
26 |     use_covariance_reg: true
27 |     fact_cov: 100
28 |     use_variance_reg: true
29 |     fact_var: 25
30 |     gamma: 1
31 |     use_align_loss: false
32 |     use_uniform_loss: false
33 |     # ------------------ Optimizer ------------------
34 |     optimizer1_init:
35 |       class_path: torch.optim.Adam
36 |       init_args:
37 |         lr: 0.0001
38 |         weight_decay: 1e-5
39 | trainer:
40 | # ------------------ Logger ------------------
41 |   logger:
42 |     class_path: pytorch_lightning.loggers.TensorBoardLogger
43 |     init_args:
44 |       save_dir: "logs"
45 |       name: "vicreg"


--------------------------------------------------------------------------------
/models/ssl_singer_identity/singer_identity/trainer_byol.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Any
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | from singer_identity.models.byol import TeacherStudentModel, Optimizer, Scheduler
 6 | from singer_identity.model import IdentityEncoder, Projection, SiameseArm, MLP
 7 | 
 8 | 
 9 | class BYOL(TeacherStudentModel):
10 |     def __init__(
11 |         self,
12 |         # module: nn.Module,
13 |         backbone: dict,
14 |         projection: dict,
15 |         predictor: dict,
16 |         weight_callback,
17 |         optimizer: Optimizer,
18 |         feature_extractor: dict = {},
19 |         loss_fn: nn.Module = torch.nn.MSELoss(),
20 |         scheduler: Optional[Scheduler] = None,
21 |         normalize_projections: bool = False,
22 |         normalize_representations: bool = False,
23 |     ):
24 |         encoder = IdentityEncoder(feature_extractor=feature_extractor, encoder=backbone)
25 |         projection = Projection(**projection)
26 |         predictor = MLP(**predictor)
27 |         module = SiameseArm(
28 |             encoder=encoder,
29 |             projector=projection,
30 |             predictor=predictor,
31 |             normalize_projections=normalize_projections,
32 |             normalize_representations=normalize_representations,
33 |         )
34 | 
35 |         super(BYOL, self).__init__(
36 |             module, loss_fn, weight_callback, optimizer, scheduler=scheduler
37 |         )
38 |         self.save_hyperparameters(ignore=["module", "loss_fn"])
39 | 
40 |     def shared_step(self, batch, step_name: str):
41 |         x1 = batch["clip1"]
42 |         x2 = batch["clip2"]
43 | 
44 |         batch_size = x1.shape[0]
45 | 
46 |         ys, zs, qs = self.student_network(x1)
47 |         with torch.no_grad():
48 |             yt, zt, qt = self.teacher_network(x2)
49 |         loss_12 = self.loss_fn(qs, zt)
50 | 
51 |         ys, zs, qs = self.student_network(x2)
52 |         with torch.no_grad():
53 |             yt, zt, qt = self.teacher_network(x1)
54 |         loss_21 = self.loss_fn(qs, zt)
55 | 
56 |         loss = (loss_12 + loss_21) / 2
57 | 
58 |         self.log(
59 |             f"loss/{step_name}",
60 |             loss,
61 |             prog_bar=True,
62 |             batch_size=batch_size,
63 |         )
64 | 
65 |         self.record_variables(y1=ys, z1=zs, y2=yt, z2=zt)
66 | 
67 |         return loss
68 | 
69 |     def training_step(self, batch, batch_idx):
70 |         return self.shared_step(batch, "train")
71 | 
72 |     def validation_step(self, batch, batch_idx):
73 |         return self.shared_step(batch, "val")
74 | 


--------------------------------------------------------------------------------
/models/ssl_singer_identity/train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pytorch_lightning as pl
 3 | from pytorch_lightning.cli import LightningCLI
 4 | 
 5 | 
 6 | class CLI(LightningCLI):
 7 |     def add_arguments_to_parser(self, parser):
 8 |         parser.add_argument("--ckpt_path", default=None)
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     cli = CLI(
13 |         model_class=pl.LightningModule,
14 |         datamodule_class=pl.LightningDataModule,
15 |         subclass_mode_model=True,
16 |         subclass_mode_data=True,
17 |         save_config_kwargs={"overwrite": True},
18 |         run=False,
19 |     )
20 | 
21 |     ckpt_path = cli.config["ckpt_path"]
22 | 
23 |     if ckpt_path is not None:
24 |         step = torch.load(ckpt_path, map_location="cpu")["global_step"]
25 |         cli.trainer.fit_loop.epoch_loop._batches_that_stepped = step
26 | 
27 |     cli.trainer.fit(cli.model, cli.datamodule, ckpt_path=ckpt_path)
28 | 


--------------------------------------------------------------------------------
/models/wavlm/WavLM-Large.pt.txt:
--------------------------------------------------------------------------------
1 | https://github.com/microsoft/unilm/tree/master/wavlm


--------------------------------------------------------------------------------
/models/wavlm/__init__.py:
--------------------------------------------------------------------------------
1 | from models.wavlm.WavLM import WavLM, WavLMConfig


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | glob2==0.7
 2 | tqdm==4.62.3
 3 | librosa==0.10.1
 4 | scipy>=1.10.0
 5 | tensorboard==2.7.0
 6 | webrtcvad==2.0.10
 7 | colorlog==6.7.0
 8 | hydra-core==1.3.2
 9 | hydra_colorlog==1.2.0
10 | omegaconf==2.3.0
11 | pyreaper
12 | protobuf==3.20.0
13 | matplotlib==3.7.1
14 | transformers==4.28.1
15 | # pyreaper @ git+https://github.com/r9y9/pyreaper.git@v0.0.9#egg=pyreaper
16 | huggingface_hub
17 | nnAudio
18 | numpy==1.24
19 | gdown
20 | torchaudio==2.1.2
21 | torch==2.1.2
22 | pytorch_lightning
23 | s3prl
24 | stopit
25 | praat-parselmouth
26 | webrtcvad
27 | asteroid_filterbanks
28 | 


--------------------------------------------------------------------------------
/resources/freesvc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freds0/free-svc/def7f35a0d1935b00e6ebaf19a4a149617f3354b/resources/freesvc.png


--------------------------------------------------------------------------------
/scripts/convert.py:
--------------------------------------------------------------------------------
  1 | from models.wavlm import WavLM, WavLMConfig
  2 | from models.speaker_encoder.voice_encoder import SpeakerEncoder
  3 | from models import SynthesizerTrn
  4 | from mel_processing import mel_processing
  5 | import utils
  6 | import argparse
  7 | import glob
  8 | import logging
  9 | import os
 10 | import time
 11 | 
 12 | import librosa
 13 | import torch
 14 | from scipy.io import wavfile
 15 | from scipy.io.wavfile import write
 16 | from tqdm import tqdm
 17 | 
 18 | import numpy as np
 19 | import pyreaper
 20 | import torch
 21 | 
 22 | import sys
 23 | sys.path.append('..')
 24 | 
 25 | logging.getLogger('numba').setLevel(logging.WARNING)
 26 | 
 27 | 
 28 | if __name__ == "__main__":
 29 |     parser = argparse.ArgumentParser()
 30 |     parser.add_argument(
 31 |         "--hpfile", type=str, default="configs/freevc.yaml", help="path to yaml config file")
 32 |     parser.add_argument(
 33 |         "--ptfile", type=str, default="checkpoints/freevc.pth", help="path to pth file")
 34 |     parser.add_argument("--txt-path", type=str,
 35 |                         default="convert.txt", help="path to txt file")
 36 |     parser.add_argument("--out-dir", type=str,
 37 |                         default="output/freevc", help="path to output dir")
 38 |     parser.add_argument("--use-timestamp", default=False, action="store_true")
 39 |     args = parser.parse_args()
 40 | 
 41 |     os.makedirs(args.outdir, exist_ok=True)
 42 |     hps = utils.get_hparams_from_file(args.hpfile)
 43 | 
 44 |     print("Loading model...")
 45 |     net_g = SynthesizerTrn(
 46 |         hps.data.filter_length // 2 + 1,
 47 |         hps.train.segment_size // hps.data.hop_length,
 48 |         **hps.model).cuda()
 49 |     _ = net_g.eval()
 50 |     print("Loading checkpoint...")
 51 |     _ = utils.load_checkpoint(args.ptfile, net_g, None, True)
 52 | 
 53 |     print("Loading WavLM for content...")
 54 |     cmodel = utils.get_cmodel(0)
 55 | 
 56 |     if hps.model.use_spk:
 57 |         print("Loading speaker encoder...")
 58 |         smodel = SpeakerEncoder(
 59 |             'speaker_encoder/ckpt/pretrained_bak_5805000.pt')
 60 | 
 61 |     print("Processing text...")
 62 |     titles, srcs, tgts = [], [], []
 63 |     with open(args.txtpath, "r") as f:
 64 |         for rawline in f.readlines():
 65 |             title, src, tgt = rawline.strip().split("|")
 66 |             titles.append(title)
 67 |             srcs.append(src)
 68 |             tgts.append(tgt)
 69 | 
 70 |     print("Synthesizing...")
 71 |     with torch.no_grad():
 72 |         for line in tqdm(zip(titles, srcs, tgts)):
 73 |             title, src, tgt = line
 74 |             # tgt
 75 |             wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
 76 |             wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
 77 |             if hps.model.use_spk:
 78 |                 g_tgt = smodel.embed_utterance(wav_tgt)
 79 |                 g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).cuda()
 80 |             else:
 81 |                 wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).cuda()
 82 |                 mel_tgt = mel_processing.mel_spectrogram_torch(
 83 |                     wav_tgt,
 84 |                     hps.data.filter_length,
 85 |                     hps.data.n_mel_channels,
 86 |                     hps.data.sampling_rate,
 87 |                     hps.data.hop_length,
 88 |                     hps.data.win_length,
 89 |                     hps.data.mel_fmin,
 90 |                     hps.data.mel_fmax
 91 |                 )
 92 |             # src
 93 |             wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
 94 |             wav_src = torch.from_numpy(wav_src).unsqueeze(0).cuda()
 95 |             # get pitch
 96 |             sampling_rate, audio = wavfile.read(src)
 97 |             _, _, _, pitch, _ = pyreaper.reaper(audio, sampling_rate)
 98 |             pitch = np.clip(pitch, 0, 800) * args.pitch_factor
 99 |             # interpolat to ensures that pitch and z have the same len
100 |             z_len = round(audio.shape[-1] / hps.data.hop_length)
101 |             pitch = torch.nn.functional.interpolate(torch.tensor(pitch).unsqueeze(0).unsqueeze(
102 |                 0), size=z_len, mode="nearest").squeeze().unsqueeze(0).unsqueeze(0).cuda()
103 | 
104 |             # TODO: explore other interpolation modes
105 |             c = torch.nn.functional.interpolate(
106 |                 c, size=z_len, mode="nearest").cuda()
107 | 
108 |             if hps.model.use_spk:
109 |                 audio = net_g.infer(c, g=g_tgt)
110 |             else:
111 |                 audio = net_g.infer(c, mel=mel_tgt)
112 |             audio = audio[0][0].data.cpu().float().numpy()
113 |             if args.use_timestamp:
114 |                 timestamp = time.strftime("%m-%d_%H-%M", time.localtime())
115 |                 write(os.path.join(args.outdir, "{}.wav".format(
116 |                     timestamp+"_"+title)), hps.data.sampling_rate, audio)
117 |             else:
118 |                 write(os.path.join(args.outdir,
119 |                       f"{title}.wav"), hps.data.sampling_rate, audio)
120 | 


--------------------------------------------------------------------------------
/scripts/convert.txt:
--------------------------------------------------------------------------------
1 | title1|DUMMY/p225/p225_001.wav|DUMMY/p226/p226_002.wav
2 | title2|DUMMY/p226/p226_002.wav|DUMMY/p225/p225_001.wav
3 | 


--------------------------------------------------------------------------------
/scripts/downsample.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import librosa
 4 | import numpy as np
 5 | from multiprocessing import Pool, cpu_count
 6 | from scipy.io import wavfile
 7 | from tqdm import tqdm
 8 | from glob import glob
 9 | 
10 | 
11 | def process(audio_path):
12 |     if os.path.exists(audio_path):
13 |         audio, _ = librosa.load(audio_path, sr=args.sample_rate)
14 |         audio, _ = librosa.effects.trim(audio, top_db=20)
15 |         peak = np.abs(audio).max()
16 |         if peak > 1.0:
17 |             audio = 0.98 * audio / peak
18 |         save_path = audio_path.replace(args.in_dir, args.out_dir)
19 |         save_path = save_path.replace('.'+args.in_audio_format, '.'+args.out_audio_format)
20 |         os.makedirs(os.path.dirname(save_path), exist_ok=True)
21 |         wavfile.write(
22 |             save_path,
23 |             args.sample_rate,
24 |             (audio * np.iinfo(np.int16).max).astype(np.int16)
25 |         )
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     parser = argparse.ArgumentParser()
30 |     parser.add_argument("-sr", "--sample-rate", type=int, default=16000, help="sampling rate")
31 |     parser.add_argument("-if", "--in-audio-format", type=str, default="wav", help="audio format of input audios")
32 |     parser.add_argument("-i", "--in-dir", type=str, default="./data", help="path to source dir")
33 |     parser.add_argument("-o", "--out-dir", type=str, default="./", help="path to target dir")
34 |     parser.add_argument("-of", "--out-audio-format", type=str, default="wav", help="audio format of output audios")
35 |     parser.add_argument("-w", "--num-workers", type=int, default=1, help="number of workers")
36 |     args = parser.parse_args()
37 | 
38 |     filepaths = glob(f'{args.in_dir}/**/*.{args.in_audio_format}', recursive=True)
39 |     if args.num_workers == 1:
40 |         for filename in tqdm(filepaths):
41 |             process(filename)
42 |     else:
43 |         pool = Pool(processes=args.num_workers)
44 |         for _ in tqdm(pool.imap_unordered(process, filepaths)):
45 |             pass
46 | 


--------------------------------------------------------------------------------
/scripts/prepare_nus_dataset.sh:
--------------------------------------------------------------------------------
 1 | # Description: This script downloads the Nus dataset and prepares it for training.
 2 | 
 3 | DATASET_DIR_NAME="dataset_nus"
 4 | mkdir -p $DATASET_DIR_NAME
 5 | 
 6 | # Check if the dataset_nus is already processed
 7 | if [ -f "$DATASET_DIR_NAME/DONE" ]; then
 8 |     echo "$DATASET_DIR_NAME already processed"
 9 |     exit 0
10 | fi
11 | 
12 | set -e
13 | set -x
14 | 
15 | # Function to download the dataset_nus
16 | function downloadNus() {
17 |     # Check if download is needed
18 |     if [ -f "Nus.zip" ]; then
19 |         echo "Dataset already downloaded"
20 |     else
21 |         echo "Downloading dataset"
22 |         gdown 1lGHfVN4jWh-oWKpxnQIwZux1qpk5L9C5
23 |     fi
24 |     mv Nus.zip $DATASET_DIR_NAME
25 |     cd $DATASET_DIR_NAME/
26 |     set +e
27 |     unzip Nus.zip && rm Nus.zip && mkdir raw && mv * raw/
28 |     set -e
29 |     cd ..
30 | }
31 | 
32 | # Function to downsample audios
33 | function downsample() {
34 |     python3 scripts/downsample.py \
35 |         --in-audio-format wav \
36 |         --in-dir $DATASET_DIR_NAME/raw \
37 |         --out-dir $DATASET_DIR_NAME/16k \
38 |         --sample-rate 16000 \
39 |         --num-workers 8
40 | }
41 | 
42 | # Function to create train and test splits
43 | function create_splits() {
44 |     python3 scripts/preprocess_flist.py \
45 |         --source-dir $DATASET_DIR_NAME/16k  \
46 |         --train-list $DATASET_DIR_NAME/train.csv \
47 |         --val-list $DATASET_DIR_NAME/val.csv \
48 |         --test-list $DATASET_DIR_NAME/test.csv \
49 |         --seed 1
50 | }
51 | 
52 | # Function to extract features
53 | function extract_features() {
54 |     python3 scripts/preprocess_spk.py \
55 |         --in-dir $DATASET_DIR_NAME/16k \
56 |         --out-dir $DATASET_DIR_NAME/spk_embeddings \
57 |         --num-workers 8
58 | 
59 |     python3 scripts/preprocess_content.py \
60 |         --in-dir $DATASET_DIR_NAME/16k \
61 |         --out-dir $DATASET_DIR_NAME/ssl_features 
62 | 
63 |     python3 scripts/preprocess_sr.py \
64 |         --in-dir $DATASET_DIR_NAME/16k \
65 |         --wav-dir $DATASET_DIR_NAME/sr \
66 |         --ssl-dir $DATASET_DIR_NAME/ssl_features \
67 |         --num-workers 1
68 | 
69 |     python3 scripts/preprocess_pitch.py \
70 |         --in-dir $DATASET_DIR_NAME/16k \
71 |         --out-dir $DATASET_DIR_NAME/pitch_features \
72 |         --num-workers 1
73 |     
74 | }
75 | 
76 | echo "STEP 1"
77 | downloadNus
78 | echo "STEP 2"
79 | downsample
80 | echo "STEP 3"
81 | create_splits
82 | echo "STEP 4"
83 | extract_features
84 | echo "DONE"
85 | rm -rf $DATASET_DIR_NAME/raw
86 | echo "" > $DATASET_DIR_NAME/DONE
87 | 
88 | set +x
89 | echo "To easily train the model, rename the $DATASET_DIR_NAME folder to 'dataset'"
90 | echo "NOTE: the audios were not cut in small chunks. You might want to do that before training (see segment_vad.py)."
91 | 


--------------------------------------------------------------------------------
/scripts/prepare_nus_dataset_vad.sh:
--------------------------------------------------------------------------------
 1 | # Description: This script downloads the Nus dataset and prepares it for training.
 2 | 
 3 | DATASET_DIR_NAME="dataset_nus"
 4 | mkdir -p $DATASET_DIR_NAME
 5 | 
 6 | # Check if the  is already processed
 7 | if [ -f "$DATASET_DIR_NAME/DONE" ]; then
 8 |     echo "$DATASET_DIR_NAME already processed"
 9 |     exit 0
10 | fi
11 | 
12 | set -e
13 | set -x
14 | 
15 | # Function to download the 
16 | function downloadNus() {
17 |     # Check if download is needed
18 |     if [ -f "Nus.zip" ]; then
19 |         echo "Dataset already downloaded"
20 |     else
21 |         echo "Downloading dataset"
22 |         gdown 1lGHfVN4jWh-oWKpxnQIwZux1qpk5L9C5
23 |     fi
24 |     mv Nus.zip $DATASET_DIR_NAME
25 |     cd $DATASET_DIR_NAME/
26 |     set +e
27 |     unzip Nus.zip && rm Nus.zip && mkdir raw && mv * raw/ 
28 |     set -e
29 |     cd ..
30 | }
31 | 
32 | # Function to downsample audios
33 | function downsample() {
34 |     python3 scripts/downsample.py \
35 |         --in-audio-format wav \
36 |         --in-dir $DATASET_DIR_NAME/raw \
37 |         --out-dir $DATASET_DIR_NAME/16k \
38 |         --sample-rate 16000 \
39 |         --num-workers 8
40 | }
41 | 
42 | # Function to create train and test splits
43 | function create_splits() {
44 |     python3 scripts/preprocess_flist.py \
45 |         --source-dir $DATASET_DIR_NAME/16k-segmented  \
46 |         --train-list $DATASET_DIR_NAME/train.csv \
47 |         --val-list $DATASET_DIR_NAME/val.csv \
48 |         --test-list $DATASET_DIR_NAME/test.csv \
49 |         --seed 1
50 | }
51 | 
52 | # Function to segment audios
53 | function segment() {
54 |     python3 scripts/segment_vad.py \
55 |         --dir $DATASET_DIR_NAME/16k \
56 |         --out-dir ./$DATASET_DIR_NAME/16k-segmented
57 | }
58 | 
59 | # Function to extract features
60 | function extract_features() {
61 |     python3 scripts/preprocess_spk.py \
62 |         --in-dir $DATASET_DIR_NAME/16k-segmented \
63 |         --out-dir $DATASET_DIR_NAME/spk_embeddings \
64 |         --num-workers 8
65 | 
66 |     python3 scripts/preprocess_content.py \
67 |         --in-dir $DATASET_DIR_NAME/16k-segmented \
68 |         --out-dir $DATASET_DIR_NAME/ssl_features 
69 | 
70 |     python3 scripts/preprocess_sr.py \
71 |         --in-dir $DATASET_DIR_NAME/16k-segmented \
72 |         --wav-dir $DATASET_DIR_NAME/sr \
73 |         --ssl-dir $DATASET_DIR_NAME/ssl_features \
74 |         --num-workers 1
75 | 
76 |     python3 scripts/preprocess_pitch.py \
77 |         --in-dir $DATASET_DIR_NAME/16k-segmented \
78 |         --out-dir $DATASET_DIR_NAME/pitch_features \
79 |         --num-workers 1
80 |     
81 | }
82 | 
83 | echo "STEP 1"
84 | downloadNus
85 | echo "STEP 2"
86 | downsample
87 | echo "STEP 3"
88 | segment
89 | echo "STEP 4"
90 | create_splits
91 | echo "STEP 5"
92 | extract_features
93 | echo "DONE"
94 | rm -rf $DATASET_DIR_NAME/raw
95 | echo "" > $DATASET_DIR_NAME/DONE
96 | 
97 | set +x
98 | echo "To easily train the model, rename the $DATASET_DIR_NAME folder to 'Data'"


--------------------------------------------------------------------------------
/scripts/prepare_pop_dataset.sh:
--------------------------------------------------------------------------------
  1 | # Description: This script downloads the PopBuTFy dataset and prepares it for training.
  2 | 
  3 | echo "This dataset has issues with some audio files."
  4 | 
  5 | DATASET_DIR_NAME="dataset_pop"
  6 | mkdir -p $DATASET_DIR_NAME
  7 | 
  8 | # Check if the dataset_pop is already processed
  9 | if [ -f "$DATASET_DIR_NAME/DONE" ]; then
 10 |     echo "$DATASET_DIR_NAME already processed"
 11 |     exit 0
 12 | fi
 13 | 
 14 | set -e
 15 | set -x
 16 | 
 17 | # Function to download the dataset_pop
 18 | function downloadPopBuTFy() {
 19 |     # Check if download is needed
 20 |     if [ -f "PopBuTFy.zip" ]; then
 21 |         echo "Dataset already downloaded"
 22 |     else
 23 |         echo "Downloading dataset"
 24 |         gdown 1WQOTrQDVgBeULUWMtBCAhWmiy2fe3hhh
 25 |     fi
 26 |     mv PopBuTFy.zip $DATASET_DIR_NAME/
 27 |     cd $DATASET_DIR_NAME/
 28 |     unzip PopBuTFy.zip && rm PopBuTFy.zip 
 29 |     cd ..
 30 | }
 31 | 
 32 | # Function to create spk dirs
 33 | function create_spk_dirs() {
 34 |     cd $DATASET_DIR_NAME/data/
 35 |     set +e
 36 |     for i in {10..18}; do
 37 |         mkdir Female${i}
 38 |         mv "Female${i}#"* Female${i}/
 39 |     done
 40 | 
 41 |     for i in {1..9}; do
 42 |         mkdir Female${i}
 43 |         mv "Female${i}#"* Female${i}/
 44 |     done
 45 | 
 46 |     for i in {1..6}; do
 47 |         mkdir Male${i}
 48 |         mv "Male${i}#"* Male${i}/
 49 |     done
 50 |     set -e
 51 |     cd ../..
 52 | }
 53 | 
 54 | # Function to downsample audios
 55 | function downsample() {
 56 |     python3 scripts/downsample.py \
 57 |         --in-audio-format mp3 \
 58 |         --in-dir $DATASET_DIR_NAME/data \
 59 |         --out-dir $DATASET_DIR_NAME/16k \
 60 |         --sample-rate 16000 \
 61 |         --num-workers 8
 62 | }
 63 | 
 64 | # Function to create train and test splits
 65 | function create_splits() {
 66 |     python3 scripts/preprocess_flist.py \
 67 |         --source-dir $DATASET_DIR_NAME/16k  \
 68 |         --train-list $DATASET_DIR_NAME/train.csv \
 69 |         --val-list $DATASET_DIR_NAME/val.csv \
 70 |         --test-list $DATASET_DIR_NAME/test.csv \
 71 |         --seed 1
 72 | }
 73 | 
 74 | # Function to extract features
 75 | function extract_features() {
 76 |     python3 scripts/preprocess_spk.py \
 77 |         --in-dir $DATASET_DIR_NAME/16k \
 78 |         --out-dir $DATASET_DIR_NAME/spk_embeddings \
 79 |         --num-workers 8
 80 | 
 81 |     python3 scripts/preprocess_content.py \
 82 |         --in-dir $DATASET_DIR_NAME/16k \
 83 |         --out-dir $DATASET_DIR_NAME/ssl_features 
 84 | 
 85 |     python3 scripts/preprocess_sr.py \
 86 |         --in-dir $DATASET_DIR_NAME/16k \
 87 |         --wav-dir $DATASET_DIR_NAME/sr \
 88 |         --ssl-dir $DATASET_DIR_NAME/ssl_features \
 89 |         --num-workers 4
 90 | 
 91 |     python3 scripts/preprocess_pitch.py \
 92 |         --in-dir $DATASET_DIR_NAME/16k \
 93 |         --out-dir $DATASET_DIR_NAME/pitch_features \
 94 |         --num-workers 1
 95 |     
 96 | }
 97 | 
 98 | echo "STEP 1"
 99 | downloadPopBuTFy
100 | echo "STEP 2"
101 | create_spk_dirs
102 | echo "STEP 3"
103 | downsample
104 | echo "STEP 4"
105 | create_splits
106 | echo "STEP 5"
107 | extract_features
108 | echo "DONE"
109 | rm -rf dataset_pop/data
110 | echo "" > $DATASET_DIR_NAME/DONE
111 | 
112 | set +x
113 | echo "To easily train the model, rename the $DATASET_DIR_NAME folder to 'dataset'"


--------------------------------------------------------------------------------
/scripts/prepare_vctk_dataset.sh:
--------------------------------------------------------------------------------
 1 | # Description: This script downloads the Nus dataset and prepares it for training.
 2 | 
 3 | DATASET_DIR_NAME="dataset_vctk"
 4 | mkdir -p $DATASET_DIR_NAME
 5 | 
 6 | # Check if the dataset_nus is already processed
 7 | if [ -f "$DATASET_DIR_NAME/DONE" ]; then
 8 |     echo "$DATASET_DIR_NAME already processed"
 9 |     exit 0
10 | fi
11 | 
12 | set -e
13 | set -x
14 | 
15 | # Function to download the dataset_nus
16 | function downloadNus() {
17 |     # Check if download is needed
18 |     if [ -f "Nus.zip" ]; then
19 |         echo "Dataset already downloaded"
20 |     else
21 |         echo "Downloading dataset"
22 |         gdown 1lGHfVN4jWh-oWKpxnQIwZux1qpk5L9C5
23 |     fi
24 |     mv Nus.zip $DATASET_DIR_NAME
25 |     cd $DATASET_DIR_NAME/
26 |     set +e
27 |     unzip Nus.zip && rm Nus.zip && mkdir raw && mv * raw/
28 |     set -e
29 |     cd ..
30 | }
31 | 
32 | # Function to downsample audios
33 | function downsample() {
34 |     python3 scripts/downsample.py \
35 |         --in-audio-format wav \
36 |         --in-dir $DATASET_DIR_NAME/raw \
37 |         --out-dir $DATASET_DIR_NAME/16k \
38 |         --sample-rate 16000 \
39 |         --num-workers 8
40 | }
41 | 
42 | # Function to create train and test splits
43 | function create_splits() {
44 |     python3 scripts/preprocess_flist.py \
45 |         --source-dir $DATASET_DIR_NAME/16k  \
46 |         --train-list $DATASET_DIR_NAME/train.csv \
47 |         --val-list $DATASET_DIR_NAME/val.csv \
48 |         --test-list $DATASET_DIR_NAME/test.csv \
49 |         --seed 1
50 | }
51 | 
52 | # Function to extract features
53 | function extract_features() {
54 |     #python3 scripts/preprocess_spk.py \
55 |     #    --in-dir $DATASET_DIR_NAME/16k \
56 |     #    --out-dir $DATASET_DIR_NAME/spk_embeddings \
57 |     #    --num-workers 8
58 | 
59 |     #python3 scripts/preprocess_content.py \
60 |     #    --in-dir $DATASET_DIR_NAME/16k \
61 |     #    --out-dir $DATASET_DIR_NAME/ssl_features 
62 | 
63 |     # python3 scripts/preprocess_sr.py \
64 |     #     --in-dir $DATASET_DIR_NAME/16k \
65 |     #     --wav-dir $DATASET_DIR_NAME/sr \
66 |     #     --ssl-dir $DATASET_DIR_NAME/ssl_features \
67 |     #     --num-workers 1
68 | 
69 |     python3 scripts/preprocess_pitch.py \
70 |         --in-dir $DATASET_DIR_NAME/16k \
71 |         --out-dir $DATASET_DIR_NAME/pitch_features \
72 |         --num-workers 1
73 |     
74 | }
75 | 
76 | echo "STEP 1"
77 | #downloadNus
78 | echo "STEP 2"
79 | #downsample
80 | echo "STEP 3"
81 | #create_splits
82 | echo "STEP 4"
83 | extract_features
84 | echo "DONE"
85 | rm -rf $DATASET_DIR_NAME/raw
86 | echo "" > $DATASET_DIR_NAME/DONE
87 | 
88 | set +x
89 | echo "To easily train the model, rename the $DATASET_DIR_NAME folder to 'dataset'"
90 | echo "NOTE: the audios were not cut in small chunks. You might want to do that before training (see segment_vad.py)."
91 | 


--------------------------------------------------------------------------------
/scripts/preprocess_content.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from glob import glob
 4 | 
 5 | import librosa
 6 | import torch
 7 | from tqdm import tqdm
 8 | 
 9 | import sys
10 | sys.path.append(os.path.dirname('..'))
11 | 
12 | import utils
13 | from models.wavlm import WavLM, WavLMConfig
14 | 
15 | 
16 | def extract_and_save_content_features(audio_path, out_dir, sampling_rate=16000):
17 |     os.makedirs(os.path.dirname(audio_path), exist_ok=True)
18 |     utt_id = os.path.basename(audio_path).rstrip(".wav")
19 |     save_filepath = os.path.join(out_dir, f"{utt_id}.pt")
20 |     if os.path.isfile(save_filepath):
21 |         print("Igored because it is already computed: ", save_filepath)
22 |     else:
23 |         wav, _ = librosa.load(audio_path, sr=sampling_rate)
24 |         wav = torch.from_numpy(wav).unsqueeze(0).cuda()
25 |         c = utils.get_content(cmodel, wav)
26 |         torch.save(c.cpu(), save_filepath)
27 | 
28 | if __name__ == "__main__":
29 |     torch.multiprocessing.set_start_method('spawn')
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument("--sr", type=int, default=16000, help="sampling rate")
32 |     parser.add_argument("--in-dir", type=str, default="data", help="path to input dir")
33 |     parser.add_argument("--out-dir", type=str, default="data/content_features", help="path to output dir")
34 |     parser.add_argument("--checkpoint", type=str, default="./models/wavlm/WavLM-Large.pt", help="path to checkpoint")
35 |     args = parser.parse_args()
36 | 
37 |     os.makedirs(args.out_dir, exist_ok=True)
38 | 
39 |     print("Loading WavLM for content...")
40 |     checkpoint = torch.load(args.checkpoint)
41 |     cfg = WavLMConfig(checkpoint['cfg'])
42 |     cmodel = WavLM(cfg).cuda()
43 |     cmodel.load_state_dict(checkpoint['model'])
44 |     cmodel.eval()
45 |     print("Loaded WavLM.")
46 | 
47 |     sub_folder_list = os.listdir(args.in_dir)
48 |     sub_folder_list.sort()
49 |     for spk in sub_folder_list:
50 |         print("Preprocessing speaker {} ...".format(spk))
51 |         in_dir = os.path.join(args.in_dir, spk)
52 |         if not os.path.isdir(in_dir):
53 |             continue
54 | 
55 |         filepaths = glob(f'{in_dir}/**/*.wav', recursive=True)
56 | 
57 |         for filepath in tqdm(filepaths):
58 |             spk_out_dir = os.path.join(args.out_dir, spk)
59 |             os.makedirs(spk_out_dir, exist_ok=True)
60 |             extract_and_save_content_features(filepath, spk_out_dir, sampling_rate=args.sr)
61 | 


--------------------------------------------------------------------------------
/scripts/preprocess_flist.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import random
 4 | from tqdm import tqdm
 5 | from random import shuffle
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument("--source-dir", type=str, default="./dataset/", help="path to source dir")
11 |     parser.add_argument("--seed", type=int, default=None, help="random seed")
12 |     parser.add_argument("--all-list", type=str, default="./dataset/all.csv", help="path to all list")
13 |     parser.add_argument("--train-list", default="", help="path to train list")
14 |     parser.add_argument("--val-list", default="", help="path to val list")
15 |     parser.add_argument("--test-list", default="", help="path to test list")
16 |     args = parser.parse_args()
17 | 
18 |     if args.seed is not None:
19 |         random.seed(args.seed)
20 | 
21 |     train = []
22 |     val = []
23 |     test = []
24 |     idx = 0
25 | 
26 |     data = []
27 |     for language in os.listdir(args.source_dir):
28 |         for speaker in tqdm(os.listdir(os.path.join(args.source_dir, language))):
29 |             for root, dirs, files in os.walk(os.path.join(args.source_dir, language, speaker)):
30 |                 for file in files:
31 |                     if file.endswith(".wav"):
32 |                         data.append((os.path.join(root, file), language, speaker))
33 | 
34 |     shuffle(data)
35 | 
36 |     print("Writing", args.all_list)
37 |     with open(args.all_list, "w") as f:
38 |         for wavpath, language, speaker in tqdm(data):
39 |             print(wavpath, language, speaker, sep="|", file=f)
40 | 
41 |     val += data[:int(len(data) * 0.01)]
42 |     test += data[int(len(data) * 0.01):int(len(data) * 0.02)]
43 |     train += data[int(len(data) * 0.02):]
44 | 
45 |     shuffle(train)
46 |     shuffle(val)
47 |     shuffle(test)
48 | 
49 |     if args.train_list != "":
50 |         print("Writing", args.train_list)
51 |         with open(args.train_list, "w") as f:
52 |             for wavpath, language, speaker in tqdm(train):
53 |                 print(wavpath, language, speaker, sep="|", file=f)
54 | 
55 |     if args.val_list != "":
56 |         print("Writing", args.val_list)
57 |         with open(args.val_list, "w") as f:
58 |             for wavpath, language, speaker in tqdm(val):
59 |                 print(wavpath, language, speaker, sep="|", file=f)
60 | 
61 |     if args.test_list != "":
62 |         print("Writing", args.test_list)
63 |         with open(args.test_list, "w") as f:
64 |             for wavpath, language, speaker in tqdm(test):
65 |                 print(wavpath, language, speaker, sep="|", file=f)
66 | 


--------------------------------------------------------------------------------
/scripts/preprocess_pitch.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import argparse
 4 | import torch
 5 | import random
 6 | from glob import glob
 7 | from tqdm import tqdm
 8 | from scipy.io import wavfile
 9 | import concurrent.futures
10 | 
11 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
12 | from models.f0_predictor import get_f0_predictor
13 | 
14 | def extract_pitch(pitch_predictor, input_path, output_path, skip_existing=False):
15 |     if skip_existing and os.path.exists(output_path):
16 |         return
17 |     pitch = pitch_predictor.compute_f0(wavfile.read(input_path)[1])
18 |     os.makedirs(os.path.dirname(output_path), exist_ok=True)
19 |     if type(pitch) is tuple:
20 |         print(f"Pitch feature computation might have failed for {input_path}")
21 |         pitch = pitch[0]
22 |     torch.save(torch.tensor(pitch), output_path)
23 | 
24 | if __name__ == "__main__":
25 |     parser = argparse.ArgumentParser()
26 |     parser.add_argument("--in-dir", type=str, default="data/train", help="path to input dir")
27 |     parser.add_argument("--pitch-predictor", type=str, default="rmvpe")
28 |     parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu")
29 |     parser.add_argument("--sampling-rate", type=int, default=24000)
30 |     parser.add_argument("--hop-length", type=int, default=320)
31 |     parser.add_argument('--num-workers', type=int, default=1)
32 |     parser.add_argument("--skip-existing", action="store_true", help="skip existing pitch files")
33 |     parser.add_argument("--out-dir", type=str, default="data/pitch_features/train", help="path to output dir")
34 |     args = parser.parse_args()
35 | 
36 |     if args.device == "cuda" and args.num_workers > 1:
37 |         print("Warning: Multiprocessing with CUDA is not supported. Setting num_workers to 1.")
38 |         args.num_workers = 1
39 | 
40 |     pitch_predictor = get_f0_predictor(
41 |         args.pitch_predictor,
42 |         sampling_rate=args.sampling_rate,
43 |         hop_length=args.hop_length,
44 |         device=args.device,
45 |         threshold=0.05
46 |     )
47 | 
48 |     file_paths = glob(f'{args.in_dir}/**/*.wav', recursive=True)
49 |     random.shuffle(file_paths)
50 | 
51 |     if args.num_workers > 1:
52 |         with concurrent.futures.ProcessPoolExecutor(args.num_workers) as \
53 |                 executor:
54 |             futures = [executor.submit(pitch_predictor, file_path, file_path.replace(args.in_dir, args.out_dir).replace(".wav", "_pitch.pt"), skip_existing=args.skip_existing) for file_path in file_paths]
55 |             for f in tqdm(concurrent.futures.as_completed(futures)):
56 |                 if f.exception() is not None:
57 |                     print(f.exception())
58 |     else:
59 |         for file_path in tqdm(file_paths):
60 |             output_path = file_path.replace(args.in_dir, args.out_dir).replace(".wav", "_pitch.pt")
61 |             extract_pitch(pitch_predictor, file_path, output_path, args.skip_existing)


--------------------------------------------------------------------------------
/scripts/preprocess_spk.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import os
 4 | from concurrent.futures import ProcessPoolExecutor
 5 | from functools import partial
 6 | from multiprocessing import cpu_count
 7 | from pathlib import Path
 8 | 
 9 | import numpy as np
10 | from tqdm import tqdm
11 | 
12 | import sys
13 | sys.path.append(os.path.dirname('..'))
14 | 
15 | from models.speaker_encoder.voice_encoder import SpeakerEncoder
16 | from models.speaker_encoder.audio import preprocess_wav
17 | 
18 | 
19 | def build_from_path(in_dir, out_dir, weights_fpath, num_workers=1):
20 |     executor = ProcessPoolExecutor(max_workers=num_workers)
21 |     futures = []
22 |     wavfile_paths = glob.glob(in_dir + '/**/*.wav', recursive=True)
23 |     wavfile_paths = sorted(wavfile_paths)
24 |     print("Number of wav files: ", len(wavfile_paths))
25 |     if num_workers > 1:
26 |         for wav_path in wavfile_paths:
27 |             futures.append(executor.submit(
28 |                 partial(_compute_spkEmbed, out_dir, wav_path, weights_fpath)))
29 |         return [future.result() for future in tqdm(futures)]
30 |     else:
31 |         for wav_path in wavfile_paths:
32 |             _compute_spkEmbed(out_dir, wav_path, weights_fpath)
33 | 
34 | def _compute_spkEmbed(out_dir, wav_path, weights_fpath):
35 |     utt_id = os.path.basename(wav_path).rstrip(".wav")
36 |     fname_save = os.path.join(out_dir, f"{utt_id}.npy")
37 |     if os.path.isfile(fname_save):
38 |         print("Igored because it is already computed: ", fname_save)
39 |         return os.path.basename(fname_save)
40 |     fpath = Path(wav_path)
41 |     wav = preprocess_wav(fpath)
42 | 
43 |     encoder = SpeakerEncoder(weights_fpath)
44 |     embed = encoder.embed_utterance(wav)
45 |     np.save(fname_save, embed, allow_pickle=False)
46 |     return os.path.basename(fname_save)
47 | 
48 | 
49 | def preprocess(in_dir, out_dir, spk, weights_fpath, num_workers):
50 |     out_dir = os.path.join(out_dir, spk)
51 |     os.makedirs(out_dir, exist_ok=True)
52 |     metadata = build_from_path(in_dir, out_dir, weights_fpath, num_workers)
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     parser = argparse.ArgumentParser()
57 |     parser.add_argument('--in-dir', type=str,
58 |                         default='dataset')
59 |     parser.add_argument('--num-workers', type=int, default=8)
60 |     parser.add_argument('--out-dir', type=str,
61 |                         default='dataset/spk_embeddings')
62 |     parser.add_argument('--spk-encoder-ckpt', type=str,
63 |                         default='models/speaker_encoder/ckpt/pretrained_bak_5805000.pt')
64 | 
65 |     args = parser.parse_args()
66 | 
67 |     sub_folder_list = os.listdir(args.in_dir)
68 |     sub_folder_list.sort()
69 | 
70 |     args.num_workers = args.num_workers if args.num_workers is not None else cpu_count()
71 |     print("Number of workers: ", args.num_workers)
72 |     ckpt_step = os.path.basename(args.spk_encoder_ckpt).split('.')[0].split('_')[-1]
73 |     spk_embed_out_dir = args.out_dir
74 |     print("[INFO] spk_embed_out_dir: ", spk_embed_out_dir)
75 |     os.makedirs(spk_embed_out_dir, exist_ok=True)
76 | 
77 |     for spk in sub_folder_list:
78 |         print("Preprocessing {} ...".format(spk))
79 |         in_dir = os.path.join(args.in_dir, spk)
80 |         if not os.path.isdir(in_dir):
81 |             continue
82 |         preprocess(in_dir, spk_embed_out_dir, spk,
83 |                    args.spk_encoder_ckpt, args.num_workers)
84 | 
85 |     print("DONE!")
86 |     sys.exit(0)
87 | 


--------------------------------------------------------------------------------
/scripts/run_inference.sh:
--------------------------------------------------------------------------------
 1 | # !/bin/bash
 2 | EXPERIMENT_PATH="/raid/alefiury/svc/free-svc/logs/config-online-language-emb.yaml/2024-02-23/10-11-19"
 3 | INPUT_BASE_DIR="/raid/lucasgris/free-svc"
 4 | 
 5 | HPFILE=$EXPERIMENT_PATH"/.hydra/config.yaml"
 6 | PTFILE=$EXPERIMENT_PATH"/G_00012_0200000.pth"
 7 | METADATA_PATH="/raid/lucasgris/free-svc/data/in_domain_transcriptions_weighted_spks.csv"
 8 | IGNORE_METADATA_HEADER=true
 9 | SPK_EMB_BASE_DIR="/raid/lucasgris/free-svc/data/spk_embeddings"
10 | PITCH_PREDICTOR="rmvpe"
11 | OUT_DIR=$EXPERIMENT_PATH"/audios"
12 | USE_TIMESTAMP=false
13 | CONCAT_AUDIO=false
14 | PITCH_FACTOR=0.9544
15 | 
16 | python3 scripts/inference.py \
17 |     --hpfile=$HPFILE \
18 |     --ptfile=$PTFILE \
19 |     --input-base-dir=$INPUT_BASE_DIR \
20 |     --metadata-path=$METADATA_PATH \
21 |     --ignore-metadata-header=$IGNORE_METADATA_HEADER \
22 |     --spk-emb-base-dir=$SPK_EMB_BASE_DIR \
23 |     --pitch-predictor=$PITCH_PREDICTOR \
24 |     --out-dir=$OUT_DIR \
25 |     --pitch-factor=$PITCH_FACTOR


--------------------------------------------------------------------------------
/scripts/run_inference_parallel.sh:
--------------------------------------------------------------------------------
 1 | # !/bin/bash
 2 | EXPERIMENT_PATH="/raid/alefiury/svc/free-svc/logs/config-online-language-emb.yaml/2024-02-23/10-11-19"
 3 | INPUT_BASE_DIR="/raid/lucasgris/free-svc"
 4 | 
 5 | HPFILE=$EXPERIMENT_PATH"/.hydra/config.yaml"
 6 | PTFILE=$EXPERIMENT_PATH"/G_00012_0200000.pth"
 7 | METADATA_PATH="/raid/lucasgris/free-svc/data/in_domain_transcriptions_weighted_spks.csv"
 8 | IGNORE_METADATA_HEADER=true
 9 | SPK_EMB_BASE_DIR="/raid/lucasgris/free-svc/data/spk_embeddings"
10 | PITCH_PREDICTOR="rmvpe"
11 | OUT_DIR=$EXPERIMENT_PATH"/audios"
12 | USE_TIMESTAMP=false
13 | CONCAT_AUDIO=false
14 | PITCH_FACTOR=0.9544
15 | 
16 | NUM_WORKERS=4
17 | 
18 | CUDA_LAUNCH_BLOCKING=1 python3 scripts/inference_parallel.py \
19 |     --hpfile=$HPFILE \
20 |     --ptfile=$PTFILE \
21 |     --input-base-dir=$INPUT_BASE_DIR \
22 |     --metadata-path=$METADATA_PATH \
23 |     --ignore-metadata-header=$IGNORE_METADATA_HEADER \
24 |     --spk-emb-base-dir=$SPK_EMB_BASE_DIR \
25 |     --pitch-predictor=$PITCH_PREDICTOR \
26 |     --out-dir=$OUT_DIR \
27 |     --pitch-factor=$PITCH_FACTOR \
28 |     --num-workers=$NUM_WORKERS


--------------------------------------------------------------------------------