├── audiocodecs ├── example.wav ├── version.py ├── __init__.py ├── codec.py ├── focalcodec.py ├── wavlm_kmeans.py ├── bigcodec.py ├── wavtokenizer.py └── speechtokenizer.py ├── downstream ├── metrics │ ├── model_v8.onnx │ ├── utmos.py │ ├── stoi.py │ ├── pesq.py │ ├── mel_distance.py │ ├── stft_distance.py │ ├── dwer.py │ └── speaker_similarity.py ├── requirements.txt ├── models │ ├── pooling.py │ ├── speaker_encoder.py │ └── multihead.py └── hparams │ ├── sr │ ├── MLS │ │ ├── bigcodec.yaml │ │ ├── mimi.yaml │ │ ├── dac.yaml │ │ ├── speechtokenizer.yaml │ │ ├── encodec.yaml │ │ ├── stablecodec.yaml │ │ ├── wavlm_kmeans.yaml │ │ ├── focalcodec.yaml │ │ ├── semanticodec.yaml │ │ └── wavtokenizer.yaml │ ├── VoiceBank │ │ ├── bigcodec.yaml │ │ ├── mimi.yaml │ │ ├── dac.yaml │ │ ├── speechtokenizer.yaml │ │ ├── encodec.yaml │ │ ├── stablecodec.yaml │ │ ├── wavlm_kmeans.yaml │ │ ├── focalcodec.yaml │ │ ├── semanticodec.yaml │ │ └── wavtokenizer.yaml │ ├── LibriSpeech │ │ ├── bigcodec.yaml │ │ ├── mimi.yaml │ │ ├── dac.yaml │ │ ├── speechtokenizer.yaml │ │ ├── encodec.yaml │ │ ├── stablecodec.yaml │ │ ├── wavlm_kmeans.yaml │ │ ├── focalcodec.yaml │ │ ├── semanticodec.yaml │ │ └── wavtokenizer.yaml │ └── LibriMix │ │ ├── bigcodec.yaml │ │ ├── mimi.yaml │ │ ├── dac.yaml │ │ ├── speechtokenizer.yaml │ │ ├── encodec.yaml │ │ ├── stablecodec.yaml │ │ ├── wavlm_kmeans.yaml │ │ ├── focalcodec.yaml │ │ ├── semanticodec.yaml │ │ └── wavtokenizer.yaml │ └── vc │ └── VCTK │ ├── mimi.yaml │ ├── bigcodec.yaml │ ├── dac.yaml │ ├── speechtokenizer.yaml │ ├── encodec.yaml │ ├── stablecodec.yaml │ ├── wavlm_kmeans.yaml │ ├── focalcodec.yaml │ ├── semanticodec.yaml │ └── wavtokenizer.yaml ├── MANIFEST.in ├── setup.cfg ├── .pre-commit-config.yaml ├── requirements.txt ├── .gitignore ├── setup.py └── NOTICE /audiocodecs/example.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucadellalib/audiocodecs/HEAD/audiocodecs/example.wav -------------------------------------------------------------------------------- /downstream/metrics/model_v8.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucadellalib/audiocodecs/HEAD/downstream/metrics/model_v8.onnx -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include audiocodecs/example.wav 3 | global-exclude __pycache__ 4 | global-exclude *.py[co] -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | force_grid_wrap = 0 3 | include_trailing_comma = True 4 | line_length = 88 5 | lines_after_imports = 2 6 | multi_line_output = 3 7 | skip_gitignore = True 8 | use_parentheses = True -------------------------------------------------------------------------------- /downstream/requirements.txt: -------------------------------------------------------------------------------- 1 | ctranslate2>=4.0,<=4.4.0 # Required by faster-whisper 2 | faster-whisper 3 | librosa>=0.9.2 4 | numpy>=1.22.0 5 | onnxruntime>=1.16.3 6 | pesq 7 | ptflops 8 | torch 9 | torchaudio 10 | torchmetrics[audio] 11 | transformers>=4.45.1 12 | speechbrain @ git+https://github.com/lucadellalib/speechbrain@50ffdc772c0d977390025ee7787735db9b92488c#egg=speechbrain 13 | # Install local audiocodecs package in editable mode 14 | -e ../ 15 | -e ../[all] -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.5.0 4 | hooks: 5 | - id: trailing-whitespace 6 | types: [file, text] 7 | - id: end-of-file-fixer 8 | types: [python] 9 | - id: mixed-line-ending 10 | types: [python] 11 | args: ["--fix=lf"] 12 | - id: debug-statements 13 | types: [python] 14 | 15 | - repo: https://github.com/psf/black 16 | rev: 24.3.0 17 | hooks: 18 | - id: black 19 | types: [python] 20 | 21 | - repo: https://github.com/pycqa/isort 22 | rev: 5.13.2 23 | hooks: 24 | - id: isort -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beartype # Required by SpeechTokenizer 2 | descript-audio-codec 3 | huggingface_hub 4 | speechtokenizer 5 | tensorboard # Required by SpeechTokenizer 6 | torch 7 | torchaudio 8 | transformers 9 | vocos 10 | bigcodec @ git+https://github.com/lucadellalib/BigCodec.git@main#egg=bigcodec 11 | semanticodec @ git+https://github.com/haoheliu/SemantiCodec-inference@8dc464c3385d2389a695ed3f718f4a0caf3ed33f#egg=semanticodec 12 | speechbrain @ git+https://github.com/lucadellalib/speechbrain@50ffdc772c0d977390025ee7787735db9b92488c#egg=speechbrain 13 | stable_codec @ git+https://github.com/lucadellalib/stable-codec.git@main#egg=stable_codec 14 | wavtokenizer @ git+https://github.com/lucadellalib/WavTokenizer.git@main#egg=wavtokenizer -------------------------------------------------------------------------------- /audiocodecs/version.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | """Version according to SemVer versioning system (https://semver.org/).""" 18 | 19 | 20 | __all__ = [ 21 | "VERSION", 22 | ] 23 | 24 | 25 | _MAJOR = "0" # Major version to increment in case of incompatible API changes 26 | 27 | _MINOR = ( 28 | "0" # Minor version to increment in case of backward compatible new functionality 29 | ) 30 | 31 | _PATCH = "1" # Patch version to increment in case of backward compatible bug fixes 32 | 33 | VERSION = f"{_MAJOR}.{_MINOR}.{_PATCH}" 34 | """The package version.""" 35 | -------------------------------------------------------------------------------- /audiocodecs/__init__.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | from audiocodecs.bigcodec import BigCodec 18 | from audiocodecs.codec import Codec 19 | from audiocodecs.dac import DAC 20 | from audiocodecs.encodec import Encodec 21 | from audiocodecs.focalcodec import FocalCodec 22 | from audiocodecs.mimi import Mimi 23 | from audiocodecs.semanticodec import SemantiCodec 24 | from audiocodecs.speechtokenizer import SpeechTokenizer 25 | from audiocodecs.stablecodec import StableCodec 26 | from audiocodecs.version import VERSION as __version__ 27 | from audiocodecs.wavlm_kmeans import WavLMKmeans 28 | from audiocodecs.wavtokenizer import WavTokenizer 29 | -------------------------------------------------------------------------------- /downstream/models/pooling.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | """Pooling layers.""" 18 | 19 | import torch 20 | from torch import nn 21 | 22 | 23 | __all__ = ["LinearPooling"] 24 | 25 | 26 | class LinearPooling(nn.Module): 27 | def __init__(self, num_channels): 28 | super().__init__() 29 | self.num_channels = num_channels 30 | if num_channels == 1: 31 | self.mlp = nn.Identity() 32 | else: 33 | self.mlp = nn.Linear(num_channels, 1, bias=False) 34 | 35 | def forward(self, x): 36 | # (B, N, K, H) 37 | x = x.movedim(-1, -2) 38 | # (B, N, H) 39 | x = self.mlp(x)[..., 0] 40 | return x 41 | 42 | 43 | if __name__ == "__main__": 44 | x = torch.randn(2, 100, 3, 256) 45 | model = LinearPooling(3) 46 | y = model(x) 47 | print(x.shape) 48 | print(y.shape) 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # IPython 78 | profile_default/ 79 | ipython_config.py 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule file 85 | celerybeat-schedule 86 | 87 | # SageMath parsed files 88 | *.sage.py 89 | 90 | # Environments 91 | .env 92 | .venv 93 | env/* 94 | venv/ 95 | ENV/ 96 | env.bak/ 97 | venv.bak/ 98 | 99 | # Spyder project settings 100 | .spyderproject 101 | .spyproject 102 | 103 | # Rope project settings 104 | .ropeproject 105 | 106 | # mkdocs documentation 107 | /site 108 | 109 | # mypy 110 | .mypy_cache/ 111 | .dmypy.json 112 | dmypy.json 113 | 114 | # Pyre type checker 115 | .pyre/ 116 | 117 | # PyCharm 118 | .idea/ 119 | 120 | # Data 121 | data/ 122 | 123 | # Results 124 | results/ 125 | 126 | reconstruction.wav -------------------------------------------------------------------------------- /downstream/metrics/utmos.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | """UTokyo-SaruLab System for VoiceMOS Challenge 2022 (UTMOS) (see https://arxiv.org/abs/2204.02152).""" 18 | 19 | import torch 20 | import torchaudio 21 | from speechbrain.utils.metric_stats import MetricStats 22 | 23 | 24 | __all__ = ["UTMOS"] 25 | 26 | 27 | SAMPLE_RATE = 16000 28 | 29 | 30 | class UTMOS(MetricStats): 31 | def __init__(self, sample_rate, model=None): 32 | self.sample_rate = sample_rate 33 | self.model = model 34 | if model is None: 35 | self.model = torch.hub.load( 36 | "tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True 37 | ) 38 | self.clear() 39 | 40 | @torch.no_grad() 41 | def append(self, ids, sig, lens=None): 42 | assert sig.ndim == 2 43 | 44 | # Resample 45 | hyp_sig = torchaudio.functional.resample(sig, self.sample_rate, SAMPLE_RATE) 46 | 47 | self.model.to(hyp_sig.device) 48 | self.model.eval() 49 | 50 | # Forward 51 | scores = self.model(hyp_sig, SAMPLE_RATE) 52 | 53 | self.ids += ids 54 | self.scores += scores.cpu().tolist() 55 | 56 | 57 | if __name__ == "__main__": 58 | sample_rate = 24000 59 | ids = ["A", "B"] 60 | hyp_sig = torch.randn(2, 2 * sample_rate) 61 | 62 | utmos = UTMOS(sample_rate) 63 | utmos.append(ids, hyp_sig) 64 | print(utmos.summarize("average")) 65 | -------------------------------------------------------------------------------- /downstream/metrics/stoi.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | """Short-time objective intelligibility (STOI) (see https://ieeexplore.ieee.org/abstract/document/5495701).""" 18 | 19 | import torch 20 | import torchaudio 21 | from speechbrain.utils.metric_stats import MetricStats 22 | from torchmetrics.functional.audio.stoi import short_time_objective_intelligibility 23 | 24 | 25 | __all__ = ["STOI"] 26 | 27 | 28 | SAMPLE_RATE = 16000 29 | 30 | 31 | class STOI(MetricStats): 32 | def __init__(self, sample_rate): 33 | self.sample_rate = sample_rate 34 | self.clear() 35 | 36 | @torch.no_grad() 37 | def append(self, ids, hyp_sig, ref_sig, lens=None): 38 | assert hyp_sig.shape == ref_sig.shape 39 | assert hyp_sig.ndim == 2 40 | 41 | # Resample 42 | hyp_sig = torchaudio.functional.resample(hyp_sig, self.sample_rate, SAMPLE_RATE) 43 | ref_sig = torchaudio.functional.resample(ref_sig, self.sample_rate, SAMPLE_RATE) 44 | 45 | scores = [ 46 | short_time_objective_intelligibility( 47 | hyp.cpu(), ref.cpu(), SAMPLE_RATE 48 | ).float() 49 | for hyp, ref in zip(hyp_sig, ref_sig) 50 | ] 51 | 52 | self.ids += ids 53 | self.scores += scores 54 | 55 | 56 | if __name__ == "__main__": 57 | sample_rate = 24000 58 | ids = ["A", "B"] 59 | hyp_sig = torch.randn(2, 2 * sample_rate) 60 | ref_sig = torch.randn(2, 2 * sample_rate) 61 | 62 | stoi = STOI(sample_rate) 63 | stoi.append(ids, hyp_sig, ref_sig) 64 | print(stoi.summarize("average")) 65 | -------------------------------------------------------------------------------- /downstream/metrics/pesq.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | """Perceptual evaluation of speech quality (PESQ) (see https://en.wikipedia.org/wiki/Perceptual_Evaluation_of_Speech_Quality).""" 18 | 19 | import os 20 | import sys 21 | 22 | import torch 23 | import torchaudio 24 | from speechbrain.utils.metric_stats import MetricStats 25 | from torchmetrics.functional.audio.pesq import perceptual_evaluation_speech_quality 26 | 27 | 28 | __all__ = ["PESQ"] 29 | 30 | 31 | SAMPLE_RATE = 16000 32 | 33 | 34 | class PESQ(MetricStats): 35 | def __init__(self, sample_rate): 36 | self.sample_rate = sample_rate 37 | self.clear() 38 | 39 | @torch.no_grad() 40 | def append(self, ids, hyp_sig, ref_sig, lens=None): 41 | assert hyp_sig.shape == ref_sig.shape 42 | assert hyp_sig.ndim == 2 43 | 44 | # Resample 45 | hyp_sig = torchaudio.functional.resample(hyp_sig, self.sample_rate, SAMPLE_RATE) 46 | ref_sig = torchaudio.functional.resample(ref_sig, self.sample_rate, SAMPLE_RATE) 47 | 48 | # Workaround to avoid name collisions with installed modules 49 | root_dir = os.path.dirname(os.path.realpath(__file__)) 50 | sys_path = [x for x in sys.path] 51 | sys.path = [x for x in sys.path if root_dir not in x] 52 | scores = [ 53 | perceptual_evaluation_speech_quality(hyp, ref, SAMPLE_RATE, "wb").cpu() 54 | for hyp, ref in zip(hyp_sig, ref_sig) 55 | ] 56 | sys.path = sys_path 57 | 58 | self.ids += ids 59 | self.scores += scores 60 | 61 | 62 | if __name__ == "__main__": 63 | sample_rate = 24000 64 | ids = ["A", "B"] 65 | hyp_sig = torch.randn(2, 2 * sample_rate) 66 | ref_sig = torch.randn(2, 2 * sample_rate) 67 | 68 | pesq = PESQ(sample_rate) 69 | pesq.append(ids, hyp_sig, ref_sig) 70 | print(pesq.summarize("average")) 71 | -------------------------------------------------------------------------------- /downstream/models/speaker_encoder.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | """Speaker encoders.""" 18 | 19 | import torch 20 | import torchaudio 21 | from speechbrain.dataio.dataio import length_to_mask 22 | from transformers import AutoModelForAudioXVector 23 | 24 | 25 | __all__ = ["WavLM"] 26 | 27 | 28 | SAMPLE_RATE = 16000 29 | 30 | 31 | class WavLM(torch.nn.Module): 32 | def __init__(self, model_hub, save_path, sample_rate, pool=True): 33 | super().__init__() 34 | self.model_hub = model_hub 35 | self.save_path = save_path 36 | self.sample_rate = sample_rate 37 | self.pool = pool 38 | self.model = AutoModelForAudioXVector.from_pretrained( 39 | model_hub, cache_dir=save_path 40 | ) 41 | 42 | @torch.no_grad() 43 | def forward(self, sig, lens=None): 44 | # Resample 45 | sig = torchaudio.functional.resample(sig, self.sample_rate, SAMPLE_RATE) 46 | 47 | self.model.to(sig.device) 48 | self.model.eval() 49 | 50 | # Attention mask 51 | attention_mask = None 52 | if lens is not None: 53 | abs_length = lens * sig.shape[-1] 54 | attention_mask = length_to_mask( 55 | abs_length.int() 56 | ).long() # 0 for masked tokens 57 | 58 | # Forward 59 | embs = self.model( 60 | input_values=sig, 61 | attention_mask=attention_mask, 62 | output_attentions=False, 63 | ) 64 | 65 | if self.pool: 66 | return embs.embeddings 67 | 68 | return embs.hidden_states[-1] 69 | 70 | 71 | if __name__ == "__main__": 72 | from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE 73 | 74 | x = torch.randn(1, 16000) 75 | model = WavLM( 76 | "microsoft/wavlm-base-sv", 77 | HUGGINGFACE_HUB_CACHE, 78 | 16000, 79 | pool=True, 80 | ) 81 | y = model(x) 82 | print(y.shape) 83 | -------------------------------------------------------------------------------- /downstream/metrics/mel_distance.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | """Mel distance.""" 18 | 19 | import torch 20 | import torchaudio 21 | from speechbrain.utils.metric_stats import MetricStats 22 | 23 | 24 | __all__ = ["MelDistance"] 25 | 26 | 27 | SAMPLE_RATE = 16000 28 | 29 | 30 | class MelDistance(MetricStats): 31 | def __init__(self, sample_rate, n_mels=80, n_fft=1024, hop_length=320): 32 | self.sample_rate = sample_rate 33 | self.n_mels = n_mels 34 | self.n_fft = n_fft 35 | self.hop_length = hop_length 36 | 37 | self.mel_spec = torchaudio.transforms.MelSpectrogram( 38 | sample_rate=SAMPLE_RATE, 39 | n_fft=n_fft, 40 | hop_length=hop_length, 41 | n_mels=n_mels, 42 | power=1.0, 43 | ) 44 | self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB() 45 | self.clear() 46 | 47 | @torch.no_grad() 48 | def append(self, ids, hyp_sig, ref_sig, lens=None): 49 | assert hyp_sig.shape == ref_sig.shape 50 | assert hyp_sig.ndim == 2 51 | 52 | # Resample to standard sample rate 53 | hyp_sig = torchaudio.functional.resample(hyp_sig, self.sample_rate, SAMPLE_RATE) 54 | ref_sig = torchaudio.functional.resample(ref_sig, self.sample_rate, SAMPLE_RATE) 55 | 56 | self.mel_spec.to(hyp_sig.device) 57 | hyp_mel = self.amplitude_to_db(self.mel_spec(hyp_sig)) 58 | ref_mel = self.amplitude_to_db(self.mel_spec(ref_sig)) 59 | 60 | # Compute L2 distance between Mel spectrograms 61 | scores = (hyp_mel - ref_mel).norm(dim=1).mean(dim=1).cpu().tolist() 62 | 63 | self.ids += ids 64 | self.scores += scores 65 | 66 | 67 | if __name__ == "__main__": 68 | sample_rate = 24000 69 | ids = ["A", "B"] 70 | hyp_sig = torch.randn(2, 2 * sample_rate) 71 | ref_sig = torch.randn(2, 2 * sample_rate) 72 | 73 | mel_dist = MelDistance(sample_rate) 74 | mel_dist.append(ids, hyp_sig, ref_sig) 75 | print(mel_dist.summarize("average")) 76 | -------------------------------------------------------------------------------- /downstream/hparams/sr/MLS/bigcodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: bigcodec 6 | dataset: MLS 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | languages: null 18 | 19 | # Output folders 20 | output_folder: !ref results//// 21 | save_folder: !ref /save 22 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 23 | 24 | # Save options 25 | compute_metrics: True 26 | save_audios: False 27 | use_profiler: True 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | vocab_size: 8192 40 | num_codebooks: 1 41 | mode: reconstruct 42 | 43 | # Performance metrics parameters 44 | dwer_hub: small 45 | wavlm_sim_hub: microsoft/wavlm-base-sv 46 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 47 | 48 | # Codec 49 | codec: !new:audiocodecs.BigCodec 50 | sample_rate: !ref 51 | mode: !ref 52 | 53 | # Performance metrics 54 | utmos_computer: !name:metrics.utmos.UTMOS 55 | sample_rate: !ref 56 | 57 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 58 | sample_rate: !ref 59 | 60 | stoi_computer: !name:metrics.stoi.STOI 61 | sample_rate: !ref 62 | 63 | pesq_computer: !name:metrics.pesq.PESQ 64 | sample_rate: !ref 65 | 66 | meld_computer: !name:metrics.mel_distance.MelDistance 67 | sample_rate: !ref 68 | 69 | stftd_computer: !name:metrics.stft_distance.STFTDistance 70 | sample_rate: !ref 71 | 72 | dwer_computer: !name:metrics.dwer.DWER 73 | model_hub: !ref 74 | sample_rate: !ref 75 | save_path: !ref 76 | 77 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 78 | model_hub: !ref 79 | save_path: !ref 80 | sample_rate: !ref 81 | 82 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 83 | model_hub: !ref 84 | sample_rate: !ref 85 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 86 | 87 | # Counters, checkpointers, loggers, etc. 88 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 89 | save_file: !ref /train_log.txt 90 | -------------------------------------------------------------------------------- /downstream/hparams/sr/VoiceBank/bigcodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: bigcodec 6 | dataset: VoiceBank 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /testset_wav.csv 16 | splits: [testset_wav] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | vocab_size: 8192 39 | num_codebooks: 1 40 | mode: reconstruct 41 | 42 | # Performance metrics parameters 43 | dwer_hub: small 44 | wavlm_sim_hub: microsoft/wavlm-base-sv 45 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 46 | 47 | # Codec 48 | codec: !new:audiocodecs.BigCodec 49 | sample_rate: !ref 50 | mode: !ref 51 | 52 | # Performance metrics 53 | utmos_computer: !name:metrics.utmos.UTMOS 54 | sample_rate: !ref 55 | 56 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 57 | sample_rate: !ref 58 | 59 | stoi_computer: !name:metrics.stoi.STOI 60 | sample_rate: !ref 61 | 62 | pesq_computer: !name:metrics.pesq.PESQ 63 | sample_rate: !ref 64 | 65 | meld_computer: !name:metrics.mel_distance.MelDistance 66 | sample_rate: !ref 67 | 68 | stftd_computer: !name:metrics.stft_distance.STFTDistance 69 | sample_rate: !ref 70 | 71 | dwer_computer: !name:metrics.dwer.DWER 72 | model_hub: !ref 73 | sample_rate: !ref 74 | save_path: !ref 75 | 76 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 77 | model_hub: !ref 78 | save_path: !ref 79 | sample_rate: !ref 80 | 81 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 82 | model_hub: !ref 83 | sample_rate: !ref 84 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 85 | 86 | # Counters, checkpointers, loggers, etc. 87 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 88 | save_file: !ref /train_log.txt 89 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriSpeech/bigcodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: bigcodec 6 | dataset: LibriSpeech 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test-clean.csv 16 | splits: [test-clean] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | vocab_size: 8192 39 | num_codebooks: 1 40 | mode: reconstruct 41 | 42 | # Performance metrics parameters 43 | dwer_hub: small 44 | wavlm_sim_hub: microsoft/wavlm-base-sv 45 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 46 | 47 | # Codec 48 | codec: !new:audiocodecs.BigCodec 49 | sample_rate: !ref 50 | mode: !ref 51 | 52 | # Performance metrics 53 | utmos_computer: !name:metrics.utmos.UTMOS 54 | sample_rate: !ref 55 | 56 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 57 | sample_rate: !ref 58 | 59 | stoi_computer: !name:metrics.stoi.STOI 60 | sample_rate: !ref 61 | 62 | pesq_computer: !name:metrics.pesq.PESQ 63 | sample_rate: !ref 64 | 65 | meld_computer: !name:metrics.mel_distance.MelDistance 66 | sample_rate: !ref 67 | 68 | stftd_computer: !name:metrics.stft_distance.STFTDistance 69 | sample_rate: !ref 70 | 71 | dwer_computer: !name:metrics.dwer.DWER 72 | model_hub: !ref 73 | sample_rate: !ref 74 | save_path: !ref 75 | 76 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 77 | model_hub: !ref 78 | save_path: !ref 79 | sample_rate: !ref 80 | 81 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 82 | model_hub: !ref 83 | sample_rate: !ref 84 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 85 | 86 | # Counters, checkpointers, loggers, etc. 87 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 88 | save_file: !ref /train_log.txt 89 | -------------------------------------------------------------------------------- /downstream/hparams/sr/MLS/mimi.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: mimi 6 | dataset: MLS 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | languages: null 18 | 19 | # Output folders 20 | output_folder: !ref results//// 21 | save_folder: !ref /save 22 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 23 | 24 | # Save options 25 | compute_metrics: True 26 | save_audios: False 27 | use_profiler: True 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | vocab_size: 2048 40 | num_codebooks: 5 41 | mode: reconstruct 42 | 43 | # Performance metrics parameters 44 | dwer_hub: small 45 | wavlm_sim_hub: microsoft/wavlm-base-sv 46 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 47 | 48 | # Codec 49 | codec: !new:audiocodecs.Mimi 50 | sample_rate: !ref 51 | num_codebooks: !ref 52 | mode: !ref 53 | 54 | # Performance metrics 55 | utmos_computer: !name:metrics.utmos.UTMOS 56 | sample_rate: !ref 57 | 58 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 59 | sample_rate: !ref 60 | 61 | stoi_computer: !name:metrics.stoi.STOI 62 | sample_rate: !ref 63 | 64 | pesq_computer: !name:metrics.pesq.PESQ 65 | sample_rate: !ref 66 | 67 | meld_computer: !name:metrics.mel_distance.MelDistance 68 | sample_rate: !ref 69 | 70 | stftd_computer: !name:metrics.stft_distance.STFTDistance 71 | sample_rate: !ref 72 | 73 | dwer_computer: !name:metrics.dwer.DWER 74 | model_hub: !ref 75 | sample_rate: !ref 76 | save_path: !ref 77 | 78 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 79 | model_hub: !ref 80 | save_path: !ref 81 | sample_rate: !ref 82 | 83 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 84 | model_hub: !ref 85 | sample_rate: !ref 86 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 87 | 88 | # Counters, checkpointers, loggers, etc. 89 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 90 | save_file: !ref /train_log.txt 91 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriSpeech/mimi.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: mimi 6 | dataset: LibriSpeech 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test-clean.csv 16 | splits: [test-clean] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | vocab_size: 2048 39 | num_codebooks: 5 40 | mode: reconstruct 41 | 42 | # Performance metrics parameters 43 | dwer_hub: small 44 | wavlm_sim_hub: microsoft/wavlm-base-sv 45 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 46 | 47 | # Codec 48 | codec: !new:audiocodecs.Mimi 49 | sample_rate: !ref 50 | num_codebooks: !ref 51 | mode: !ref 52 | 53 | # Performance metrics 54 | utmos_computer: !name:metrics.utmos.UTMOS 55 | sample_rate: !ref 56 | 57 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 58 | sample_rate: !ref 59 | 60 | stoi_computer: !name:metrics.stoi.STOI 61 | sample_rate: !ref 62 | 63 | pesq_computer: !name:metrics.pesq.PESQ 64 | sample_rate: !ref 65 | 66 | meld_computer: !name:metrics.mel_distance.MelDistance 67 | sample_rate: !ref 68 | 69 | stftd_computer: !name:metrics.stft_distance.STFTDistance 70 | sample_rate: !ref 71 | 72 | dwer_computer: !name:metrics.dwer.DWER 73 | model_hub: !ref 74 | sample_rate: !ref 75 | save_path: !ref 76 | 77 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 78 | model_hub: !ref 79 | save_path: !ref 80 | sample_rate: !ref 81 | 82 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 83 | model_hub: !ref 84 | sample_rate: !ref 85 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 86 | 87 | # Counters, checkpointers, loggers, etc. 88 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 89 | save_file: !ref /train_log.txt 90 | -------------------------------------------------------------------------------- /downstream/hparams/sr/VoiceBank/mimi.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: mimi 6 | dataset: VoiceBank 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /testset_wav.csv 16 | splits: [testset_wav] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | vocab_size: 2048 39 | num_codebooks: 5 40 | mode: reconstruct 41 | 42 | # Performance metrics parameters 43 | dwer_hub: small 44 | wavlm_sim_hub: microsoft/wavlm-base-sv 45 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 46 | 47 | # Codec 48 | codec: !new:audiocodecs.Mimi 49 | sample_rate: !ref 50 | num_codebooks: !ref 51 | mode: !ref 52 | 53 | # Performance metrics 54 | utmos_computer: !name:metrics.utmos.UTMOS 55 | sample_rate: !ref 56 | 57 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 58 | sample_rate: !ref 59 | 60 | stoi_computer: !name:metrics.stoi.STOI 61 | sample_rate: !ref 62 | 63 | pesq_computer: !name:metrics.pesq.PESQ 64 | sample_rate: !ref 65 | 66 | meld_computer: !name:metrics.mel_distance.MelDistance 67 | sample_rate: !ref 68 | 69 | stftd_computer: !name:metrics.stft_distance.STFTDistance 70 | sample_rate: !ref 71 | 72 | dwer_computer: !name:metrics.dwer.DWER 73 | model_hub: !ref 74 | sample_rate: !ref 75 | save_path: !ref 76 | 77 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 78 | model_hub: !ref 79 | save_path: !ref 80 | sample_rate: !ref 81 | 82 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 83 | model_hub: !ref 84 | sample_rate: !ref 85 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 86 | 87 | # Counters, checkpointers, loggers, etc. 88 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 89 | save_file: !ref /train_log.txt 90 | -------------------------------------------------------------------------------- /downstream/hparams/vc/VCTK/mimi.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: mimi 6 | dataset: VCTK 7 | task: vc 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /train.csv 16 | splits: [train] 17 | num_valid_speakers: 0 18 | num_test_speakers: 0 19 | 20 | # Output folders 21 | output_folder: !ref results//// 22 | save_folder: !ref /save 23 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 24 | 25 | # Save options 26 | compute_metrics: True 27 | save_audios: False 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | vocab_size: 2048 40 | num_codebooks: 5 41 | mode: reconstruct 42 | 43 | # Performance metrics parameters 44 | dwer_hub: small 45 | wavlm_sim_hub: microsoft/wavlm-base-sv 46 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 47 | 48 | # Codec 49 | codec: !new:audiocodecs.Mimi 50 | sample_rate: !ref 51 | num_codebooks: !ref 52 | mode: !ref 53 | 54 | # Performance metrics 55 | utmos_computer: !name:metrics.utmos.UTMOS 56 | sample_rate: !ref 57 | 58 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 59 | sample_rate: !ref 60 | 61 | stoi_computer: !name:metrics.stoi.STOI 62 | sample_rate: !ref 63 | 64 | pesq_computer: !name:metrics.pesq.PESQ 65 | sample_rate: !ref 66 | 67 | meld_computer: !name:metrics.mel_distance.MelDistance 68 | sample_rate: !ref 69 | 70 | stftd_computer: !name:metrics.stft_distance.STFTDistance 71 | sample_rate: !ref 72 | 73 | dwer_computer: !name:metrics.dwer.DWER 74 | model_hub: !ref 75 | sample_rate: !ref 76 | save_path: !ref 77 | 78 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 79 | model_hub: !ref 80 | save_path: !ref 81 | sample_rate: !ref 82 | 83 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 84 | model_hub: !ref 85 | sample_rate: !ref 86 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 87 | 88 | # Counters, checkpointers, loggers, etc. 89 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 90 | save_file: !ref /train_log.txt 91 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriMix/bigcodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: bigcodec 6 | dataset: LibriMix 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | num_speakers: 1 18 | add_noise: True 19 | version: wav16k/min 20 | 21 | # Output folders 22 | output_folder: !ref results//// 23 | save_folder: !ref /save 24 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 25 | 26 | # Save options 27 | compute_metrics: True 28 | save_audios: False 29 | use_profiler: True 30 | 31 | # Preprocessing parameters 32 | test_remove_if_longer: 60.0 # Seconds 33 | test_remove_if_shorter: 0.0 # Seconds 34 | 35 | # Test parameters 36 | test_batch_size: 1 37 | dataloader_workers: 4 38 | 39 | # Codec parameters 40 | sample_rate: 16000 41 | vocab_size: 8192 42 | num_codebooks: 1 43 | mode: reconstruct 44 | 45 | # Performance metrics parameters 46 | dwer_hub: small 47 | wavlm_sim_hub: microsoft/wavlm-base-sv 48 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 49 | 50 | # Codec 51 | codec: !new:audiocodecs.BigCodec 52 | sample_rate: !ref 53 | mode: !ref 54 | 55 | # Performance metrics 56 | utmos_computer: !name:metrics.utmos.UTMOS 57 | sample_rate: !ref 58 | 59 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 60 | sample_rate: !ref 61 | 62 | stoi_computer: !name:metrics.stoi.STOI 63 | sample_rate: !ref 64 | 65 | pesq_computer: !name:metrics.pesq.PESQ 66 | sample_rate: !ref 67 | 68 | meld_computer: !name:metrics.mel_distance.MelDistance 69 | sample_rate: !ref 70 | 71 | stftd_computer: !name:metrics.stft_distance.STFTDistance 72 | sample_rate: !ref 73 | 74 | dwer_computer: !name:metrics.dwer.DWER 75 | model_hub: !ref 76 | sample_rate: !ref 77 | save_path: !ref 78 | 79 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 80 | model_hub: !ref 81 | save_path: !ref 82 | sample_rate: !ref 83 | 84 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 85 | model_hub: !ref 86 | sample_rate: !ref 87 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 88 | 89 | # Counters, checkpointers, loggers, etc. 90 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 91 | save_file: !ref /train_log.txt 92 | -------------------------------------------------------------------------------- /downstream/hparams/sr/MLS/dac.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: dac 6 | dataset: MLS 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | languages: null 18 | 19 | # Output folders 20 | output_folder: !ref results//// 21 | save_folder: !ref /save 22 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 23 | 24 | # Save options 25 | compute_metrics: True 26 | save_audios: False 27 | use_profiler: True 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | vocab_size: 1024 40 | num_codebooks: 2 41 | mode: reconstruct 42 | 43 | # Performance metrics parameters 44 | dwer_hub: small 45 | wavlm_sim_hub: microsoft/wavlm-base-sv 46 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 47 | 48 | # Codec 49 | codec: !new:audiocodecs.DAC 50 | sample_rate: !ref 51 | orig_sample_rate: 16000 52 | num_codebooks: !ref 53 | mode: !ref 54 | 55 | # Performance metrics 56 | utmos_computer: !name:metrics.utmos.UTMOS 57 | sample_rate: !ref 58 | 59 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 60 | sample_rate: !ref 61 | 62 | stoi_computer: !name:metrics.stoi.STOI 63 | sample_rate: !ref 64 | 65 | pesq_computer: !name:metrics.pesq.PESQ 66 | sample_rate: !ref 67 | 68 | meld_computer: !name:metrics.mel_distance.MelDistance 69 | sample_rate: !ref 70 | 71 | stftd_computer: !name:metrics.stft_distance.STFTDistance 72 | sample_rate: !ref 73 | 74 | dwer_computer: !name:metrics.dwer.DWER 75 | model_hub: !ref 76 | sample_rate: !ref 77 | save_path: !ref 78 | 79 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 80 | model_hub: !ref 81 | save_path: !ref 82 | sample_rate: !ref 83 | 84 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 85 | model_hub: !ref 86 | sample_rate: !ref 87 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 88 | 89 | # Counters, checkpointers, loggers, etc. 90 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 91 | save_file: !ref /train_log.txt 92 | -------------------------------------------------------------------------------- /downstream/hparams/vc/VCTK/bigcodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: bigcodec 6 | dataset: VCTK 7 | task: vc 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /train.csv 16 | splits: [train] 17 | num_valid_speakers: 0 18 | num_test_speakers: 0 19 | 20 | # Output folders 21 | output_folder: !ref results//// 22 | save_folder: !ref /save 23 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 24 | 25 | # Save options 26 | compute_metrics: True 27 | save_audios: False 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | vocab_size: 8192 40 | num_codebooks: 1 41 | mode: reconstruct 42 | 43 | # k-NN 44 | topk: 4 45 | num_splits: 1 46 | 47 | # Performance metrics parameters 48 | dwer_hub: small 49 | wavlm_sim_hub: microsoft/wavlm-base-sv 50 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 51 | 52 | # Codec 53 | codec: !new:audiocodecs.BigCodec 54 | sample_rate: !ref 55 | mode: !ref 56 | 57 | # Performance metrics 58 | utmos_computer: !name:metrics.utmos.UTMOS 59 | sample_rate: !ref 60 | 61 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 62 | sample_rate: !ref 63 | 64 | stoi_computer: !name:metrics.stoi.STOI 65 | sample_rate: !ref 66 | 67 | pesq_computer: !name:metrics.pesq.PESQ 68 | sample_rate: !ref 69 | 70 | meld_computer: !name:metrics.mel_distance.MelDistance 71 | sample_rate: !ref 72 | 73 | stftd_computer: !name:metrics.stft_distance.STFTDistance 74 | sample_rate: !ref 75 | 76 | dwer_computer: !name:metrics.dwer.DWER 77 | model_hub: !ref 78 | sample_rate: !ref 79 | save_path: !ref 80 | 81 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 82 | model_hub: !ref 83 | save_path: !ref 84 | sample_rate: !ref 85 | 86 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 87 | model_hub: !ref 88 | sample_rate: !ref 89 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 90 | 91 | # Counters, checkpointers, loggers, etc. 92 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 93 | save_file: !ref /train_log.txt 94 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriSpeech/dac.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: dac 6 | dataset: LibriSpeech 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test-clean.csv 16 | splits: [test-clean] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | vocab_size: 1024 39 | num_codebooks: 2 40 | mode: reconstruct 41 | 42 | # Performance metrics parameters 43 | dwer_hub: small 44 | wavlm_sim_hub: microsoft/wavlm-base-sv 45 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 46 | 47 | # Codec 48 | codec: !new:audiocodecs.DAC 49 | sample_rate: !ref 50 | orig_sample_rate: 16000 51 | num_codebooks: !ref 52 | mode: !ref 53 | 54 | # Performance metrics 55 | utmos_computer: !name:metrics.utmos.UTMOS 56 | sample_rate: !ref 57 | 58 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 59 | sample_rate: !ref 60 | 61 | stoi_computer: !name:metrics.stoi.STOI 62 | sample_rate: !ref 63 | 64 | pesq_computer: !name:metrics.pesq.PESQ 65 | sample_rate: !ref 66 | 67 | meld_computer: !name:metrics.mel_distance.MelDistance 68 | sample_rate: !ref 69 | 70 | stftd_computer: !name:metrics.stft_distance.STFTDistance 71 | sample_rate: !ref 72 | 73 | dwer_computer: !name:metrics.dwer.DWER 74 | model_hub: !ref 75 | sample_rate: !ref 76 | save_path: !ref 77 | 78 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 79 | model_hub: !ref 80 | save_path: !ref 81 | sample_rate: !ref 82 | 83 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 84 | model_hub: !ref 85 | sample_rate: !ref 86 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 87 | 88 | # Counters, checkpointers, loggers, etc. 89 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 90 | save_file: !ref /train_log.txt 91 | -------------------------------------------------------------------------------- /downstream/hparams/sr/MLS/speechtokenizer.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: speechtokenizer 6 | dataset: MLS 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | languages: null 18 | 19 | # Output folders 20 | output_folder: !ref results//// 21 | save_folder: !ref /save 22 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 23 | 24 | # Save options 25 | compute_metrics: True 26 | save_audios: False 27 | use_profiler: True 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | vocab_size: 1024 40 | num_codebooks: 2 41 | mode: reconstruct 42 | 43 | # Performance metrics parameters 44 | dwer_hub: small 45 | wavlm_sim_hub: microsoft/wavlm-base-sv 46 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 47 | 48 | # Codec 49 | codec: !new:audiocodecs.SpeechTokenizer 50 | sample_rate: !ref 51 | num_codebooks: !ref 52 | mode: !ref 53 | 54 | # Performance metrics 55 | utmos_computer: !name:metrics.utmos.UTMOS 56 | sample_rate: !ref 57 | 58 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 59 | sample_rate: !ref 60 | 61 | stoi_computer: !name:metrics.stoi.STOI 62 | sample_rate: !ref 63 | 64 | pesq_computer: !name:metrics.pesq.PESQ 65 | sample_rate: !ref 66 | 67 | meld_computer: !name:metrics.mel_distance.MelDistance 68 | sample_rate: !ref 69 | 70 | stftd_computer: !name:metrics.stft_distance.STFTDistance 71 | sample_rate: !ref 72 | 73 | dwer_computer: !name:metrics.dwer.DWER 74 | model_hub: !ref 75 | sample_rate: !ref 76 | save_path: !ref 77 | 78 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 79 | model_hub: !ref 80 | save_path: !ref 81 | sample_rate: !ref 82 | 83 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 84 | model_hub: !ref 85 | sample_rate: !ref 86 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 87 | 88 | # Counters, checkpointers, loggers, etc. 89 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 90 | save_file: !ref /train_log.txt 91 | -------------------------------------------------------------------------------- /downstream/hparams/sr/VoiceBank/dac.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: dac 6 | dataset: VoiceBank 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /testset_wav.csv 16 | splits: [testset_wav] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | vocab_size: 1024 39 | num_codebooks: 2 40 | mode: reconstruct 41 | 42 | # Performance metrics parameters 43 | dwer_hub: small 44 | wavlm_sim_hub: microsoft/wavlm-base-sv 45 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 46 | 47 | # Codec 48 | codec: !new:audiocodecs.DAC 49 | sample_rate: !ref 50 | orig_sample_rate: 16000 51 | num_codebooks: !ref 52 | mode: !ref 53 | 54 | # Performance metrics 55 | utmos_computer: !name:metrics.utmos.UTMOS 56 | sample_rate: !ref 57 | 58 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 59 | sample_rate: !ref 60 | 61 | stoi_computer: !name:metrics.stoi.STOI 62 | sample_rate: !ref 63 | 64 | pesq_computer: !name:metrics.pesq.PESQ 65 | sample_rate: !ref 66 | 67 | meld_computer: !name:metrics.mel_distance.MelDistance 68 | sample_rate: !ref 69 | 70 | stftd_computer: !name:metrics.stft_distance.STFTDistance 71 | sample_rate: !ref 72 | 73 | dwer_computer: !name:metrics.dwer.DWER 74 | model_hub: !ref 75 | sample_rate: !ref 76 | save_path: !ref 77 | 78 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 79 | model_hub: !ref 80 | save_path: !ref 81 | sample_rate: !ref 82 | 83 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 84 | model_hub: !ref 85 | sample_rate: !ref 86 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 87 | 88 | # Counters, checkpointers, loggers, etc. 89 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 90 | save_file: !ref /train_log.txt 91 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriSpeech/speechtokenizer.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: speechtokenizer 6 | dataset: LibriSpeech 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test-clean.csv 16 | splits: [test-clean] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | vocab_size: 1024 39 | num_codebooks: 2 40 | mode: reconstruct 41 | 42 | # Performance metrics parameters 43 | dwer_hub: small 44 | wavlm_sim_hub: microsoft/wavlm-base-sv 45 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 46 | 47 | # Codec 48 | codec: !new:audiocodecs.SpeechTokenizer 49 | sample_rate: !ref 50 | num_codebooks: !ref 51 | mode: !ref 52 | 53 | # Performance metrics 54 | utmos_computer: !name:metrics.utmos.UTMOS 55 | sample_rate: !ref 56 | 57 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 58 | sample_rate: !ref 59 | 60 | stoi_computer: !name:metrics.stoi.STOI 61 | sample_rate: !ref 62 | 63 | pesq_computer: !name:metrics.pesq.PESQ 64 | sample_rate: !ref 65 | 66 | meld_computer: !name:metrics.mel_distance.MelDistance 67 | sample_rate: !ref 68 | 69 | stftd_computer: !name:metrics.stft_distance.STFTDistance 70 | sample_rate: !ref 71 | 72 | dwer_computer: !name:metrics.dwer.DWER 73 | model_hub: !ref 74 | sample_rate: !ref 75 | save_path: !ref 76 | 77 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 78 | model_hub: !ref 79 | save_path: !ref 80 | sample_rate: !ref 81 | 82 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 83 | model_hub: !ref 84 | sample_rate: !ref 85 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 86 | 87 | # Counters, checkpointers, loggers, etc. 88 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 89 | save_file: !ref /train_log.txt 90 | -------------------------------------------------------------------------------- /downstream/hparams/sr/MLS/encodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: encodec 6 | dataset: MLS 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | languages: null 18 | 19 | # Output folders 20 | output_folder: !ref results//// 21 | save_folder: !ref /save 22 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 23 | 24 | # Save options 25 | compute_metrics: True 26 | save_audios: False 27 | use_profiler: True 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | vocab_size: 1024 40 | num_codebooks: 2 41 | mode: reconstruct 42 | 43 | # Performance metrics parameters 44 | dwer_hub: small 45 | wavlm_sim_hub: microsoft/wavlm-base-sv 46 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 47 | 48 | # Codec 49 | codec: !new:audiocodecs.Encodec 50 | sample_rate: !ref 51 | orig_sample_rate: 24000 52 | num_codebooks: !ref 53 | mode: !ref 54 | 55 | # Performance metrics 56 | utmos_computer: !name:metrics.utmos.UTMOS 57 | sample_rate: !ref 58 | 59 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 60 | sample_rate: !ref 61 | 62 | stoi_computer: !name:metrics.stoi.STOI 63 | sample_rate: !ref 64 | 65 | pesq_computer: !name:metrics.pesq.PESQ 66 | sample_rate: !ref 67 | 68 | meld_computer: !name:metrics.mel_distance.MelDistance 69 | sample_rate: !ref 70 | 71 | stftd_computer: !name:metrics.stft_distance.STFTDistance 72 | sample_rate: !ref 73 | 74 | dwer_computer: !name:metrics.dwer.DWER 75 | model_hub: !ref 76 | sample_rate: !ref 77 | save_path: !ref 78 | 79 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 80 | model_hub: !ref 81 | save_path: !ref 82 | sample_rate: !ref 83 | 84 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 85 | model_hub: !ref 86 | sample_rate: !ref 87 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 88 | 89 | # Counters, checkpointers, loggers, etc. 90 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 91 | save_file: !ref /train_log.txt 92 | -------------------------------------------------------------------------------- /downstream/hparams/sr/VoiceBank/speechtokenizer.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: speechtokenizer 6 | dataset: VoiceBank 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /testset_wav.csv 16 | splits: [testset_wav] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | vocab_size: 1024 39 | num_codebooks: 2 40 | mode: reconstruct 41 | 42 | # Performance metrics parameters 43 | dwer_hub: small 44 | wavlm_sim_hub: microsoft/wavlm-base-sv 45 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 46 | 47 | # Codec 48 | codec: !new:audiocodecs.SpeechTokenizer 49 | sample_rate: !ref 50 | num_codebooks: !ref 51 | mode: !ref 52 | 53 | # Performance metrics 54 | utmos_computer: !name:metrics.utmos.UTMOS 55 | sample_rate: !ref 56 | 57 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 58 | sample_rate: !ref 59 | 60 | stoi_computer: !name:metrics.stoi.STOI 61 | sample_rate: !ref 62 | 63 | pesq_computer: !name:metrics.pesq.PESQ 64 | sample_rate: !ref 65 | 66 | meld_computer: !name:metrics.mel_distance.MelDistance 67 | sample_rate: !ref 68 | 69 | stftd_computer: !name:metrics.stft_distance.STFTDistance 70 | sample_rate: !ref 71 | 72 | dwer_computer: !name:metrics.dwer.DWER 73 | model_hub: !ref 74 | sample_rate: !ref 75 | save_path: !ref 76 | 77 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 78 | model_hub: !ref 79 | save_path: !ref 80 | sample_rate: !ref 81 | 82 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 83 | model_hub: !ref 84 | sample_rate: !ref 85 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 86 | 87 | # Counters, checkpointers, loggers, etc. 88 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 89 | save_file: !ref /train_log.txt 90 | -------------------------------------------------------------------------------- /downstream/hparams/vc/VCTK/dac.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: dac 6 | dataset: VCTK 7 | task: vc 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /train.csv 16 | splits: [train] 17 | num_valid_speakers: 0 18 | num_test_speakers: 0 19 | 20 | # Output folders 21 | output_folder: !ref results//// 22 | save_folder: !ref /save 23 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 24 | 25 | # Save options 26 | compute_metrics: True 27 | save_audios: False 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | vocab_size: 1024 40 | num_codebooks: 2 41 | mode: reconstruct 42 | 43 | # Performance metrics parameters 44 | dwer_hub: small 45 | wavlm_sim_hub: microsoft/wavlm-base-sv 46 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 47 | 48 | # Codec 49 | codec: !new:audiocodecs.DAC 50 | sample_rate: !ref 51 | orig_sample_rate: 16000 52 | num_codebooks: !ref 53 | mode: !ref 54 | 55 | # Performance metrics 56 | utmos_computer: !name:metrics.utmos.UTMOS 57 | sample_rate: !ref 58 | 59 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 60 | sample_rate: !ref 61 | 62 | stoi_computer: !name:metrics.stoi.STOI 63 | sample_rate: !ref 64 | 65 | pesq_computer: !name:metrics.pesq.PESQ 66 | sample_rate: !ref 67 | 68 | meld_computer: !name:metrics.mel_distance.MelDistance 69 | sample_rate: !ref 70 | 71 | stftd_computer: !name:metrics.stft_distance.STFTDistance 72 | sample_rate: !ref 73 | 74 | dwer_computer: !name:metrics.dwer.DWER 75 | model_hub: !ref 76 | sample_rate: !ref 77 | save_path: !ref 78 | 79 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 80 | model_hub: !ref 81 | save_path: !ref 82 | sample_rate: !ref 83 | 84 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 85 | model_hub: !ref 86 | sample_rate: !ref 87 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 88 | 89 | # Counters, checkpointers, loggers, etc. 90 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 91 | save_file: !ref /train_log.txt 92 | -------------------------------------------------------------------------------- /downstream/hparams/sr/VoiceBank/encodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: encodec 6 | dataset: VoiceBank 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /testset_wav.csv 16 | splits: [testset_wav] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | vocab_size: 1024 39 | num_codebooks: 2 40 | mode: reconstruct 41 | 42 | # Performance metrics parameters 43 | dwer_hub: small 44 | wavlm_sim_hub: microsoft/wavlm-base-sv 45 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 46 | 47 | # Codec 48 | codec: !new:audiocodecs.Encodec 49 | sample_rate: !ref 50 | orig_sample_rate: 24000 51 | num_codebooks: !ref 52 | mode: !ref 53 | 54 | # Performance metrics 55 | utmos_computer: !name:metrics.utmos.UTMOS 56 | sample_rate: !ref 57 | 58 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 59 | sample_rate: !ref 60 | 61 | stoi_computer: !name:metrics.stoi.STOI 62 | sample_rate: !ref 63 | 64 | pesq_computer: !name:metrics.pesq.PESQ 65 | sample_rate: !ref 66 | 67 | meld_computer: !name:metrics.mel_distance.MelDistance 68 | sample_rate: !ref 69 | 70 | stftd_computer: !name:metrics.stft_distance.STFTDistance 71 | sample_rate: !ref 72 | 73 | dwer_computer: !name:metrics.dwer.DWER 74 | model_hub: !ref 75 | sample_rate: !ref 76 | save_path: !ref 77 | 78 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 79 | model_hub: !ref 80 | save_path: !ref 81 | sample_rate: !ref 82 | 83 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 84 | model_hub: !ref 85 | sample_rate: !ref 86 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 87 | 88 | # Counters, checkpointers, loggers, etc. 89 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 90 | save_file: !ref /train_log.txt 91 | -------------------------------------------------------------------------------- /downstream/hparams/vc/VCTK/speechtokenizer.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: speechtokenizer 6 | dataset: VCTK 7 | task: vc 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /train.csv 16 | splits: [train] 17 | num_valid_speakers: 0 18 | num_test_speakers: 0 19 | 20 | # Output folders 21 | output_folder: !ref results//// 22 | save_folder: !ref /save 23 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 24 | 25 | # Save options 26 | compute_metrics: True 27 | save_audios: False 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | vocab_size: 1024 40 | num_codebooks: 2 41 | mode: reconstruct 42 | 43 | # Performance metrics parameters 44 | dwer_hub: small 45 | wavlm_sim_hub: microsoft/wavlm-base-sv 46 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 47 | 48 | # Codec 49 | codec: !new:audiocodecs.SpeechTokenizer 50 | sample_rate: !ref 51 | num_codebooks: !ref 52 | mode: !ref 53 | 54 | # Performance metrics 55 | utmos_computer: !name:metrics.utmos.UTMOS 56 | sample_rate: !ref 57 | 58 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 59 | sample_rate: !ref 60 | 61 | stoi_computer: !name:metrics.stoi.STOI 62 | sample_rate: !ref 63 | 64 | pesq_computer: !name:metrics.pesq.PESQ 65 | sample_rate: !ref 66 | 67 | meld_computer: !name:metrics.mel_distance.MelDistance 68 | sample_rate: !ref 69 | 70 | stftd_computer: !name:metrics.stft_distance.STFTDistance 71 | sample_rate: !ref 72 | 73 | dwer_computer: !name:metrics.dwer.DWER 74 | model_hub: !ref 75 | sample_rate: !ref 76 | save_path: !ref 77 | 78 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 79 | model_hub: !ref 80 | save_path: !ref 81 | sample_rate: !ref 82 | 83 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 84 | model_hub: !ref 85 | sample_rate: !ref 86 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 87 | 88 | # Counters, checkpointers, loggers, etc. 89 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 90 | save_file: !ref /train_log.txt 91 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriSpeech/encodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: encodec 6 | dataset: LibriSpeech 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test-clean.csv 16 | splits: [test-clean] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | vocab_size: 1024 39 | num_codebooks: 2 40 | mode: reconstruct 41 | 42 | # Performance metrics parameters 43 | dwer_hub: small 44 | wavlm_sim_hub: microsoft/wavlm-base-sv 45 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 46 | 47 | # Codec 48 | codec: !new:audiocodecs.Encodec 49 | sample_rate: !ref 50 | orig_sample_rate: 24000 51 | num_codebooks: !ref 52 | mode: !ref 53 | 54 | # Performance metrics 55 | utmos_computer: !name:metrics.utmos.UTMOS 56 | sample_rate: !ref 57 | 58 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 59 | sample_rate: !ref 60 | 61 | stoi_computer: !name:metrics.stoi.STOI 62 | sample_rate: !ref 63 | 64 | pesq_computer: !name:metrics.pesq.PESQ 65 | sample_rate: !ref 66 | 67 | meld_computer: !name:metrics.mel_distance.MelDistance 68 | sample_rate: !ref 69 | 70 | stftd_computer: !name:metrics.stft_distance.STFTDistance 71 | sample_rate: !ref 72 | 73 | dwer_computer: !name:metrics.dwer.DWER 74 | model_hub: !ref 75 | sample_rate: !ref 76 | save_path: !ref 77 | 78 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 79 | model_hub: !ref 80 | save_path: !ref 81 | sample_rate: !ref 82 | 83 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 84 | model_hub: !ref 85 | sample_rate: !ref 86 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 87 | 88 | # Counters, checkpointers, loggers, etc. 89 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 90 | save_file: !ref /train_log.txt 91 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriMix/mimi.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: mimi 6 | dataset: LibriMix 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | num_speakers: 1 18 | add_noise: True 19 | version: wav16k/min 20 | 21 | # Output folders 22 | output_folder: !ref results//// 23 | save_folder: !ref /save 24 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 25 | 26 | # Save options 27 | compute_metrics: True 28 | save_audios: False 29 | use_profiler: True 30 | 31 | # Preprocessing parameters 32 | test_remove_if_longer: 60.0 # Seconds 33 | test_remove_if_shorter: 0.0 # Seconds 34 | 35 | # Test parameters 36 | test_batch_size: 1 37 | dataloader_workers: 4 38 | 39 | # Codec parameters 40 | sample_rate: 16000 41 | vocab_size: 2048 42 | num_codebooks: 5 43 | mode: reconstruct 44 | 45 | # Performance metrics parameters 46 | dwer_hub: small 47 | wavlm_sim_hub: microsoft/wavlm-base-sv 48 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 49 | 50 | # Codec 51 | codec: !new:audiocodecs.Mimi 52 | sample_rate: !ref 53 | num_codebooks: !ref 54 | mode: !ref 55 | 56 | # Performance metrics 57 | utmos_computer: !name:metrics.utmos.UTMOS 58 | sample_rate: !ref 59 | 60 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 61 | sample_rate: !ref 62 | 63 | stoi_computer: !name:metrics.stoi.STOI 64 | sample_rate: !ref 65 | 66 | pesq_computer: !name:metrics.pesq.PESQ 67 | sample_rate: !ref 68 | 69 | meld_computer: !name:metrics.mel_distance.MelDistance 70 | sample_rate: !ref 71 | 72 | stftd_computer: !name:metrics.stft_distance.STFTDistance 73 | sample_rate: !ref 74 | 75 | dwer_computer: !name:metrics.dwer.DWER 76 | model_hub: !ref 77 | sample_rate: !ref 78 | save_path: !ref 79 | 80 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 81 | model_hub: !ref 82 | save_path: !ref 83 | sample_rate: !ref 84 | 85 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 86 | model_hub: !ref 87 | sample_rate: !ref 88 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 89 | 90 | # Counters, checkpointers, loggers, etc. 91 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 92 | save_file: !ref /train_log.txt 93 | -------------------------------------------------------------------------------- /downstream/hparams/vc/VCTK/encodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: encodec 6 | dataset: VCTK 7 | task: vc 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /train.csv 16 | splits: [train] 17 | num_valid_speakers: 0 18 | num_test_speakers: 0 19 | 20 | # Output folders 21 | output_folder: !ref results//// 22 | save_folder: !ref /save 23 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 24 | 25 | # Save options 26 | compute_metrics: True 27 | save_audios: False 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | vocab_size: 1024 40 | num_codebooks: 2 41 | mode: reconstruct 42 | 43 | # Performance metrics parameters 44 | dwer_hub: small 45 | wavlm_sim_hub: microsoft/wavlm-base-sv 46 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 47 | 48 | # Codec 49 | codec: !new:audiocodecs.Encodec 50 | sample_rate: !ref 51 | orig_sample_rate: 24000 52 | num_codebooks: !ref 53 | mode: !ref 54 | 55 | # Performance metrics 56 | utmos_computer: !name:metrics.utmos.UTMOS 57 | sample_rate: !ref 58 | 59 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 60 | sample_rate: !ref 61 | 62 | stoi_computer: !name:metrics.stoi.STOI 63 | sample_rate: !ref 64 | 65 | pesq_computer: !name:metrics.pesq.PESQ 66 | sample_rate: !ref 67 | 68 | meld_computer: !name:metrics.mel_distance.MelDistance 69 | sample_rate: !ref 70 | 71 | stftd_computer: !name:metrics.stft_distance.STFTDistance 72 | sample_rate: !ref 73 | 74 | dwer_computer: !name:metrics.dwer.DWER 75 | model_hub: !ref 76 | sample_rate: !ref 77 | save_path: !ref 78 | 79 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 80 | model_hub: !ref 81 | save_path: !ref 82 | sample_rate: !ref 83 | 84 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 85 | model_hub: !ref 86 | sample_rate: !ref 87 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 88 | 89 | # Counters, checkpointers, loggers, etc. 90 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 91 | save_file: !ref /train_log.txt 92 | -------------------------------------------------------------------------------- /downstream/hparams/sr/MLS/stablecodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: stablecodec 6 | dataset: MLS 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | languages: null 18 | 19 | # Output folders 20 | output_folder: !ref results//// 21 | save_folder: !ref /save 22 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 23 | 24 | # Save options 25 | compute_metrics: True 26 | save_audios: False 27 | use_profiler: True 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | vocab_size: 15625 40 | num_codebooks: 2 41 | mode: reconstruct 42 | 43 | # Performance metrics parameters 44 | dwer_hub: small 45 | wavlm_sim_hub: microsoft/wavlm-base-sv 46 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 47 | 48 | # Codec 49 | codec: !new:audiocodecs.StableCodec 50 | sample_rate: !ref 51 | num_codebooks: !ref 52 | vocab_size: !ref 53 | mode: !ref 54 | 55 | # Performance metrics 56 | utmos_computer: !name:metrics.utmos.UTMOS 57 | sample_rate: !ref 58 | 59 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 60 | sample_rate: !ref 61 | 62 | stoi_computer: !name:metrics.stoi.STOI 63 | sample_rate: !ref 64 | 65 | pesq_computer: !name:metrics.pesq.PESQ 66 | sample_rate: !ref 67 | 68 | meld_computer: !name:metrics.mel_distance.MelDistance 69 | sample_rate: !ref 70 | 71 | stftd_computer: !name:metrics.stft_distance.STFTDistance 72 | sample_rate: !ref 73 | 74 | dwer_computer: !name:metrics.dwer.DWER 75 | model_hub: !ref 76 | sample_rate: !ref 77 | save_path: !ref 78 | 79 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 80 | model_hub: !ref 81 | save_path: !ref 82 | sample_rate: !ref 83 | 84 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 85 | model_hub: !ref 86 | sample_rate: !ref 87 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 88 | 89 | # Counters, checkpointers, loggers, etc. 90 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 91 | save_file: !ref /train_log.txt 92 | -------------------------------------------------------------------------------- /downstream/hparams/sr/MLS/wavlm_kmeans.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: wavlm_kmeans 6 | dataset: MLS 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | languages: null 18 | 19 | # Output folders 20 | output_folder: !ref results//// 21 | save_folder: !ref /save 22 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 23 | 24 | # Save options 25 | compute_metrics: True 26 | save_audios: False 27 | use_profiler: True 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | vocab_size: 512 40 | layer_ids: [6] 41 | num_codebooks: !apply:len [!ref ] 42 | mode: reconstruct 43 | 44 | # Performance metrics parameters 45 | dwer_hub: small 46 | wavlm_sim_hub: microsoft/wavlm-base-sv 47 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 48 | 49 | # Codec 50 | codec: !new:audiocodecs.WavLMKmeans 51 | sample_rate: !ref 52 | mode: !ref 53 | layer_ids: !ref 54 | 55 | # Performance metrics 56 | utmos_computer: !name:metrics.utmos.UTMOS 57 | sample_rate: !ref 58 | 59 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 60 | sample_rate: !ref 61 | 62 | stoi_computer: !name:metrics.stoi.STOI 63 | sample_rate: !ref 64 | 65 | pesq_computer: !name:metrics.pesq.PESQ 66 | sample_rate: !ref 67 | 68 | meld_computer: !name:metrics.mel_distance.MelDistance 69 | sample_rate: !ref 70 | 71 | stftd_computer: !name:metrics.stft_distance.STFTDistance 72 | sample_rate: !ref 73 | 74 | dwer_computer: !name:metrics.dwer.DWER 75 | model_hub: !ref 76 | sample_rate: !ref 77 | save_path: !ref 78 | 79 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 80 | model_hub: !ref 81 | save_path: !ref 82 | sample_rate: !ref 83 | 84 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 85 | model_hub: !ref 86 | sample_rate: !ref 87 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 88 | 89 | # Counters, checkpointers, loggers, etc. 90 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 91 | save_file: !ref /train_log.txt 92 | -------------------------------------------------------------------------------- /downstream/hparams/sr/VoiceBank/stablecodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: stablecodec 6 | dataset: VoiceBank 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /testset_wav.csv 16 | splits: [testset_wav] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | vocab_size: 15625 39 | num_codebooks: 2 40 | mode: reconstruct 41 | 42 | # Performance metrics parameters 43 | dwer_hub: small 44 | wavlm_sim_hub: microsoft/wavlm-base-sv 45 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 46 | 47 | # Codec 48 | codec: !new:audiocodecs.StableCodec 49 | sample_rate: !ref 50 | num_codebooks: !ref 51 | vocab_size: !ref 52 | mode: !ref 53 | 54 | # Performance metrics 55 | utmos_computer: !name:metrics.utmos.UTMOS 56 | sample_rate: !ref 57 | 58 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 59 | sample_rate: !ref 60 | 61 | stoi_computer: !name:metrics.stoi.STOI 62 | sample_rate: !ref 63 | 64 | pesq_computer: !name:metrics.pesq.PESQ 65 | sample_rate: !ref 66 | 67 | meld_computer: !name:metrics.mel_distance.MelDistance 68 | sample_rate: !ref 69 | 70 | stftd_computer: !name:metrics.stft_distance.STFTDistance 71 | sample_rate: !ref 72 | 73 | dwer_computer: !name:metrics.dwer.DWER 74 | model_hub: !ref 75 | sample_rate: !ref 76 | save_path: !ref 77 | 78 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 79 | model_hub: !ref 80 | save_path: !ref 81 | sample_rate: !ref 82 | 83 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 84 | model_hub: !ref 85 | sample_rate: !ref 86 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 87 | 88 | # Counters, checkpointers, loggers, etc. 89 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 90 | save_file: !ref /train_log.txt 91 | -------------------------------------------------------------------------------- /downstream/hparams/sr/VoiceBank/wavlm_kmeans.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: wavlm_kmeans 6 | dataset: VoiceBank 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /testset_wav.csv 16 | splits: [testset_wav] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | vocab_size: 512 39 | layer_ids: [6] 40 | num_codebooks: !apply:len [!ref ] 41 | mode: reconstruct 42 | 43 | # Performance metrics parameters 44 | dwer_hub: small 45 | wavlm_sim_hub: microsoft/wavlm-base-sv 46 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 47 | 48 | # Codec 49 | codec: !new:audiocodecs.WavLMKmeans 50 | sample_rate: !ref 51 | mode: !ref 52 | layer_ids: !ref 53 | 54 | # Performance metrics 55 | utmos_computer: !name:metrics.utmos.UTMOS 56 | sample_rate: !ref 57 | 58 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 59 | sample_rate: !ref 60 | 61 | stoi_computer: !name:metrics.stoi.STOI 62 | sample_rate: !ref 63 | 64 | pesq_computer: !name:metrics.pesq.PESQ 65 | sample_rate: !ref 66 | 67 | meld_computer: !name:metrics.mel_distance.MelDistance 68 | sample_rate: !ref 69 | 70 | stftd_computer: !name:metrics.stft_distance.STFTDistance 71 | sample_rate: !ref 72 | 73 | dwer_computer: !name:metrics.dwer.DWER 74 | model_hub: !ref 75 | sample_rate: !ref 76 | save_path: !ref 77 | 78 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 79 | model_hub: !ref 80 | save_path: !ref 81 | sample_rate: !ref 82 | 83 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 84 | model_hub: !ref 85 | sample_rate: !ref 86 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 87 | 88 | # Counters, checkpointers, loggers, etc. 89 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 90 | save_file: !ref /train_log.txt 91 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriSpeech/stablecodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: stablecodec 6 | dataset: LibriSpeech 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test-clean.csv 16 | splits: [test-clean] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | vocab_size: 15625 39 | num_codebooks: 2 40 | mode: reconstruct 41 | 42 | # Performance metrics parameters 43 | dwer_hub: small 44 | wavlm_sim_hub: microsoft/wavlm-base-sv 45 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 46 | 47 | # Codec 48 | codec: !new:audiocodecs.StableCodec 49 | sample_rate: !ref 50 | num_codebooks: !ref 51 | vocab_size: !ref 52 | mode: !ref 53 | 54 | # Performance metrics 55 | utmos_computer: !name:metrics.utmos.UTMOS 56 | sample_rate: !ref 57 | 58 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 59 | sample_rate: !ref 60 | 61 | stoi_computer: !name:metrics.stoi.STOI 62 | sample_rate: !ref 63 | 64 | pesq_computer: !name:metrics.pesq.PESQ 65 | sample_rate: !ref 66 | 67 | meld_computer: !name:metrics.mel_distance.MelDistance 68 | sample_rate: !ref 69 | 70 | stftd_computer: !name:metrics.stft_distance.STFTDistance 71 | sample_rate: !ref 72 | 73 | dwer_computer: !name:metrics.dwer.DWER 74 | model_hub: !ref 75 | sample_rate: !ref 76 | save_path: !ref 77 | 78 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 79 | model_hub: !ref 80 | save_path: !ref 81 | sample_rate: !ref 82 | 83 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 84 | model_hub: !ref 85 | sample_rate: !ref 86 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 87 | 88 | # Counters, checkpointers, loggers, etc. 89 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 90 | save_file: !ref /train_log.txt 91 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriSpeech/wavlm_kmeans.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: wavlm_kmeans 6 | dataset: LibriSpeech 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test-clean.csv 16 | splits: [test-clean] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | vocab_size: 512 39 | layer_ids: [6] 40 | num_codebooks: !apply:len [!ref ] 41 | mode: reconstruct 42 | 43 | # Performance metrics parameters 44 | dwer_hub: small 45 | wavlm_sim_hub: microsoft/wavlm-base-sv 46 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 47 | 48 | # Codec 49 | codec: !new:audiocodecs.WavLMKmeans 50 | sample_rate: !ref 51 | mode: !ref 52 | layer_ids: !ref 53 | 54 | # Performance metrics 55 | utmos_computer: !name:metrics.utmos.UTMOS 56 | sample_rate: !ref 57 | 58 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 59 | sample_rate: !ref 60 | 61 | stoi_computer: !name:metrics.stoi.STOI 62 | sample_rate: !ref 63 | 64 | pesq_computer: !name:metrics.pesq.PESQ 65 | sample_rate: !ref 66 | 67 | meld_computer: !name:metrics.mel_distance.MelDistance 68 | sample_rate: !ref 69 | 70 | stftd_computer: !name:metrics.stft_distance.STFTDistance 71 | sample_rate: !ref 72 | 73 | dwer_computer: !name:metrics.dwer.DWER 74 | model_hub: !ref 75 | sample_rate: !ref 76 | save_path: !ref 77 | 78 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 79 | model_hub: !ref 80 | save_path: !ref 81 | sample_rate: !ref 82 | 83 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 84 | model_hub: !ref 85 | sample_rate: !ref 86 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 87 | 88 | # Counters, checkpointers, loggers, etc. 89 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 90 | save_file: !ref /train_log.txt 91 | -------------------------------------------------------------------------------- /downstream/hparams/vc/VCTK/stablecodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: stablecodec 6 | dataset: VCTK 7 | task: vc 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /train.csv 16 | splits: [train] 17 | num_valid_speakers: 0 18 | num_test_speakers: 0 19 | 20 | # Output folders 21 | output_folder: !ref results//// 22 | save_folder: !ref /save 23 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 24 | 25 | # Save options 26 | compute_metrics: True 27 | save_audios: False 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | vocab_size: 15625 40 | num_codebooks: 2 41 | mode: reconstruct 42 | 43 | # Performance metrics parameters 44 | dwer_hub: small 45 | wavlm_sim_hub: microsoft/wavlm-base-sv 46 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 47 | 48 | # Codec 49 | codec: !new:audiocodecs.StableCodec 50 | sample_rate: !ref 51 | num_codebooks: !ref 52 | vocab_size: !ref 53 | mode: !ref 54 | 55 | # Performance metrics 56 | utmos_computer: !name:metrics.utmos.UTMOS 57 | sample_rate: !ref 58 | 59 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 60 | sample_rate: !ref 61 | 62 | stoi_computer: !name:metrics.stoi.STOI 63 | sample_rate: !ref 64 | 65 | pesq_computer: !name:metrics.pesq.PESQ 66 | sample_rate: !ref 67 | 68 | meld_computer: !name:metrics.mel_distance.MelDistance 69 | sample_rate: !ref 70 | 71 | stftd_computer: !name:metrics.stft_distance.STFTDistance 72 | sample_rate: !ref 73 | 74 | dwer_computer: !name:metrics.dwer.DWER 75 | model_hub: !ref 76 | sample_rate: !ref 77 | save_path: !ref 78 | 79 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 80 | model_hub: !ref 81 | save_path: !ref 82 | sample_rate: !ref 83 | 84 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 85 | model_hub: !ref 86 | sample_rate: !ref 87 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 88 | 89 | # Counters, checkpointers, loggers, etc. 90 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 91 | save_file: !ref /train_log.txt 92 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriMix/dac.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: dac 6 | dataset: LibriMix 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | num_speakers: 1 18 | add_noise: True 19 | version: wav16k/min 20 | 21 | # Output folders 22 | output_folder: !ref results//// 23 | save_folder: !ref /save 24 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 25 | 26 | # Save options 27 | compute_metrics: True 28 | save_audios: False 29 | use_profiler: True 30 | 31 | # Preprocessing parameters 32 | test_remove_if_longer: 60.0 # Seconds 33 | test_remove_if_shorter: 0.0 # Seconds 34 | 35 | # Test parameters 36 | test_batch_size: 1 37 | dataloader_workers: 4 38 | 39 | # Codec parameters 40 | sample_rate: 16000 41 | vocab_size: 1024 42 | num_codebooks: 2 43 | mode: reconstruct 44 | 45 | # Performance metrics parameters 46 | dwer_hub: small 47 | wavlm_sim_hub: microsoft/wavlm-base-sv 48 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 49 | 50 | # Codec 51 | codec: !new:audiocodecs.DAC 52 | sample_rate: !ref 53 | orig_sample_rate: 16000 54 | num_codebooks: !ref 55 | mode: !ref 56 | 57 | # Performance metrics 58 | utmos_computer: !name:metrics.utmos.UTMOS 59 | sample_rate: !ref 60 | 61 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 62 | sample_rate: !ref 63 | 64 | stoi_computer: !name:metrics.stoi.STOI 65 | sample_rate: !ref 66 | 67 | pesq_computer: !name:metrics.pesq.PESQ 68 | sample_rate: !ref 69 | 70 | meld_computer: !name:metrics.mel_distance.MelDistance 71 | sample_rate: !ref 72 | 73 | stftd_computer: !name:metrics.stft_distance.STFTDistance 74 | sample_rate: !ref 75 | 76 | dwer_computer: !name:metrics.dwer.DWER 77 | model_hub: !ref 78 | sample_rate: !ref 79 | save_path: !ref 80 | 81 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 82 | model_hub: !ref 83 | save_path: !ref 84 | sample_rate: !ref 85 | 86 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 87 | model_hub: !ref 88 | sample_rate: !ref 89 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 90 | 91 | # Counters, checkpointers, loggers, etc. 92 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 93 | save_file: !ref /train_log.txt 94 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriMix/speechtokenizer.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: speechtokenizer 6 | dataset: LibriMix 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | num_speakers: 1 18 | add_noise: True 19 | version: wav16k/min 20 | 21 | # Output folders 22 | output_folder: !ref results//// 23 | save_folder: !ref /save 24 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 25 | 26 | # Save options 27 | compute_metrics: True 28 | save_audios: False 29 | use_profiler: True 30 | 31 | # Preprocessing parameters 32 | test_remove_if_longer: 60.0 # Seconds 33 | test_remove_if_shorter: 0.0 # Seconds 34 | 35 | # Test parameters 36 | test_batch_size: 1 37 | dataloader_workers: 4 38 | 39 | # Codec parameters 40 | sample_rate: 16000 41 | vocab_size: 1024 42 | num_codebooks: 2 43 | mode: reconstruct 44 | 45 | # Performance metrics parameters 46 | dwer_hub: small 47 | wavlm_sim_hub: microsoft/wavlm-base-sv 48 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 49 | 50 | # Codec 51 | codec: !new:audiocodecs.SpeechTokenizer 52 | sample_rate: !ref 53 | num_codebooks: !ref 54 | mode: !ref 55 | 56 | # Performance metrics 57 | utmos_computer: !name:metrics.utmos.UTMOS 58 | sample_rate: !ref 59 | 60 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 61 | sample_rate: !ref 62 | 63 | stoi_computer: !name:metrics.stoi.STOI 64 | sample_rate: !ref 65 | 66 | pesq_computer: !name:metrics.pesq.PESQ 67 | sample_rate: !ref 68 | 69 | meld_computer: !name:metrics.mel_distance.MelDistance 70 | sample_rate: !ref 71 | 72 | stftd_computer: !name:metrics.stft_distance.STFTDistance 73 | sample_rate: !ref 74 | 75 | dwer_computer: !name:metrics.dwer.DWER 76 | model_hub: !ref 77 | sample_rate: !ref 78 | save_path: !ref 79 | 80 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 81 | model_hub: !ref 82 | save_path: !ref 83 | sample_rate: !ref 84 | 85 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 86 | model_hub: !ref 87 | sample_rate: !ref 88 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 89 | 90 | # Counters, checkpointers, loggers, etc. 91 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 92 | save_file: !ref /train_log.txt 93 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriMix/encodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: encodec 6 | dataset: LibriMix 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | num_speakers: 1 18 | add_noise: True 19 | version: wav16k/min 20 | 21 | # Output folders 22 | output_folder: !ref results//// 23 | save_folder: !ref /save 24 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 25 | 26 | # Save options 27 | compute_metrics: True 28 | save_audios: False 29 | use_profiler: True 30 | 31 | # Preprocessing parameters 32 | test_remove_if_longer: 60.0 # Seconds 33 | test_remove_if_shorter: 0.0 # Seconds 34 | 35 | # Test parameters 36 | test_batch_size: 1 37 | dataloader_workers: 4 38 | 39 | # Codec parameters 40 | sample_rate: 16000 41 | vocab_size: 1024 42 | num_codebooks: 2 43 | mode: reconstruct 44 | 45 | # Performance metrics parameters 46 | dwer_hub: small 47 | wavlm_sim_hub: microsoft/wavlm-base-sv 48 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 49 | 50 | # Codec 51 | codec: !new:audiocodecs.Encodec 52 | sample_rate: !ref 53 | orig_sample_rate: 24000 54 | num_codebooks: !ref 55 | mode: !ref 56 | 57 | # Performance metrics 58 | utmos_computer: !name:metrics.utmos.UTMOS 59 | sample_rate: !ref 60 | 61 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 62 | sample_rate: !ref 63 | 64 | stoi_computer: !name:metrics.stoi.STOI 65 | sample_rate: !ref 66 | 67 | pesq_computer: !name:metrics.pesq.PESQ 68 | sample_rate: !ref 69 | 70 | meld_computer: !name:metrics.mel_distance.MelDistance 71 | sample_rate: !ref 72 | 73 | stftd_computer: !name:metrics.stft_distance.STFTDistance 74 | sample_rate: !ref 75 | 76 | dwer_computer: !name:metrics.dwer.DWER 77 | model_hub: !ref 78 | sample_rate: !ref 79 | save_path: !ref 80 | 81 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 82 | model_hub: !ref 83 | save_path: !ref 84 | sample_rate: !ref 85 | 86 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 87 | model_hub: !ref 88 | sample_rate: !ref 89 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 90 | 91 | # Counters, checkpointers, loggers, etc. 92 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 93 | save_file: !ref /train_log.txt 94 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriMix/stablecodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: stablecodec 6 | dataset: LibriMix 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | num_speakers: 1 18 | add_noise: True 19 | version: wav16k/min 20 | 21 | # Output folders 22 | output_folder: !ref results//// 23 | save_folder: !ref /save 24 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 25 | 26 | # Save options 27 | compute_metrics: True 28 | save_audios: False 29 | use_profiler: True 30 | 31 | # Preprocessing parameters 32 | test_remove_if_longer: 60.0 # Seconds 33 | test_remove_if_shorter: 0.0 # Seconds 34 | 35 | # Test parameters 36 | test_batch_size: 1 37 | dataloader_workers: 4 38 | 39 | # Codec parameters 40 | sample_rate: 16000 41 | vocab_size: 15625 42 | num_codebooks: 2 43 | mode: reconstruct 44 | 45 | # Performance metrics parameters 46 | dwer_hub: small 47 | wavlm_sim_hub: microsoft/wavlm-base-sv 48 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 49 | 50 | # Codec 51 | codec: !new:audiocodecs.StableCodec 52 | sample_rate: !ref 53 | num_codebooks: !ref 54 | vocab_size: !ref 55 | mode: !ref 56 | 57 | # Performance metrics 58 | utmos_computer: !name:metrics.utmos.UTMOS 59 | sample_rate: !ref 60 | 61 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 62 | sample_rate: !ref 63 | 64 | stoi_computer: !name:metrics.stoi.STOI 65 | sample_rate: !ref 66 | 67 | pesq_computer: !name:metrics.pesq.PESQ 68 | sample_rate: !ref 69 | 70 | meld_computer: !name:metrics.mel_distance.MelDistance 71 | sample_rate: !ref 72 | 73 | stftd_computer: !name:metrics.stft_distance.STFTDistance 74 | sample_rate: !ref 75 | 76 | dwer_computer: !name:metrics.dwer.DWER 77 | model_hub: !ref 78 | sample_rate: !ref 79 | save_path: !ref 80 | 81 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 82 | model_hub: !ref 83 | save_path: !ref 84 | sample_rate: !ref 85 | 86 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 87 | model_hub: !ref 88 | sample_rate: !ref 89 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 90 | 91 | # Counters, checkpointers, loggers, etc. 92 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 93 | save_file: !ref /train_log.txt 94 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriMix/wavlm_kmeans.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: wavlm_kmeans 6 | dataset: LibriMix 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | num_speakers: 1 18 | add_noise: True 19 | version: wav16k/min 20 | 21 | # Output folders 22 | output_folder: !ref results//// 23 | save_folder: !ref /save 24 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 25 | 26 | # Save options 27 | compute_metrics: True 28 | save_audios: False 29 | use_profiler: True 30 | 31 | # Preprocessing parameters 32 | test_remove_if_longer: 60.0 # Seconds 33 | test_remove_if_shorter: 0.0 # Seconds 34 | 35 | # Test parameters 36 | test_batch_size: 1 37 | dataloader_workers: 4 38 | 39 | # Codec parameters 40 | sample_rate: 16000 41 | vocab_size: 512 42 | layer_ids: [6] 43 | num_codebooks: !apply:len [!ref ] 44 | mode: reconstruct 45 | 46 | # Performance metrics parameters 47 | dwer_hub: small 48 | wavlm_sim_hub: microsoft/wavlm-base-sv 49 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 50 | 51 | # Codec 52 | codec: !new:audiocodecs.WavLMKmeans 53 | sample_rate: !ref 54 | mode: !ref 55 | layer_ids: !ref 56 | 57 | # Performance metrics 58 | utmos_computer: !name:metrics.utmos.UTMOS 59 | sample_rate: !ref 60 | 61 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 62 | sample_rate: !ref 63 | 64 | stoi_computer: !name:metrics.stoi.STOI 65 | sample_rate: !ref 66 | 67 | pesq_computer: !name:metrics.pesq.PESQ 68 | sample_rate: !ref 69 | 70 | meld_computer: !name:metrics.mel_distance.MelDistance 71 | sample_rate: !ref 72 | 73 | stftd_computer: !name:metrics.stft_distance.STFTDistance 74 | sample_rate: !ref 75 | 76 | dwer_computer: !name:metrics.dwer.DWER 77 | model_hub: !ref 78 | sample_rate: !ref 79 | save_path: !ref 80 | 81 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 82 | model_hub: !ref 83 | save_path: !ref 84 | sample_rate: !ref 85 | 86 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 87 | model_hub: !ref 88 | sample_rate: !ref 89 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 90 | 91 | # Counters, checkpointers, loggers, etc. 92 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 93 | save_file: !ref /train_log.txt 94 | -------------------------------------------------------------------------------- /downstream/hparams/vc/VCTK/wavlm_kmeans.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: wavlm_kmeans 6 | dataset: VCTK 7 | task: vc 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /train.csv 16 | splits: [train] 17 | num_valid_speakers: 0 18 | num_test_speakers: 0 19 | 20 | # Output folders 21 | output_folder: !ref results//// 22 | save_folder: !ref /save 23 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 24 | 25 | # Save options 26 | compute_metrics: True 27 | save_audios: False 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | vocab_size: 512 40 | layer_ids: [6] 41 | num_codebooks: !apply:len [!ref ] 42 | mode: reconstruct 43 | 44 | # k-NN 45 | topk: 4 46 | num_splits: 1 47 | 48 | # Performance metrics parameters 49 | dwer_hub: small 50 | wavlm_sim_hub: microsoft/wavlm-base-sv 51 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 52 | 53 | # Codec 54 | codec: !new:audiocodecs.WavLMKmeans 55 | sample_rate: !ref 56 | mode: !ref 57 | layer_ids: !ref 58 | 59 | # Performance metrics 60 | utmos_computer: !name:metrics.utmos.UTMOS 61 | sample_rate: !ref 62 | 63 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 64 | sample_rate: !ref 65 | 66 | stoi_computer: !name:metrics.stoi.STOI 67 | sample_rate: !ref 68 | 69 | pesq_computer: !name:metrics.pesq.PESQ 70 | sample_rate: !ref 71 | 72 | meld_computer: !name:metrics.mel_distance.MelDistance 73 | sample_rate: !ref 74 | 75 | stftd_computer: !name:metrics.stft_distance.STFTDistance 76 | sample_rate: !ref 77 | 78 | dwer_computer: !name:metrics.dwer.DWER 79 | model_hub: !ref 80 | sample_rate: !ref 81 | save_path: !ref 82 | 83 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 84 | model_hub: !ref 85 | save_path: !ref 86 | sample_rate: !ref 87 | 88 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 89 | model_hub: !ref 90 | sample_rate: !ref 91 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 92 | 93 | # Counters, checkpointers, loggers, etc. 94 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 95 | save_file: !ref /train_log.txt 96 | -------------------------------------------------------------------------------- /downstream/metrics/stft_distance.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | """STFT distance.""" 18 | 19 | import torch 20 | import torchaudio 21 | from speechbrain.utils.metric_stats import MetricStats 22 | 23 | 24 | __all__ = ["STFTDistance"] 25 | 26 | 27 | SAMPLE_RATE = 16000 28 | 29 | 30 | class STFTDistance(MetricStats): 31 | def __init__(self, sample_rate, n_fft=1024, hop_length=320): 32 | self.sample_rate = sample_rate 33 | self.n_fft = n_fft 34 | self.hop_length = hop_length 35 | 36 | self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB() 37 | self.clear() 38 | 39 | @torch.no_grad() 40 | def append(self, ids, hyp_sig, ref_sig, lens=None): 41 | assert hyp_sig.shape == ref_sig.shape 42 | assert hyp_sig.ndim == 2 43 | 44 | # Resample to standard sample rate 45 | hyp_sig = torchaudio.functional.resample(hyp_sig, self.sample_rate, SAMPLE_RATE) 46 | ref_sig = torchaudio.functional.resample(ref_sig, self.sample_rate, SAMPLE_RATE) 47 | 48 | # Compute STFT -> magnitude -> dB 49 | hyp_stft = torch.stft( 50 | hyp_sig, 51 | n_fft=self.n_fft, 52 | hop_length=self.hop_length, 53 | window=torch.hann_window(self.n_fft).to(hyp_sig.device), 54 | return_complex=True, 55 | ).abs() 56 | 57 | ref_stft = torch.stft( 58 | ref_sig, 59 | n_fft=self.n_fft, 60 | hop_length=self.hop_length, 61 | window=torch.hann_window(self.n_fft).to(ref_sig.device), 62 | return_complex=True, 63 | ).abs() 64 | 65 | hyp_db = self.amplitude_to_db(hyp_stft) 66 | ref_db = self.amplitude_to_db(ref_stft) 67 | 68 | # Compute L2 distance between log magnitude spectrograms 69 | scores = (hyp_db - ref_db).norm(dim=1).mean(dim=1).cpu().tolist() 70 | 71 | self.ids += ids 72 | self.scores += scores 73 | 74 | 75 | if __name__ == "__main__": 76 | sample_rate = 24000 77 | ids = ["A", "B"] 78 | hyp_sig = torch.randn(2, 2 * sample_rate) 79 | ref_sig = torch.randn(2, 2 * sample_rate) 80 | 81 | stft_dist = STFTDistance(sample_rate) 82 | stft_dist.append(ids, hyp_sig, ref_sig) 83 | print(stft_dist.summarize("average")) 84 | -------------------------------------------------------------------------------- /downstream/hparams/sr/MLS/focalcodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: focalcodec 6 | dataset: MLS 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | languages: null 18 | 19 | # Output folders 20 | output_folder: !ref results//// 21 | save_folder: !ref /save 22 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 23 | 24 | # Save options 25 | compute_metrics: True 26 | save_audios: False 27 | use_profiler: True 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | vocab_size: 8192 40 | num_codebooks: 1 41 | config: lucadellalib/focalcodec_50hz 42 | mode: reconstruct 43 | 44 | # Performance metrics parameters 45 | dwer_hub: small 46 | wavlm_sim_hub: microsoft/wavlm-base-sv 47 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 48 | 49 | # Codec 50 | codec: !new:audiocodecs.FocalCodec 51 | sample_rate: !ref 52 | num_codebooks: !ref 53 | vocab_size: !ref 54 | config: !ref 55 | mode: !ref 56 | 57 | # Performance metrics 58 | utmos_computer: !name:metrics.utmos.UTMOS 59 | sample_rate: !ref 60 | 61 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 62 | sample_rate: !ref 63 | 64 | stoi_computer: !name:metrics.stoi.STOI 65 | sample_rate: !ref 66 | 67 | pesq_computer: !name:metrics.pesq.PESQ 68 | sample_rate: !ref 69 | 70 | meld_computer: !name:metrics.mel_distance.MelDistance 71 | sample_rate: !ref 72 | 73 | stftd_computer: !name:metrics.stft_distance.STFTDistance 74 | sample_rate: !ref 75 | 76 | dwer_computer: !name:metrics.dwer.DWER 77 | model_hub: !ref 78 | sample_rate: !ref 79 | save_path: !ref 80 | 81 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 82 | model_hub: !ref 83 | save_path: !ref 84 | sample_rate: !ref 85 | 86 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 87 | model_hub: !ref 88 | sample_rate: !ref 89 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 90 | 91 | # Counters, checkpointers, loggers, etc. 92 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 93 | save_file: !ref /train_log.txt 94 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriSpeech/focalcodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: focalcodec 6 | dataset: LibriSpeech 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test-clean.csv 16 | splits: [test-clean] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | vocab_size: 8192 39 | num_codebooks: 1 40 | config: lucadellalib/focalcodec_50hz 41 | mode: reconstruct 42 | 43 | # Performance metrics parameters 44 | dwer_hub: small 45 | wavlm_sim_hub: microsoft/wavlm-base-sv 46 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 47 | 48 | # Codec 49 | codec: !new:audiocodecs.FocalCodec 50 | sample_rate: !ref 51 | num_codebooks: !ref 52 | vocab_size: !ref 53 | config: !ref 54 | mode: !ref 55 | 56 | # Performance metrics 57 | utmos_computer: !name:metrics.utmos.UTMOS 58 | sample_rate: !ref 59 | 60 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 61 | sample_rate: !ref 62 | 63 | stoi_computer: !name:metrics.stoi.STOI 64 | sample_rate: !ref 65 | 66 | pesq_computer: !name:metrics.pesq.PESQ 67 | sample_rate: !ref 68 | 69 | meld_computer: !name:metrics.mel_distance.MelDistance 70 | sample_rate: !ref 71 | 72 | stftd_computer: !name:metrics.stft_distance.STFTDistance 73 | sample_rate: !ref 74 | 75 | dwer_computer: !name:metrics.dwer.DWER 76 | model_hub: !ref 77 | sample_rate: !ref 78 | save_path: !ref 79 | 80 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 81 | model_hub: !ref 82 | save_path: !ref 83 | sample_rate: !ref 84 | 85 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 86 | model_hub: !ref 87 | sample_rate: !ref 88 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 89 | 90 | # Counters, checkpointers, loggers, etc. 91 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 92 | save_file: !ref /train_log.txt 93 | -------------------------------------------------------------------------------- /downstream/hparams/sr/VoiceBank/focalcodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: focalcodec 6 | dataset: VoiceBank 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /testset_wav.csv 16 | splits: [testset_wav] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | vocab_size: 8192 39 | num_codebooks: 1 40 | config: lucadellalib/focalcodec_50hz 41 | mode: reconstruct 42 | 43 | # Performance metrics parameters 44 | dwer_hub: small 45 | wavlm_sim_hub: microsoft/wavlm-base-sv 46 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 47 | 48 | # Codec 49 | codec: !new:audiocodecs.FocalCodec 50 | sample_rate: !ref 51 | num_codebooks: !ref 52 | vocab_size: !ref 53 | config: !ref 54 | mode: !ref 55 | 56 | # Performance metrics 57 | utmos_computer: !name:metrics.utmos.UTMOS 58 | sample_rate: !ref 59 | 60 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 61 | sample_rate: !ref 62 | 63 | stoi_computer: !name:metrics.stoi.STOI 64 | sample_rate: !ref 65 | 66 | pesq_computer: !name:metrics.pesq.PESQ 67 | sample_rate: !ref 68 | 69 | meld_computer: !name:metrics.mel_distance.MelDistance 70 | sample_rate: !ref 71 | 72 | stftd_computer: !name:metrics.stft_distance.STFTDistance 73 | sample_rate: !ref 74 | 75 | dwer_computer: !name:metrics.dwer.DWER 76 | model_hub: !ref 77 | sample_rate: !ref 78 | save_path: !ref 79 | 80 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 81 | model_hub: !ref 82 | save_path: !ref 83 | sample_rate: !ref 84 | 85 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 86 | model_hub: !ref 87 | sample_rate: !ref 88 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 89 | 90 | # Counters, checkpointers, loggers, etc. 91 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 92 | save_file: !ref /train_log.txt 93 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriMix/focalcodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: focalcodec 6 | dataset: LibriMix 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | num_speakers: 1 18 | add_noise: True 19 | version: wav16k/min 20 | 21 | # Output folders 22 | output_folder: !ref results//// 23 | save_folder: !ref /save 24 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 25 | 26 | # Save options 27 | compute_metrics: True 28 | save_audios: False 29 | use_profiler: True 30 | 31 | # Preprocessing parameters 32 | test_remove_if_longer: 60.0 # Seconds 33 | test_remove_if_shorter: 0.0 # Seconds 34 | 35 | # Test parameters 36 | test_batch_size: 1 37 | dataloader_workers: 4 38 | 39 | # Codec parameters 40 | sample_rate: 16000 41 | vocab_size: 8192 42 | num_codebooks: 1 43 | config: lucadellalib/focalcodec_50hz 44 | mode: reconstruct 45 | 46 | # Performance metrics parameters 47 | dwer_hub: small 48 | wavlm_sim_hub: microsoft/wavlm-base-sv 49 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 50 | 51 | # Codec 52 | codec: !new:audiocodecs.FocalCodec 53 | sample_rate: !ref 54 | num_codebooks: !ref 55 | vocab_size: !ref 56 | config: !ref 57 | mode: !ref 58 | 59 | # Performance metrics 60 | utmos_computer: !name:metrics.utmos.UTMOS 61 | sample_rate: !ref 62 | 63 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 64 | sample_rate: !ref 65 | 66 | stoi_computer: !name:metrics.stoi.STOI 67 | sample_rate: !ref 68 | 69 | pesq_computer: !name:metrics.pesq.PESQ 70 | sample_rate: !ref 71 | 72 | meld_computer: !name:metrics.mel_distance.MelDistance 73 | sample_rate: !ref 74 | 75 | stftd_computer: !name:metrics.stft_distance.STFTDistance 76 | sample_rate: !ref 77 | 78 | dwer_computer: !name:metrics.dwer.DWER 79 | model_hub: !ref 80 | sample_rate: !ref 81 | save_path: !ref 82 | 83 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 84 | model_hub: !ref 85 | save_path: !ref 86 | sample_rate: !ref 87 | 88 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 89 | model_hub: !ref 90 | sample_rate: !ref 91 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 92 | 93 | # Counters, checkpointers, loggers, etc. 94 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 95 | save_file: !ref /train_log.txt 96 | -------------------------------------------------------------------------------- /downstream/hparams/vc/VCTK/focalcodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: focalcodec 6 | dataset: VCTK 7 | task: vc 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /train.csv 16 | splits: [train] 17 | num_valid_speakers: 0 18 | num_test_speakers: 0 19 | 20 | # Output folders 21 | output_folder: !ref results//// 22 | save_folder: !ref /save 23 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 24 | 25 | # Save options 26 | compute_metrics: True 27 | save_audios: False 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | vocab_size: 8192 40 | num_codebooks: 1 41 | config: lucadellalib/focalcodec_50hz 42 | mode: reconstruct 43 | 44 | # k-NN 45 | topk: 4 46 | num_splits: 1 47 | 48 | # Performance metrics parameters 49 | dwer_hub: small 50 | wavlm_sim_hub: microsoft/wavlm-base-sv 51 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 52 | 53 | # Codec 54 | codec: !new:audiocodecs.FocalCodec 55 | sample_rate: !ref 56 | num_codebooks: !ref 57 | vocab_size: !ref 58 | config: !ref 59 | mode: !ref 60 | 61 | # Performance metrics 62 | utmos_computer: !name:metrics.utmos.UTMOS 63 | sample_rate: !ref 64 | 65 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 66 | sample_rate: !ref 67 | 68 | stoi_computer: !name:metrics.stoi.STOI 69 | sample_rate: !ref 70 | 71 | pesq_computer: !name:metrics.pesq.PESQ 72 | sample_rate: !ref 73 | 74 | meld_computer: !name:metrics.mel_distance.MelDistance 75 | sample_rate: !ref 76 | 77 | stftd_computer: !name:metrics.stft_distance.STFTDistance 78 | sample_rate: !ref 79 | 80 | dwer_computer: !name:metrics.dwer.DWER 81 | model_hub: !ref 82 | sample_rate: !ref 83 | save_path: !ref 84 | 85 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 86 | model_hub: !ref 87 | save_path: !ref 88 | sample_rate: !ref 89 | 90 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 91 | model_hub: !ref 92 | sample_rate: !ref 93 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 94 | 95 | # Counters, checkpointers, loggers, etc. 96 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 97 | save_file: !ref /train_log.txt 98 | -------------------------------------------------------------------------------- /downstream/hparams/sr/MLS/semanticodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: semanticodec 6 | dataset: MLS 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | languages: null 18 | 19 | # Output folders 20 | output_folder: !ref results//// 21 | save_folder: !ref /save 22 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 23 | 24 | # Save options 25 | compute_metrics: True 26 | save_audios: False 27 | use_profiler: True 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | num_codebooks: 2 40 | token_rate: 50 41 | vocab_size: 8192 42 | acoustic_vocab_size: 8192 43 | ddim_sample_step: 50 44 | mode: reconstruct 45 | 46 | # Performance metrics parameters 47 | dwer_hub: small 48 | wavlm_sim_hub: microsoft/wavlm-base-sv 49 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 50 | 51 | # Codec 52 | codec: !new:audiocodecs.SemantiCodec 53 | sample_rate: !ref 54 | token_rate: !ref 55 | semantic_vocab_size: !ref 56 | ddim_sample_step: !ref 57 | mode: !ref 58 | 59 | # Performance metrics 60 | utmos_computer: !name:metrics.utmos.UTMOS 61 | sample_rate: !ref 62 | 63 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 64 | sample_rate: !ref 65 | 66 | stoi_computer: !name:metrics.stoi.STOI 67 | sample_rate: !ref 68 | 69 | pesq_computer: !name:metrics.pesq.PESQ 70 | sample_rate: !ref 71 | 72 | meld_computer: !name:metrics.mel_distance.MelDistance 73 | sample_rate: !ref 74 | 75 | stftd_computer: !name:metrics.stft_distance.STFTDistance 76 | sample_rate: !ref 77 | 78 | dwer_computer: !name:metrics.dwer.DWER 79 | model_hub: !ref 80 | sample_rate: !ref 81 | save_path: !ref 82 | 83 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 84 | model_hub: !ref 85 | save_path: !ref 86 | sample_rate: !ref 87 | 88 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 89 | model_hub: !ref 90 | sample_rate: !ref 91 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 92 | 93 | # Counters, checkpointers, loggers, etc. 94 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 95 | save_file: !ref /train_log.txt 96 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriSpeech/semanticodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: semanticodec 6 | dataset: LibriSpeech 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test-clean.csv 16 | splits: [test-clean] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | num_codebooks: 2 39 | token_rate: 50 40 | vocab_size: 8192 41 | acoustic_vocab_size: 8192 42 | ddim_sample_step: 50 43 | mode: reconstruct 44 | 45 | # Performance metrics parameters 46 | dwer_hub: small 47 | wavlm_sim_hub: microsoft/wavlm-base-sv 48 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 49 | 50 | # Codec 51 | codec: !new:audiocodecs.SemantiCodec 52 | sample_rate: !ref 53 | token_rate: !ref 54 | semantic_vocab_size: !ref 55 | ddim_sample_step: !ref 56 | mode: !ref 57 | 58 | # Performance metrics 59 | utmos_computer: !name:metrics.utmos.UTMOS 60 | sample_rate: !ref 61 | 62 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 63 | sample_rate: !ref 64 | 65 | stoi_computer: !name:metrics.stoi.STOI 66 | sample_rate: !ref 67 | 68 | pesq_computer: !name:metrics.pesq.PESQ 69 | sample_rate: !ref 70 | 71 | meld_computer: !name:metrics.mel_distance.MelDistance 72 | sample_rate: !ref 73 | 74 | stftd_computer: !name:metrics.stft_distance.STFTDistance 75 | sample_rate: !ref 76 | 77 | dwer_computer: !name:metrics.dwer.DWER 78 | model_hub: !ref 79 | sample_rate: !ref 80 | save_path: !ref 81 | 82 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 83 | model_hub: !ref 84 | save_path: !ref 85 | sample_rate: !ref 86 | 87 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 88 | model_hub: !ref 89 | sample_rate: !ref 90 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 91 | 92 | # Counters, checkpointers, loggers, etc. 93 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 94 | save_file: !ref /train_log.txt 95 | -------------------------------------------------------------------------------- /downstream/hparams/sr/VoiceBank/semanticodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: semanticodec 6 | dataset: VoiceBank 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /testset_wav.csv 16 | splits: [testset_wav] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | num_codebooks: 2 39 | token_rate: 50 40 | vocab_size: 8192 41 | acoustic_vocab_size: 8192 42 | ddim_sample_step: 50 43 | mode: reconstruct 44 | 45 | # Performance metrics parameters 46 | dwer_hub: small 47 | wavlm_sim_hub: microsoft/wavlm-base-sv 48 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 49 | 50 | # Codec 51 | codec: !new:audiocodecs.SemantiCodec 52 | sample_rate: !ref 53 | token_rate: !ref 54 | semantic_vocab_size: !ref 55 | ddim_sample_step: !ref 56 | mode: !ref 57 | 58 | # Performance metrics 59 | utmos_computer: !name:metrics.utmos.UTMOS 60 | sample_rate: !ref 61 | 62 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 63 | sample_rate: !ref 64 | 65 | stoi_computer: !name:metrics.stoi.STOI 66 | sample_rate: !ref 67 | 68 | pesq_computer: !name:metrics.pesq.PESQ 69 | sample_rate: !ref 70 | 71 | meld_computer: !name:metrics.mel_distance.MelDistance 72 | sample_rate: !ref 73 | 74 | stftd_computer: !name:metrics.stft_distance.STFTDistance 75 | sample_rate: !ref 76 | 77 | dwer_computer: !name:metrics.dwer.DWER 78 | model_hub: !ref 79 | sample_rate: !ref 80 | save_path: !ref 81 | 82 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 83 | model_hub: !ref 84 | save_path: !ref 85 | sample_rate: !ref 86 | 87 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 88 | model_hub: !ref 89 | sample_rate: !ref 90 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 91 | 92 | # Counters, checkpointers, loggers, etc. 93 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 94 | save_file: !ref /train_log.txt 95 | -------------------------------------------------------------------------------- /downstream/hparams/vc/VCTK/semanticodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: semanticodec 6 | dataset: VCTK 7 | task: vc 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /train.csv 16 | splits: [train] 17 | num_valid_speakers: 0 18 | num_test_speakers: 0 19 | 20 | # Output folders 21 | output_folder: !ref results//// 22 | save_folder: !ref /save 23 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 24 | 25 | # Save options 26 | compute_metrics: True 27 | save_audios: False 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | num_codebooks: 2 40 | token_rate: 50 41 | vocab_size: 8192 42 | acoustic_vocab_size: 8192 43 | ddim_sample_step: 50 44 | mode: reconstruct 45 | 46 | # Performance metrics parameters 47 | dwer_hub: small 48 | wavlm_sim_hub: microsoft/wavlm-base-sv 49 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 50 | 51 | # Codec 52 | codec: !new:audiocodecs.SemantiCodec 53 | sample_rate: !ref 54 | token_rate: !ref 55 | semantic_vocab_size: !ref 56 | ddim_sample_step: !ref 57 | mode: !ref 58 | 59 | # Performance metrics 60 | utmos_computer: !name:metrics.utmos.UTMOS 61 | sample_rate: !ref 62 | 63 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 64 | sample_rate: !ref 65 | 66 | stoi_computer: !name:metrics.stoi.STOI 67 | sample_rate: !ref 68 | 69 | pesq_computer: !name:metrics.pesq.PESQ 70 | sample_rate: !ref 71 | 72 | meld_computer: !name:metrics.mel_distance.MelDistance 73 | sample_rate: !ref 74 | 75 | stftd_computer: !name:metrics.stft_distance.STFTDistance 76 | sample_rate: !ref 77 | 78 | dwer_computer: !name:metrics.dwer.DWER 79 | model_hub: !ref 80 | sample_rate: !ref 81 | save_path: !ref 82 | 83 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 84 | model_hub: !ref 85 | save_path: !ref 86 | sample_rate: !ref 87 | 88 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 89 | model_hub: !ref 90 | sample_rate: !ref 91 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 92 | 93 | # Counters, checkpointers, loggers, etc. 94 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 95 | save_file: !ref /train_log.txt 96 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriMix/semanticodec.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: semanticodec 6 | dataset: LibriMix 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | num_speakers: 1 18 | add_noise: True 19 | version: wav16k/min 20 | 21 | # Output folders 22 | output_folder: !ref results//// 23 | save_folder: !ref /save 24 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 25 | 26 | # Save options 27 | compute_metrics: True 28 | save_audios: False 29 | use_profiler: True 30 | 31 | # Preprocessing parameters 32 | test_remove_if_longer: 60.0 # Seconds 33 | test_remove_if_shorter: 0.0 # Seconds 34 | 35 | # Test parameters 36 | test_batch_size: 1 37 | dataloader_workers: 4 38 | 39 | # Codec parameters 40 | sample_rate: 16000 41 | num_codebooks: 2 42 | token_rate: 50 43 | vocab_size: 8192 44 | acoustic_vocab_size: 8192 45 | ddim_sample_step: 50 46 | mode: reconstruct 47 | 48 | # Performance metrics parameters 49 | dwer_hub: small 50 | wavlm_sim_hub: microsoft/wavlm-base-sv 51 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 52 | 53 | # Codec 54 | codec: !new:audiocodecs.SemantiCodec 55 | sample_rate: !ref 56 | token_rate: !ref 57 | semantic_vocab_size: !ref 58 | ddim_sample_step: !ref 59 | mode: !ref 60 | 61 | # Performance metrics 62 | utmos_computer: !name:metrics.utmos.UTMOS 63 | sample_rate: !ref 64 | 65 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 66 | sample_rate: !ref 67 | 68 | stoi_computer: !name:metrics.stoi.STOI 69 | sample_rate: !ref 70 | 71 | pesq_computer: !name:metrics.pesq.PESQ 72 | sample_rate: !ref 73 | 74 | meld_computer: !name:metrics.mel_distance.MelDistance 75 | sample_rate: !ref 76 | 77 | stftd_computer: !name:metrics.stft_distance.STFTDistance 78 | sample_rate: !ref 79 | 80 | dwer_computer: !name:metrics.dwer.DWER 81 | model_hub: !ref 82 | sample_rate: !ref 83 | save_path: !ref 84 | 85 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 86 | model_hub: !ref 87 | save_path: !ref 88 | sample_rate: !ref 89 | 90 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 91 | model_hub: !ref 92 | sample_rate: !ref 93 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 94 | 95 | # Counters, checkpointers, loggers, etc. 96 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 97 | save_file: !ref /train_log.txt 98 | -------------------------------------------------------------------------------- /downstream/hparams/sr/MLS/wavtokenizer.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: wavtokenizer 6 | dataset: MLS 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | languages: null 18 | 19 | # Output folders 20 | output_folder: !ref results//// 21 | save_folder: !ref /save 22 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 23 | 24 | # Save options 25 | compute_metrics: True 26 | save_audios: False 27 | use_profiler: True 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | vocab_size: 4096 40 | num_codebooks: 1 41 | source: novateur/WavTokenizer-large-unify-40token 42 | config: wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml 43 | checkpoint: wavtokenizer_large_unify_600_24k.ckpt 44 | mode: reconstruct 45 | 46 | # Performance metrics parameters 47 | dwer_hub: small 48 | wavlm_sim_hub: microsoft/wavlm-base-sv 49 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 50 | 51 | # Codec 52 | codec: !new:audiocodecs.WavTokenizer 53 | sample_rate: !ref 54 | mode: !ref 55 | source: !ref 56 | config: !ref 57 | checkpoint: !ref 58 | 59 | # Performance metrics 60 | utmos_computer: !name:metrics.utmos.UTMOS 61 | sample_rate: !ref 62 | 63 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 64 | sample_rate: !ref 65 | 66 | stoi_computer: !name:metrics.stoi.STOI 67 | sample_rate: !ref 68 | 69 | pesq_computer: !name:metrics.pesq.PESQ 70 | sample_rate: !ref 71 | 72 | meld_computer: !name:metrics.mel_distance.MelDistance 73 | sample_rate: !ref 74 | 75 | stftd_computer: !name:metrics.stft_distance.STFTDistance 76 | sample_rate: !ref 77 | 78 | dwer_computer: !name:metrics.dwer.DWER 79 | model_hub: !ref 80 | sample_rate: !ref 81 | save_path: !ref 82 | 83 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 84 | model_hub: !ref 85 | save_path: !ref 86 | sample_rate: !ref 87 | 88 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 89 | model_hub: !ref 90 | sample_rate: !ref 91 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 92 | 93 | # Counters, checkpointers, loggers, etc. 94 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 95 | save_file: !ref /train_log.txt 96 | -------------------------------------------------------------------------------- /downstream/hparams/sr/VoiceBank/wavtokenizer.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: wavtokenizer 6 | dataset: VoiceBank 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /testset_wav.csv 16 | splits: [testset_wav] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | vocab_size: 4096 39 | num_codebooks: 1 40 | source: novateur/WavTokenizer-large-unify-40token 41 | config: wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml 42 | checkpoint: wavtokenizer_large_unify_600_24k.ckpt 43 | mode: reconstruct 44 | 45 | # Performance metrics parameters 46 | dwer_hub: small 47 | wavlm_sim_hub: microsoft/wavlm-base-sv 48 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 49 | 50 | # Codec 51 | codec: !new:audiocodecs.WavTokenizer 52 | sample_rate: !ref 53 | mode: !ref 54 | source: !ref 55 | config: !ref 56 | checkpoint: !ref 57 | 58 | # Performance metrics 59 | utmos_computer: !name:metrics.utmos.UTMOS 60 | sample_rate: !ref 61 | 62 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 63 | sample_rate: !ref 64 | 65 | stoi_computer: !name:metrics.stoi.STOI 66 | sample_rate: !ref 67 | 68 | pesq_computer: !name:metrics.pesq.PESQ 69 | sample_rate: !ref 70 | 71 | meld_computer: !name:metrics.mel_distance.MelDistance 72 | sample_rate: !ref 73 | 74 | stftd_computer: !name:metrics.stft_distance.STFTDistance 75 | sample_rate: !ref 76 | 77 | dwer_computer: !name:metrics.dwer.DWER 78 | model_hub: !ref 79 | sample_rate: !ref 80 | save_path: !ref 81 | 82 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 83 | model_hub: !ref 84 | save_path: !ref 85 | sample_rate: !ref 86 | 87 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 88 | model_hub: !ref 89 | sample_rate: !ref 90 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 91 | 92 | # Counters, checkpointers, loggers, etc. 93 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 94 | save_file: !ref /train_log.txt 95 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriSpeech/wavtokenizer.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: wavtokenizer 6 | dataset: LibriSpeech 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test-clean.csv 16 | splits: [test-clean] 17 | 18 | # Output folders 19 | output_folder: !ref results//// 20 | save_folder: !ref /save 21 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 22 | 23 | # Save options 24 | compute_metrics: True 25 | save_audios: False 26 | use_profiler: True 27 | 28 | # Preprocessing parameters 29 | test_remove_if_longer: 60.0 # Seconds 30 | test_remove_if_shorter: 0.0 # Seconds 31 | 32 | # Test parameters 33 | test_batch_size: 1 34 | dataloader_workers: 4 35 | 36 | # Codec parameters 37 | sample_rate: 16000 38 | vocab_size: 4096 39 | num_codebooks: 1 40 | source: novateur/WavTokenizer-large-unify-40token 41 | config: wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml 42 | checkpoint: wavtokenizer_large_unify_600_24k.ckpt 43 | mode: reconstruct 44 | 45 | # Performance metrics parameters 46 | dwer_hub: small 47 | wavlm_sim_hub: microsoft/wavlm-base-sv 48 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 49 | 50 | # Codec 51 | codec: !new:audiocodecs.WavTokenizer 52 | sample_rate: !ref 53 | mode: !ref 54 | source: !ref 55 | config: !ref 56 | checkpoint: !ref 57 | 58 | # Performance metrics 59 | utmos_computer: !name:metrics.utmos.UTMOS 60 | sample_rate: !ref 61 | 62 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 63 | sample_rate: !ref 64 | 65 | stoi_computer: !name:metrics.stoi.STOI 66 | sample_rate: !ref 67 | 68 | pesq_computer: !name:metrics.pesq.PESQ 69 | sample_rate: !ref 70 | 71 | meld_computer: !name:metrics.mel_distance.MelDistance 72 | sample_rate: !ref 73 | 74 | stftd_computer: !name:metrics.stft_distance.STFTDistance 75 | sample_rate: !ref 76 | 77 | dwer_computer: !name:metrics.dwer.DWER 78 | model_hub: !ref 79 | sample_rate: !ref 80 | save_path: !ref 81 | 82 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 83 | model_hub: !ref 84 | save_path: !ref 85 | sample_rate: !ref 86 | 87 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 88 | model_hub: !ref 89 | sample_rate: !ref 90 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 91 | 92 | # Counters, checkpointers, loggers, etc. 93 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 94 | save_file: !ref /train_log.txt 95 | -------------------------------------------------------------------------------- /downstream/hparams/sr/LibriMix/wavtokenizer.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: wavtokenizer 6 | dataset: LibriMix 7 | task: sr 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /test.csv 16 | splits: [test] 17 | num_speakers: 1 18 | add_noise: True 19 | version: wav16k/min 20 | 21 | # Output folders 22 | output_folder: !ref results//// 23 | save_folder: !ref /save 24 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 25 | 26 | # Save options 27 | compute_metrics: True 28 | save_audios: False 29 | use_profiler: True 30 | 31 | # Preprocessing parameters 32 | test_remove_if_longer: 60.0 # Seconds 33 | test_remove_if_shorter: 0.0 # Seconds 34 | 35 | # Test parameters 36 | test_batch_size: 1 37 | dataloader_workers: 4 38 | 39 | # Codec parameters 40 | sample_rate: 16000 41 | vocab_size: 4096 42 | num_codebooks: 1 43 | source: novateur/WavTokenizer-large-unify-40token 44 | config: wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml 45 | checkpoint: wavtokenizer_large_unify_600_24k.ckpt 46 | mode: reconstruct 47 | 48 | # Performance metrics parameters 49 | dwer_hub: small 50 | wavlm_sim_hub: microsoft/wavlm-base-sv 51 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 52 | 53 | # Codec 54 | codec: !new:audiocodecs.WavTokenizer 55 | sample_rate: !ref 56 | mode: !ref 57 | source: !ref 58 | config: !ref 59 | checkpoint: !ref 60 | 61 | # Performance metrics 62 | utmos_computer: !name:metrics.utmos.UTMOS 63 | sample_rate: !ref 64 | 65 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 66 | sample_rate: !ref 67 | 68 | stoi_computer: !name:metrics.stoi.STOI 69 | sample_rate: !ref 70 | 71 | pesq_computer: !name:metrics.pesq.PESQ 72 | sample_rate: !ref 73 | 74 | meld_computer: !name:metrics.mel_distance.MelDistance 75 | sample_rate: !ref 76 | 77 | stftd_computer: !name:metrics.stft_distance.STFTDistance 78 | sample_rate: !ref 79 | 80 | dwer_computer: !name:metrics.dwer.DWER 81 | model_hub: !ref 82 | sample_rate: !ref 83 | save_path: !ref 84 | 85 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 86 | model_hub: !ref 87 | save_path: !ref 88 | sample_rate: !ref 89 | 90 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 91 | model_hub: !ref 92 | sample_rate: !ref 93 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 94 | 95 | # Counters, checkpointers, loggers, etc. 96 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 97 | save_file: !ref /train_log.txt 98 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | """Setup script.""" 18 | 19 | import os 20 | 21 | from setuptools import find_packages, setup 22 | 23 | 24 | _ROOT_DIR = os.path.dirname(os.path.realpath(__file__)) 25 | 26 | _VERSION = {} 27 | with open(os.path.join(_ROOT_DIR, "audiocodecs", "version.py")) as f: 28 | exec(f.read(), _VERSION) 29 | 30 | with open(os.path.join(_ROOT_DIR, "README.md"), encoding="utf-8") as f: 31 | _README = f.read() 32 | 33 | _REQUIREMENTS_SETUP = ["setuptools", "wheel"] 34 | 35 | _REQUIREMENTS_BASE = ["huggingface_hub", "torch", "torchaudio"] 36 | 37 | with open(os.path.join(_ROOT_DIR, "requirements.txt")) as f: 38 | _REQUIREMENTS_ALL = [line.strip() for line in f.readlines()] 39 | 40 | 41 | setup( 42 | name="audiocodecs", 43 | version=_VERSION["VERSION"], 44 | description="A collection of audio codecs with a standardized API", 45 | long_description=_README, 46 | long_description_content_type="text/markdown", 47 | author="Luca Della Libera", 48 | author_email="luca.dellalib@gmail.com", 49 | url="https://github.com/lucadellalib/audiocodecs", 50 | license="Apache License 2.0", 51 | classifiers=[ 52 | "Development Status :: 4 - Beta", 53 | "Environment :: Console", 54 | "Environment :: GPU :: NVIDIA CUDA", 55 | "Intended Audience :: Developers", 56 | "Intended Audience :: Information Technology", 57 | "Intended Audience :: Science/Research", 58 | "License :: OSI Approved :: Apache Software License", 59 | "Natural Language :: English", 60 | "Operating System :: OS Independent", 61 | "Programming Language :: Python :: 3", 62 | "Programming Language :: Python :: Implementation :: CPython", 63 | "Programming Language :: Python :: Implementation :: PyPy", 64 | "Topic :: Multimedia :: Sound/Audio", 65 | "Topic :: Multimedia :: Sound/Audio :: Speech", 66 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 67 | "Topic :: Software Development :: Libraries :: Python Modules", 68 | ], 69 | keywords=[ 70 | "Neural Audio Codecs", 71 | "PyTorch", 72 | "Speech Tokenization", 73 | "Speech Processing", 74 | "Speech Compression", 75 | ], 76 | packages=find_packages(include=["audiocodecs"]), 77 | include_package_data=True, 78 | install_requires=_REQUIREMENTS_BASE, 79 | extras_require={"all": _REQUIREMENTS_ALL}, 80 | setup_requires=_REQUIREMENTS_SETUP, 81 | ) 82 | -------------------------------------------------------------------------------- /downstream/hparams/vc/VCTK/wavtokenizer.yaml: -------------------------------------------------------------------------------- 1 | # ########################################################################################### 2 | # Authors: Luca Della Libera 2025 3 | # ########################################################################################### 4 | 5 | experiment_name: wavtokenizer 6 | dataset: VCTK 7 | task: vc 8 | 9 | # Seed needs to be set at top of YAML 10 | seed: 0 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | 13 | # Data preparation 14 | data_folder: !PLACEHOLDER 15 | test_csv: !ref /train.csv 16 | splits: [train] 17 | num_valid_speakers: 0 18 | num_test_speakers: 0 19 | 20 | # Output folders 21 | output_folder: !ref results//// 22 | save_folder: !ref /save 23 | cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE 24 | 25 | # Save options 26 | compute_metrics: True 27 | save_audios: False 28 | 29 | # Preprocessing parameters 30 | test_remove_if_longer: 60.0 # Seconds 31 | test_remove_if_shorter: 0.0 # Seconds 32 | 33 | # Test parameters 34 | test_batch_size: 1 35 | dataloader_workers: 4 36 | 37 | # Codec parameters 38 | sample_rate: 16000 39 | vocab_size: 4096 40 | num_codebooks: 1 41 | source: novateur/WavTokenizer-large-unify-40token 42 | config: wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml 43 | checkpoint: wavtokenizer_large_unify_600_24k.ckpt 44 | mode: reconstruct 45 | 46 | # k-NN 47 | topk: 4 48 | num_splits: 1 49 | 50 | # Performance metrics parameters 51 | dwer_hub: small 52 | wavlm_sim_hub: microsoft/wavlm-base-sv 53 | ecapatdnn_sim_hub: speechbrain/spkrec-ecapa-voxceleb 54 | 55 | # Codec 56 | codec: !new:audiocodecs.WavTokenizer 57 | sample_rate: !ref 58 | mode: !ref 59 | source: !ref 60 | config: !ref 61 | checkpoint: !ref 62 | 63 | # Performance metrics 64 | utmos_computer: !name:metrics.utmos.UTMOS 65 | sample_rate: !ref 66 | 67 | dnsmos_computer: !name:metrics.dnsmos.DNSMOS 68 | sample_rate: !ref 69 | 70 | stoi_computer: !name:metrics.stoi.STOI 71 | sample_rate: !ref 72 | 73 | pesq_computer: !name:metrics.pesq.PESQ 74 | sample_rate: !ref 75 | 76 | meld_computer: !name:metrics.mel_distance.MelDistance 77 | sample_rate: !ref 78 | 79 | stftd_computer: !name:metrics.stft_distance.STFTDistance 80 | sample_rate: !ref 81 | 82 | dwer_computer: !name:metrics.dwer.DWER 83 | model_hub: !ref 84 | sample_rate: !ref 85 | save_path: !ref 86 | 87 | wavlm_sim_computer: !name:metrics.speaker_similarity.SpkSimWavLM 88 | model_hub: !ref 89 | save_path: !ref 90 | sample_rate: !ref 91 | 92 | ecapatdnn_sim_computer: !name:metrics.speaker_similarity.SpkSimECAPATDNN 93 | model_hub: !ref 94 | sample_rate: !ref 95 | save_path: !apply:os.path.join [!ref , models--speechbrain--spkrec-ecapa-voxceleb] 96 | 97 | # Counters, checkpointers, loggers, etc. 98 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 99 | save_file: !ref /train_log.txt 100 | -------------------------------------------------------------------------------- /audiocodecs/codec.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | """Codec interface.""" 18 | 19 | from abc import ABC, abstractmethod 20 | 21 | import torch 22 | import torchaudio 23 | 24 | 25 | __all__ = ["Codec"] 26 | 27 | 28 | # B: batch size 29 | # T: sequence length in the time domain 30 | # N: sequence length in the token domain 31 | # C: vocabulary size (assuming that each codebook has the same number of tokens) 32 | # K: number of codebooks 33 | class Codec(torch.nn.Module, ABC): 34 | _MODES = ["encode", "decode", "reconstruct"] 35 | 36 | def __init__(self, sample_rate, orig_sample_rate, mode="reconstruct"): 37 | super().__init__() 38 | if mode not in self._MODES: 39 | raise ValueError(f"`mode` ({mode}) must be one of {self._MODES}") 40 | self.sample_rate = sample_rate 41 | self.orig_sample_rate = orig_sample_rate 42 | self.mode = mode 43 | self._dist = None 44 | 45 | def forward(self, input, length=None): 46 | if self.mode == "encode": 47 | toks = self.sig_to_toks(input, length) 48 | return toks 49 | if self.mode == "decode": 50 | sig = self.toks_to_sig(input, length) 51 | return sig 52 | if self.mode == "reconstruct": 53 | toks = self.sig_to_toks(input, length) 54 | sig = self.toks_to_sig(toks, length) 55 | return sig 56 | 57 | def sig_to_toks(self, sig, length=None): 58 | # sig: [B, T] 59 | sig = torchaudio.functional.resample( 60 | sig, 61 | self.sample_rate, 62 | self.orig_sample_rate, 63 | ) 64 | if length is None: 65 | length = torch.ones(len(sig), device=sig.device) 66 | return self._sig_to_toks(sig, length) 67 | 68 | def sig_to_feats(self, sig, length=None): 69 | # sig: [B, T] 70 | sig = torchaudio.functional.resample( 71 | sig, 72 | self.sample_rate, 73 | self.orig_sample_rate, 74 | ) 75 | if length is None: 76 | length = torch.ones(len(sig), device=sig.device) 77 | return self._sig_to_feats(sig, length) 78 | 79 | def toks_to_sig(self, toks, length=None): 80 | # toks: [B, N, K] 81 | if length is None: 82 | length = torch.ones(len(toks), device=toks.device) 83 | sig = self._toks_to_sig(toks, length) 84 | sig = torchaudio.functional.resample( 85 | sig, 86 | self.orig_sample_rate, 87 | self.sample_rate, 88 | ) 89 | return sig 90 | 91 | @abstractmethod 92 | def embs(self): 93 | raise NotImplementedError 94 | 95 | @abstractmethod 96 | def _sig_to_toks(self, sig, length): 97 | # sig: [B, T] 98 | raise NotImplementedError 99 | 100 | @abstractmethod 101 | def _sig_to_feats(self, sig, length): 102 | # sig: [B, T] 103 | raise NotImplementedError 104 | 105 | @abstractmethod 106 | def _toks_to_sig(self, toks, length): 107 | # toks: [B, N, K] 108 | raise NotImplementedError 109 | -------------------------------------------------------------------------------- /downstream/models/multihead.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | """Multi-head layers.""" 18 | 19 | import torch 20 | 21 | 22 | __all__ = [ 23 | "MultiHeadEmbedding", 24 | "MultiHeadLinear", 25 | ] 26 | 27 | 28 | class MultiHeadEmbedding(torch.nn.Embedding): 29 | def __init__( 30 | self, 31 | vocab_size, 32 | embedding_dim, 33 | num_codebooks, 34 | padding_idx=False, 35 | **kwargs, 36 | ): 37 | if isinstance(vocab_size, (list, tuple)): 38 | assert len(vocab_size) == num_codebooks, [len(vocab_size), num_codebooks] 39 | num_embeddings = torch.tensor(vocab_size).sum().item() 40 | self.offsets = torch.tensor([0] + vocab_size[:-1]).cumsum(dim=-1) 41 | else: 42 | num_embeddings = vocab_size * num_codebooks 43 | self.offsets = torch.arange(0, num_embeddings, vocab_size) 44 | if padding_idx: 45 | padding_idx = num_embeddings 46 | num_embeddings += 1 47 | else: 48 | padding_idx = None 49 | super().__init__(num_embeddings, embedding_dim, padding_idx, **kwargs) 50 | self.vocab_size = vocab_size 51 | self.num_codebooks = num_codebooks 52 | 53 | def forward(self, input): 54 | offsets = self.offsets.to(input) 55 | output = input + offsets 56 | if self.padding_idx is not None: 57 | output[input == self.vocab_size] = self.padding_idx 58 | # JIT compilable 59 | output = torch.nn.functional.embedding( 60 | output, 61 | self.weight, 62 | self.padding_idx, 63 | self.max_norm, 64 | self.norm_type, 65 | self.scale_grad_by_freq, 66 | self.sparse, 67 | ) 68 | return output 69 | 70 | 71 | class MultiHeadLinear(torch.nn.Linear): 72 | def __init__( 73 | self, 74 | in_features, 75 | out_features, 76 | num_codebooks, 77 | **kwargs, 78 | ): 79 | if isinstance(out_features, (list, tuple)): 80 | assert len(out_features) == num_codebooks, [ 81 | len(out_features), 82 | num_codebooks, 83 | ] 84 | total_out_features = torch.tensor(out_features).sum().item() 85 | else: 86 | total_out_features = out_features * num_codebooks 87 | super().__init__(in_features, total_out_features, **kwargs) 88 | self.num_codebooks = num_codebooks 89 | 90 | def forward(self, input): 91 | input_shape = input.shape 92 | output = super().forward(input) 93 | output = output.reshape(*input_shape[:-1], self.num_codebooks, -1) 94 | return output 95 | 96 | 97 | if __name__ == "__main__": 98 | B = 2 99 | T = 10 100 | H = 64 101 | C = [512, 1024, 512, 1024] 102 | K = 4 103 | 104 | embedding = MultiHeadEmbedding(C, H, K, padding_idx=True) 105 | linear = MultiHeadLinear(H, C, K) 106 | 107 | input = torch.randint(0, min(C) + 1, size=(B, T, K)) 108 | output = embedding(input) 109 | print(output.shape) 110 | 111 | output = output.sum(dim=-2) 112 | output = linear(output) 113 | print(output.shape) 114 | -------------------------------------------------------------------------------- /audiocodecs/focalcodec.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | """FocalCodec (see https://arxiv.org/abs/2502.04465).""" 18 | 19 | import torch 20 | 21 | from audiocodecs.codec import Codec 22 | 23 | 24 | __all__ = ["FocalCodec"] 25 | 26 | 27 | class FocalCodec(Codec): 28 | CONFIGS = [ 29 | "lucadellalib/focalcodec_50hz", 30 | "lucadellalib/focalcodec_25hz", 31 | "lucadellalib/focalcodec_12_5hz", 32 | ] 33 | 34 | def __init__( 35 | self, 36 | sample_rate, 37 | num_codebooks=1, 38 | vocab_size=8192, 39 | mode="reconstruct", 40 | config="lucadellalib/focalcodec_50hz", 41 | ): 42 | try: 43 | import safetensors 44 | 45 | except ImportError: 46 | raise ImportError("`pip install safetensors` to use this module") 47 | 48 | super().__init__(sample_rate, 16000, mode) 49 | self.num_codebooks = num_codebooks 50 | self.vocab_size = vocab_size 51 | assert num_codebooks == 1, num_codebooks 52 | assert vocab_size == 8192, vocab_size 53 | 54 | self.model = torch.hub.load( 55 | "lucadellalib/focalcodec", "focalcodec", config=config 56 | ) 57 | 58 | if mode == "encode": 59 | self.model.decompressor = None 60 | self.model.decoder = None 61 | elif mode == "decode": 62 | self.model.encoder = None 63 | self.model.compressor = None 64 | 65 | # override 66 | @torch.no_grad() 67 | def embs(self): 68 | embs = self.model.codebook 69 | return embs 70 | 71 | # override 72 | def _sig_to_toks(self, sig, length): 73 | # sig: [B, T] 74 | toks = self.model.sig_to_toks(sig, length) # [B, N] 75 | return toks[..., None] # [B, N, 1] 76 | 77 | # override 78 | def _sig_to_feats(self, sig, length): 79 | # sig: [B, T] 80 | feats = self.model.sig_to_lats(sig, length) # [B, N, H] 81 | return feats 82 | 83 | # override 84 | def _toks_to_sig(self, toks, length): 85 | # toks: [B, N, K=1] 86 | toks = toks[..., 0] 87 | sig = self.model.toks_to_sig(toks) 88 | return sig 89 | 90 | 91 | # Test 92 | if __name__ == "__main__": 93 | import torchaudio 94 | 95 | device = "cuda" if torch.cuda.is_available() else "cpu" 96 | sample_rate = 10000 97 | batch_size = 2 98 | 99 | for mode in ["encode", "decode", "reconstruct"]: 100 | codec = ( 101 | FocalCodec( 102 | sample_rate, 103 | mode=mode, 104 | ) 105 | .eval() 106 | .to(device) 107 | ) 108 | input = ( 109 | torch.zeros(batch_size, 10, 1).long() 110 | if mode == "decode" 111 | else torch.randn(batch_size, sample_rate) 112 | ).to(device) 113 | with torch.no_grad(): 114 | output = codec(input) 115 | print(output.shape) 116 | embs = codec.embs() 117 | print(embs.shape) 118 | if mode in ["encode", "reconstruct"]: 119 | output = codec.sig_to_feats(input) 120 | print(output.shape) 121 | 122 | sig, sample_rate = torchaudio.load("example.wav") 123 | codec = FocalCodec(sample_rate).eval() 124 | with torch.no_grad(): 125 | rec_sig = codec(sig) 126 | torchaudio.save("reconstruction.wav", rec_sig, sample_rate) 127 | -------------------------------------------------------------------------------- /audiocodecs/wavlm_kmeans.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | """WavLM + K-means (see https://arxiv.org/abs/2312.09747).""" 18 | 19 | import torch 20 | 21 | from audiocodecs.codec import Codec 22 | 23 | 24 | __all__ = ["WavLMKmeans"] 25 | 26 | 27 | class WavLMKmeans(Codec): 28 | LAYER_IDS = [(6,), (1, 3, 6)] 29 | 30 | def __init__(self, sample_rate, mode="reconstruct", layer_ids=(6,)): 31 | try: 32 | import speechbrain 33 | except ImportError: 34 | raise ImportError( 35 | "`pip install git+https://github.com/lucadellalib/speechbrain@50ffdc772c0d977390025ee7787735db9b92488c#egg=speechbrain` to use this module" 36 | ) 37 | 38 | super().__init__(sample_rate, 16000, mode) 39 | self.layer_ids = layer_ids 40 | self.vocab_size = 512 41 | 42 | self.model = torch.hub.load( 43 | repo_or_dir="lucadellalib/discrete-wavlm-codec", 44 | model="discrete_wavlm_large", 45 | layer_ids=layer_ids, 46 | ) 47 | if mode == "encode": 48 | self.model.dequantizer = None 49 | self.model.vocoder = None 50 | elif mode == "decode": 51 | self.model.encoder = None 52 | 53 | # override 54 | @torch.no_grad() 55 | def embs(self): 56 | device = next(iter(self.model.state_dict().values())).device 57 | toks = torch.arange(self.vocab_size, device=device) 58 | toks = toks[:, None].expand(-1, len(self.layer_ids)).clone() # [C, K] 59 | embs = self.model.toks_to_qfeats(toks) # [C, H, K] 60 | embs = embs.movedim(-1, 0) # [K, C, H] 61 | return embs 62 | 63 | # override 64 | def _sig_to_toks(self, sig, length): 65 | # sig: [B, T] 66 | feats = self.model.sig_to_feats(sig) 67 | toks = self.model.feats_to_toks(feats) # [B, N, K] 68 | return toks 69 | 70 | # override 71 | def _sig_to_feats(self, sig, length): 72 | # sig: [B, T] 73 | feats = self.model.sig_to_feats(sig).mean(dim=-1) # [B, N, H, K] 74 | return feats 75 | 76 | # override 77 | def _toks_to_sig(self, toks, length): 78 | # toks: [B, N, K] 79 | qfeats = self.model.toks_to_qfeats(toks) 80 | feats = self.model.qfeats_to_feats(qfeats) 81 | sig = self.model.feats_to_sig(feats)[:, 0] # [B, T] 82 | return sig 83 | 84 | 85 | # Test 86 | if __name__ == "__main__": 87 | import torchaudio 88 | 89 | device = "cuda" if torch.cuda.is_available() else "cpu" 90 | sample_rate = 10000 91 | batch_size = 2 92 | layer_ids = [6] 93 | 94 | for mode in ["encode", "decode", "reconstruct"]: 95 | codec = ( 96 | WavLMKmeans(sample_rate, mode=mode, layer_ids=layer_ids).eval().to(device) 97 | ) 98 | input = ( 99 | torch.zeros(batch_size, 10, len(layer_ids)).long() 100 | if mode == "decode" 101 | else torch.randn(batch_size, sample_rate) 102 | ).to(device) 103 | with torch.no_grad(): 104 | output = codec(input) 105 | print(output.shape) 106 | embs = codec.embs() 107 | print(embs.shape) 108 | if mode in ["encode", "reconstruct"]: 109 | output = codec.sig_to_feats(input) 110 | print(output.shape) 111 | 112 | sig, sample_rate = torchaudio.load("example.wav") 113 | codec = WavLMKmeans(sample_rate, layer_ids=layer_ids).eval() 114 | with torch.no_grad(): 115 | rec_sig = codec(sig) 116 | torchaudio.save("reconstruction.wav", rec_sig, sample_rate) 117 | -------------------------------------------------------------------------------- /downstream/metrics/dwer.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | """Differential WER (dWER) (see https://arxiv.org/abs/1911.07953).""" 18 | 19 | import torch 20 | import torchaudio 21 | from faster_whisper import WhisperModel 22 | from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE 23 | from speechbrain.utils.metric_stats import ErrorRateStats, MetricStats 24 | from transformers import WhisperTokenizer 25 | 26 | 27 | __all__ = ["DWER"] 28 | 29 | 30 | SAMPLE_RATE = 16000 31 | 32 | 33 | class DWER(MetricStats): 34 | def __init__( 35 | self, 36 | model_hub, 37 | sample_rate, 38 | save_path=HUGGINGFACE_HUB_CACHE, 39 | model=None, 40 | **kwargs, 41 | ): 42 | self.sample_rate = sample_rate 43 | self.model = model 44 | if model is None: 45 | self.model = WhisperModel(model_hub, download_root=save_path, **kwargs) 46 | self.tokenizer = WhisperTokenizer.from_pretrained( 47 | f"openai/whisper-{model_hub}", cache_dir=save_path 48 | ) 49 | self.wer_computer = ErrorRateStats() 50 | self.cer_computer = ErrorRateStats(split_tokens=True) 51 | 52 | def clear(self): 53 | self.wer_computer.clear() 54 | self.cer_computer.clear() 55 | 56 | @torch.no_grad() 57 | def append(self, ids, hyp_sig, ref_sig, lens=None, locales=None): 58 | assert hyp_sig.shape == ref_sig.shape 59 | assert hyp_sig.ndim == 2 60 | 61 | if locales is None: 62 | locales = ["en"] * len(ids) 63 | locales = locales * 2 64 | 65 | # Concatenate 66 | sig = torch.cat([hyp_sig, ref_sig]) 67 | 68 | # Move to device 69 | self.model.device = sig.device 70 | 71 | # Resample 72 | sig = ( 73 | torchaudio.functional.resample(sig, self.sample_rate, SAMPLE_RATE) 74 | .cpu() 75 | .numpy() 76 | ) 77 | 78 | texts = [] 79 | for x, locale in zip(sig, locales): 80 | # Forward 81 | segs, _ = self.model.transcribe( 82 | x, 83 | beam_size=1, 84 | language=locale, 85 | without_timestamps=True, # temperature=0.0, 86 | ) 87 | text = "" 88 | for seg in segs: 89 | text += seg.text 90 | texts.append(text) 91 | 92 | texts = [self.tokenizer.normalize(x) for x in texts] 93 | texts = [x.split(" ") for x in texts] 94 | hyp_text = texts[: hyp_sig.shape[0]] 95 | ref_text = texts[hyp_sig.shape[0] :] 96 | 97 | # Compute WER 98 | self.wer_computer.append(ids, hyp_text, ref_text) 99 | self.cer_computer.append(ids, hyp_text, ref_text) 100 | 101 | def summarize(self, field=None): 102 | wer_summary = self.wer_computer.summarize() 103 | cer_summary = self.cer_computer.summarize() 104 | wer_summary["CER"] = wer_summary["error_rate_char"] = cer_summary["error_rate"] 105 | if field is None: 106 | return wer_summary 107 | return wer_summary[field] 108 | 109 | def write_stats(self, filestream, verbose=False): 110 | self.wer_computer.write_stats(filestream) 111 | 112 | 113 | if __name__ == "__main__": 114 | sample_rate = 24000 115 | ids = ["A", "B"] 116 | hyp_sig = torch.randn(2, 2 * sample_rate) 117 | ref_sig = torch.randn(2, 2 * sample_rate) 118 | 119 | dwer = DWER("large-v3", sample_rate) 120 | dwer.append(ids, hyp_sig, ref_sig) 121 | print(dwer.summarize("error_rate")) 122 | print(dwer.summarize("WER")) 123 | print(dwer.summarize("error_rate_char")) 124 | print(dwer.summarize("CER")) 125 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | This project incorporates components from the projects listed below. The original copyright notices are set forth below. 2 | 3 | ############################################################################################################################################################# 4 | 5 | 1. Code in downstream/profiler.py adapted from: 6 | https://github.com/k2-fsa/icefall/blob/42de4591107868816d240c991b1e8acdfdfd090b/icefall/profiler.py 7 | 8 | Copyright 2021 The Icefall Authors. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | 22 | ############################################################################################################################################################# 23 | 24 | 2. Code in downstream/test_vc.py adapted from: 25 | https://github.com/bshall/knn-vc/blob/848302a262f7299c738af49d74209790ed442a9f/matcher.py#L21 26 | 27 | MIT License 28 | 29 | Copyright (c) 2023 MediaLab, Department of Electrical & Electronic Engineering, Stellenbosch University 30 | 31 | Permission is hereby granted, free of charge, to any person obtaining a copy 32 | of this software and associated documentation files (the "Software"), to deal 33 | in the Software without restriction, including without limitation the rights 34 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 35 | copies of the Software, and to permit persons to whom the Software is 36 | furnished to do so, subject to the following conditions: 37 | 38 | The above copyright notice and this permission notice shall be included in all 39 | copies or substantial portions of the Software, and you spend at least 10 seconds 40 | thinking about whether the idea of copyright for Software actually makes sense 41 | the first time you download the Software. 42 | 43 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 44 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 45 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 46 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 47 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 48 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 49 | SOFTWARE. 50 | 51 | ############################################################################################################################################################# 52 | 53 | 3. Code in downstream/metrics/dnsmos.py adapted from: 54 | https://github.com/microsoft/DNS-Challenge/blob/4dfd2f639f737cdf530c61db91af16f7e0aa23e1/DNSMOS/dnsmos_local.py 55 | 56 | MIT License 57 | 58 | Copyright (c) Microsoft Corporation. 59 | 60 | Permission is hereby granted, free of charge, to any person obtaining a copy 61 | of this software and associated documentation files (the "Software"), to deal 62 | in the Software without restriction, including without limitation the rights 63 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 64 | copies of the Software, and to permit persons to whom the Software is 65 | furnished to do so, subject to the following conditions: 66 | 67 | The above copyright notice and this permission notice shall be included in all 68 | copies or substantial portions of the Software. 69 | 70 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 71 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 72 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 73 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 74 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 75 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 76 | SOFTWARE 77 | 78 | ############################################################################################################################################################# 79 | 80 | 4. Code in downstream/models/llama3.py adapted from: 81 | https://github.com/meta-llama/llama3 82 | 83 | META LLAMA 3 COMMUNITY LICENSE AGREEMENT 84 | 85 | Meta Llama 3 is licensed under the Meta Llama 3 Community License, Copyright © Meta Platforms, Inc. All Rights Reserved. 86 | 87 | ############################################################################################################################################################# -------------------------------------------------------------------------------- /downstream/metrics/speaker_similarity.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | """Cosine similarity between speaker embeddings.""" 18 | 19 | import torch 20 | import torchaudio 21 | from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE 22 | from speechbrain.dataio.dataio import length_to_mask 23 | from speechbrain.inference.speaker import SpeakerRecognition 24 | from speechbrain.utils.metric_stats import MetricStats 25 | from transformers import AutoModelForAudioXVector 26 | 27 | 28 | __all__ = ["SpkSimECAPATDNN", "SpkSimWavLM"] 29 | 30 | 31 | SAMPLE_RATE = 16000 32 | 33 | 34 | class SpkSimECAPATDNN(MetricStats): 35 | def __init__( 36 | self, model_hub, sample_rate, save_path=HUGGINGFACE_HUB_CACHE, model=None 37 | ): 38 | self.sample_rate = sample_rate 39 | self.model = model 40 | if model is None: 41 | self.model = SpeakerRecognition.from_hparams( 42 | model_hub, savedir=save_path 43 | ).cpu() 44 | self.clear() 45 | 46 | @torch.no_grad() 47 | def append(self, ids, hyp_sig, ref_sig, lens=None): 48 | assert hyp_sig.shape == ref_sig.shape 49 | assert hyp_sig.ndim == 2 50 | 51 | # Concatenate 52 | sig = torch.cat([hyp_sig, ref_sig]) 53 | if lens is not None: 54 | lens = torch.cat([lens, lens]) 55 | 56 | # Resample 57 | sig = torchaudio.functional.resample(sig, self.sample_rate, SAMPLE_RATE) 58 | 59 | self.model.device = hyp_sig.device 60 | self.model.to(hyp_sig.device) 61 | self.model.eval() 62 | 63 | # Forward 64 | embs = self.model.encode_batch(sig, lens, normalize=False) 65 | hyp_embs, ref_embs = embs.split([len(hyp_sig), len(ref_sig)]) 66 | scores = self.model.similarity(hyp_embs, ref_embs)[:, 0] 67 | 68 | self.ids += ids 69 | self.scores += scores.cpu().tolist() 70 | 71 | 72 | class SpkSimWavLM(MetricStats): 73 | def __init__( 74 | self, model_hub, sample_rate, save_path=HUGGINGFACE_HUB_CACHE, model=None 75 | ): 76 | self.sample_rate = sample_rate 77 | self.model = model 78 | if model is None: 79 | self.model = AutoModelForAudioXVector.from_pretrained( 80 | model_hub, cache_dir=save_path 81 | ) 82 | self.clear() 83 | 84 | @torch.no_grad() 85 | def append(self, ids, hyp_sig, ref_sig, lens=None): 86 | assert hyp_sig.shape == ref_sig.shape 87 | assert hyp_sig.ndim == 2 88 | 89 | # Concatenate 90 | sig = torch.cat([hyp_sig, ref_sig]) 91 | if lens is not None: 92 | lens = torch.cat([lens, lens]) 93 | 94 | # Resample 95 | sig = torchaudio.functional.resample(sig, self.sample_rate, SAMPLE_RATE) 96 | if sig.shape[-1] < 4880: 97 | sig = torch.nn.functional.pad( 98 | sig, [0, 4880 - sig.shape[-1]], mode="replicate" 99 | ) 100 | 101 | self.model.to(hyp_sig.device) 102 | self.model.eval() 103 | 104 | # Attention mask 105 | attention_mask = None 106 | if lens is not None: 107 | abs_length = lens * sig.shape[-1] 108 | attention_mask = length_to_mask( 109 | abs_length.int() 110 | ).long() # 0 for masked tokens 111 | 112 | # Forward 113 | embs = self.model( 114 | input_values=sig, 115 | attention_mask=attention_mask, 116 | output_attentions=False, 117 | ).embeddings 118 | 119 | hyp_embs, ref_embs = embs.split([len(hyp_sig), len(ref_sig)]) 120 | scores = torch.nn.functional.cosine_similarity(hyp_embs, ref_embs, dim=-1) 121 | 122 | self.ids += ids 123 | self.scores += scores.cpu().tolist() 124 | 125 | 126 | if __name__ == "__main__": 127 | sample_rate = 24000 128 | ids = ["A", "B"] 129 | hyp_sig = torch.randn(2, 2 * sample_rate) 130 | ref_sig = torch.randn(2, 2 * sample_rate) 131 | 132 | spk_sim = SpkSimECAPATDNN("speechbrain/spkrec-ecapa-voxceleb", sample_rate) 133 | spk_sim.append(ids, hyp_sig, ref_sig) 134 | print(spk_sim.summarize("average")) 135 | 136 | spk_sim = SpkSimWavLM("microsoft/wavlm-base-sv", sample_rate) 137 | spk_sim.append(ids, hyp_sig, ref_sig) 138 | print(spk_sim.summarize("average")) 139 | -------------------------------------------------------------------------------- /audiocodecs/bigcodec.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | """BigCodec (see https://arxiv.org/abs/2409.05377).""" 18 | 19 | import os 20 | import sys 21 | 22 | import torch 23 | from huggingface_hub import snapshot_download 24 | 25 | from audiocodecs.codec import Codec 26 | 27 | 28 | __all__ = ["BigCodec"] 29 | 30 | 31 | class BigCodec(Codec): 32 | SOURCES = ["Alethia/BigCodec"] 33 | CHECKPOINTS = ["bigcodec.pt"] 34 | 35 | def __init__( 36 | self, 37 | sample_rate, 38 | mode="reconstruct", 39 | source="Alethia/BigCodec", 40 | checkpoint="bigcodec.pt", 41 | latent=True, 42 | ): 43 | try: 44 | # Workaround to avoid name collisions with installed modules 45 | root_dir = os.path.dirname(os.path.realpath(__file__)) 46 | sys_path = [x for x in sys.path] 47 | sys.path = [x for x in sys.path if root_dir not in x] 48 | import bigcodec 49 | 50 | sys.path = sys_path 51 | except ImportError: 52 | raise ImportError( 53 | "`pip install git+https://github.com/lucadellalib/BigCodec.git@main` to use this module" 54 | ) 55 | 56 | super().__init__(sample_rate, 16000, mode) 57 | self.num_codebooks = 1 58 | self.vocab_size = 8192 59 | self.latent = latent 60 | 61 | path = snapshot_download(repo_id=source) 62 | checkpoint_path = os.path.join(path, checkpoint) 63 | checkpoint = torch.load(checkpoint_path, map_location="cpu") 64 | self.encoder = bigcodec.CodecEncoder() 65 | self.encoder.load_state_dict(checkpoint["CodecEnc"]) 66 | self.decoder = bigcodec.CodecDecoder() 67 | self.decoder.load_state_dict(checkpoint["generator"]) 68 | self.quantizer = self.decoder.quantizer 69 | 70 | if mode == "encode": 71 | self.decoder = None 72 | elif mode == "decode": 73 | self.encoder = None 74 | 75 | # override 76 | @torch.no_grad() 77 | def embs(self): 78 | if self.latent: 79 | embs = self.quantizer.get_emb()[0] 80 | embs = embs[None] # [K=1, C, H] 81 | return embs 82 | embs = self.quantizer.get_emb()[0] 83 | embs = embs[None] 84 | embs = self.quantizer.layers[0].out_proj(embs) 85 | return embs 86 | 87 | # override 88 | def _sig_to_toks(self, sig, length): 89 | # sig: [B, T] 90 | feats = self.encoder(sig[:, None]) 91 | _, toks, _ = self.quantizer(feats) 92 | toks = toks[0, :, :, None] # [B, N, K=1] 93 | return toks 94 | 95 | # override 96 | def _sig_to_feats(self, sig, length): 97 | # sig: [B, T] 98 | if self.latent: 99 | feats = self.encoder(sig[:, None]) # [B, H, N] 100 | feats = feats.movedim(-1, -2) 101 | feats = self.quantizer.layers[0].in_proj(feats) 102 | return feats 103 | feats = self.encoder(sig[:, None]) # [B, H, N] 104 | feats = feats.movedim(-1, -2) 105 | return feats 106 | 107 | # override 108 | def _toks_to_sig(self, toks, length): 109 | # toks: [B, N, K=1] 110 | qfeats = self.quantizer.vq2emb(toks) 111 | sig = self.decoder(qfeats.movedim(-1, -2), vq=False)[:, 0] # [B, T] 112 | return sig 113 | 114 | 115 | # Test 116 | if __name__ == "__main__": 117 | import torchaudio 118 | 119 | device = "cuda" if torch.cuda.is_available() else "cpu" 120 | sample_rate = 10000 121 | batch_size = 2 122 | 123 | for mode in ["encode", "decode", "reconstruct"]: 124 | codec = BigCodec(sample_rate, mode=mode).eval().to(device) 125 | input = ( 126 | torch.zeros(batch_size, 10, 1).long() 127 | if mode == "decode" 128 | else torch.randn(batch_size, sample_rate) 129 | ).to(device) 130 | with torch.no_grad(): 131 | output = codec(input) 132 | print(output.shape) 133 | embs = codec.embs() 134 | print(embs.shape) 135 | if mode in ["encode", "reconstruct"]: 136 | output = codec.sig_to_feats(input) 137 | print(output.shape) 138 | 139 | sig, sample_rate = torchaudio.load("example.wav") 140 | codec = BigCodec(sample_rate).eval() 141 | with torch.no_grad(): 142 | rec_sig = codec(sig) 143 | torchaudio.save("reconstruction.wav", rec_sig, sample_rate) 144 | -------------------------------------------------------------------------------- /audiocodecs/wavtokenizer.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | """WavTokenizer (see https://arxiv.org/abs/2408.16532).""" 18 | 19 | import os 20 | import sys 21 | 22 | import torch 23 | from huggingface_hub import snapshot_download 24 | 25 | from audiocodecs.codec import Codec 26 | 27 | 28 | __all__ = ["WavTokenizer"] 29 | 30 | 31 | class WavTokenizer(Codec): 32 | SOURCES = [ 33 | "novateur/WavTokenizer-large-unify-40token", 34 | "novateur/WavTokenizer-large-speech-75token", 35 | ] 36 | CONFIGS = [ 37 | "wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml", 38 | "wavtokenizer_smalldata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml", 39 | ] 40 | CHECKPOINTS = [ 41 | "wavtokenizer_large_unify_600_24k.ckpt", 42 | "wavtokenizer_large_speech_320_v2.ckpt", 43 | ] 44 | 45 | def __init__( 46 | self, 47 | sample_rate, 48 | mode="reconstruct", 49 | source="novateur/WavTokenizer-large-unify-40token", 50 | config="wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml", 51 | checkpoint="wavtokenizer_large_unify_600_24k.ckpt", 52 | ): 53 | try: 54 | # Workaround to avoid name collisions with installed modules 55 | root_dir = os.path.dirname(os.path.realpath(__file__)) 56 | sys_path = [x for x in sys.path] 57 | sys.path = [x for x in sys.path if root_dir not in x] 58 | import wavtokenizer 59 | 60 | sys.path = sys_path 61 | except ImportError: 62 | raise ImportError( 63 | "`pip install git+https://github.com/lucadellalib/WavTokenizer.git@main` to use this module" 64 | ) 65 | 66 | super().__init__(sample_rate, 24000, mode) 67 | self.num_codebooks = 1 68 | self.vocab_size = 4096 69 | 70 | path = snapshot_download(repo_id=source) 71 | checkpoint_path = os.path.join(path, checkpoint) 72 | path = snapshot_download(repo_id="novateur/WavTokenizer") 73 | config_path = os.path.join(path, config) 74 | self.model = wavtokenizer.WavTokenizer.from_pretrained0802( 75 | config_path, checkpoint_path 76 | ) 77 | 78 | if mode == "encode": 79 | self.model.feature_extractor.encodec.decoder = None 80 | self.model.head = None 81 | elif mode == "decode": 82 | self.model.feature_extractor.encodec.encoder = None 83 | 84 | # override 85 | @torch.no_grad() 86 | def embs(self): 87 | embs = self.model.feature_extractor.encodec.quantizer.vq.layers[0].codebook 88 | embs = embs[None] # [K, C, H] 89 | return embs 90 | 91 | # override 92 | def _sig_to_toks(self, sig, length): 93 | # sig: [B, T] 94 | _, toks = self.model.encode(sig, bandwidth_id=0) 95 | toks = toks.movedim(0, -1) # [B, N, K] 96 | return toks 97 | 98 | # override 99 | def _sig_to_feats(self, sig, length): 100 | # sig: [B, T] 101 | feats = self.model.feature_extractor.encodec.encoder(sig[:, None]) 102 | feats = feats.movedim(-1, -2) # [B, N, K] 103 | return feats 104 | 105 | # override 106 | def _toks_to_sig(self, toks, length): 107 | # toks: [B, N, K] 108 | feats = self.model.codes_to_features(toks.movedim(-1, 0)) 109 | sig = self.model.decode( 110 | feats, bandwidth_id=torch.tensor(0, device=toks.device) 111 | ) # [B, T] 112 | return sig 113 | 114 | 115 | # Test 116 | if __name__ == "__main__": 117 | import torchaudio 118 | 119 | device = "cuda" if torch.cuda.is_available() else "cpu" 120 | sample_rate = 10000 121 | batch_size = 2 122 | 123 | for mode in ["encode", "decode", "reconstruct"]: 124 | codec = WavTokenizer(sample_rate, mode=mode).eval().to(device) 125 | input = ( 126 | torch.zeros(batch_size, 10, 1).long() 127 | if mode == "decode" 128 | else torch.randn(batch_size, sample_rate) 129 | ).to(device) 130 | with torch.no_grad(): 131 | output = codec(input) 132 | print(output.shape) 133 | embs = codec.embs() 134 | print(embs.shape) 135 | if mode in ["encode", "reconstruct"]: 136 | output = codec.sig_to_feats(input) 137 | print(output.shape) 138 | 139 | sig, sample_rate = torchaudio.load("example.wav") 140 | codec = WavTokenizer(sample_rate).eval() 141 | with torch.no_grad(): 142 | rec_sig = codec(sig) 143 | torchaudio.save("reconstruction.wav", rec_sig, sample_rate) 144 | -------------------------------------------------------------------------------- /audiocodecs/speechtokenizer.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright 2025 Luca Della Libera. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | """SpeechTokenizer (see https://arxiv.org/abs/2308.16692).""" 18 | 19 | import os 20 | import sys 21 | 22 | import torch 23 | from huggingface_hub import snapshot_download 24 | 25 | from audiocodecs.codec import Codec 26 | 27 | 28 | __all__ = ["SpeechTokenizer"] 29 | 30 | 31 | class SpeechTokenizer(Codec): 32 | def __init__( 33 | self, 34 | sample_rate, 35 | mode="reconstruct", 36 | num_codebooks=8, 37 | ): 38 | try: 39 | # Workaround to avoid name collisions with installed modules 40 | root_dir = os.path.dirname(os.path.realpath(__file__)) 41 | sys_path = [x for x in sys.path] 42 | sys.path = [x for x in sys.path if root_dir not in x] 43 | import speechtokenizer 44 | 45 | sys.path = sys_path 46 | except ImportError: 47 | raise ImportError("`pip install speechtokenizer` to use this module") 48 | 49 | super().__init__(sample_rate, 16000, mode) 50 | self.num_codebooks = num_codebooks 51 | 52 | source = "fnlp/SpeechTokenizer" 53 | path = snapshot_download(repo_id=source) 54 | config_path = os.path.join(path, "speechtokenizer_hubert_avg", "config.json") 55 | checkpoint_path = os.path.join( 56 | path, "speechtokenizer_hubert_avg", "SpeechTokenizer.pt" 57 | ) 58 | self.model = speechtokenizer.SpeechTokenizer.load_from_checkpoint( 59 | config_path, checkpoint_path 60 | ) 61 | 62 | if mode == "encode": 63 | self.model.decoder = None 64 | elif mode == "decode": 65 | self.model.encoder = None 66 | self.model.transform = None 67 | 68 | # override 69 | @torch.no_grad() 70 | def embs(self): 71 | # See https://github.com/ZhangXInFD/SpeechTokenizer/blob/a9f88dc72642b600654a62861e34342babae6c71/speechtokenizer/quantization/core_vq.py#L360 72 | vocab_size = 1024 73 | device = next(iter(self.model.state_dict().values())).device 74 | toks = torch.arange(vocab_size, device=device) 75 | toks = ( 76 | toks[None, :, None].expand(self.num_codebooks, -1, -1).clone() 77 | ) # [K, C, 1] 78 | embs = [] 79 | for i, indices in enumerate(toks): 80 | layer = self.model.quantizer.vq.layers[i] 81 | quantized = layer.decode(indices) # [C, H, 1] 82 | embs.append(quantized) 83 | assert (self.model.quantizer.decode(toks) == sum(embs)).all() 84 | embs = torch.stack(embs)[..., 0] # [K, C, H] 85 | return embs 86 | 87 | # override 88 | def _sig_to_toks(self, sig, length): 89 | # sig: [B, T] 90 | toks = self.model.encode(sig[:, None])[: self.num_codebooks] # [K, B, N] 91 | toks = toks.movedim(-3, -1) # [B, N, K] 92 | return toks 93 | 94 | # override 95 | def _sig_to_feats(self, sig, length): 96 | # sig: [B, T] 97 | feats = self.model.encoder(sig[:, None]) 98 | feats = feats.movedim(-1, -2) # [B, N, K] 99 | return feats 100 | 101 | # override 102 | def _toks_to_sig(self, toks, length): 103 | # toks: [B, N, K] 104 | toks = toks.movedim(-1, -3) # [K, B, N] 105 | sig = self.model.decode(toks)[:, 0] # [B, T] 106 | return sig 107 | 108 | 109 | # Test 110 | if __name__ == "__main__": 111 | import torchaudio 112 | 113 | device = "cuda" if torch.cuda.is_available() else "cpu" 114 | sample_rate = 10000 115 | batch_size = 2 116 | num_codebooks = 8 117 | 118 | for mode in ["encode", "decode", "reconstruct"]: 119 | codec = ( 120 | SpeechTokenizer( 121 | sample_rate, 122 | mode=mode, 123 | num_codebooks=num_codebooks, 124 | ) 125 | .eval() 126 | .to(device) 127 | ) 128 | input = ( 129 | torch.zeros(batch_size, 10, num_codebooks).long() 130 | if mode == "decode" 131 | else torch.randn(batch_size, sample_rate) 132 | ).to(device) 133 | with torch.no_grad(): 134 | output = codec(input) 135 | print(output.shape) 136 | embs = codec.embs() 137 | print(embs.shape) 138 | if mode in ["encode", "reconstruct"]: 139 | output = codec.sig_to_feats(input) 140 | print(output.shape) 141 | 142 | sig, sample_rate = torchaudio.load("example.wav") 143 | codec = SpeechTokenizer(sample_rate, num_codebooks=num_codebooks).eval() 144 | with torch.no_grad(): 145 | rec_sig = codec(sig) 146 | torchaudio.save("reconstruction.wav", rec_sig, sample_rate) 147 | --------------------------------------------------------------------------------