├── .env.example ├── .gitignore ├── LICENCE ├── README.md ├── config ├── callbacks │ ├── debugging.yaml │ ├── default_speech.yaml │ ├── none.yaml │ ├── speaker_default.yaml │ └── speaker_early_stopping.yaml ├── data │ ├── dataloader │ │ ├── speaker.yaml │ │ └── speech.yaml │ ├── module │ │ ├── librispeech.yaml │ │ ├── voxceleb1.yaml │ │ ├── voxceleb1_pairs.yaml │ │ ├── voxceleb1_triplets.yaml │ │ ├── voxceleb2.yaml │ │ ├── voxceleb2_pairs.yaml │ │ ├── voxceleb2_test_everyone.yaml │ │ ├── voxceleb2_test_hard.yaml │ │ └── voxceleb2_triplets.yaml │ ├── pipeline │ │ ├── wav2vec_base_pipeline.yaml │ │ ├── wav2vec_full_seq_pipeline.yaml │ │ ├── wav2vec_pair_pipeline.yaml │ │ ├── wav2vec_short_seq_pipeline.yaml │ │ ├── xvector_all_augment_pipeline.yaml │ │ ├── xvector_dropout_augment_pipeline.yaml │ │ ├── xvector_pipeline.yaml │ │ └── xvector_rirs_augment.yaml │ └── shards │ │ ├── shards_librispeech.yaml │ │ └── shards_voxceleb.yaml ├── evaluator │ ├── cosine_distance.yaml │ ├── cosine_distance_with_train_data.yaml │ ├── lda.yaml │ └── plda.yaml ├── experiment │ ├── speaker_dummy.yaml │ ├── speaker_ecapa_tdnn.yaml │ ├── speaker_wav2vec2_aam.yaml │ ├── speaker_wav2vec2_ce.yaml │ ├── speaker_wav2vec2_ctc.yaml │ ├── speaker_wav2vec2_pairs.yaml │ ├── speaker_wav2vec2_triplet.yaml │ ├── speaker_wav2vec2_triplet_ce.yaml │ ├── speaker_xvector.yaml │ └── speech_wav2vec2_ctc.yaml ├── hydra │ └── launcher │ │ └── slurm.yaml ├── network │ ├── dummy.yaml │ ├── ecapa_tdnn.yaml │ ├── wav2spk.yaml │ ├── wav2vec2_fc.yaml │ ├── wav2vec2_fc_letter.yaml │ ├── wav2vec2_paired.yaml │ ├── wav2vec_fc.yaml │ ├── wav2vec_xvector.yaml │ └── xvector.yaml ├── optim │ ├── algo │ │ ├── adam.yaml │ │ └── sgd.yaml │ ├── loss │ │ ├── aam_softmax.yaml │ │ ├── binary_cross_entropy.yaml │ │ ├── cross_entropy.yaml │ │ ├── ctc.yaml │ │ ├── triplet.yaml │ │ └── triplet_ce.yaml │ └── schedule │ │ ├── constant.yaml │ │ ├── cyclic.yaml │ │ ├── exp_decay.yaml │ │ ├── one_cycle.yaml │ │ ├── reduce_on_plateau.yaml │ │ ├── schedule_wav2spk.yaml │ │ ├── schedule_wav2vec_fan_etal.yaml │ │ └── tri_stage.yaml ├── predict.yaml ├── profiler │ ├── advanced.yaml │ └── simple.yaml ├── search │ ├── lr_and_aam_loss.yaml │ ├── lr_and_pooling.yaml │ └── lr_and_schedule_search.yaml ├── tokenizer │ └── default.yaml ├── train_eval.yaml └── trainer │ ├── debug_trainer.yaml │ └── trainer.yaml ├── convert_voxceleb2.sh ├── paper_results ├── auto_lr_find │ ├── ecapa │ │ ├── .hydra │ │ │ ├── config.yaml │ │ │ ├── hydra.yaml │ │ │ └── overrides.yaml │ │ ├── data.json │ │ ├── lightning_logs │ │ │ └── version_0 │ │ │ │ └── events.out.tfevents.1631794798.katara.82853.0 │ │ ├── plot.png │ │ ├── plot_lr_eer.png │ │ ├── plot_lr_eer_zoomed.png │ │ └── run.log │ ├── grid_search_results.csv │ ├── plot_auto_lr.py │ ├── plot_eer_and_lr_find.py │ ├── plot_eer_and_lr_find_broken.py │ ├── wav2vec2-sv-aam │ │ ├── .hydra │ │ │ ├── config.yaml │ │ │ ├── hydra.yaml │ │ │ └── overrides.yaml │ │ ├── data.json │ │ ├── lightning_logs │ │ │ └── version_0 │ │ │ │ └── events.out.tfevents.1631044502.katara.6664.0 │ │ ├── plot.png │ │ ├── plot_lr_eer.png │ │ ├── plot_lr_eer_zoomed.png │ │ └── run.log │ ├── wav2vec2-sv-bce │ │ ├── .hydra │ │ │ ├── config.yaml │ │ │ ├── hydra.yaml │ │ │ └── overrides.yaml │ │ ├── data.json │ │ ├── lightning_logs │ │ │ └── version_0 │ │ │ │ └── events.out.tfevents.1631113238.katara.16035.0 │ │ ├── plot.png │ │ ├── plot_lr_eer.png │ │ ├── plot_lr_eer_zoomed.png │ │ └── run.log │ ├── wav2vec2-sv-ce │ │ ├── .hydra │ │ │ ├── config.yaml │ │ │ ├── hydra.yaml │ │ │ └── overrides.yaml │ │ ├── data.json │ │ ├── lightning_logs │ │ │ └── version_0 │ │ │ │ └── events.out.tfevents.1631043151.katara.6259.0 │ │ ├── plot.png │ │ ├── plot_lr_eer.png │ │ ├── plot_lr_eer_zoomed.png │ │ └── run.log │ ├── wav2vec2-sv-ctc │ │ ├── .hydra │ │ │ ├── config.yaml │ │ │ ├── hydra.yaml │ │ │ └── overrides.yaml │ │ ├── data.json │ │ ├── lightning_logs │ │ │ └── version_0 │ │ │ │ └── events.out.tfevents.1631793388.katara.71473.0 │ │ ├── plot.png │ │ ├── plot_lr_eer.png │ │ └── run.log │ └── xvector │ │ ├── .hydra │ │ ├── config.yaml │ │ ├── hydra.yaml │ │ └── overrides.yaml │ │ ├── data.json │ │ ├── lightning_logs │ │ └── version_0 │ │ │ └── events.out.tfevents.1631794594.katara.80664.0 │ │ ├── plot.png │ │ ├── plot_lr_eer.png │ │ ├── plot_lr_eer_zoomed.png │ │ └── run.log └── run_tests_pool.py ├── predict.py ├── preparation_scripts ├── download_and_prepare_rirs.sh ├── download_librispeech.sh ├── download_pretrained_models.sh ├── download_voxceleb_meta.sh ├── hydra_bash_complete.sh ├── set_cuda_dependencies.sh ├── validate_scores.py └── voxceleb2_convert_to_wav.py ├── pyproject.toml ├── requirements ├── requirements_cuda101.txt ├── requirements_cuda111.txt └── requirements_py1.9_cuda111.txt ├── run.py └── src ├── __init__.py ├── callbacks ├── __init__.py ├── input_monitor_callback.py ├── memory_monitor.py └── progress_tracker_callback.py ├── config_util.py ├── data ├── __init__.py ├── collating.py ├── common.py ├── modules │ ├── __init__.py │ ├── speaker │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── speaker_data_module.cpython-38.pyc │ │ │ ├── training_batch_speaker.cpython-38.pyc │ │ │ └── voxceleb.cpython-38.pyc │ │ ├── speaker_data_module.py │ │ ├── training_batch_speaker.py │ │ └── voxceleb.py │ └── speech │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── librispeech.cpython-38.pyc │ │ ├── speech_data_module.cpython-38.pyc │ │ └── training_batch_speech.cpython-38.pyc │ │ ├── librispeech.py │ │ ├── speech_data_module.py │ │ └── training_batch_speech.py ├── preprocess │ ├── __init__.py │ ├── audio_features.py │ ├── augment.py │ ├── base.py │ ├── input_normalisation.py │ └── random_chunks.py └── util.py ├── eval_metrics.py ├── evaluation ├── __init__.py ├── speaker │ ├── __init__.py │ ├── cosine_distance.py │ ├── lda.py │ ├── plda.py │ └── speaker_recognition_evaluator.py └── speech │ ├── __init__.py │ └── wer.py ├── hydra_resolvers.py ├── layers ├── __init__.py ├── embedding_masking.py ├── pooling.py └── temporal_gating.py ├── lightning_modules ├── __init__.py ├── base_lightning_module.py ├── multitask │ ├── __init__.py │ └── mt_speech_speaker_module.py ├── speaker │ ├── __init__.py │ ├── dummy.py │ ├── ecapa_tdnn.py │ ├── paired_speaker_recognition_module.py │ ├── speaker_recognition_module.py │ ├── wav2spk.py │ ├── wav2vec2_ctc.py │ ├── wav2vec2_fc.py │ ├── wav2vec2_paired_input.py │ ├── wav2vec_fc.py │ ├── wav2vec_xvector.py │ └── xvector.py └── speech │ ├── __init__.py │ ├── speech_recognition_module.py │ └── wav2vec2_fc_letter.py ├── main.py ├── models ├── __init__.py ├── wav2vec.py └── wav2vec2.py ├── optim ├── __init__.py ├── loss │ ├── __init__.py │ ├── aam_softmax.py │ ├── binary_cross_entropy.py │ ├── cross_entropy.py │ ├── ctc_loss.py │ ├── triplet_ce_loss.py │ └── triplet_loss.py └── schedule │ ├── __init__.py │ └── tri_stage.py ├── predict.py ├── tokenizer ├── __init__.py ├── base.py └── tokenizer_wav2vec2.py └── util.py /.env.example: -------------------------------------------------------------------------------- 1 | # folder where dataset(s) and pretrained models are stored 2 | DATA_FOLDER=$PWD 3 | 4 | # folder where results will be logged to 5 | LOG_FOLDER=$PWD/logs 6 | 7 | # folder which can be used for temporary storage 8 | TEMP_FOLDER=$DATA_FOLDER/tmp 9 | 10 | # folder where huggingface library saves model weights 11 | TRANSFORMERS_CACHE=$DATA_FOLDER/pretrained_models 12 | 13 | # default value for using comet ml 14 | USE_COMET_ML=False 15 | 16 | # API key of comet.ml account for experiment tracking 17 | COMET_API_KEY= 18 | 19 | # Default number of GPUs you want to train with 20 | NUM_GPUS=1 21 | 22 | # hydra launcher to use. Set to SLURM on GPU cluster :) 23 | HYDRA_LAUNCHER=basic 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | 3 | *.pyc 4 | 5 | dist/ 6 | build/ 7 | *.egg-info/ 8 | 9 | .tox/ 10 | .coverage 11 | 12 | set_environment.sh 13 | 14 | /results 15 | /models 16 | /data 17 | /poetry.lock 18 | /lightning_logs/ 19 | .env 20 | /data/ 21 | /outputs/ 22 | /playground/ 23 | 24 | poetry.toml 25 | .vscode 26 | /.venv/ 27 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | Copyright 2022 Nik Vaessen on behalf of Radboud University 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /config/callbacks/debugging.yaml: -------------------------------------------------------------------------------- 1 | to_add: 2 | - input_monitor 3 | - lr_monitor 4 | # - gpu_monitor 5 | 6 | # log debug information for a single batch 7 | input_monitor: 8 | _target_: src.callbacks.input_monitor_callback.InputMonitor 9 | 10 | # keep track of learning rate in logger 11 | lr_monitor: 12 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 13 | 14 | gpu_monitor: 15 | _target_: pytorch_lightning.callbacks.GPUStatsMonitor -------------------------------------------------------------------------------- /config/callbacks/default_speech.yaml: -------------------------------------------------------------------------------- 1 | to_add: 2 | - lr_monitor 3 | - ram_monitor 4 | - checkpoint 5 | 6 | # keep track of learning rate in logger 7 | lr_monitor: 8 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 9 | 10 | ram_monitor: 11 | _target_: src.callbacks.memory_monitor.RamMemoryMonitor 12 | frequency: 100 13 | 14 | # save model checkpoint of weights with best validation performance 15 | checkpoint: 16 | _target_: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint 17 | monitor: val_wer_clean 18 | save_top_k: 1 19 | mode: min 20 | filename: '{epoch}.{step}.{val_wer_clean:.4f}.best' 21 | save_last: true 22 | every_n_val_epochs: 1 23 | 24 | last_checkpoint_pattern: '{epoch}.{step}.{val_wer_clean:.4f}.last' -------------------------------------------------------------------------------- /config/callbacks/none.yaml: -------------------------------------------------------------------------------- 1 | to_add: 2 | - null -------------------------------------------------------------------------------- /config/callbacks/speaker_default.yaml: -------------------------------------------------------------------------------- 1 | to_add: 2 | - lr_monitor 3 | - ram_monitor 4 | - checkpoint 5 | 6 | # keep track of learning rate in logger 7 | lr_monitor: 8 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 9 | 10 | ram_monitor: 11 | _target_: src.callbacks.memory_monitor.RamMemoryMonitor 12 | frequency: 100 13 | 14 | # save model checkpoint of weights with best validation performance 15 | checkpoint: 16 | _target_: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint 17 | monitor: val_eer 18 | save_top_k: 1 19 | mode: min 20 | filename: '{epoch}.{step}.{val_eer:.4f}.best' 21 | save_last: true 22 | every_n_val_epochs: 1 23 | 24 | last_checkpoint_pattern: '{epoch}.{step}.{val_eer:.4f}.last' -------------------------------------------------------------------------------- /config/callbacks/speaker_early_stopping.yaml: -------------------------------------------------------------------------------- 1 | to_add: 2 | - lr_monitor 3 | - ram_monitor 4 | - checkpoint 5 | - early_stopping 6 | 7 | # keep track of learning rate in logger 8 | lr_monitor: 9 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 10 | 11 | ram_monitor: 12 | _target_: src.callbacks.memory_monitor.RamMemoryMonitor 13 | frequency: 100 14 | 15 | # save model checkpoint of weights with best validation performance 16 | checkpoint: 17 | _target_: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint 18 | monitor: val_eer 19 | save_top_k: 0 20 | mode: min 21 | filename: '{epoch}.{step}.{val_eer:.4f}.best' 22 | save_last: false 23 | every_n_val_epochs: 1 24 | 25 | last_checkpoint_pattern: '{epoch}.{step}.{val_eer:.4f}.last' 26 | 27 | # stop when val_eer doesn't improve or diverges 28 | early_stopping: 29 | _target_: pytorch_lightning.callbacks.early_stopping.EarlyStopping 30 | monitor: val_eer 31 | min_delta: 0.00 32 | patience: 4 33 | mode: min 34 | check_finite: True 35 | divergence_threshold: 0.45 -------------------------------------------------------------------------------- /config/data/dataloader/speaker.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the config object 2 | _target_: src.data.common.SpeakerDataLoaderConfig 3 | 4 | # settings for data loader 5 | train_batch_size: 32 6 | val_batch_size: ${data.dataloader.train_batch_size} 7 | test_batch_size: 1 8 | num_workers: 5 9 | pin_memory: true -------------------------------------------------------------------------------- /config/data/dataloader/speech.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the config object 2 | _target_: src.data.common.SpeechDataLoaderConfig 3 | 4 | # settings for data loader 5 | train_max_num_samples: 3_200_000 6 | val_batch_size: 8 7 | test_batch_size: 1 8 | num_workers: 5 9 | pin_memory: true -------------------------------------------------------------------------------- /config/data/module/librispeech.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the data module config object 2 | _target_: src.data.modules.speech.librispeech.LibriSpeechLightningDataModuleConfig 3 | 4 | # select which subset of the training data to use 5 | use_train_clean_100: true 6 | use_train_clean_360: true 7 | use_train_other_500: true 8 | 9 | # paths to training data 10 | train_clean_100_path: ${data_folder}/librispeech/train-clean-100.tar.gz 11 | train_clean_360_path: ${data_folder}/librispeech/train-clean-360.tar.gz 12 | train_other_500_path: ${data_folder}/librispeech/train-other-500.tar.gz 13 | 14 | # paths to validation data 15 | dev_clean_path: ${data_folder}/librispeech/dev-clean.tar.gz 16 | dev_other_path: ${data_folder}/librispeech/dev-other.tar.gz 17 | 18 | # paths to test data 19 | test_clean_path: ${data_folder}/librispeech/test-clean.tar.gz 20 | test_other_path: ${data_folder}/librispeech/test-other.tar.gz 21 | 22 | # folder to write train/val/test shards into 23 | shards_folder: ${data_folder}/librispeech_shards 24 | 25 | # temporary working directory for shard creation process 26 | extraction_folder: ${temp_folder}/librispeech 27 | 28 | # collation strategy 29 | train_collate_fn: default 30 | val_collate_fn: default 31 | test_collate_fn: default 32 | 33 | # add side info (in order to ease debugging data pipeline at the cost of 34 | # slowing down the iter/sec) 35 | add_side_info: False 36 | 37 | # limit the amount of samples to a certain amount - useful for debugging 38 | # whether a model can overfit on a small amount of data. 39 | # No limit when value is <= 0 40 | limit_samples: -1 -------------------------------------------------------------------------------- /config/data/module/voxceleb1.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the data module config object 2 | _target_: src.data.modules.speaker.voxceleb.VoxCelebDataModuleConfig 3 | 4 | # select which dataset(s) should be used during training 5 | # note that in practise only voxceleb2 is optional because 6 | # the test set of voxceleb2 is often all data in voxceleb1) 7 | use_voxceleb1_dev: true 8 | use_voxceleb1_test: true 9 | use_voxceleb2_dev: false 10 | use_voxceleb2_test: false 11 | all_voxceleb1_is_test_set: false 12 | 13 | # define data kind 14 | has_train: true 15 | has_val: true 16 | has_test: true 17 | 18 | # path to identity file for test set 19 | # Warning: changing the test set while shards 20 | # are already written has no effect and would require 21 | # overwriting existing shards. 22 | test_split_file_path: ${data_folder}/voxceleb_meta/veri_test2.txt 23 | 24 | # folder to write train/val/test shards into 25 | shards_folder: ${data_folder}/voxceleb1_shards 26 | 27 | # temporary working directory for shard creation process 28 | extraction_folder: ${temp_folder}/voxceleb_1 29 | 30 | # determine train/val split 31 | # `equal` mode means each speaker is in both train and val split 32 | # `different` mode means intersection of speakers in train and val is empty 33 | split_mode: equal # one of 'equal`, `different` 34 | train_val_ratio: 0.97 35 | num_val_speakers: -1 # not used because split_mode=equal 36 | 37 | # number of pairs of validation samples to calculate EER on during training 38 | eer_validation_pairs: 10_000 39 | 40 | # settings related to how data is written to shards 41 | sequential_same_speaker_samples: 1 # num back-to-back samples from same speaker 42 | min_unique_speakers_per_shard: 500 43 | discard_partial_shards: true 44 | 45 | # The paths to the zipfile containing 46 | # the voxceleb1 training and test data 47 | # Values are ignored if `use_voxceleb1=False` 48 | voxceleb1_train_zip_path: ${data_folder}/voxceleb_archives/vox1_dev_wav.zip 49 | voxceleb1_test_zip_path: ${data_folder}/voxceleb_archives/vox1_test_wav.zip 50 | 51 | # The paths to the zipfile containing 52 | # the voxceleb2 training and test data 53 | # Values are ignored if `use_voxceleb2=False` 54 | voxceleb2_train_zip_path: ${data_folder}/voxceleb_archives/vox2_dev_wav.zip 55 | voxceleb2_test_zip_path: ${data_folder}/voxceleb_archives/vox2_test_wav.zip 56 | 57 | # collation strategy 58 | train_collate_fn: pad_right 59 | val_collate_fn: default 60 | test_collate_fn: default 61 | 62 | # add side info (in order to ease debugging data pipeline at the cost of 63 | # slowing down the iter/sec) 64 | add_batch_debug_info: False 65 | 66 | # limit the amount of samples to a certain amount - useful for debugging 67 | # whether a model can overfit on a small amount of data. 68 | # No limit when value is <= 0 69 | limit_samples: -1 70 | 71 | # each sample in a batch consists of two audio samples which are either 72 | # from the same speaker or from different speakers. 73 | batch_processing_mode: categorical -------------------------------------------------------------------------------- /config/data/module/voxceleb1_pairs.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the data module config object 2 | _target_: src.data.modules.speaker.voxceleb.VoxCelebDataModuleConfig 3 | 4 | # select which dataset(s) should be used during training 5 | # note that in practise only voxceleb2 is optional because 6 | # the test set of voxceleb2 is often all data in voxceleb1) 7 | use_voxceleb1_dev: true 8 | use_voxceleb1_test: true 9 | use_voxceleb2_dev: false 10 | use_voxceleb2_test: false 11 | all_voxceleb1_is_test_set: false 12 | 13 | # define data kind 14 | has_train: true 15 | has_val: true 16 | has_test: true 17 | 18 | # path to identity file for test set 19 | # Warning: changing the test set while shards 20 | # are already written has no effect and would require 21 | # overwriting existing shards. 22 | test_split_file_path: ${data_folder}/voxceleb_meta/veri_test2.txt 23 | 24 | # folder to write train/val/test shards into 25 | shards_folder: ${data_folder}/voxceleb1_shards_pairs 26 | 27 | # temporary working directory for shard creation process 28 | extraction_folder: ${temp_folder}/voxceleb_1 29 | 30 | # determine train/val split 31 | # `equal` mode means each speaker is in both train and val split 32 | # `different` mode means intersection of speakers in train and val is empty 33 | split_mode: different # one of 'equal`, `different` 34 | train_val_ratio: -1 # not used because split_mode=different 35 | num_val_speakers: 41 36 | 37 | # number of pairs of validation samples to calculate EER on during training 38 | eer_validation_pairs: 10_000 39 | 40 | # settings related to how data is written to shards 41 | sequential_same_speaker_samples: 4 # num back-to-back samples from same speaker 42 | min_unique_speakers_per_shard: 50 43 | discard_partial_shards: true 44 | 45 | # The paths to the zipfile containing 46 | # the voxceleb1 training and test data 47 | # Values are ignored if `use_voxceleb1=False` 48 | voxceleb1_train_zip_path: ${data_folder}/voxceleb_archives/vox1_dev_wav.zip 49 | voxceleb1_test_zip_path: ${data_folder}/voxceleb_archives/vox1_test_wav.zip 50 | 51 | # The paths to the zipfile containing 52 | # the voxceleb2 training and test data 53 | # Values are ignored if `use_voxceleb2=False` 54 | voxceleb2_train_zip_path: ${data_folder}/voxceleb_archives/vox2_dev_wav.zip 55 | voxceleb2_test_zip_path: ${data_folder}/voxceleb_archives/vox2_test_wav.zip 56 | 57 | # collation strategy 58 | train_collate_fn: pad_right 59 | val_collate_fn: default 60 | test_collate_fn: default 61 | 62 | # add side info (in order to ease debugging data pipeline at the cost of 63 | # slowing down the iter/sec) 64 | add_batch_debug_info: False 65 | 66 | # limit the amount of samples to a certain amount - useful for debugging 67 | # whether a model can overfit on a small amount of data. 68 | # No limit when value is <= 0 69 | limit_samples: -1 70 | 71 | # each sample in a batch consists of two audio samples which are either 72 | # from the same speaker or from different speakers. 73 | batch_processing_mode: pairwise_categorical 74 | 75 | # distribution of pos/neg pairs in batch 76 | pos_neg_training_batch_ratio: 0.5 77 | yield_limit: null -------------------------------------------------------------------------------- /config/data/module/voxceleb1_triplets.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the data module config object 2 | _target_: src.data.modules.speaker.voxceleb.VoxCelebDataModuleConfig 3 | 4 | # select which dataset(s) should be used during training 5 | # note that in practise only voxceleb2 is optional because 6 | # the test set of voxceleb2 is often all data in voxceleb1) 7 | use_voxceleb1_dev: true 8 | use_voxceleb1_test: true 9 | use_voxceleb2_dev: false 10 | use_voxceleb2_test: false 11 | all_voxceleb1_is_test_set: false 12 | 13 | # define data kind 14 | has_train: true 15 | has_val: true 16 | has_test: true 17 | 18 | # path to identity file for test set 19 | # Warning: changing the test set while shards 20 | # are already written has no effect and would require 21 | # overwriting existing shards. 22 | test_split_file_path: ${data_folder}/voxceleb_meta/veri_test2.txt 23 | 24 | # folder to write train/val/test shards into 25 | shards_folder: ${data_folder}/voxceleb1_shards_pairs 26 | 27 | # temporary working directory for shard creation process 28 | extraction_folder: ${temp_folder}/voxceleb_1 29 | 30 | # determine train/val split 31 | # `equal` mode means each speaker is in both train and val split 32 | # `different` mode means intersection of speakers in train and val is empty 33 | split_mode: different # one of 'equal`, `different` 34 | train_val_ratio: -1 # not used because split_mode=different 35 | num_val_speakers: 41 36 | 37 | # number of pairs of validation samples to calculate EER on during training 38 | eer_validation_pairs: 10_000 39 | 40 | # settings related to how data is written to shards 41 | sequential_same_speaker_samples: 4 # num back-to-back samples from same speaker 42 | min_unique_speakers_per_shard: 50 43 | discard_partial_shards: true 44 | 45 | # The paths to the zipfile containing 46 | # the voxceleb1 training and test data 47 | # Values are ignored if `use_voxceleb1=False` 48 | voxceleb1_train_zip_path: ${data_folder}/voxceleb_archives/vox1_dev_wav.zip 49 | voxceleb1_test_zip_path: ${data_folder}/voxceleb_archives/vox1_test_wav.zip 50 | 51 | # The paths to the zipfile containing 52 | # the voxceleb2 training and test data 53 | # Values are ignored if `use_voxceleb2=False` 54 | voxceleb2_train_zip_path: ${data_folder}/voxceleb_archives/vox2_dev_wav.zip 55 | voxceleb2_test_zip_path: ${data_folder}/voxceleb_archives/vox2_test_wav.zip 56 | 57 | # collation strategy 58 | train_collate_fn: pad_right 59 | val_collate_fn: default 60 | test_collate_fn: default 61 | 62 | # add side info (in order to ease debugging data pipeline at the cost of 63 | # slowing down the iter/sec) 64 | add_batch_debug_info: False 65 | 66 | # limit the amount of samples to a certain amount - useful for debugging 67 | # whether a model can overfit on a small amount of data. 68 | # No limit when value is <= 0 69 | limit_samples: -1 70 | 71 | # each sample in a batch consists of two audio samples which are either 72 | # from the same speaker or from different speakers. 73 | batch_processing_mode: categorical_triplets -------------------------------------------------------------------------------- /config/data/module/voxceleb2.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the data module config object 2 | _target_: src.data.modules.speaker.voxceleb.VoxCelebDataModuleConfig 3 | 4 | # select which dataset(s) should be used during training 5 | # note that in practise only voxceleb2 is optional because 6 | # the test set of voxceleb2 is often all data in voxceleb1) 7 | use_voxceleb1_dev: true 8 | use_voxceleb1_test: true 9 | use_voxceleb2_dev: true 10 | use_voxceleb2_test: false 11 | all_voxceleb1_is_test_set: true 12 | 13 | # define data kind 14 | has_train: true 15 | has_val: true 16 | has_test: true 17 | 18 | # path to identity file for test set 19 | # Warning: changing the test set while shards 20 | # are already written has no effect and would require 21 | # overwriting existing shards. 22 | test_split_file_path: ${data_folder}/voxceleb_meta/veri_test2.txt 23 | 24 | # folder to write train/val/test shards into 25 | shards_folder: ${data_folder}/voxceleb2_shards 26 | 27 | # temporary working directory for shard creation process 28 | extraction_folder: ${temp_folder}/voxceleb_2 29 | 30 | # determine train/val split 31 | # `equal` mode means each speaker is in both train and val split 32 | # `different` mode means intersection of speakers in train and val is empty 33 | split_mode: equal # one of 'equal`, `different` 34 | train_val_ratio: 0.99 35 | num_val_speakers: -1 # not used because split_mode=equal 36 | 37 | # number of pairs of validation samples to calculate EER on during training 38 | eer_validation_pairs: 10_000 39 | 40 | # settings related to how data is written to shards 41 | sequential_same_speaker_samples: 1 # num back-to-back samples from same speaker 42 | min_unique_speakers_per_shard: 500 43 | discard_partial_shards: true 44 | 45 | # The paths to the zipfile containing 46 | # the voxceleb1 training and test data 47 | # Values are ignored if `use_voxceleb1=False` 48 | voxceleb1_train_zip_path: ${data_folder}/voxceleb_archives/vox1_dev_wav.zip 49 | voxceleb1_test_zip_path: ${data_folder}/voxceleb_archives/vox1_test_wav.zip 50 | 51 | # The paths to the zipfile containing 52 | # the voxceleb2 training and test data 53 | # Values are ignored if `use_voxceleb2=False` 54 | voxceleb2_train_zip_path: ${data_folder}/voxceleb_archives/vox2_dev_wav.zip 55 | voxceleb2_test_zip_path: ${data_folder}/voxceleb_archives/vox2_test_wav.zip 56 | 57 | # collation strategy 58 | train_collate_fn: pad_right 59 | val_collate_fn: default 60 | test_collate_fn: default 61 | 62 | # add side info (in order to ease debugging data pipeline at the cost of 63 | # slowing down the iter/sec) 64 | add_batch_debug_info: False 65 | 66 | # limit the amount of samples to a certain amount - useful for debugging 67 | # whether a model can overfit on a small amount of data. 68 | # No limit when value is <= 0 69 | limit_samples: -1 70 | 71 | # each sample in a batch consists of two audio samples which are either 72 | # from the same speaker or from different speakers. 73 | batch_processing_mode: categorical 74 | -------------------------------------------------------------------------------- /config/data/module/voxceleb2_pairs.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the data module config object 2 | _target_: src.data.modules.speaker.voxceleb.VoxCelebDataModuleConfig 3 | 4 | # select which dataset(s) should be used during training 5 | # note that in practise only voxceleb2 is optional because 6 | # the test set of voxceleb2 is often all data in voxceleb1) 7 | use_voxceleb1_dev: true 8 | use_voxceleb1_test: true 9 | use_voxceleb2_dev: true 10 | use_voxceleb2_test: false 11 | all_voxceleb1_is_test_set: true 12 | 13 | # define data kind 14 | has_train: true 15 | has_val: true 16 | has_test: true 17 | 18 | # path to identity file for test set 19 | # Warning: changing the test set while shards 20 | # are already written has no effect and would require 21 | # overwriting existing shards. 22 | test_split_file_path: ${data_folder}/voxceleb_meta/veri_test2.txt 23 | 24 | # folder to write train/val/test shards into 25 | shards_folder: ${data_folder}/voxceleb2_shards_pairs 26 | 27 | # temporary working directory for shard creation process 28 | extraction_folder: ${temp_folder}/voxceleb_2 29 | 30 | # determine train/val split 31 | # `equal` mode means each speaker is in both train and val split 32 | # `different` mode means intersection of speakers in train and val is empty 33 | split_mode: different # one of 'equal`, `different` 34 | train_val_ratio: -1 # not used because split_mode=different 35 | num_val_speakers: 41 36 | 37 | # number of pairs of validation samples to calculate EER on during training 38 | eer_validation_pairs: 10_000 39 | 40 | # settings related to how data is written to shards 41 | sequential_same_speaker_samples: 4 # num back-to-back samples from same speaker 42 | min_unique_speakers_per_shard: 50 43 | discard_partial_shards: true 44 | 45 | 46 | # The paths to the zipfile containing 47 | # the voxceleb1 training and test data 48 | # Values are ignored if `use_voxceleb1=False` 49 | voxceleb1_train_zip_path: ${data_folder}/voxceleb_archives/vox1_dev_wav.zip 50 | voxceleb1_test_zip_path: ${data_folder}/voxceleb_archives/vox1_test_wav.zip 51 | 52 | # The paths to the zipfile containing 53 | # the voxceleb2 training and test data 54 | # Values are ignored if `use_voxceleb2=False` 55 | voxceleb2_train_zip_path: ${data_folder}/voxceleb_archives/vox2_dev_wav.zip 56 | voxceleb2_test_zip_path: ${data_folder}/voxceleb_archives/vox2_test_wav.zip 57 | 58 | # collation strategy 59 | train_collate_fn: pad_right 60 | val_collate_fn: default 61 | test_collate_fn: default 62 | 63 | # add side info (in order to ease debugging data pipeline at the cost of 64 | # slowing down the iter/sec) 65 | add_batch_debug_info: False 66 | 67 | # limit the amount of samples to a certain amount - useful for debugging 68 | # whether a model can overfit on a small amount of data. 69 | # No limit when value is <= 0 70 | limit_samples: -1 71 | 72 | # each sample in a batch consists of two audio samples which are either 73 | # from the same speaker or from different speakers. 74 | batch_processing_mode: pairwise_categorical 75 | 76 | # distribution of pos/neg pairs in batch 77 | pos_neg_training_batch_ratio: 0.5 78 | yield_limit: null -------------------------------------------------------------------------------- /config/data/module/voxceleb2_test_everyone.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the data module config object 2 | _target_: src.data.modules.speaker.voxceleb.VoxCelebDataModuleConfig 3 | 4 | # select which dataset(s) should be used during training 5 | # note that in practise only voxceleb2 is optional because 6 | # the test set of voxceleb2 is often all data in voxceleb1) 7 | use_voxceleb1_dev: true 8 | use_voxceleb1_test: true 9 | use_voxceleb2_dev: true 10 | use_voxceleb2_test: false 11 | all_voxceleb1_is_test_set: true 12 | 13 | # define data kind 14 | has_train: true 15 | has_val: true 16 | has_test: true 17 | 18 | # path to identity file for test set 19 | # Warning: changing the test set while shards 20 | # are already written has no effect and would require 21 | # overwriting existing shards. 22 | test_split_file_path: ${data_folder}/voxceleb_meta/list_test_all2.txt 23 | 24 | # folder to write train/val/test shards into 25 | shards_folder: ${data_folder}/voxceleb1_test_all_shards 26 | 27 | # temporary working directory for shard creation process 28 | extraction_folder: ${temp_folder}/voxceleb_1 29 | 30 | # determine train/val split 31 | # `equal` mode means each speaker is in both train and val split 32 | # `different` mode means intersection of speakers in train and val is empty 33 | split_mode: equal # one of 'equal`, `different` 34 | train_val_ratio: 0.97 35 | num_val_speakers: -1 # not used because split_mode=equal 36 | 37 | # number of pairs of validation samples to calculate EER on during training 38 | eer_validation_pairs: 10_000 39 | 40 | # settings related to how data is written to shards 41 | sequential_same_speaker_samples: 1 # num back-to-back samples from same speaker 42 | min_unique_speakers_per_shard: 500 43 | discard_partial_shards: true 44 | 45 | # The paths to the zipfile containing 46 | # the voxceleb1 training and test data 47 | # Values are ignored if `use_voxceleb1=False` 48 | voxceleb1_train_zip_path: ${data_folder}/voxceleb_archives/vox1_dev_wav.zip 49 | voxceleb1_test_zip_path: ${data_folder}/voxceleb_archives/vox1_test_wav.zip 50 | 51 | # The paths to the zipfile containing 52 | # the voxceleb2 training and test data 53 | # Values are ignored if `use_voxceleb2=False` 54 | voxceleb2_train_zip_path: ${data_folder}/voxceleb_archives/vox2_dev_wav.zip 55 | voxceleb2_test_zip_path: ${data_folder}/voxceleb_archives/vox2_test_wav.zip 56 | 57 | # collation strategy 58 | train_collate_fn: pad_right 59 | val_collate_fn: default 60 | test_collate_fn: default 61 | 62 | # add side info (in order to ease debugging data pipeline at the cost of 63 | # slowing down the iter/sec) 64 | add_batch_debug_info: False 65 | 66 | # limit the amount of samples to a certain amount - useful for debugging 67 | # whether a model can overfit on a small amount of data. 68 | # No limit when value is <= 0 69 | limit_samples: -1 70 | 71 | # each sample in a batch consists of two audio samples which are either 72 | # from the same speaker or from different speakers. 73 | batch_processing_mode: categorical -------------------------------------------------------------------------------- /config/data/module/voxceleb2_test_hard.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the data module config object 2 | _target_: src.data.modules.speaker.voxceleb.VoxCelebDataModuleConfig 3 | 4 | # select which dataset(s) should be used during training 5 | # note that in practise only voxceleb2 is optional because 6 | # the test set of voxceleb2 is often all data in voxceleb1) 7 | use_voxceleb1_dev: true 8 | use_voxceleb1_test: true 9 | use_voxceleb2_dev: true 10 | use_voxceleb2_test: false 11 | all_voxceleb1_is_test_set: true 12 | 13 | # define data kind 14 | has_train: true 15 | has_val: true 16 | has_test: true 17 | 18 | # path to identity file for test set 19 | # Warning: changing the test set while shards 20 | # are already written has no effect and would require 21 | # overwriting existing shards. 22 | test_split_file_path: ${data_folder}/voxceleb_meta/list_test_hard2.txt 23 | 24 | # folder to write train/val/test shards into 25 | shards_folder: ${data_folder}/voxceleb1_test_hard_shards 26 | 27 | # temporary working directory for shard creation process 28 | extraction_folder: ${temp_folder}/voxceleb_1 29 | 30 | # determine train/val split 31 | # `equal` mode means each speaker is in both train and val split 32 | # `different` mode means intersection of speakers in train and val is empty 33 | split_mode: equal # one of 'equal`, `different` 34 | train_val_ratio: 0.97 35 | num_val_speakers: -1 # not used because split_mode=equal 36 | 37 | # number of pairs of validation samples to calculate EER on during training 38 | eer_validation_pairs: 10_000 39 | 40 | # settings related to how data is written to shards 41 | sequential_same_speaker_samples: 1 # num back-to-back samples from same speaker 42 | min_unique_speakers_per_shard: 500 43 | discard_partial_shards: true 44 | 45 | # The paths to the zipfile containing 46 | # the voxceleb1 training and test data 47 | # Values are ignored if `use_voxceleb1=False` 48 | voxceleb1_train_zip_path: ${data_folder}/voxceleb_archives/vox1_dev_wav.zip 49 | voxceleb1_test_zip_path: ${data_folder}/voxceleb_archives/vox1_test_wav.zip 50 | 51 | # The paths to the zipfile containing 52 | # the voxceleb2 training and test data 53 | # Values are ignored if `use_voxceleb2=False` 54 | voxceleb2_train_zip_path: ${data_folder}/voxceleb_archives/vox2_dev_wav.zip 55 | voxceleb2_test_zip_path: ${data_folder}/voxceleb_archives/vox2_test_wav.zip 56 | 57 | # collation strategy 58 | train_collate_fn: pad_right 59 | val_collate_fn: default 60 | test_collate_fn: default 61 | 62 | # add side info (in order to ease debugging data pipeline at the cost of 63 | # slowing down the iter/sec) 64 | add_batch_debug_info: False 65 | 66 | # limit the amount of samples to a certain amount - useful for debugging 67 | # whether a model can overfit on a small amount of data. 68 | # No limit when value is <= 0 69 | limit_samples: -1 70 | 71 | # each sample in a batch consists of two audio samples which are either 72 | # from the same speaker or from different speakers. 73 | batch_processing_mode: categorical -------------------------------------------------------------------------------- /config/data/module/voxceleb2_triplets.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the data module config object 2 | _target_: src.data.modules.speaker.voxceleb.VoxCelebDataModuleConfig 3 | 4 | # select which dataset(s) should be used during training 5 | # note that in practise only voxceleb2 is optional because 6 | # the test set of voxceleb2 is often all data in voxceleb1) 7 | use_voxceleb1_dev: true 8 | use_voxceleb1_test: true 9 | use_voxceleb2_dev: true 10 | use_voxceleb2_test: false 11 | all_voxceleb1_is_test_set: true 12 | 13 | # define data kind 14 | has_train: true 15 | has_val: true 16 | has_test: true 17 | 18 | # path to identity file for test set 19 | # Warning: changing the test set while shards 20 | # are already written has no effect and would require 21 | # overwriting existing shards. 22 | test_split_file_path: ${data_folder}/voxceleb_meta/veri_test2.txt 23 | 24 | # folder to write train/val/test shards into 25 | shards_folder: ${data_folder}/voxceleb2_shards_pairs 26 | 27 | # temporary working directory for shard creation process 28 | extraction_folder: ${temp_folder}/voxceleb_1 29 | 30 | # determine train/val split 31 | # `equal` mode means each speaker is in both train and val split 32 | # `different` mode means intersection of speakers in train and val is empty 33 | split_mode: different # one of 'equal`, `different` 34 | train_val_ratio: -1 # not used because split_mode=different 35 | num_val_speakers: 41 36 | 37 | # number of pairs of validation samples to calculate EER on during training 38 | eer_validation_pairs: 10_000 39 | 40 | # settings related to how data is written to shards 41 | sequential_same_speaker_samples: 4 # num back-to-back samples from same speaker 42 | min_unique_speakers_per_shard: 50 43 | discard_partial_shards: true 44 | 45 | # The paths to the zipfile containing 46 | # the voxceleb1 training and test data 47 | # Values are ignored if `use_voxceleb1=False` 48 | voxceleb1_train_zip_path: ${data_folder}/voxceleb_archives/vox1_dev_wav.zip 49 | voxceleb1_test_zip_path: ${data_folder}/voxceleb_archives/vox1_test_wav.zip 50 | 51 | # The paths to the zipfile containing 52 | # the voxceleb2 training and test data 53 | # Values are ignored if `use_voxceleb2=False` 54 | voxceleb2_train_zip_path: ${data_folder}/voxceleb_archives/vox2_dev_wav.zip 55 | voxceleb2_test_zip_path: ${data_folder}/voxceleb_archives/vox2_test_wav.zip 56 | 57 | # collation strategy 58 | train_collate_fn: pad_right 59 | val_collate_fn: default 60 | test_collate_fn: default 61 | 62 | # add side info (in order to ease debugging data pipeline at the cost of 63 | # slowing down the iter/sec) 64 | add_batch_debug_info: False 65 | 66 | # limit the amount of samples to a certain amount - useful for debugging 67 | # whether a model can overfit on a small amount of data. 68 | # No limit when value is <= 0 69 | limit_samples: -1 70 | 71 | # each sample in a batch consists of two audio samples which are either 72 | # from the same speaker or from different speakers. 73 | batch_processing_mode: categorical_triplets 74 | -------------------------------------------------------------------------------- /config/data/pipeline/wav2vec_base_pipeline.yaml: -------------------------------------------------------------------------------- 1 | train_pipeline: 2 | - normalizer 3 | - selector_train 4 | 5 | val_pipeline: 6 | - normalizer 7 | - selector_val 8 | 9 | test_pipeline: 10 | # assume batch size of 1 due to no selector (and therefore tensors have 11 | # different dimensions and cannot be collated without padding 12 | - normalizer 13 | 14 | selector_train: 15 | _target_: src.data.preprocess.random_chunks.AudioChunkSelector 16 | # one of 'start', 'end', 'random', 'random_contiguous', 'contiguous' 17 | selection_strategy: random 18 | desired_chunk_length_sec: 3 19 | 20 | selector_val: 21 | _target_: src.data.preprocess.random_chunks.AudioChunkSelector 22 | # one of 'start', 'end', 'random', 'random_contiguous' 23 | selection_strategy: start 24 | desired_chunk_length_sec: 3 25 | 26 | normalizer: 27 | _target_: src.data.preprocess.input_normalisation.InputNormalizer2D 28 | normalize_over_channels: false -------------------------------------------------------------------------------- /config/data/pipeline/wav2vec_full_seq_pipeline.yaml: -------------------------------------------------------------------------------- 1 | train_pipeline: 2 | - normalizer 3 | 4 | val_pipeline: 5 | - normalizer 6 | 7 | test_pipeline: 8 | # assume batch size of 1 due to no selector (and therefore tensors have 9 | # different dimensions and cannot be collated without padding 10 | - normalizer 11 | 12 | selector_contiguous: 13 | _target_: src.data.preprocess.random_chunks.AudioChunkSelector 14 | # one of 'start', 'end', 'random', 'random_contiguous', 'contiguous' 15 | selection_strategy: contiguous 16 | desired_chunk_length_sec: 3 17 | 18 | selector_start: 19 | _target_: src.data.preprocess.random_chunks.AudioChunkSelector 20 | # one of 'start', 'end', 'random', 'random_contiguous' 21 | selection_strategy: start 22 | desired_chunk_length_sec: 3 23 | 24 | filterbank: 25 | _target_: src.data.preprocess.audio_features.FilterBank 26 | 27 | normalizer: 28 | _target_: src.data.preprocess.input_normalisation.InputNormalizer2D 29 | normalize_over_channels: false -------------------------------------------------------------------------------- /config/data/pipeline/wav2vec_pair_pipeline.yaml: -------------------------------------------------------------------------------- 1 | train_pipeline: 2 | - normalizer 3 | - selector_contiguous 4 | 5 | val_pipeline: 6 | - normalizer 7 | - selector_start 8 | 9 | test_pipeline: 10 | # assume batch size of 1 due to no selector (and therefore tensors have 11 | # different dimensions and cannot be collated without padding 12 | - normalizer 13 | 14 | selector_contiguous: 15 | _target_: src.data.preprocess.random_chunks.AudioChunkSelector 16 | # one of 'start', 'end', 'random', 'random_contiguous', 'contiguous' 17 | selection_strategy: random 18 | desired_chunk_length_sec: 3 19 | 20 | selector_start: 21 | _target_: src.data.preprocess.random_chunks.AudioChunkSelector 22 | # one of 'start', 'end', 'random', 'random_contiguous' 23 | selection_strategy: start 24 | desired_chunk_length_sec: 3 25 | 26 | filterbank: 27 | _target_: src.data.preprocess.audio_features.FilterBank 28 | 29 | normalizer: 30 | _target_: src.data.preprocess.input_normalisation.InputNormalizer2D 31 | normalize_over_channels: false -------------------------------------------------------------------------------- /config/data/pipeline/wav2vec_short_seq_pipeline.yaml: -------------------------------------------------------------------------------- 1 | train_pipeline: 2 | - normalizer 3 | - selector_contiguous 4 | 5 | val_pipeline: 6 | - normalizer 7 | - selector_start 8 | 9 | test_pipeline: 10 | # assume batch size of 1 due to no selector (and therefore tensors have 11 | # different dimensions and cannot be collated without padding 12 | - normalizer 13 | 14 | selector_contiguous: 15 | _target_: src.data.preprocess.random_chunks.AudioChunkSelector 16 | # one of 'start', 'end', 'random', 'random_contiguous', 'contiguous' 17 | selection_strategy: random 18 | desired_chunk_length_sec: 0.4 19 | 20 | selector_start: 21 | _target_: src.data.preprocess.random_chunks.AudioChunkSelector 22 | # one of 'start', 'end', 'random', 'random_contiguous' 23 | selection_strategy: start 24 | desired_chunk_length_sec: 3 25 | 26 | filterbank: 27 | _target_: src.data.preprocess.audio_features.FilterBank 28 | 29 | normalizer: 30 | _target_: src.data.preprocess.input_normalisation.InputNormalizer2D 31 | normalize_over_channels: false -------------------------------------------------------------------------------- /config/data/pipeline/xvector_all_augment_pipeline.yaml: -------------------------------------------------------------------------------- 1 | # preprocessors to apply to training data 2 | train_pipeline: 3 | - selector_contiguous 4 | - augmenter 5 | - filterbank 6 | - normalizer 7 | 8 | # preprocessors to apply to validation data 9 | val_pipeline: 10 | - selector_start 11 | - filterbank 12 | - normalizer 13 | 14 | # preprocessors to apply to test data 15 | test_pipeline: 16 | # assume batch size of 1 due to no selector (and therefore tensors have 17 | # different dimensions and cannot be collated without padding 18 | - filterbank 19 | - normalizer 20 | 21 | # define all the augmentations to add to the `augmenter` 22 | augmentations: 23 | - augment_drop_time 24 | - augment_drop_freqs 25 | - augment_change_speed 26 | - augment_add_reverb 27 | - augment_add_noise 28 | 29 | # selects a random audio chunk 30 | selector_contiguous: 31 | _target_: src.data.preprocess.random_chunks.AudioChunkSelector 32 | # one of 'start', 'end', 'random', 'random_contiguous', 'contiguous' 33 | selection_strategy: contiguous 34 | desired_chunk_length_sec: 3 35 | 36 | # selects the first x seconds of audio 37 | selector_start: 38 | _target_: src.data.preprocess.random_chunks.AudioChunkSelector 39 | # one of 'start', 'end', 'random', 'random_contiguous' 40 | selection_strategy: start 41 | desired_chunk_length_sec: 3 42 | 43 | # converts wav to mel filterbanks 44 | filterbank: 45 | _target_: src.data.preprocess.audio_features.FilterBank 46 | 47 | # normalizes filterbanks to 0 mean and unit variance 48 | normalizer: 49 | _target_: src.data.preprocess.input_normalisation.InputNormalizer2D 50 | normalize_over_channels: true 51 | 52 | # augmentation preprocessors to use 53 | augmenter: 54 | _target_: src.data.preprocess.augment.Augmenter 55 | yield_intermediate_augmentations: True 56 | yield_unaugmented: True 57 | stack_augmentations: False 58 | 59 | # randomly drop `x` seconds of audio 60 | augment_drop_time: 61 | _target_: src.data.preprocess.augment.TimeDropoutAugment 62 | sample_rate: 16000 63 | max_dropout_length_seconds: 0.25 64 | min_drop_count: 0 65 | max_drop_count: 5 66 | 67 | # randomly drops certain frequency bands from the audio signal 68 | augment_drop_freqs: 69 | _target_: src.data.preprocess.augment.FrequencyDropoutAugment 70 | sample_rate: 16000 71 | min_drop_count: 0 72 | max_drop_count: 5 73 | band_scaling: 1 74 | 75 | # randomly slows down or speeds up the audio 76 | augment_change_speed: 77 | _target_: src.data.preprocess.augment.ChoiceSpeedAugment 78 | sample_rate: 16000 79 | possible_speed_factors: [0.95, 1, 1.05] 80 | 81 | # randomly adds reverb 82 | augment_add_reverb: 83 | _target_: src.data.preprocess.augment.ReverbAugment 84 | sample_rate: 16000 85 | room_scale_min: 0 86 | room_scale_max: 100 87 | 88 | # randomly adds uniform noise to the audio 89 | augment_add_noise: 90 | _target_: src.data.preprocess.augment.ChoiceNoiseAugment 91 | sample_rate: 16000 92 | snr_choices: [15, 20, 100] 93 | -------------------------------------------------------------------------------- /config/data/pipeline/xvector_dropout_augment_pipeline.yaml: -------------------------------------------------------------------------------- 1 | # preprocessors to apply to training data 2 | train_pipeline: 3 | - selector_contiguous 4 | - augmenter 5 | - filterbank 6 | - normalizer 7 | 8 | # preprocessors to apply to validation data 9 | val_pipeline: 10 | - selector_start 11 | - filterbank 12 | - normalizer 13 | 14 | # preprocessors to apply to test data 15 | test_pipeline: 16 | # assume batch size of 1 due to no selector (and therefore tensors have 17 | # different dimensions and cannot be collated without padding 18 | - filterbank 19 | - normalizer 20 | 21 | # define all the augmentations to add to the `augmenter` 22 | augmentations: 23 | - augment_drop_time 24 | - augment_drop_freqs 25 | - augment_change_speed 26 | 27 | # selects a random audio chunk 28 | selector_contiguous: 29 | _target_: src.data.preprocess.random_chunks.AudioChunkSelector 30 | # one of 'start', 'end', 'random', 'random_contiguous', 'contiguous' 31 | selection_strategy: contiguous 32 | desired_chunk_length_sec: 3 33 | 34 | # selects the first x seconds of audio 35 | selector_start: 36 | _target_: src.data.preprocess.random_chunks.AudioChunkSelector 37 | # one of 'start', 'end', 'random', 'random_contiguous' 38 | selection_strategy: start 39 | desired_chunk_length_sec: 3 40 | 41 | # converts wav to mel filterbanks 42 | filterbank: 43 | _target_: src.data.preprocess.audio_features.FilterBank 44 | 45 | # normalizes filterbanks to 0 mean and unit variance 46 | normalizer: 47 | _target_: src.data.preprocess.input_normalisation.InputNormalizer2D 48 | normalize_over_channels: true 49 | 50 | # augmentation preprocessors to use 51 | augmenter: 52 | _target_: src.data.preprocess.augment.Augmenter 53 | yield_intermediate_augmentations: True 54 | yield_unaugmented: True 55 | stack_augmentations: False 56 | 57 | # randomly drop `x` seconds of audio 58 | augment_drop_time: 59 | _target_: src.data.preprocess.augment.TimeDropoutAugment 60 | sample_rate: 16000 61 | max_dropout_length_seconds: 0.25 62 | min_drop_count: 0 63 | max_drop_count: 5 64 | 65 | # randomly drops certain frequency bands from the audio signal 66 | augment_drop_freqs: 67 | _target_: src.data.preprocess.augment.FrequencyDropoutAugment 68 | sample_rate: 16000 69 | min_drop_count: 0 70 | max_drop_count: 5 71 | band_scaling: 1 72 | 73 | # randomly slows down or speeds up the audio 74 | augment_change_speed: 75 | _target_: src.data.preprocess.augment.ChoiceSpeedAugment 76 | sample_rate: 16000 77 | possible_speed_factors: [0.95, 1, 1.05] 78 | -------------------------------------------------------------------------------- /config/data/pipeline/xvector_pipeline.yaml: -------------------------------------------------------------------------------- 1 | train_pipeline: 2 | - selector_train 3 | - filterbank 4 | - normalizer 5 | 6 | val_pipeline: 7 | - selector_val 8 | - filterbank 9 | - normalizer 10 | 11 | test_pipeline: 12 | # assume batch size of 1 due to no selector (and therefore tensors have 13 | # different dimensions and cannot be collated without padding 14 | - filterbank 15 | - normalizer 16 | 17 | selector_train: 18 | _target_: src.data.preprocess.random_chunks.AudioChunkSelector 19 | # one of 'start', 'end', 'random', 'random_contiguous', 'contiguous' 20 | selection_strategy: random 21 | desired_chunk_length_sec: 3 22 | 23 | selector_val: 24 | _target_: src.data.preprocess.random_chunks.AudioChunkSelector 25 | # one of 'start', 'end', 'random', 'random_contiguous' 26 | selection_strategy: start 27 | desired_chunk_length_sec: 3 28 | 29 | filterbank: 30 | _target_: src.data.preprocess.audio_features.FilterBank 31 | n_mels: 40 32 | 33 | normalizer: 34 | _target_: src.data.preprocess.input_normalisation.InputNormalizer2D 35 | normalize_over_channels: true -------------------------------------------------------------------------------- /config/data/pipeline/xvector_rirs_augment.yaml: -------------------------------------------------------------------------------- 1 | # preprocessors to apply to training data 2 | train_pipeline: 3 | - selector_contiguous 4 | - augmenter 5 | - filterbank 6 | - normalizer 7 | 8 | # preprocessors to apply to validation data 9 | val_pipeline: 10 | - selector_start 11 | - filterbank 12 | - normalizer 13 | 14 | # preprocessors to apply to test data 15 | test_pipeline: 16 | # assume batch size of 1 due to no selector (and therefore tensors have 17 | # different dimensions and cannot be collated without padding 18 | - filterbank 19 | - normalizer 20 | 21 | # define all the augmentations to add to the `augmenter` 22 | augmentations: 23 | - augment_add_rirs 24 | 25 | # selects a random audio chunk 26 | selector_contiguous: 27 | _target_: src.data.preprocess.random_chunks.AudioChunkSelector 28 | # one of 'start', 'end', 'random', 'random_contiguous', 'contiguous' 29 | selection_strategy: contiguous 30 | desired_chunk_length_sec: 3 31 | 32 | # selects the first x seconds of audio 33 | selector_start: 34 | _target_: src.data.preprocess.random_chunks.AudioChunkSelector 35 | # one of 'start', 'end', 'random', 'random_contiguous' 36 | selection_strategy: start 37 | desired_chunk_length_sec: 3 38 | 39 | # converts wav to mel filterbanks 40 | filterbank: 41 | _target_: src.data.preprocess.audio_features.FilterBank 42 | 43 | # normalizes filterbanks to 0 mean and unit variance 44 | normalizer: 45 | _target_: src.data.preprocess.input_normalisation.InputNormalizer2D 46 | normalize_over_channels: true 47 | 48 | # augmentation preprocessors to use 49 | augmenter: 50 | _target_: src.data.preprocess.augment.Augmenter 51 | yield_intermediate_augmentations: True 52 | yield_unaugmented: True 53 | stack_augmentations: False 54 | 55 | # randomly drop `x` seconds of audio 56 | augment_add_rirs: 57 | _target_: src.data.preprocess.augment.ChoiceRirsNoiseAugment 58 | sample_rate: 16000 59 | snr_choices: [5] 60 | shards_folder: ${data_folder}/rirs_shards 61 | -------------------------------------------------------------------------------- /config/data/shards/shards_librispeech.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the config object 2 | _target_: src.data.common.WebDataSetShardConfig 3 | 4 | # amount of training samples stored per shard 5 | samples_per_shard: 155000 6 | 7 | # whether to compress the shards 8 | use_gzip_compression: false 9 | 10 | # whether to use shards in random order 11 | shuffle_shards: True 12 | 13 | # queue from which samples are extracted 14 | # in order to create batches with higher variance 15 | queue_size: 200 -------------------------------------------------------------------------------- /config/data/shards/shards_voxceleb.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the config object 2 | _target_: src.data.common.WebDataSetShardConfig 3 | 4 | # amount of training samples stored per shard 5 | samples_per_shard: 5000 6 | 7 | # whether to compress the shards 8 | use_gzip_compression: true 9 | 10 | # whether to use shards in random order 11 | shuffle_shards: True 12 | 13 | # queue from which samples are extracted 14 | # in order to create batches with higher variance 15 | queue_size: 1024 -------------------------------------------------------------------------------- /config/evaluator/cosine_distance.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.evaluation.speaker.cosine_distance.CosineDistanceEvaluator 2 | 3 | # whether to center embeddings before calculating cosine score 4 | center_before_scoring: False 5 | 6 | # whether to length-normalize embeddings before calculating cosine score 7 | length_norm_before_scoring: False 8 | 9 | # maximum number of samples to use to fit mean/std parameters 10 | # when `use_centering` is True 11 | max_num_training_samples: 0 -------------------------------------------------------------------------------- /config/evaluator/cosine_distance_with_train_data.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.evaluation.speaker.cosine_distance.CosineDistanceEvaluator 2 | 3 | # whether to center embeddings before calculating cosine score 4 | center_before_scoring: True 5 | 6 | # whether to length-normalize embeddings before calculating cosine score 7 | length_norm_before_scoring: True 8 | 9 | # maximum number of samples to use to fit mean/std parameters 10 | # when `use_centering` is True 11 | max_num_training_samples: 1000 -------------------------------------------------------------------------------- /config/evaluator/lda.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.evaluation.lda.LDAEvaluator 2 | 3 | # whether to center embeddings before calculating cosine score 4 | center_before_scoring: True 5 | 6 | # whether to length-normalize embeddings before calculating cosine score 7 | length_norm_before_scoring: True 8 | 9 | # maximum number of samples to use to fit mean/std parameters 10 | # when `use_centering` is True 11 | max_training_batches_to_fit: 30000 # exhaust a single training epoch 12 | 13 | # number of PCA components 14 | num_pca_components: 150 15 | 16 | # center the training batches before training LDA model 17 | center_before_fit_training_batches: true -------------------------------------------------------------------------------- /config/evaluator/plda.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.evaluation.plda.PLDAEvaluator 2 | 3 | # number of PCA components 4 | num_lda_pca_components: 50 5 | 6 | # number of PCA components 7 | num_plda_pca_components: 50 8 | 9 | # number of iterations to train PLDA for 10 | max_iterations: 1 11 | 12 | # maximum number of samples to use to fit mean/std parameters 13 | # when `use_centering` is True 14 | max_training_batches_to_fit: 300 # exhaust a single training epoch 15 | -------------------------------------------------------------------------------- /config/experiment/speaker_dummy.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /data/module: voxceleb1 5 | - override /data/pipeline: xvector_pipeline 6 | - override /data/dataloader: speaker 7 | - override /data/shards: shards_voxceleb 8 | - override /evaluator: cosine_distance 9 | - override /network: dummy 10 | - override /optim/algo: adam 11 | - override /optim/schedule: one_cycle 12 | - override /optim/loss: cross_entropy 13 | - override /trainer: trainer 14 | 15 | trainer: 16 | max_steps: 100_000 17 | val_check_interval: 5000 18 | precision: 16 19 | 20 | project_name: dummy-network -------------------------------------------------------------------------------- /config/experiment/speaker_ecapa_tdnn.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /data/module: voxceleb2 5 | - override /data/pipeline: xvector_pipeline 6 | - override /data/dataloader: speaker 7 | - override /data/shards: shards_voxceleb 8 | - override /evaluator: cosine_distance 9 | - override /network: ecapa_tdnn 10 | - override /optim/algo: adam 11 | - override /optim/schedule: one_cycle 12 | - override /optim/loss: aam_softmax 13 | - override /trainer: trainer 14 | 15 | trainer: 16 | max_steps: 100_000 17 | val_check_interval: 5000 18 | precision: 32 19 | 20 | data: 21 | pipeline: 22 | filterbank: 23 | n_mels: 40 24 | 25 | optim: 26 | loss: 27 | input_features: 192 28 | output_features: 5994 # only on voxceleb2 dev 29 | 30 | 31 | project_name: ecapa-tdnn -------------------------------------------------------------------------------- /config/experiment/speaker_wav2vec2_aam.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /data/module: voxceleb2 5 | - override /data/pipeline: wav2vec_base_pipeline 6 | - override /data/dataloader: speaker 7 | - override /data/shards: shards_voxceleb 8 | - override /network: wav2vec2_fc 9 | - override /optim/algo: adam 10 | - override /optim/schedule: one_cycle 11 | - override /optim/loss: aam_softmax 12 | - override /trainer: trainer 13 | 14 | trainer: 15 | max_steps: 100_000 16 | val_check_interval: 5000 17 | precision: 16 18 | 19 | project_name: wav2vec2-sv-aam -------------------------------------------------------------------------------- /config/experiment/speaker_wav2vec2_ce.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /data/module: voxceleb2 5 | - override /data/pipeline: wav2vec_base_pipeline 6 | - override /data/dataloader: speaker 7 | - override /data/shards: shards_voxceleb 8 | - override /network: wav2vec2_fc 9 | - override /optim/algo: adam 10 | - override /optim/schedule: one_cycle 11 | - override /optim/loss: cross_entropy 12 | - override /trainer: trainer 13 | 14 | trainer: 15 | max_steps: 100_000 16 | val_check_interval: 5000 17 | precision: 16 18 | 19 | project_name: wav2vec2-sv-ce -------------------------------------------------------------------------------- /config/experiment/speaker_wav2vec2_ctc.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /data/module: voxceleb2 5 | - override /data/pipeline: wav2vec_base_pipeline 6 | - override /data/dataloader: speaker 7 | - override /data/shards: shards_voxceleb 8 | - override /network: wav2vec2_fc 9 | - override /optim/algo: adam 10 | - override /optim/schedule: one_cycle 11 | - override /optim/loss: ctc 12 | - override /trainer: trainer 13 | 14 | network: 15 | stat_pooling_type: none 16 | test_stat_pooling_type: mean+std 17 | 18 | trainer: 19 | max_steps: 100_000 20 | val_check_interval: 5000 21 | precision: 16 22 | 23 | project_name: wav2vec2-sv-ctc -------------------------------------------------------------------------------- /config/experiment/speaker_wav2vec2_pairs.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /data/module: voxceleb2_pairs 5 | - override /data/pipeline: wav2vec_pair_pipeline 6 | - override /data/dataloader: speaker 7 | - override /data/shards: shards_voxceleb 8 | - override /network: wav2vec2_paired 9 | - override /optim/algo: adam 10 | - override /optim/schedule: one_cycle 11 | - override /optim/loss: binary_cross_entropy 12 | - override /trainer: trainer 13 | 14 | trainer: 15 | max_steps: 100_000 16 | val_check_interval: 5000 17 | precision: 16 18 | 19 | data: 20 | dataloader: 21 | train_batch_size: 32 22 | 23 | project_name: wav2vec2-paired -------------------------------------------------------------------------------- /config/experiment/speaker_wav2vec2_triplet.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /data/module: voxceleb1_and_2 5 | - override /data/pipeline: wav2vec_base_pipeline 6 | - override /data/dataloader: speaker 7 | - override /data/shards: shards_voxceleb 8 | - override /network: wav2vec2_fc 9 | - override /optim/algo: adam 10 | - override /optim/schedule: one_cycle 11 | - override /optim/loss: triplet 12 | - override /trainer: trainer 13 | 14 | trainer: 15 | max_steps: 100_000 16 | val_check_interval: 5000 17 | precision: 16 18 | 19 | project_name: wav2vec2-triplet 20 | 21 | data: 22 | module: 23 | enforce_triplets: true -------------------------------------------------------------------------------- /config/experiment/speaker_wav2vec2_triplet_ce.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /data/module: voxceleb1_and_2 5 | - override /data/pipeline: wav2vec_base_pipeline 6 | - override /data/dataloader: speaker 7 | - override /data/shards: shards_voxceleb 8 | - override /network: wav2vec2_fc 9 | - override /optim/algo: adam 10 | - override /optim/schedule: one_cycle 11 | - override /optim/loss: triplet_ce 12 | - override /trainer: trainer 13 | 14 | trainer: 15 | max_steps: 100_000 16 | val_check_interval: 5000 17 | precision: 16 18 | 19 | project_name: wav2vec2-triplet-ce 20 | 21 | data: 22 | module: 23 | enforce_triplets: true -------------------------------------------------------------------------------- /config/experiment/speaker_xvector.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /data/module: voxceleb2 5 | - override /data/pipeline: xvector_pipeline 6 | - override /data/dataloader: speaker 7 | - override /data/shards: shards_voxceleb 8 | - override /evaluator: cosine_distance 9 | - override /network: xvector 10 | - override /optim/algo: adam 11 | - override /optim/schedule: one_cycle 12 | - override /optim/loss: cross_entropy 13 | - override /trainer: trainer 14 | 15 | trainer: 16 | max_steps: 100_000 17 | val_check_interval: 5000 18 | precision: 32 19 | 20 | data: 21 | pipeline: 22 | filterbank: 23 | n_mels: 40 24 | 25 | project_name: xvector-sv-ce -------------------------------------------------------------------------------- /config/experiment/speech_wav2vec2_ctc.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /callbacks: default_speech 5 | - override /data/module: librispeech 6 | - override /data/pipeline: wav2vec_full_seq_pipeline 7 | - override /data/dataloader: speech 8 | - override /data/shards: shards_librispeech 9 | - override /network: wav2vec2_fc_letter 10 | - override /tokenizer: default 11 | - override /optim/algo: adam 12 | - override /optim/schedule: one_cycle 13 | - override /optim/loss: ctc 14 | - override /trainer: trainer 15 | 16 | trainer: 17 | max_steps: 100_000 18 | precision: 16 19 | 20 | project_name: wav2vec2-librispeech 21 | 22 | optim: 23 | algo: 24 | lr: 1e-4 -------------------------------------------------------------------------------- /config/hydra/launcher/slurm.yaml: -------------------------------------------------------------------------------- 1 | # @package hydra.launcher 2 | 3 | _target_: hydra_plugins.hydra_submitit_launcher.submitit_launcher.SlurmLauncher 4 | 5 | submitit_folder: ${hydra.sweep.dir}/.submitit/%j 6 | timeout_min: 4320 7 | cpus_per_task: 6 8 | gpus_per_node: ${gpus} 9 | tasks_per_node: 1 10 | mem_gb: 20 11 | nodes: 1 12 | name: ${hydra.job.name} 13 | partition: das 14 | comment: null 15 | constraint: null 16 | exclude: null 17 | signal_delay_s: 120 18 | max_num_timeout: 0 19 | additional_parameters: { "mail-user": "nvaessen", "mail-type": "BEGIN,END,FAIL" } 20 | array_parallelism: 4 21 | -------------------------------------------------------------------------------- /config/network/dummy.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.lightning_modules.speaker.dummy.DummyModuleConfig 2 | -------------------------------------------------------------------------------- /config/network/ecapa_tdnn.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the x-vector network lightning module config object 2 | _target_: src.lightning_modules.speaker.ecapa_tdnn.EcapaTDNNModuleConfig 3 | 4 | input_mel_coefficients: ${data.pipeline.filterbank.n_mels} 5 | lin_neurons: 192 6 | 7 | channels: 8 | - 1024 9 | - 1024 10 | - 1024 11 | - 1024 12 | - 3072 13 | 14 | kernel_sizes: 15 | - 5 16 | - 3 17 | - 3 18 | - 3 19 | - 1 20 | dilations: 21 | - 1 22 | - 2 23 | - 3 24 | - 4 25 | - 1 26 | 27 | attention_channels: 128 28 | res2net_scale: 8 29 | se_channels: 128 30 | global_context: True 31 | 32 | pretrained_weights_path: null 33 | 34 | # optional explicit overwrite of embedding size and/or num speakers 35 | # (e.g if you need to load finetuned weights but want to experiment with another 36 | # pooling type in the evaluation or test on a dataset with different num speakers) 37 | explicit_stat_pool_embedding_size: null 38 | explicit_num_speakers: null -------------------------------------------------------------------------------- /config/network/wav2spk.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the x-vector network lightning module config object 2 | _target_: src.lightning_modules.speaker.wav2spk.Wav2SpkModuleConfig 3 | 4 | # whether to use temporal gating after the feature encoder 5 | apply_temporal_gating: true 6 | 7 | # structure of fc head (excluding the last layer, which always has NUM_SPEAKERS 8 | # output nodes 9 | hidden_fc_layers_out: 10 | - 512 11 | - 128 12 | 13 | # Which FC hidden layer to use as speaker embedding for EER evaluation 14 | # should be a valid index from the list `hidden_fc_layers_out`, 15 | # or (len(hidden_fc_layers_out) + 1) to use the softmax output as speaker embedding, 16 | # or -1 when you want to use the stat-pooled wav2vec embeddings 17 | embedding_layer_idx: 0 18 | 19 | # which type of statistical pooling to use ('mean' or 'mean+std') 20 | stat_pooling_type: mean+std -------------------------------------------------------------------------------- /config/network/wav2vec2_fc.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the x-vector network lightning module config object 2 | _target_: src.lightning_modules.speaker.wav2vec2_fc.Wav2vec2FCModuleConfig 3 | 4 | # pretrained weights of wav2vec model 5 | wav2vec_hunggingface_id: "facebook/wav2vec2-base" 6 | 7 | # whether to use reset the pretrained weights 8 | # and start from a fresh initialization 9 | reset_weights: false 10 | 11 | # settings related to wav2vec2 architecture 12 | wav2vec_feature_encoder_only: false 13 | 14 | # whether to freeze the feature encoder part 15 | # of the network for the whole training run 16 | completely_freeze_feature_extractor: true 17 | 18 | # initially freeze wav2vec model 19 | wav2vec_initially_frozen: false 20 | 21 | # number of steps before the wav2vec model is unfrozen 22 | # (if initially frozen at all) 23 | # if set to null, wav2vec will never be unfrozen 24 | num_frozen_steps: 10000 25 | 26 | # structure of fc head (excluding the last layer, which is always NUM_SPEAKERS soft max 27 | # classification) 28 | hidden_fc_layers_out: 29 | [] # empty list means we have only 1 fc layer with NUM_SPEAKER (softmax) embeddings 30 | # - 1024 31 | # - 512 32 | 33 | # Which hidden layer to use as speaker embedding for EER evaluation 34 | # should be a valid index from the list `hidden_fc_layers_out`, 35 | # or (len(hidden_fc_layers_out) + 1) to use the softmax output as speaker embedding, 36 | # or -1 when you want to use the stat-pooled wav2vec embeddings 37 | embedding_layer_idx: -1 38 | 39 | # which type of statistical pooling to use ('mean', 'mean+std' or 'attentive') 40 | stat_pooling_type: mean+std 41 | test_stat_pooling_type: ${network.stat_pooling_type} 42 | 43 | # probability of regularization techniques during training 44 | # dropout 45 | activation_dropout: 0.0 # in feed-forward module of transformer layer 46 | attention_dropout: 0.1 # in attention module of transformer layer 47 | feat_proj_dropout: 0.1 # in feature projection module 48 | hidden_dropout: 0.1 # between residual connections in transformer layer 49 | 50 | # layer skip in transformer 51 | layerdrop: 0.05 52 | 53 | # specaugment 54 | # feature 55 | mask_feature_length: 10 56 | mask_feature_prob: 0.0 57 | 58 | # time 59 | mask_time_length: 10 60 | mask_time_prob: 0.05 61 | 62 | # augment on FINAL TOKENS 63 | final_channel_mask_prob: 0 64 | final_channel_mask_width: 5 65 | 66 | # optional explicit overwrite of embedding size and/or num speakers 67 | # (e.g if you need to load finetuned weights but want to experiment with another 68 | # pooling type in the evaluation or test on a dataset with different num speakers) 69 | explicit_stat_pool_embedding_size: null 70 | explicit_num_speakers: null 71 | 72 | use_transformers_as_ensembles: False 73 | num_ensembles: 1 -------------------------------------------------------------------------------- /config/network/wav2vec2_fc_letter.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the x-vector network lightning module config object 2 | _target_: src.lightning_modules.speech.wav2vec2_fc_letter.Wav2vec2FcLetterRecognizerConfig 3 | 4 | # pretrained weights of wav2vec model 5 | wav2vec_hunggingface_id: "facebook/wav2vec2-base" 6 | 7 | # whether to use reset the pretrained weights 8 | # and start from a fresh initialization 9 | reset_weights: false 10 | 11 | # initially freeze wav2vec model 12 | wav2vec_initially_frozen: false 13 | 14 | # whether to freeze the feature encoder part 15 | # of the network for the whole training run 16 | completely_freeze_feature_extractor: true 17 | 18 | # number of steps before the wav2vec model is unfrozen 19 | # (if initially frozen at all) 20 | # if set to null, wav2vec will never be unfrozen 21 | num_frozen_steps: 10000 22 | 23 | # mask (dropout of embedding tensor) settings 24 | timestep_mask_prob: 0 25 | timestep_mask_width: 10 26 | channel_mask_prob: 0 27 | channel_mask_width: 64 28 | -------------------------------------------------------------------------------- /config/network/wav2vec2_paired.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the x-vector network lightning module config object 2 | _target_: src.lightning_modules.speaker.wav2vec2_paired_input.Wav2vec2PairedSpeakerModuleConfig 3 | 4 | # pretrained weights of wav2vec model 5 | wav2vec_hunggingface_id: "facebook/wav2vec2-base" 6 | 7 | # whether to use reset the pretrained weights 8 | # and start from a fresh initialization 9 | reset_weights: false 10 | 11 | # initially freeze wav2vec model 12 | wav2vec_initially_frozen: false 13 | 14 | # number of steps before the wav2vec model is unfrozen 15 | # (if initially frozen at all) 16 | # if set to null, wav2vec will never be unfrozen 17 | num_frozen_steps: 10000 18 | 19 | # whether to freeze the feature encoder part 20 | # of the network for the whole training run 21 | completely_freeze_feature_extractor: true 22 | 23 | # whether to freeze the feature projection part 24 | # of the network for the whole training run 25 | completely_freeze_feature_projector: false 26 | 27 | # probability of regularization techniques during training 28 | # dropout 29 | activation_dropout: 0.0 # in feed-forward module of transformer layer 30 | attention_dropout: 0.1 # in attention module of transformer layer 31 | feat_proj_dropout: 0.1 # in feature projection module 32 | hidden_dropout: 0.1 # between residual connections in transformer layer 33 | 34 | # layer skip in transformer 35 | layerdrop: 0.05 36 | 37 | # specaugment 38 | # feature 39 | mask_feature_length: 10 40 | mask_feature_prob: 0.0 41 | 42 | # time 43 | mask_time_length: 10 44 | mask_time_prob: 0.05 45 | 46 | # augment on FINAL TOKENS 47 | final_channel_mask_prob: 0 48 | final_channel_mask_width: 5 -------------------------------------------------------------------------------- /config/network/wav2vec_fc.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the x-vector network lightning module config object 2 | _target_: src.lightning_modules.speaker.wav2vec_fc.Wav2vecFCModuleConfig 3 | 4 | # pretrained weights of wav2vec model 5 | wav2vec_model_path: ${data_folder}/pretrained_models/wav2vec/wav2vec_large.pt 6 | 7 | # whether to use the aggregation layers in wav2vec model 8 | use_aggregation_layers: true 9 | 10 | # whether to use reset the pretrained weights 11 | # and start from a fresh initialization 12 | reset_weights: false 13 | 14 | # initially freeze wav2vec model 15 | wav2vec_initially_frozen: true 16 | 17 | # number of steps before the wav2vec model is unfrozen 18 | # (if initially frozen at all) 19 | # if set to null, wav2vec will never be unfrozen 20 | num_frozen_steps: 10000 21 | 22 | # structure of fc head (excluding the last layer, which is always NUM_SPEAKERS soft max 23 | # classification) 24 | hidden_fc_layers_out: 25 | - 1024 26 | - 512 27 | # [] # empty list means we have only 1 fc layer with NUM_SPEAKER (softmax) embeddings 28 | 29 | # Which hidden layer to use as speaker embedding for EER evaluation 30 | # should be a valid index from the list `hidden_fc_layers_out`, 31 | # or (len(hidden_fc_layers_out) + 1) to use the softmax output as speaker embedding, 32 | # or -1 when you want to use the stat-pooled wav2vec embeddings 33 | embedding_layer_idx: 1 34 | 35 | # which type of statistical pooling to use ('mean' or 'mean+std') 36 | stat_pooling_type: mean+std -------------------------------------------------------------------------------- /config/network/wav2vec_xvector.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the x-vector network lightning module config object 2 | _target_: src.lightning_modules.speaker.wav2vec_xvector.Wav2vecXVectorModuleConfig 3 | 4 | tdnn_blocks: 5 5 | tdnn_channels: [512, 512, 512, 512, 1500] 6 | tdnn_kernel_sizes: [5, 3, 3, 1, 1] 7 | tdnn_dilations: [1, 2, 3, 1, 1] 8 | lin_neurons: 512 9 | in_channels: 512 # wav2vec has 512 features 10 | 11 | # pretrained weights of wav2vec model 12 | wav2vec_model_path: ${data_folder}/pretrained_models/wav2vec/wav2vec_large.pt 13 | 14 | # whether to use the aggregation layers in wav2vec model 15 | use_aggregation_layers: true 16 | 17 | # initially freeze wav2vec model 18 | wav2vec_initially_frozen: true 19 | 20 | # number of steps before the wav2vec model is unfrozen 21 | # (if initially frozen at all) 22 | num_frozen_steps: 10000 -------------------------------------------------------------------------------- /config/network/xvector.yaml: -------------------------------------------------------------------------------- 1 | # instantiate the x-vector network lightning module config object 2 | _target_: src.lightning_modules.speaker.xvector.XVectorModuleConfig 3 | 4 | tdnn_blocks: 5 5 | tdnn_channels: [512, 512, 512, 512, 1500] 6 | tdnn_kernel_sizes: [5, 3, 3, 1, 1] 7 | tdnn_dilations: [1, 2, 3, 1, 1] 8 | lin_neurons: 512 9 | in_channels: 40 # depends on values in data.pipeline 10 | 11 | # optional explicit overwrite of embedding size and/or num speakers 12 | # (e.g if you need to load finetuned weights but want to experiment with another 13 | # pooling type in the evaluation or test on a dataset with different num speakers) 14 | explicit_stat_pool_embedding_size: null 15 | explicit_num_speakers: null -------------------------------------------------------------------------------- /config/optim/algo/adam.yaml: -------------------------------------------------------------------------------- 1 | _target_: torch.optim.Adam 2 | 3 | # learning rate 4 | lr: 1e-4 5 | 6 | # weight decay (l2 regression) 7 | weight_decay: 0 8 | 9 | # beta constants for running mean of gradient and square of gradient 10 | betas: [0.9, 0.999] 11 | 12 | # epsilon term for numerical stability 13 | eps: 1e-8 14 | 15 | # use AMSGRAD version of ADAM 16 | amsgrad: false 17 | -------------------------------------------------------------------------------- /config/optim/algo/sgd.yaml: -------------------------------------------------------------------------------- 1 | _target_: torch.optim.SGD 2 | 3 | # learning rate 4 | lr: 3e-3 5 | 6 | # momentum: 7 | momentum: 0.9 8 | 9 | # weight decay (l2 regression) 10 | weight_decay: 0 11 | 12 | # momentum dampening 13 | dampening: 0 14 | 15 | # nesterov 16 | nesterov: True -------------------------------------------------------------------------------- /config/optim/loss/aam_softmax.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.optim.loss.aam_softmax.AngularAdditiveMarginSoftMaxLoss 2 | 3 | input_features: 1536 # for mean+std embeddings 4 | output_features: 5994 # only on voxceleb2 dev 5 | 6 | margin: 0.2 7 | scale: 30 8 | -------------------------------------------------------------------------------- /config/optim/loss/binary_cross_entropy.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.optim.loss.binary_cross_entropy.BinaryCrossEntropyLoss 2 | -------------------------------------------------------------------------------- /config/optim/loss/cross_entropy.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.optim.loss.cross_entropy.CrossEntropyLoss 2 | -------------------------------------------------------------------------------- /config/optim/loss/ctc.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.optim.loss.ctc_loss.CtcLoss -------------------------------------------------------------------------------- /config/optim/loss/triplet.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.optim.loss.triplet_loss.TripletLoss 2 | 3 | margin: 1 4 | -------------------------------------------------------------------------------- /config/optim/loss/triplet_ce.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.optim.loss.TripletCrossEntropyLoss 2 | 3 | # weighting which will be multiplied with cross-entropy loss 4 | c_ce: 1 5 | 6 | # weighting which will be multiplied with triplet loss 7 | c_triplet: 1 8 | -------------------------------------------------------------------------------- /config/optim/schedule/constant.yaml: -------------------------------------------------------------------------------- 1 | # the scheduler object to use 2 | scheduler: 3 | _target_: torch.optim.lr_scheduler.StepLR 4 | 5 | # number of epochs between consecutive steps 6 | step_size: 1 7 | 8 | # factor by which to multiply the learning rate every `step_size` epochs 9 | gamma: 1 10 | 11 | # epoch number after which to not do any steps any more. '-1' implies never stop 12 | last_epoch: -1 13 | 14 | # print to STDOUT when making a step 15 | verbose: false 16 | 17 | # optional value to track which is fed into the step() call 18 | # only relevant for learning rate schedulers such 19 | # as `reduce on plateau` 20 | monitor: null 21 | 22 | # whether to step every epoch or every step 23 | interval: epoch 24 | 25 | # amount of epochs/steps between consecutive step() calls 26 | frequency: null 27 | 28 | # name to log the learning rate as 29 | name: null -------------------------------------------------------------------------------- /config/optim/schedule/cyclic.yaml: -------------------------------------------------------------------------------- 1 | # the scheduler object to use 2 | scheduler: 3 | _target_: torch.optim.lr_scheduler.CyclicLR 4 | 5 | # the lowest lr in the cycle 6 | base_lr: 1e-4 7 | 8 | # the peak lr in the cycle 9 | max_lr: 0.02 10 | 11 | # number of steps to go from base_lr to max_lr 12 | step_size_up: 2500 13 | 14 | # number of steps to go from max+lr to base_lr 15 | step_size_down: 2500 16 | 17 | # Adam doesn't have `momentum` parameter, can only be true with SGD 18 | cycle_momentum: False 19 | 20 | # shape of line (triangular=linearly increasing/decreasing) 21 | mode: triangular 22 | 23 | # optional value to track which is fed into the step() call 24 | # only relevant for learning rate schedulers such 25 | # as `reduce on plateau` 26 | monitor: null 27 | 28 | # whether to step every epoch or every step 29 | interval: step 30 | 31 | # amount of epochs/steps between consecutive step() calls 32 | frequency: null 33 | 34 | # name to log the learning rate as 35 | name: null -------------------------------------------------------------------------------- /config/optim/schedule/exp_decay.yaml: -------------------------------------------------------------------------------- 1 | # the scheduler object to use 2 | scheduler: 3 | _target_: torch.optim.lr_scheduler.LambdaLR 4 | 5 | # A function which computes a multiplicative factor given an integer parameter 6 | lr_lambda: 7 | _target_: src.optim.schedule.tri_stage.TriStageLearningRateLambdaLRFunction 8 | max_steps: ${trainer.max_steps} 9 | warmup_stage_ratio: 0 10 | constant_stage_ratio: 0 11 | decay_stage_ratio: 1 12 | initial_lr: ${optim.algo.lr} 13 | base_lr: ${optim.algo.lr} 14 | final_lr: 5e-6 15 | 16 | # epoch number after which to not do any steps any more. '-1' implies never stop 17 | last_epoch: -1 18 | 19 | # print to STDOUT when making a step 20 | verbose: false 21 | 22 | # optional value to track which is fed into the step() call 23 | # only relevant for learning rate schedulers such 24 | # as `reduce on plateau` 25 | monitor: null 26 | 27 | # whether to step every epoch or every step 28 | interval: step 29 | 30 | # amount of epochs/steps between consecutive step() calls 31 | frequency: null 32 | 33 | # name to log the learning rate as 34 | name: null -------------------------------------------------------------------------------- /config/optim/schedule/one_cycle.yaml: -------------------------------------------------------------------------------- 1 | # the scheduler object to use 2 | scheduler: 3 | _target_: torch.optim.lr_scheduler.OneCycleLR 4 | 5 | # maximum learning rate to reach in the cycle 6 | max_lr: ${optim.algo.lr} 7 | 8 | # the amount of steps in the training 9 | total_steps: ${trainer.max_steps} 10 | 11 | # the initial learning rate is max_lr / div_factor 12 | div_factor: 25 13 | 14 | # optional value to track which is fed into the step() call 15 | # only relevant for learning rate schedulers such 16 | # as `reduce on plateau` 17 | monitor: null 18 | 19 | # whether to step every epoch or every step 20 | interval: step 21 | 22 | # amount of epochs/steps between consecutive step() calls 23 | frequency: null 24 | 25 | # name to log the learning rate as 26 | name: null -------------------------------------------------------------------------------- /config/optim/schedule/reduce_on_plateau.yaml: -------------------------------------------------------------------------------- 1 | # the scheduler object to use 2 | scheduler: 3 | _target_: torch.optim.lr_scheduler.ReduceLROnPlateau 4 | 5 | # whether the monitored value is minimized or maximized 6 | mode: min 7 | 8 | # factor by which to reduce the lr when it has plateaued 9 | factor: 0.1 10 | 11 | # number of epochs with no improvement after which learning rate will be reduced. 12 | # Be careful with setting this value when also using early stopping 13 | patience: 3 14 | 15 | # Threshold for measuring the new optimum, to only focus on significant changes 16 | threshold: 1e-2 17 | 18 | # Number of epochs to wait before resuming normal operation after lr has been reduced 19 | cooldown: 0 20 | 21 | # A lower bound on the learning rate 22 | min_lr: 0 23 | 24 | # optional value to track which is fed into the step() call 25 | # only relevant for learning rate schedulers such 26 | # as `reduce on plateau` 27 | monitor: val_eer 28 | 29 | # whether to step every epoch or every step 30 | interval: epoch 31 | 32 | # amount of epochs/steps between consecutive step() calls 33 | frequency: null 34 | 35 | # name to log the learning rate as 36 | name: null -------------------------------------------------------------------------------- /config/optim/schedule/schedule_wav2spk.yaml: -------------------------------------------------------------------------------- 1 | # the scheduler object to use 2 | scheduler: 3 | _target_: torch.optim.lr_scheduler.MultiStepLR 4 | 5 | # list of steps at which to decrease LR (assuming batch size 32) 6 | milestones: 7 | - 300_000 8 | - 450_000 9 | - 600_000 10 | - 750_000 11 | 12 | # factor by which to multiply the learning rate every `step_size` epochs 13 | gamma: 0.1 14 | 15 | # epoch number after which to not do any steps any more. '-1' implies never stop 16 | last_epoch: -1 17 | 18 | # print to STDOUT when making a step 19 | verbose: false 20 | 21 | # optional value to track which is fed into the step() call 22 | # only relevant for learning rate schedulers such 23 | # as `reduce on plateau` 24 | monitor: null 25 | 26 | # whether to step every epoch or every step 27 | interval: step 28 | 29 | # amount of epochs/steps between consecutive step() calls 30 | frequency: null 31 | 32 | # name to log the learning rate as 33 | name: null -------------------------------------------------------------------------------- /config/optim/schedule/schedule_wav2vec_fan_etal.yaml: -------------------------------------------------------------------------------- 1 | # schedule to train wav2vec + fc layer as described in 2 | # EXPLORING WAV2VEC 2.0 ON SPEAKER VERIFICATION AND LANGUAGE IDENTIFICATION 3 | # https://arxiv.org/abs/2012.06185 4 | 5 | # the scheduler object to use 6 | scheduler: 7 | _target_: torch.optim.lr_scheduler.CyclicLR 8 | 9 | # the lowest lr in the cycle 10 | base_lr: 1e-5 11 | 12 | # the peak lr in the cycle 13 | max_lr: 0.005 14 | 15 | # number of steps to go from base_lr to max_lr 16 | step_size_up: 6000 17 | 18 | # number of steps to go from max+lr to base_lr 19 | step_size_down: 7000 20 | 21 | # Adam doesn't have `momentum` parameter, can only be true with SGD 22 | cycle_momentum: False 23 | 24 | # shape of line (triangular=linearly increasing/decreasing) 25 | mode: triangular 26 | 27 | # optional value to track which is fed into the step() call 28 | # only relevant for learning rate schedulers such 29 | # as `reduce on plateau` 30 | monitor: null 31 | 32 | # whether to step every epoch or every step 33 | interval: step 34 | 35 | # amount of epochs/steps between consecutive step() calls 36 | frequency: null 37 | 38 | # name to log the learning rate as 39 | name: null -------------------------------------------------------------------------------- /config/optim/schedule/tri_stage.yaml: -------------------------------------------------------------------------------- 1 | # the scheduler object to use 2 | scheduler: 3 | _target_: torch.optim.lr_scheduler.LambdaLR 4 | 5 | # A function which computes a multiplicative factor given an integer parameter 6 | lr_lambda: 7 | _target_: src.optim.schedule.tri_stage.TriStageLearningRateLambdaLRFunction 8 | max_steps: ${trainer.max_steps} 9 | warmup_stage_ratio: 0.1 10 | constant_stage_ratio: 0.4 11 | decay_stage_ratio: 0.5 12 | initial_lr: 5e-6 13 | base_lr: ${optim.algo.lr} 14 | final_lr: 5e-6 15 | 16 | # epoch number after which to not do any steps any more. '-1' implies never stop 17 | last_epoch: -1 18 | 19 | # print to STDOUT when making a step 20 | verbose: false 21 | 22 | # optional value to track which is fed into the step() call 23 | # only relevant for learning rate schedulers such 24 | # as `reduce on plateau` 25 | monitor: null 26 | 27 | # whether to step every epoch or every step 28 | interval: step 29 | 30 | # amount of epochs/steps between consecutive step() calls 31 | frequency: null 32 | 33 | # name to log the learning rate as 34 | name: null -------------------------------------------------------------------------------- /config/predict.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - data/module: voxceleb1 3 | - data/pipeline: xvector_pipeline 4 | - data/shards: shards_voxceleb 5 | - data/dataloader: speaker 6 | - evaluator: cosine_distance 7 | - network: xvector 8 | - optim/loss: cross_entropy 9 | - tokenizer: default 10 | - trainer: trainer 11 | 12 | # root directory with subfolders containing the canonical dataset(s) 13 | data_folder: ${oc.env:DATA_FOLDER} 14 | 15 | # directory for temporary storage 16 | temp_folder: ${oc.env:TEMP_FOLDER} 17 | 18 | # directory for slurm and hydra logs 19 | log_folder: ${oc.env:LOG_FOLDER} 20 | 21 | # random seed used by the experiment 22 | seed: 42133724 23 | 24 | # verify model (run e.g summary and BatchGradientVerification) 25 | verify_model: False 26 | 27 | # whether to load the weight of the networks from a checkpoint 28 | load_network_from_checkpoint: null 29 | 30 | # number of gpus to use 31 | gpus: ${oc.decode:${oc.env:NUM_GPUS}} 32 | 33 | # experiment name 34 | experiment_name: predict_pairs 35 | 36 | # path to folder which contains all files which need to be predicted 37 | predict_folder_path: ${data_folder}/voxsrc2021_val/wav 38 | 39 | # path to text file containing pairs which need to be evaluated 40 | pair_prediction_path: ${data_folder}/voxsrc2021_val/voxsrc2021_val.txt 41 | 42 | # config variables for hydra 43 | hydra: 44 | run: 45 | # set root output directory 46 | dir: ${log_folder}/wav2vec_speaker_identification/run/${now:%Y-%m-%d_%H-%M-%S}/${experiment_name} 47 | sweep: 48 | # set root output directory 49 | dir: ${log_folder}/wav2vec_speaker_identification/sweep/${now:%Y-%m-%d_%H-%M-%S} 50 | subdir: ${experiment_name} -------------------------------------------------------------------------------- /config/profiler/advanced.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /trainer: null # override trainer to null so it's not loaded from main config defaults... 5 | 6 | profiler: 7 | _target_: pytorch_lightning.profiler.AdvancedProfiler 8 | output_filename: advanced_profile.txt 9 | 10 | trainer: 11 | _target_: pytorch_lightning.Trainer 12 | 13 | # set `1` to train on GPU, `0` to train on CPU only 14 | gpus: ${gpus} 15 | 16 | # minimum number of epochs to train for 17 | min_epochs: 1 18 | 19 | # maximum number of epochs to train for 20 | max_epochs: 1 21 | 22 | # do not output a progress bar if rate = 0 23 | progress_bar_refresh_rate: 1 24 | 25 | # potentially limit the number of train batches - set to low value for debugging 26 | limit_train_batches: 200 27 | 28 | # amount of sanity validation steps to take before training starts 29 | num_sanity_val_steps: 0 30 | 31 | callbacks: 32 | to_add: 33 | - gpu_monitor -------------------------------------------------------------------------------- /config/profiler/simple.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | profiler: 4 | _target_: pytorch_lightning.profiler.SimpleProfiler 5 | output_filename: simple_profile.txt 6 | 7 | trainer: 8 | _target_: pytorch_lightning.Trainer 9 | 10 | # set `1` to train on GPU, `0` to train on CPU only 11 | gpus: ${gpus} 12 | 13 | # minimum number of epochs to train for 14 | min_epochs: 1 15 | 16 | # maximum number of epochs to train for 17 | max_epochs: 1 18 | 19 | # do not output a progress bar if rate = 0 20 | progress_bar_refresh_rate: 1 21 | 22 | # amount of sanity validation steps to take before training starts 23 | num_sanity_val_steps: 0 24 | 25 | callbacks: 26 | to_add: 27 | - gpu_monitor -------------------------------------------------------------------------------- /config/search/lr_and_aam_loss.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /hydra/sweeper: optuna 5 | - override /hydra/sweeper/sampler: tpe 6 | - override /callbacks: speaker_early_stopping 7 | - override /optim/loss: aam_softmax 8 | 9 | hydra: 10 | sweeper: 11 | sampler: 12 | seed: 123 13 | n_startup_trials: 48 14 | multivariate: true 15 | 16 | direction: minimize 17 | study_name: lr_and_schedule_search 18 | storage: null 19 | n_trials: 128 20 | n_jobs: 8 21 | 22 | search_space: 23 | optim.loss.margin: 24 | type: float 25 | low: 0 26 | high: 10 27 | optim.loss.scale: 28 | type: int 29 | low: 1 30 | high: 50 31 | optim.algo.lr: 32 | type: float 33 | low: 1e-8 34 | high: 1 35 | log: true 36 | optim.algo.weight_decay: 37 | type: categorical 38 | choices: 39 | - 0 40 | - 1e-12 41 | - 1e-11 42 | - 1e-10 43 | - 1e-9 44 | - 1e-8 45 | - 1e-7 46 | - 1e-6 47 | - 1e-5 48 | - 1e-4 49 | - 1e-3 50 | - 1e-2 51 | - 1e-1 52 | -------------------------------------------------------------------------------- /config/search/lr_and_pooling.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /hydra/sweeper: optuna 5 | - override /hydra/sweeper/sampler: tpe 6 | - override /callbacks: speaker_early_stopping 7 | 8 | hydra: 9 | sweeper: 10 | sampler: 11 | seed: 123 12 | n_startup_trials: 48 13 | multivariate: true 14 | 15 | direction: minimize 16 | study_name: lr_and_schedule_search 17 | storage: null 18 | n_trials: 128 19 | n_jobs: ${hydra.launcher.array_parallelism} 20 | 21 | search_space: 22 | network.stat_pooling_type: 23 | type: categorical 24 | choices: 25 | - mean 26 | - mean+std 27 | - attentive 28 | - max 29 | - quantile 30 | - first 31 | optim.algo.lr: 32 | type: float 33 | low: 1e-8 34 | high: 1 35 | log: true 36 | optim.algo.weight_decay: 37 | type: categorical 38 | choices: 39 | - 0 40 | - 1e-12 41 | - 1e-11 42 | - 1e-10 43 | - 1e-9 44 | - 1e-8 45 | - 1e-7 46 | - 1e-6 47 | - 1e-5 48 | - 1e-4 49 | - 1e-3 50 | - 1e-2 51 | - 1e-1 52 | -------------------------------------------------------------------------------- /config/search/lr_and_schedule_search.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /hydra/sweeper: optuna 5 | - override /hydra/sweeper/sampler: tpe 6 | - override /callbacks: speaker_early_stopping 7 | 8 | hydra: 9 | sweeper: 10 | sampler: 11 | seed: 123 12 | n_startup_trials: 48 13 | multivariate: true 14 | 15 | direction: minimize 16 | study_name: lr_and_schedule_search 17 | storage: null 18 | n_trials: 128 19 | n_jobs: ${hydra.launcher.array_parallelism} 20 | 21 | search_space: 22 | optim/schedule: 23 | type: categorical 24 | choices: 25 | - tri_stage 26 | - one_cycle 27 | optim/algo: 28 | type: categorical 29 | choices: 30 | - sgd 31 | - adam 32 | optim.algo.lr: 33 | type: float 34 | low: 1e-8 35 | high: 1 36 | log: true 37 | optim.algo.weight_decay: 38 | type: categorical 39 | choices: 40 | - 0 41 | - 1e-12 42 | - 1e-11 43 | - 1e-10 44 | - 1e-9 45 | - 1e-8 46 | - 1e-7 47 | - 1e-6 48 | - 1e-5 49 | - 1e-4 50 | - 1e-3 51 | - 1e-2 52 | - 1e-1 53 | -------------------------------------------------------------------------------- /config/tokenizer/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.tokenizer.tokenizer_wav2vec2.Wav2vec2TokenizerConfig 2 | 3 | tokenizer_huggingface_id: "facebook/wav2vec2-base-960h" 4 | -------------------------------------------------------------------------------- /config/train_eval.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - callbacks: speaker_default 4 | - data/module: voxceleb1 5 | - data/pipeline: xvector_pipeline 6 | - data/shards: shards_voxceleb 7 | - data/dataloader: speaker 8 | - evaluator: cosine_distance 9 | - network: xvector 10 | - tokenizer: default 11 | - optim/algo: adam 12 | - optim/schedule: constant 13 | - optim/loss: cross_entropy 14 | - trainer: trainer 15 | 16 | # setting a profiler changes the trainer to 1 epoch 17 | # in order to debug performance 18 | - profiler: null 19 | 20 | # root directory with subfolders containing the canonical dataset(s) 21 | data_folder: ${oc.env:DATA_FOLDER} 22 | 23 | # directory for temporary storage 24 | temp_folder: ${oc.env:TEMP_FOLDER} 25 | 26 | # directory for slurm and hydra logs 27 | log_folder: ${oc.env:LOG_FOLDER} 28 | 29 | # random seed used by the experiment 30 | seed: 42133724 31 | 32 | # whether to tune model 33 | tune_model: False 34 | tune_iterations: 1000 35 | 36 | # verify model (run e.g summary and BatchGradientVerification) 37 | verify_model: false 38 | 39 | # whether to fit model 40 | fit_model: True 41 | 42 | # whether to evaluate model 43 | eval_model: True 44 | 45 | # whether to load the weight of the networks from a checkpoint 46 | load_network_from_checkpoint: null 47 | 48 | # whether to log to comet-ml 49 | use_cometml: ${oc.decode:${oc.env:USE_COMET_ML}} 50 | 51 | # number of gpus to use 52 | gpus: ${oc.decode:${oc.env:NUM_GPUS}} 53 | 54 | # project name (useful for giving a name to log directories) 55 | project_name: general 56 | 57 | # experiment name 58 | # (:) indicates it needs to be resolved 59 | experiment_name: ${random_uuid:} 60 | 61 | # tag to add to the experiment dashboard for easy filtering 62 | # of certain experiment 63 | tag: ${now:%Y-%m-%d} 64 | 65 | # config variables for hydra 66 | hydra: 67 | run: 68 | # set root output directory 69 | dir: ${log_folder}/wav2vec_speaker_identification/run/${now:%Y-%m-%d_%H-%M-%S}/${experiment_name} 70 | sweep: 71 | # set root output directory 72 | dir: ${log_folder}/wav2vec_speaker_identification/sweep/${now:%Y-%m-%d_%H-%M-%S} 73 | subdir: ${experiment_name} -------------------------------------------------------------------------------- /config/trainer/debug_trainer.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | trainer: 4 | _target_: pytorch_lightning.Trainer 5 | 6 | # set `1` to train on (1) GPU, `0` to train on CPU only 7 | gpus: ${gpus} 8 | 9 | # useful for debugging 10 | limit_train_batches: 10 11 | limit_val_batches: 0 12 | limit_test_batches: 0 13 | fast_dev_run: False 14 | 15 | max_epochs: 1000 16 | 17 | callbacks: 18 | to_add: 19 | - gpu_monitor 20 | 21 | data: 22 | module: 23 | limit_samples: 320 24 | shards: 25 | initial_fill_buffer_percentage: 0 26 | shard_shuffle_queue_size: 0 27 | pre_batch_shuffle_queue_size: 0 28 | dataloader: 29 | num_workers: 0 30 | train_batch_size: 32 -------------------------------------------------------------------------------- /config/trainer/trainer.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorch_lightning.Trainer 2 | 3 | # set `1` to train on GPU, `0` to train on CPU only 4 | gpus: ${gpus} 5 | 6 | # accelerator: 7 | # - null is 1 gpu training 8 | # - `ddp` is multi-gpu training 9 | accelerator: null 10 | 11 | # how many machines to use for multi-gpu training 12 | num_nodes: 1 13 | 14 | # minimum number of epochs to train for 15 | min_epochs: null 16 | 17 | # maximum number of epochs to train for 18 | max_epochs: null 19 | 20 | # minimum number of steps to train for 21 | min_steps: null 22 | 23 | # maximum number of steps to train for 24 | max_steps: 20000 25 | 26 | # due to training dataset having no length we need 27 | # to manually set the validation epoch interval 28 | val_check_interval: 1000 29 | 30 | # accumulating batches artificially increases 31 | # the batch size by doing multiple 32 | # forward steps before a single backward step 33 | accumulate_grad_batches: 1 # 1300 // 32 34 | 35 | # do not output a progress bar if rate = 0 36 | progress_bar_refresh_rate: 500 37 | 38 | # deterministic CUDA operations - true lead to ~20x decrease in speed :( 39 | deterministic: False 40 | 41 | # potentially limit the number of train batches - set to low value for debugging 42 | limit_train_batches: 1.0 43 | 44 | # potentially limit the number of val batches - set to low value for debugging 45 | limit_val_batches: 1.0 46 | 47 | # potentially limit the number of test batches - set to low value for debugging 48 | limit_test_batches: 1.0 49 | 50 | # fast dev run 51 | # set all three `limit_*_batches to `n` so only `n` batches are used. n=1 if 'true' 52 | fast_dev_run: false 53 | 54 | # either train with 16 (half), 32 (single) or 64 (double) bit precision 55 | precision: 32 56 | 57 | # amount of sanity validation steps to take before training starts 58 | num_sanity_val_steps: 2 59 | 60 | # whether to try auto learning rate finding (this does not actually train the 61 | # model, set tune_model:true, fit_model:false, eval_model:false in `main.yaml`. 62 | # set this value to `auto_lr_find` to try it out 63 | auto_lr_find: False 64 | 65 | # apply clipping to the global gradient norm to avoid exploding 66 | # gradients. Default value of '0' means no clipping is applied 67 | gradient_clip_val: 0 -------------------------------------------------------------------------------- /convert_voxceleb2.sh: -------------------------------------------------------------------------------- 1 | source .env 2 | 3 | PDIR=$PWD # folder where this README is located 4 | D=$DATA_FOLDER # location of data - should be set in .env file 5 | WORKERS=$(nproc --all) # number of CPUs available 6 | 7 | # extract voxceleb 2 data 8 | cd "$D" || exit 9 | mkdir -p convert_tmp/train convert_tmp/test 10 | 11 | unzip voxceleb_archives/vox2_dev_aac.zip -d convert_tmp/train 12 | unzip voxceleb_archives/vox2_test_aac.zip -d convert_tmp/test 13 | 14 | # run the conversion script 15 | cd "$PDIR" || exit 16 | poetry run python preparation_scripts/voxceleb2_convert_to_wav.py "$D"/convert_tmp --num_workers "$WORKERS" 17 | 18 | # rezip the converted data 19 | cd "$D"/convert_tmp/train || exit 20 | zip "$D"/voxceleb_archives/vox2_dev_wav.zip wav -r 21 | 22 | cd "$D"/convert_tmp/test || exit 23 | zip "$D"/voxceleb_archives/vox2_test_wav.zip wav -r 24 | 25 | # delete the unzipped .m4a files 26 | cd "$D" || exit 27 | rm -r convert_tmp -------------------------------------------------------------------------------- /paper_results/auto_lr_find/ecapa/.hydra/overrides.yaml: -------------------------------------------------------------------------------- 1 | - +experiment=speaker_ecapa_tdnn 2 | - tune_model=True 3 | - data/module=voxceleb1 4 | - trainer.auto_lr_find=auto_lr_find 5 | - tune_iterations=5000 6 | -------------------------------------------------------------------------------- /paper_results/auto_lr_find/ecapa/lightning_logs/version_0/events.out.tfevents.1631794798.katara.82853.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/ecapa/lightning_logs/version_0/events.out.tfevents.1631794798.katara.82853.0 -------------------------------------------------------------------------------- /paper_results/auto_lr_find/ecapa/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/ecapa/plot.png -------------------------------------------------------------------------------- /paper_results/auto_lr_find/ecapa/plot_lr_eer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/ecapa/plot_lr_eer.png -------------------------------------------------------------------------------- /paper_results/auto_lr_find/ecapa/plot_lr_eer_zoomed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/ecapa/plot_lr_eer_zoomed.png -------------------------------------------------------------------------------- /paper_results/auto_lr_find/ecapa/run.log: -------------------------------------------------------------------------------- 1 | [2021-09-16 14:19:57,772][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724 2 | [2021-09-16 14:19:58,541][src.main][INFO] - Using callback <> 3 | [2021-09-16 14:19:58,542][src.main][INFO] - Using callback <> 4 | [2021-09-16 14:19:58,543][src.main][INFO] - Using callback <> 5 | [2021-09-16 14:19:58,548][pytorch_lightning.utilities.distributed][INFO] - GPU available: True, used: True 6 | [2021-09-16 14:19:58,548][pytorch_lightning.utilities.distributed][INFO] - TPU available: False, using: 0 TPU cores 7 | [2021-09-16 14:19:58,548][pytorch_lightning.utilities.distributed][INFO] - IPU available: False, using: 0 IPUs 8 | [2021-09-16 14:19:58,651][pytorch_lightning.accelerators.gpu][INFO] - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0] 9 | [2021-09-16 14:20:00,832][pytorch_lightning.core.lightning][INFO] - 10 | | Name | Type | Params 11 | ------------------------------------------------------- 12 | 0 | loss_fn | CrossEntropyLoss | 0 13 | 1 | metric_train_acc | Accuracy | 0 14 | 2 | metric_train_loss | AverageMeter | 0 15 | 3 | metric_valid_acc | Accuracy | 0 16 | 4 | feature_extractor | ECAPA_TDNN | 20.8 M 17 | 5 | classifier | Classifier | 232 K 18 | ------------------------------------------------------- 19 | 21.0 M Trainable params 20 | 0 Non-trainable params 21 | 21.0 M Total params 22 | 84.000 Total estimated model params size (MB) 23 | [2021-09-16 14:20:05,425][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724 24 | [2021-09-16 14:35:24,725][pytorch_lightning.utilities.distributed][INFO] - Restoring states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-16_14-19-55/e77c80b93db94c5d92741f5b1cd3c351/lr_find_temp_model.ckpt 25 | [2021-09-16 14:35:24,810][pytorch_lightning.utilities.distributed][INFO] - Restored all states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-16_14-19-55/e77c80b93db94c5d92741f5b1cd3c351/lr_find_temp_model.ckpt 26 | [2021-09-16 14:35:24,816][pytorch_lightning.tuner.lr_finder][INFO] - Learning rate set to 0.8887917198848208 27 | -------------------------------------------------------------------------------- /paper_results/auto_lr_find/grid_search_results.csv: -------------------------------------------------------------------------------- 1 | learning rate,network,eer 2 | 0.00001,wav2vec2-sv-aam,0.02605135925 3 | 0.00005,wav2vec2-sv-aam,0.02063610218 4 | 0.00009,wav2vec2-sv-aam,0.0221702382 5 | 0.0001,wav2vec2-sv-aam,0.02154026181 6 | 0.0002,wav2vec2-sv-aam,0.03840016946 7 | 0.0005,wav2vec2-sv-aam,0.5026879907 8 | 0.001,wav2vec2-sv-aam,0.5 9 | 0.00001,wav2vec2-sv-ce,0.03126163036 10 | 0.00005,wav2vec2-sv-ce,0.02185124159 11 | 0.00009,wav2vec2-sv-ce,0.02090203203 12 | 0.0001,wav2vec2-sv-ce,0.02180619165 13 | 0.0002,wav2vec2-sv-ce,0.04387830943 14 | 0.0005,wav2vec2-sv-ce,0.5 15 | 0.001,wav2vec2-sv-ce,0.5 16 | 0.00003,wav2vec2-sv-bce,0.07767558098 17 | 0.00002,wav2vec2-sv-bce,0.08068290353 18 | 0.00004,wav2vec2-sv-bce,0.08117688447 19 | 0.00001,wav2vec2-sv-bce,0.0848563239 20 | 0.000009,wav2vec2-sv-bce,0.08623531461 21 | 0.000007,wav2vec2-sv-bce,0.08908626437 22 | 0.000005,wav2vec2-sv-bce,0.09471081942 23 | 0.0001,wav2vec2-sv-bce,0.5001855493 24 | 0.00001,wav2vec2-sv-ctc,0.2 25 | 0.00005,wav2vec2-sv-ctc,0.5 26 | 0.0001,wav2vec2-sv-ctc,0.5 27 | 0.0002,wav2vec2-sv-ctc,0.5 28 | 0.0003,wav2vec2-sv-ctc,0.5 29 | 0.0004,wav2vec2-sv-ctc,0.5 30 | 0.0005,wav2vec2-sv-ctc,0.5 31 | 0.001,ecapa,0.1084459126 32 | 0.005,ecapa,0.1220160574 33 | 0.0009,ecapa,0.113722153 34 | 0.0007,ecapa,0.1097346991 35 | 0.000005,ecapa,0.1525865346 36 | 0.0001,ecapa,0.1180286035 37 | 0.00001,ecapa,0.1362621039 38 | 0.0005,ecapa,0.1127120033 39 | 0.0004,xvector,0.09578768164 40 | 0.0003,xvector,0.09685140103 41 | 0.0008,xvector,0.09695777297 42 | 0.001,xvector,0.09717051685 43 | 0.0002,xvector,0.09988299012 44 | 0.0001,xvector,0.1024509519 45 | 0.00006,xvector,0.1059599146 46 | 0.00001,xvector,0.1311030686 -------------------------------------------------------------------------------- /paper_results/auto_lr_find/plot_auto_lr.py: -------------------------------------------------------------------------------- 1 | import json 2 | from os import path 3 | import pathlib 4 | 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | 8 | # import data 9 | data_path_ce = pathlib.Path( 10 | "wav2vec2-sv-ce/23fb5940c4c94ab39ff4ab74c3852857/lr_find_20210907-215014.json" 11 | ) 12 | data_path_aam = pathlib.Path( 13 | "wav2vec2-sv-aam/06c91df465da4d55bed874caf6fa1da5/lr_find_20210907-221822.json" 14 | ) 15 | data_path_ctc = pathlib.Path() 16 | data_path_bce = pathlib.Path("wav2vec2-sv-bce/65f16f5c0860494187135a30e48097c7/lr_find_20210908-171251.json") 17 | 18 | data_path = data_path_bce 19 | with data_path.open("r") as f: 20 | data = json.load(f) 21 | 22 | x_loss = data["data"]["lr"] 23 | y_loss = data["data"]["loss"] 24 | 25 | # draw graph 26 | fig = plt.figure() 27 | ax1 = fig.add_subplot(1, 1, 1) 28 | 29 | # line plot of learning rate versus loss 30 | (loss_line,) = ax1.plot(x_loss, y_loss, "C1") 31 | 32 | ax1.set_xscale("log") 33 | ax1.set_xlabel("learning rate") 34 | ax1.set_ylabel("loss") 35 | ax1.set_ylim(0.4, 0.9) 36 | 37 | plt.show() 38 | -------------------------------------------------------------------------------- /paper_results/auto_lr_find/plot_eer_and_lr_find.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pathlib 3 | 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | 7 | network_name = "xvector" 8 | zoomed = True 9 | zoom_min = 0 10 | zoom_max = 0.15 11 | 12 | # import data 13 | df = pd.read_csv("grid_search_results.csv", sep=",") 14 | df = df.loc[df['network'] == network_name] 15 | x_eer = df["learning rate"].tolist() 16 | y_eer = df["eer"].tolist() 17 | 18 | data_path = pathlib.Path( 19 | f"{network_name}/data.json" 20 | ) 21 | 22 | with data_path.open("r") as f: 23 | data = json.load(f) 24 | 25 | x_loss = data["data"]["lr"] 26 | y_loss = data["data"]["loss"] 27 | 28 | # draw graph 29 | fig = plt.figure() 30 | ax1 = fig.add_subplot(1, 1, 1) 31 | 32 | # line plot of learning rate versus loss 33 | loss_line, = ax1.plot( 34 | x_loss, 35 | y_loss, 36 | "C1" 37 | ) 38 | 39 | ax1.set_xscale("log") 40 | ax1.set_xlabel("learning rate") 41 | ax1.set_ylabel("loss") 42 | 43 | # scatter plot of EER result at certain LR values 44 | ax2 = plt.twinx() 45 | 46 | eer_scatter = ax2.scatter(x=x_eer, y=y_eer, marker="x") 47 | 48 | ax2.set_ylabel("EER") 49 | if zoomed: 50 | ax2.set_ylim(zoom_min, zoom_max) 51 | else: 52 | ax2.set_ylim(0, 0.6) 53 | 54 | plt.legend([loss_line, eer_scatter], ["loss", "EER"], loc=2) 55 | plt.suptitle(network_name) 56 | 57 | plt.savefig(f'{network_name}/plot_lr_eer{"_zoomed" if zoomed else ""}.png') 58 | plt.show() 59 | -------------------------------------------------------------------------------- /paper_results/auto_lr_find/plot_eer_and_lr_find_broken.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pathlib 3 | 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | 7 | # import data 8 | df = pd.read_csv("grid_search_results.csv", sep=";") 9 | x_eer = df["learning rate"].tolist() 10 | y_eer = df["eer"].tolist() 11 | 12 | data_path = pathlib.Path( 13 | "wav2vec2-sv-ce/23fb5940c4c94ab39ff4ab74c3852857/lr_find_20210907-215014.json" 14 | ) 15 | 16 | with data_path.open("r") as f: 17 | data = json.load(f) 18 | 19 | x_loss = data["data"]["lr"] 20 | y_loss = data["data"]["loss"] 21 | 22 | # draw graph 23 | fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True) 24 | 25 | # scatter plot of EER result at certain LR values 26 | eer_scatter_down = ax1.scatter(x=x_eer, y=y_eer, marker="x") 27 | eer_scatter_up = ax2.scatter(x=x_eer, y=y_eer, marker="x") 28 | 29 | ax1.set_ylabel("EER") 30 | ax2.set_ylabel("EER") 31 | 32 | ax1.set_ylim(0.45, 0.55) 33 | ax2.set_ylim(0, 0.07) 34 | 35 | ax1.set_xscale("log") 36 | ax2.set_xscale("log") 37 | 38 | # line plot of learning rate versus loss 39 | (loss_line,) = ax3.plot(x_loss, y_loss, "C1") 40 | 41 | ax3.set_xscale("log") 42 | ax3.set_xlabel("learning rate") 43 | ax3.set_ylabel("loss") 44 | ax3.set_xlim(1e-6, 5e-2) 45 | 46 | # hide the spines between ax1 and ax2, and ax2 and ax3 47 | ax1.spines.bottom.set_visible(False) 48 | ax1.xaxis.tick_top() 49 | ax1.tick_params(labeltop=False, labelbottom=False) 50 | 51 | ax2.spines.top.set_visible(False) 52 | ax2.spines.bottom.set_visible(False) 53 | ax2.tick_params(bottom=False, top=False, labeltop=False, labelbottom=False) 54 | ax2.xaxis.set_visible(False) 55 | 56 | ax3.spines.top.set_visible(False) 57 | ax3.xaxis.tick_bottom() 58 | ax3.tick_params(labeltop=False) 59 | 60 | # add discontinuity 61 | d = 0.5 # proportion of vertical to horizontal extent of the slanted line 62 | kwargs = dict( 63 | marker=[(-1, -d), (1, d)], 64 | markersize=12, 65 | linestyle="none", 66 | color="k", 67 | mec="k", 68 | mew=1, 69 | clip_on=False, 70 | ) 71 | ax1.plot([0, 1], [0, 0], transform=ax1.transAxes, **kwargs) 72 | ax2.plot([0, 1], [1, 1], transform=ax2.transAxes, **kwargs) 73 | 74 | # create legend 75 | plt.legend([eer_scatter_down, loss_line], ["EER", "loss"], loc=2) 76 | 77 | # show plot 78 | plt.show() 79 | -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-aam/.hydra/overrides.yaml: -------------------------------------------------------------------------------- 1 | - +experiment=speaker_wav2vec2_ce 2 | - tune_model=True 3 | - data/module=voxceleb1 4 | - trainer.auto_lr_find=auto_lr_find 5 | - tune_iterations=5000 6 | - optim/loss=aam_softmax 7 | -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-aam/lightning_logs/version_0/events.out.tfevents.1631044502.katara.6664.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-aam/lightning_logs/version_0/events.out.tfevents.1631044502.katara.6664.0 -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-aam/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-aam/plot.png -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-aam/plot_lr_eer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-aam/plot_lr_eer.png -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-aam/plot_lr_eer_zoomed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-aam/plot_lr_eer_zoomed.png -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-aam/run.log: -------------------------------------------------------------------------------- 1 | [2021-09-07 21:55:00,574][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724 2 | [2021-09-07 21:55:02,628][src.main][INFO] - Using callback <> 3 | [2021-09-07 21:55:02,628][src.main][INFO] - Using callback <> 4 | [2021-09-07 21:55:02,631][src.main][INFO] - Using callback <> 5 | [2021-09-07 21:55:02,636][pytorch_lightning.trainer.connectors.accelerator_connector][INFO] - Using native 16bit precision. 6 | [2021-09-07 21:55:02,637][pytorch_lightning.utilities.distributed][INFO] - GPU available: True, used: True 7 | [2021-09-07 21:55:02,637][pytorch_lightning.utilities.distributed][INFO] - TPU available: False, using: 0 TPU cores 8 | [2021-09-07 21:55:02,637][pytorch_lightning.utilities.distributed][INFO] - IPU available: False, using: 0 IPUs 9 | [2021-09-07 21:55:03,125][pytorch_lightning.accelerators.gpu][INFO] - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0] 10 | [2021-09-07 21:55:05,880][pytorch_lightning.core.lightning][INFO] - 11 | | Name | Type | Params 12 | ----------------------------------------------------------------------- 13 | 0 | loss_fn | AngularAdditiveMarginSoftMaxLoss | 9.2 M 14 | 1 | metric_train_acc | Accuracy | 0 15 | 2 | metric_train_loss | AverageMeter | 0 16 | 3 | metric_valid_acc | Accuracy | 0 17 | 4 | wav2vec | Wav2Vec2WrapperModule | 94.4 M 18 | 5 | embedding_masker | EmbeddingMasker | 0 19 | 6 | stat_pooling | MeanStdStatPool1D | 0 20 | 7 | fc_list | ModuleList | 0 21 | ----------------------------------------------------------------------- 22 | 103 M Trainable params 23 | 0 Non-trainable params 24 | 103 M Total params 25 | 414.314 Total estimated model params size (MB) 26 | [2021-09-07 21:55:10,051][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724 27 | [2021-09-07 22:18:22,208][pytorch_lightning.utilities.distributed][INFO] - Restoring states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-07_21-54-58/06c91df465da4d55bed874caf6fa1da5/lr_find_temp_model.ckpt 28 | [2021-09-07 22:18:22,620][pytorch_lightning.utilities.distributed][INFO] - Restored all states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-07_21-54-58/06c91df465da4d55bed874caf6fa1da5/lr_find_temp_model.ckpt 29 | [2021-09-07 22:18:22,666][pytorch_lightning.tuner.lr_finder][INFO] - Learning rate set to 8.379150674384097e-05 30 | -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-bce/.hydra/overrides.yaml: -------------------------------------------------------------------------------- 1 | - +experiment=speaker_wav2vec2_pairs 2 | - tune_model=True 3 | - data/module=voxceleb1_pairs 4 | - trainer.auto_lr_find=auto_lr_find 5 | - tune_iterations=5000 6 | - data.dataloader.train_batch_size=8 7 | -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-bce/lightning_logs/version_0/events.out.tfevents.1631113238.katara.16035.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-bce/lightning_logs/version_0/events.out.tfevents.1631113238.katara.16035.0 -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-bce/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-bce/plot.png -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-bce/plot_lr_eer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-bce/plot_lr_eer.png -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-bce/plot_lr_eer_zoomed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-bce/plot_lr_eer_zoomed.png -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-bce/run.log: -------------------------------------------------------------------------------- 1 | [2021-09-08 17:00:36,262][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724 2 | [2021-09-08 17:00:38,382][src.main][INFO] - Using callback <> 3 | [2021-09-08 17:00:38,382][src.main][INFO] - Using callback <> 4 | [2021-09-08 17:00:38,383][src.main][INFO] - Using callback <> 5 | [2021-09-08 17:00:38,388][pytorch_lightning.trainer.connectors.accelerator_connector][INFO] - Using native 16bit precision. 6 | [2021-09-08 17:00:38,389][pytorch_lightning.utilities.distributed][INFO] - GPU available: True, used: True 7 | [2021-09-08 17:00:38,389][pytorch_lightning.utilities.distributed][INFO] - TPU available: False, using: 0 TPU cores 8 | [2021-09-08 17:00:38,389][pytorch_lightning.utilities.distributed][INFO] - IPU available: False, using: 0 IPUs 9 | [2021-09-08 17:00:38,830][pytorch_lightning.accelerators.gpu][INFO] - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0] 10 | [2021-09-08 17:00:41,475][pytorch_lightning.core.lightning][INFO] - 11 | | Name | Type | Params 12 | ------------------------------------------------------------- 13 | 0 | loss_fn | BinaryCrossEntropyLoss | 0 14 | 1 | metric_train_acc | Accuracy | 0 15 | 2 | metric_train_loss | AverageMeter | 0 16 | 3 | metric_valid_acc | Accuracy | 0 17 | 4 | wav2vec | Wav2Vec2WrapperModule | 94.4 M 18 | 5 | linear | Linear | 769 19 | ------------------------------------------------------------- 20 | 94.4 M Trainable params 21 | 0 Non-trainable params 22 | 94.4 M Total params 23 | 377.490 Total estimated model params size (MB) 24 | [2021-09-08 17:01:02,988][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724 25 | [2021-09-08 17:12:50,906][pytorch_lightning.tuner.lr_finder][INFO] - LR finder stopped early after 4155 steps due to diverging loss. 26 | [2021-09-08 17:12:50,930][pytorch_lightning.utilities.distributed][INFO] - Restoring states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-08_17-00-34/65f16f5c0860494187135a30e48097c7/lr_find_temp_model.ckpt 27 | [2021-09-08 17:12:51,232][pytorch_lightning.utilities.distributed][INFO] - Restored all states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-08_17-00-34/65f16f5c0860494187135a30e48097c7/lr_find_temp_model.ckpt 28 | [2021-09-08 17:12:51,293][pytorch_lightning.tuner.lr_finder][INFO] - Learning rate set to 0.04429961991003636 29 | -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-ce/.hydra/overrides.yaml: -------------------------------------------------------------------------------- 1 | - +experiment=speaker_wav2vec2_ce 2 | - tune_model=True 3 | - data/module=voxceleb1 4 | - trainer.auto_lr_find=auto_lr_find 5 | - tune_iterations=5000 6 | -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-ce/lightning_logs/version_0/events.out.tfevents.1631043151.katara.6259.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-ce/lightning_logs/version_0/events.out.tfevents.1631043151.katara.6259.0 -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-ce/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-ce/plot.png -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-ce/plot_lr_eer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-ce/plot_lr_eer.png -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-ce/plot_lr_eer_zoomed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-ce/plot_lr_eer_zoomed.png -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-ce/run.log: -------------------------------------------------------------------------------- 1 | [2021-09-07 21:32:29,938][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724 2 | [2021-09-07 21:32:31,811][src.main][INFO] - Using callback <> 3 | [2021-09-07 21:32:31,812][src.main][INFO] - Using callback <> 4 | [2021-09-07 21:32:31,813][src.main][INFO] - Using callback <> 5 | [2021-09-07 21:32:31,818][pytorch_lightning.trainer.connectors.accelerator_connector][INFO] - Using native 16bit precision. 6 | [2021-09-07 21:32:31,819][pytorch_lightning.utilities.distributed][INFO] - GPU available: True, used: True 7 | [2021-09-07 21:32:31,819][pytorch_lightning.utilities.distributed][INFO] - TPU available: False, using: 0 TPU cores 8 | [2021-09-07 21:32:31,819][pytorch_lightning.utilities.distributed][INFO] - IPU available: False, using: 0 IPUs 9 | [2021-09-07 21:32:32,212][pytorch_lightning.accelerators.gpu][INFO] - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0] 10 | [2021-09-07 21:32:34,753][pytorch_lightning.core.lightning][INFO] - 11 | | Name | Type | Params 12 | ------------------------------------------------------------ 13 | 0 | loss_fn | CrossEntropyLoss | 0 14 | 1 | metric_train_acc | Accuracy | 0 15 | 2 | metric_train_loss | AverageMeter | 0 16 | 3 | metric_valid_acc | Accuracy | 0 17 | 4 | wav2vec | Wav2Vec2WrapperModule | 94.4 M 18 | 5 | embedding_masker | EmbeddingMasker | 0 19 | 6 | stat_pooling | MeanStdStatPool1D | 0 20 | 7 | fc_list | ModuleList | 1.9 M 21 | ------------------------------------------------------------ 22 | 96.2 M Trainable params 23 | 0 Non-trainable params 24 | 96.2 M Total params 25 | 384.932 Total estimated model params size (MB) 26 | [2021-09-07 21:32:38,530][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724 27 | [2021-09-07 21:50:13,768][pytorch_lightning.tuner.lr_finder][INFO] - LR finder stopped early after 3950 steps due to diverging loss. 28 | [2021-09-07 21:50:13,797][pytorch_lightning.utilities.distributed][INFO] - Restoring states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-07_21-32-28/23fb5940c4c94ab39ff4ab74c3852857/lr_find_temp_model.ckpt 29 | [2021-09-07 21:50:14,118][pytorch_lightning.utilities.distributed][INFO] - Restored all states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-07_21-32-28/23fb5940c4c94ab39ff4ab74c3852857/lr_find_temp_model.ckpt 30 | [2021-09-07 21:50:14,155][pytorch_lightning.tuner.lr_finder][INFO] - Learning rate set to 0.00016811249744769598 31 | -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-ctc/.hydra/hydra.yaml: -------------------------------------------------------------------------------- 1 | hydra: 2 | run: 3 | dir: ${log_folder}/wav2vec_speaker_identification/run/${now:%Y-%m-%d_%H-%M-%S}/${experiment_name} 4 | sweep: 5 | dir: ${log_folder}/wav2vec_speaker_identification/sweep/${now:%Y-%m-%d_%H-%M-%S} 6 | subdir: ${experiment_name} 7 | launcher: 8 | _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher 9 | sweeper: 10 | _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper 11 | max_batch_size: null 12 | help: 13 | app_name: ${hydra.job.name} 14 | header: '${hydra.help.app_name} is powered by Hydra. 15 | 16 | ' 17 | footer: 'Powered by Hydra (https://hydra.cc) 18 | 19 | Use --hydra-help to view Hydra specific help 20 | 21 | ' 22 | template: '${hydra.help.header} 23 | 24 | == Configuration groups == 25 | 26 | Compose your configuration from those groups (group=option) 27 | 28 | 29 | $APP_CONFIG_GROUPS 30 | 31 | 32 | == Config == 33 | 34 | Override anything in the config (foo.bar=value) 35 | 36 | 37 | $CONFIG 38 | 39 | 40 | ${hydra.help.footer} 41 | 42 | ' 43 | hydra_help: 44 | template: 'Hydra (${hydra.runtime.version}) 45 | 46 | See https://hydra.cc for more info. 47 | 48 | 49 | == Flags == 50 | 51 | $FLAGS_HELP 52 | 53 | 54 | == Configuration groups == 55 | 56 | Compose your configuration from those groups (For example, append hydra/job_logging=disabled 57 | to command line) 58 | 59 | 60 | $HYDRA_CONFIG_GROUPS 61 | 62 | 63 | Use ''--cfg hydra'' to Show the Hydra config. 64 | 65 | ' 66 | hydra_help: ??? 67 | hydra_logging: 68 | version: 1 69 | formatters: 70 | simple: 71 | format: '[%(asctime)s][HYDRA] %(message)s' 72 | handlers: 73 | console: 74 | class: logging.StreamHandler 75 | formatter: simple 76 | stream: ext://sys.stdout 77 | root: 78 | level: INFO 79 | handlers: 80 | - console 81 | loggers: 82 | logging_example: 83 | level: DEBUG 84 | disable_existing_loggers: false 85 | job_logging: 86 | version: 1 87 | formatters: 88 | simple: 89 | format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' 90 | handlers: 91 | console: 92 | class: logging.StreamHandler 93 | formatter: simple 94 | stream: ext://sys.stdout 95 | file: 96 | class: logging.FileHandler 97 | formatter: simple 98 | filename: ${hydra.job.name}.log 99 | root: 100 | level: INFO 101 | handlers: 102 | - console 103 | - file 104 | disable_existing_loggers: false 105 | env: {} 106 | searchpath: [] 107 | callbacks: {} 108 | output_subdir: .hydra 109 | overrides: 110 | hydra: [] 111 | task: 112 | - +experiment=speaker_wav2vec2_ctc 113 | - tune_model=True 114 | - data/module=voxceleb1 115 | - trainer.auto_lr_find=auto_lr_find 116 | - tune_iterations=5000 117 | job: 118 | name: run 119 | override_dirname: +experiment=speaker_wav2vec2_ctc,data/module=voxceleb1,trainer.auto_lr_find=auto_lr_find,tune_iterations=5000,tune_model=True 120 | id: ??? 121 | num: ??? 122 | config_name: train_eval 123 | env_set: {} 124 | env_copy: [] 125 | config: 126 | override_dirname: 127 | kv_sep: '=' 128 | item_sep: ',' 129 | exclude_keys: [] 130 | runtime: 131 | version: 1.1.1 132 | cwd: /home/nik/workspace/phd/repos/wav2vec_speaker_identification 133 | config_sources: 134 | - path: hydra.conf 135 | schema: pkg 136 | provider: hydra 137 | - path: /home/nik/workspace/phd/repos/wav2vec_speaker_identification/config 138 | schema: file 139 | provider: main 140 | - path: '' 141 | schema: structured 142 | provider: schema 143 | choices: 144 | experiment: speaker_wav2vec2_ctc 145 | profiler: null 146 | trainer: trainer 147 | optim/loss: ctc 148 | optim/schedule: one_cycle 149 | optim/algo: adam 150 | tokenizer: default 151 | network: wav2vec2_fc 152 | evaluator: cosine_distance 153 | data/dataloader: speaker 154 | data/shards: shards_voxceleb 155 | data/pipeline: wav2vec_base_pipeline 156 | data/module: voxceleb1 157 | callbacks: speaker_default 158 | hydra/env: default 159 | hydra/callbacks: null 160 | hydra/job_logging: default 161 | hydra/hydra_logging: default 162 | hydra/hydra_help: default 163 | hydra/help: default 164 | hydra/sweeper: basic 165 | hydra/launcher: basic 166 | hydra/output: default 167 | verbose: false 168 | -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-ctc/.hydra/overrides.yaml: -------------------------------------------------------------------------------- 1 | - +experiment=speaker_wav2vec2_ctc 2 | - tune_model=True 3 | - data/module=voxceleb1 4 | - trainer.auto_lr_find=auto_lr_find 5 | - tune_iterations=5000 6 | -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-ctc/lightning_logs/version_0/events.out.tfevents.1631793388.katara.71473.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-ctc/lightning_logs/version_0/events.out.tfevents.1631793388.katara.71473.0 -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-ctc/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-ctc/plot.png -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-ctc/plot_lr_eer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-ctc/plot_lr_eer.png -------------------------------------------------------------------------------- /paper_results/auto_lr_find/wav2vec2-sv-ctc/run.log: -------------------------------------------------------------------------------- 1 | [2021-09-16 13:56:26,445][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724 2 | [2021-09-16 13:56:28,219][src.main][INFO] - Using callback <> 3 | [2021-09-16 13:56:28,219][src.main][INFO] - Using callback <> 4 | [2021-09-16 13:56:28,220][src.main][INFO] - Using callback <> 5 | [2021-09-16 13:56:28,225][pytorch_lightning.trainer.connectors.accelerator_connector][INFO] - Using native 16bit precision. 6 | [2021-09-16 13:56:28,226][pytorch_lightning.utilities.distributed][INFO] - GPU available: True, used: True 7 | [2021-09-16 13:56:28,226][pytorch_lightning.utilities.distributed][INFO] - TPU available: False, using: 0 TPU cores 8 | [2021-09-16 13:56:28,226][pytorch_lightning.utilities.distributed][INFO] - IPU available: False, using: 0 IPUs 9 | [2021-09-16 13:56:28,615][pytorch_lightning.accelerators.gpu][INFO] - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0] 10 | [2021-09-16 13:56:30,795][pytorch_lightning.core.lightning][INFO] - 11 | | Name | Type | Params 12 | ------------------------------------------------------------ 13 | 0 | loss_fn | CtcLoss | 0 14 | 1 | metric_train_acc | Accuracy | 0 15 | 2 | metric_train_loss | AverageMeter | 0 16 | 3 | metric_valid_acc | Accuracy | 0 17 | 4 | wav2vec | Wav2Vec2WrapperModule | 94.4 M 18 | 5 | embedding_masker | EmbeddingMasker | 0 19 | 6 | stat_pooling | NoPooling | 0 20 | 7 | test_stat_pooling | MeanStdStatPool1D | 0 21 | 8 | fc_list | ModuleList | 932 K 22 | ------------------------------------------------------------ 23 | 95.3 M Trainable params 24 | 0 Non-trainable params 25 | 95.3 M Total params 26 | 381.215 Total estimated model params size (MB) 27 | [2021-09-16 13:56:34,302][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724 28 | [2021-09-16 14:12:17,607][pytorch_lightning.tuner.lr_finder][INFO] - LR finder stopped early after 3379 steps due to diverging loss. 29 | [2021-09-16 14:12:17,627][pytorch_lightning.utilities.distributed][INFO] - Restoring states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-16_13-56-24/05cf7a78d19f46128b4c2d4fadf3eaec/lr_find_temp_model.ckpt 30 | [2021-09-16 14:12:17,947][pytorch_lightning.utilities.distributed][INFO] - Restored all states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-16_13-56-24/05cf7a78d19f46128b4c2d4fadf3eaec/lr_find_temp_model.ckpt 31 | [2021-09-16 14:12:17,980][pytorch_lightning.tuner.lr_finder][INFO] - Learning rate set to 0.00029977816715823815 32 | -------------------------------------------------------------------------------- /paper_results/auto_lr_find/xvector/.hydra/hydra.yaml: -------------------------------------------------------------------------------- 1 | hydra: 2 | run: 3 | dir: ${log_folder}/wav2vec_speaker_identification/run/${now:%Y-%m-%d_%H-%M-%S}/${experiment_name} 4 | sweep: 5 | dir: ${log_folder}/wav2vec_speaker_identification/sweep/${now:%Y-%m-%d_%H-%M-%S} 6 | subdir: ${experiment_name} 7 | launcher: 8 | _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher 9 | sweeper: 10 | _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper 11 | max_batch_size: null 12 | help: 13 | app_name: ${hydra.job.name} 14 | header: '${hydra.help.app_name} is powered by Hydra. 15 | 16 | ' 17 | footer: 'Powered by Hydra (https://hydra.cc) 18 | 19 | Use --hydra-help to view Hydra specific help 20 | 21 | ' 22 | template: '${hydra.help.header} 23 | 24 | == Configuration groups == 25 | 26 | Compose your configuration from those groups (group=option) 27 | 28 | 29 | $APP_CONFIG_GROUPS 30 | 31 | 32 | == Config == 33 | 34 | Override anything in the config (foo.bar=value) 35 | 36 | 37 | $CONFIG 38 | 39 | 40 | ${hydra.help.footer} 41 | 42 | ' 43 | hydra_help: 44 | template: 'Hydra (${hydra.runtime.version}) 45 | 46 | See https://hydra.cc for more info. 47 | 48 | 49 | == Flags == 50 | 51 | $FLAGS_HELP 52 | 53 | 54 | == Configuration groups == 55 | 56 | Compose your configuration from those groups (For example, append hydra/job_logging=disabled 57 | to command line) 58 | 59 | 60 | $HYDRA_CONFIG_GROUPS 61 | 62 | 63 | Use ''--cfg hydra'' to Show the Hydra config. 64 | 65 | ' 66 | hydra_help: ??? 67 | hydra_logging: 68 | version: 1 69 | formatters: 70 | simple: 71 | format: '[%(asctime)s][HYDRA] %(message)s' 72 | handlers: 73 | console: 74 | class: logging.StreamHandler 75 | formatter: simple 76 | stream: ext://sys.stdout 77 | root: 78 | level: INFO 79 | handlers: 80 | - console 81 | loggers: 82 | logging_example: 83 | level: DEBUG 84 | disable_existing_loggers: false 85 | job_logging: 86 | version: 1 87 | formatters: 88 | simple: 89 | format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' 90 | handlers: 91 | console: 92 | class: logging.StreamHandler 93 | formatter: simple 94 | stream: ext://sys.stdout 95 | file: 96 | class: logging.FileHandler 97 | formatter: simple 98 | filename: ${hydra.job.name}.log 99 | root: 100 | level: INFO 101 | handlers: 102 | - console 103 | - file 104 | disable_existing_loggers: false 105 | env: {} 106 | searchpath: [] 107 | callbacks: {} 108 | output_subdir: .hydra 109 | overrides: 110 | hydra: [] 111 | task: 112 | - +experiment=speaker_xvector 113 | - tune_model=True 114 | - data/module=voxceleb1 115 | - trainer.auto_lr_find=auto_lr_find 116 | - tune_iterations=5000 117 | job: 118 | name: run 119 | override_dirname: +experiment=speaker_xvector,data/module=voxceleb1,trainer.auto_lr_find=auto_lr_find,tune_iterations=5000,tune_model=True 120 | id: ??? 121 | num: ??? 122 | config_name: train_eval 123 | env_set: {} 124 | env_copy: [] 125 | config: 126 | override_dirname: 127 | kv_sep: '=' 128 | item_sep: ',' 129 | exclude_keys: [] 130 | runtime: 131 | version: 1.1.1 132 | cwd: /home/nik/workspace/phd/repos/wav2vec_speaker_identification 133 | config_sources: 134 | - path: hydra.conf 135 | schema: pkg 136 | provider: hydra 137 | - path: /home/nik/workspace/phd/repos/wav2vec_speaker_identification/config 138 | schema: file 139 | provider: main 140 | - path: '' 141 | schema: structured 142 | provider: schema 143 | choices: 144 | experiment: speaker_xvector 145 | profiler: null 146 | trainer: trainer 147 | optim/loss: cross_entropy 148 | optim/schedule: one_cycle 149 | optim/algo: adam 150 | tokenizer: default 151 | network: xvector 152 | evaluator: cosine_distance_with_train_data 153 | data/dataloader: speaker 154 | data/shards: shards_voxceleb 155 | data/pipeline: xvector_pipeline 156 | data/module: voxceleb1 157 | callbacks: speaker_default 158 | hydra/env: default 159 | hydra/callbacks: null 160 | hydra/job_logging: default 161 | hydra/hydra_logging: default 162 | hydra/hydra_help: default 163 | hydra/help: default 164 | hydra/sweeper: basic 165 | hydra/launcher: basic 166 | hydra/output: default 167 | verbose: false 168 | -------------------------------------------------------------------------------- /paper_results/auto_lr_find/xvector/.hydra/overrides.yaml: -------------------------------------------------------------------------------- 1 | - +experiment=speaker_xvector 2 | - tune_model=True 3 | - data/module=voxceleb1 4 | - trainer.auto_lr_find=auto_lr_find 5 | - tune_iterations=5000 6 | -------------------------------------------------------------------------------- /paper_results/auto_lr_find/xvector/lightning_logs/version_0/events.out.tfevents.1631794594.katara.80664.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/xvector/lightning_logs/version_0/events.out.tfevents.1631794594.katara.80664.0 -------------------------------------------------------------------------------- /paper_results/auto_lr_find/xvector/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/xvector/plot.png -------------------------------------------------------------------------------- /paper_results/auto_lr_find/xvector/plot_lr_eer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/xvector/plot_lr_eer.png -------------------------------------------------------------------------------- /paper_results/auto_lr_find/xvector/plot_lr_eer_zoomed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/xvector/plot_lr_eer_zoomed.png -------------------------------------------------------------------------------- /paper_results/auto_lr_find/xvector/run.log: -------------------------------------------------------------------------------- 1 | [2021-09-16 14:16:33,601][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724 2 | [2021-09-16 14:16:34,332][src.main][INFO] - Using callback <> 3 | [2021-09-16 14:16:34,332][src.main][INFO] - Using callback <> 4 | [2021-09-16 14:16:34,333][src.main][INFO] - Using callback <> 5 | [2021-09-16 14:16:34,338][pytorch_lightning.utilities.distributed][INFO] - GPU available: True, used: True 6 | [2021-09-16 14:16:34,338][pytorch_lightning.utilities.distributed][INFO] - TPU available: False, using: 0 TPU cores 7 | [2021-09-16 14:16:34,338][pytorch_lightning.utilities.distributed][INFO] - IPU available: False, using: 0 IPUs 8 | [2021-09-16 14:16:34,366][pytorch_lightning.accelerators.gpu][INFO] - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0] 9 | [2021-09-16 14:16:36,535][pytorch_lightning.core.lightning][INFO] - 10 | | Name | Type | Params 11 | ------------------------------------------------------- 12 | 0 | loss_fn | CrossEntropyLoss | 0 13 | 1 | metric_train_acc | Accuracy | 0 14 | 2 | metric_train_loss | AverageMeter | 0 15 | 3 | metric_valid_acc | Accuracy | 0 16 | 4 | feature_extractor | Xvector | 4.3 M 17 | 5 | classifier | Classifier | 885 K 18 | ------------------------------------------------------- 19 | 5.1 M Trainable params 20 | 0 Non-trainable params 21 | 5.1 M Total params 22 | 20.554 Total estimated model params size (MB) 23 | [2021-09-16 14:16:40,831][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724 24 | [2021-09-16 14:18:51,476][pytorch_lightning.tuner.lr_finder][INFO] - LR finder stopped early after 4418 steps due to diverging loss. 25 | [2021-09-16 14:18:51,479][pytorch_lightning.utilities.distributed][INFO] - Restoring states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-16_14-16-31/f47df09ca90443d3b2dd5ffffcc8d60c/lr_find_temp_model.ckpt 26 | [2021-09-16 14:18:51,499][pytorch_lightning.utilities.distributed][INFO] - Restored all states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-16_14-16-31/f47df09ca90443d3b2dd5ffffcc8d60c/lr_find_temp_model.ckpt 27 | [2021-09-16 14:18:51,502][pytorch_lightning.tuner.lr_finder][INFO] - Learning rate set to 0.0021281390459827135 28 | -------------------------------------------------------------------------------- /paper_results/run_tests_pool.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | from collections import defaultdict 4 | 5 | root_folder = "/home/nvaessen/data/transfer/paper_n4/aam/ablation/" 6 | experiment_name = "speaker_wav2vec2_aam" 7 | tag_prefix = "ablation" 8 | test_run = False 9 | test_sets_to_use = [ 10 | # "voxceleb2", 11 | "voxceleb2_test_everyone", 12 | # "voxceleb2_test_hard" 13 | ] 14 | 15 | postfix_map = { 16 | "voxceleb2": "o", 17 | "voxceleb2_test_hard": "h", 18 | "voxceleb2_test_everyone": "e", 19 | } 20 | 21 | 22 | path_dict = defaultdict(set) 23 | for ckpt in sorted(pathlib.Path(root_folder).glob("*.ckpt")): 24 | first_underscore = ckpt.stem.find("_") 25 | first_dot = ckpt.stem.find(".") 26 | 27 | ablation_name = ckpt.stem[first_underscore+1:first_dot] 28 | 29 | path_dict[ablation_name].add(ckpt.absolute()) 30 | 31 | assert len(path_dict) == 10 32 | for key, v in path_dict.items(): 33 | assert len(v) == 3 34 | 35 | for test_set in test_sets_to_use: 36 | for ablation_name, ckpt_set in path_dict.items(): 37 | for ckpt in ckpt_set: 38 | command_template = ( 39 | f"python run.py -m +experiment={experiment_name} " 40 | f"data/module={test_set} " 41 | f"fit_model=False " 42 | f"network.stat_pooling_type=first+cls " 43 | f"tag={ablation_name}_eval_{postfix_map[test_set]} " 44 | f"load_network_from_checkpoint={ckpt} " 45 | f"network.explicit_num_speakers=5994 " 46 | f"hydra/launcher=slurm " 47 | ) 48 | 49 | print(f"{command_template} & ") 50 | 51 | if test_run: 52 | exit() 53 | -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # This run script encapsulates making predictions with a particular network 4 | # on data without labels. 5 | # 6 | # Author(s): Nik Vaessen 7 | ################################################################################ 8 | 9 | import hydra 10 | 11 | from dotenv import load_dotenv 12 | from omegaconf import DictConfig, OmegaConf 13 | 14 | from src.hydra_resolvers import ( 15 | division_resolver, 16 | integer_division_resolver, 17 | random_uuid, 18 | ) 19 | 20 | ################################################################################ 21 | # set custom resolvers 22 | 23 | OmegaConf.register_new_resolver("divide", division_resolver) 24 | OmegaConf.register_new_resolver("idivide", integer_division_resolver) 25 | OmegaConf.register_new_resolver("random_uuid", random_uuid) 26 | 27 | ################################################################################ 28 | # wrap around main hydra script 29 | 30 | 31 | @hydra.main(config_path="config", config_name="predict") 32 | def run(cfg: DictConfig): 33 | # we import here such that tab-completion in bash 34 | # does not need to import everything (which slows it down 35 | # significantly) 36 | from src.main import run_predictions 37 | 38 | return run_predictions(cfg) 39 | 40 | 41 | ################################################################################ 42 | # execute hydra application 43 | 44 | if __name__ == "__main__": 45 | load_dotenv() 46 | run() 47 | 48 | -------------------------------------------------------------------------------- /preparation_scripts/download_and_prepare_rirs.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | ### set environment variables 4 | source ../.env 2> /dev/null || source .env 5 | 6 | # default directory to save files in 7 | DIR="$DATA_FOLDER" 8 | mkdir -p "$DIR" 9 | 10 | ## download files 11 | curl -C - https://www.openslr.org/resources/28/rirs_noises.zip --output "$DIR"/rirs_noises.zip 12 | 13 | # extract file and remove zip 14 | cd "$DIR" 15 | unzip rirs_noises.zip -d "$DIR" 16 | rm rirs_noises.zip 17 | 18 | # create tar for webdataset compatability 19 | mkdir -p "$DIR"/rirs_shards/ 20 | tar --sort=name -cf rirs_shards/pointsource_noises.tar RIRS_NOISES/pointsource_noises 21 | tar --sort=name -cf rirs_shards/real_rirs_isotropic_noises.tar RIRS_NOISES/real_rirs_isotropic_noises 22 | tar --sort=name -cf rirs_shards/simulated_rirs.tar RIRS_NOISES/simulated_rirs 23 | 24 | # remove extracted dir 25 | rm -r "$DIR"/RIRS_NOISES -------------------------------------------------------------------------------- /preparation_scripts/download_librispeech.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | ### set environment variables 4 | source ../.env 2> /dev/null || source .env 5 | 6 | # default directory to save files in 7 | DIR="$DATA_FOLDER"/librispeech 8 | mkdir -p "$DIR" 9 | 10 | ## download files 11 | curl -C - https://www.openslr.org/resources/12/dev-clean.tar.gz --output "$DIR"/dev-clean.tar.gz 12 | curl -C - https://www.openslr.org/resources/12/dev-other.tar.gz --output "$DIR"/dev-other.tar.gz 13 | curl -C - https://www.openslr.org/resources/12/test-clean.tar.gz --output "$DIR"/test-clean.tar.gz 14 | curl -C - https://www.openslr.org/resources/12/test-other.tar.gz --output "$DIR"/test-other.tar.gz 15 | curl -C - https://www.openslr.org/resources/12/train-clean-100.tar.gz --output "$DIR"/train-clean-100.tar.gz 16 | curl -C - https://www.openslr.org/resources/12/train-clean-360.tar.gz --output "$DIR"/train-clean-360.tar.gz 17 | curl -C - https://www.openslr.org/resources/12/train-other-500.tar.gz --output "$DIR"/train-other-500.tar.gz 18 | -------------------------------------------------------------------------------- /preparation_scripts/download_pretrained_models.sh: -------------------------------------------------------------------------------- 1 | ### set environment variables 2 | source ../.env 2> /dev/null || source .env 3 | 4 | ### create folder to store models in 5 | PRETRAINED="$DATA_FOLDER"/pretrained_models/wav2vec 6 | mkdir -p "$PRETRAINED" 7 | 8 | ### download pretrained models 9 | 10 | # wav2vec1 large 11 | echo "wav2vec1 - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_large.pt" 12 | curl -C - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_large.pt --output "$PRETRAINED"/wav2vec_large.pt 13 | 14 | # wav2vec2 small - no ft 15 | echo "# wav2vec2 small - no ft - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small.pt" 16 | curl -C - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small.pt --output "$PRETRAINED"/wav2vec2_small_noft.pt 17 | 18 | # wav2vec2 small - 10 minutes 19 | echo "# wav2vec2 small - 10m ft - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small_10m.pt" 20 | curl -C - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small_10m.pt --output "$PRETRAINED"/wav2vec2_small_ft10m.pt 21 | 22 | # wav2vec2 small - 100 hours 23 | echo "# wav2vec2 small - 100h ft - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small_100h.pt" 24 | curl -C - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small_100h.pt --output "$PRETRAINED"/wav2vec2_small_ft100h.pt 25 | 26 | # wav2vec2 small - 960h ft 27 | echo "wav2vec2 small - 960h ft - https://dl.fbaipublicfiles.com/fairseq/wav2vec/libri960_big.pt" 28 | curl -C - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small_960h.pt --output "$PRETRAINED"/wav2vec2_small_ft960h.pt 29 | 30 | # wav2vec2 large - no ft 31 | echo "wav2vec2 large - no ft - https://dl.fbaipublicfiles.com/fairseq/wav2vec/libri960_big.pt" 32 | curl -C - https://dl.fbaipublicfiles.com/fairseq/wav2vec/libri960_big.pt --output "$PRETRAINED"/wav2vec2_large_noft.pt 33 | 34 | # wav2vec2 large - 10 minutes 35 | echo "# wav2vec2 base - 10m ft - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_big_10m.pt" 36 | curl -C - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_big_10m.pt --output "$PRETRAINED"/wav2vec2_large_10m.pt 37 | 38 | # wav2vec2 large - 100 hours 39 | echo "# wav2vec2 large - 100h ft - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_big_100h.pt" 40 | curl -C - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_big_100h.pt --output "$PRETRAINED"/wav2vec2_large_100h.pt 41 | 42 | # wav2vec2 large - 960 ft 43 | echo "wav2vec2 large - 960h ft - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_big_960h.pt" 44 | curl -C - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_big_960h.pt --output "$PRETRAINED"/wav2vec2_large_960h.pt -------------------------------------------------------------------------------- /preparation_scripts/download_voxceleb_meta.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | ### set environment variables 4 | source ../.env 2> /dev/null || source .env 5 | 6 | # default directory to save files in 7 | DIR="$DATA_FOLDER"/voxceleb_meta 8 | mkdir -p "$DIR" 9 | 10 | ## download files 11 | curl -C - https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/iden_split.txt --output "$DIR"/iden_split.txt 12 | curl -C - https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test.txt --output "$DIR"/veri_test.txt 13 | curl -C - https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt --output "$DIR"/veri_test2.txt 14 | curl -C - https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_hard.txt --output "$DIR"/list_test_hard.txt 15 | curl -C - https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_hard2.txt --output "$DIR"/list_test_hard2.txt 16 | curl -C - https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_all.txt --output "$DIR"/list_test_all.txt 17 | curl -C - https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_all2.txt --output "$DIR"/list_test_all2.txt 18 | curl -C - https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/vox1_meta.csv --output "$DIR"/vox1_meta.csv 19 | -------------------------------------------------------------------------------- /preparation_scripts/hydra_bash_complete.sh: -------------------------------------------------------------------------------- 1 | # you need to source this file instead of executing it 2 | eval "$(python run.py -sc install=bash)" 3 | -------------------------------------------------------------------------------- /preparation_scripts/set_cuda_dependencies.sh: -------------------------------------------------------------------------------- 1 | pip install -r requirements/requirements_py1.9_cuda111.txt -------------------------------------------------------------------------------- /preparation_scripts/validate_scores.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # This file creates a CLI for validating a score text file given a 4 | # text file with pairs (without gt labels). If validation is successfull a 5 | # zipfile will be created which can be submitted to voxceleb challenge on 6 | # codalab. 7 | # 8 | # pair text file format: 9 | # 'FILEa FILEb\n' 10 | # ... 11 | # 'FILEc FILEd\n' 12 | # 13 | # score text file format: 14 | # 'SCORE_FLOAT FILEa FILEb\n' 15 | # ... 16 | # 'SCORE_FLOAT FILEc FILEd\n' 17 | # 18 | # where SCORE_FLOAT is a string representing a float between 0 and 1. 19 | # 20 | # Author(s): Nik Vaessen 21 | ################################################################################ 22 | 23 | import pathlib 24 | import argparse 25 | import tqdm 26 | import zipfile 27 | 28 | from typing import List, Tuple 29 | 30 | ################################################################################ 31 | # validation function 32 | 33 | def _load_pair_file(file: pathlib.Path) -> List[Tuple[str, str]]: 34 | with file.open('r') as f: 35 | lines = f.readlines() 36 | 37 | loaded_list = [] 38 | 39 | for l in lines: 40 | l = l.strip() 41 | 42 | assert l.count(" ") == 1 43 | 44 | split_line = l.split(" ") 45 | assert len(split_line) == 2 46 | 47 | key1, key2 = split_line 48 | loaded_list.append((key1, key2)) 49 | 50 | return loaded_list 51 | 52 | def _load_score_file(file: pathlib.Path) -> List[Tuple[float, str, str]]: 53 | with file.open('r') as f: 54 | lines = f.readlines() 55 | 56 | loaded_list = [] 57 | 58 | for l in lines: 59 | l = l.strip() 60 | 61 | assert l.count(" ") == 2 62 | 63 | split_line = l.split(" ") 64 | assert len(split_line) == 3 65 | 66 | score, key1, key2 = split_line 67 | 68 | try: 69 | score = float(score) 70 | except: 71 | raise ValueError(f"could not convert {score} to float") 72 | 73 | assert isinstance(score, float) 74 | loaded_list.append((score, key1, key2)) 75 | 76 | return loaded_list 77 | 78 | def validate(pair_file: pathlib.Path, score_file: pathlib.Path): 79 | # load data in file 80 | pairs = _load_pair_file(pair_file) 81 | scores = _load_score_file(score_file) 82 | 83 | # ensure each float is between 0 and 1 84 | print("validate each score is valid") 85 | for score_tuple in tqdm.tqdm(scores): 86 | score = score_tuple[0] 87 | 88 | assert score <= 1 89 | assert score >= 0 90 | 91 | # ensure each pair is present 92 | print("validate each pair is present") 93 | for score_tuple in tqdm.tqdm(scores): 94 | pair_tuple = (score_tuple[1], score_tuple[2]) 95 | 96 | assert pair_tuple in pairs 97 | 98 | 99 | 100 | ################################################################################ 101 | # creation of submission file. 102 | 103 | SCORE_FILE_NAME = 'scores.txt' 104 | ZIPFILE_NAME = 'submission.zip' 105 | 106 | def create_submission(score_file: pathlib.Path): 107 | zipfile_path = score_file.parent / ZIPFILE_NAME 108 | 109 | with zipfile.ZipFile(str(zipfile_path), mode='w') as f: 110 | f.write(str(score_file), SCORE_FILE_NAME) 111 | 112 | ################################################################################ 113 | # entrypoint of CLI 114 | 115 | def main(): 116 | # set CLI arguments 117 | parser = argparse.ArgumentParser() 118 | 119 | parser.add_argument("--score_file", required=True) 120 | parser.add_argument("--pair_file", required=True) 121 | 122 | # load arguments 123 | args = parser.parse_args() 124 | 125 | score_file = pathlib.Path(args.score_file) 126 | pair_file = pathlib.Path(args.pair_file) 127 | 128 | # validate score file 129 | # validate(pair_file, score_file) 130 | 131 | # create submission zipfile 132 | create_submission(score_file) 133 | 134 | 135 | if __name__ == "__main__": 136 | main() -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "wav2vec-speaker-identification" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Nik Vaessen "] 6 | packages = [ 7 | { include = "src" }, 8 | { include = "tests" } 9 | ] 10 | 11 | [tool.poetry.dependencies] 12 | python = "^3.8" 13 | tqdm = "^4.27" 14 | scipy = "^1.6.0" 15 | scikit-learn = "^0.24.0" 16 | click = "^7.1.2" 17 | pandas = "^1.2.1" 18 | pytorch-lightning = "1.4.5" 19 | pytorch-model-summary = "^0.1.2" 20 | jupyter = "^1.0.0" 21 | librosa = "^0.8.0" 22 | matplotlib = "^3.3.4" 23 | seaborn = "^0.11.1" 24 | hydra-core = "^1.1.0" 25 | psutil = "^5.8.0" 26 | webdataset = "0.1.58" 27 | yaspin = "2.0.0" 28 | "hurry.filesize" = "^0.9" 29 | python-dotenv = "^0.17.0" 30 | torchaudio = "0.9.0" 31 | speechbrain = "^0.5.5" 32 | comet-ml = "^3.9.0" 33 | lightning-bolts = "^0.3.3" 34 | hydra-submitit-launcher = "^1.1.1" 35 | wavaugment = "^0.2" 36 | jupyterlab = "^3.0.14" 37 | fairseq = "0.10.2" 38 | jiwer = "^2.2.0" 39 | datasets = "^1.8.0" 40 | transformers = "^4.8.2" 41 | hydra-optuna-sweeper = "^1.1.0" 42 | 43 | [tool.poetry.dev-dependencies] 44 | black = "^21.6b0" 45 | pytest = "^6.2.2" 46 | 47 | [build-system] 48 | requires = ["poetry-core>=1.0.0"] 49 | build-backend = "poetry.core.masonry.api" 50 | -------------------------------------------------------------------------------- /requirements/requirements_cuda101.txt: -------------------------------------------------------------------------------- 1 | -f https://download.pytorch.org/whl/torch_stable.html 2 | torch==1.8.1+cu101 3 | torchvision==0.9.1+cu101 4 | torchaudio==0.8.1 -------------------------------------------------------------------------------- /requirements/requirements_cuda111.txt: -------------------------------------------------------------------------------- 1 | -f https://download.pytorch.org/whl/torch_stable.html 2 | torch==1.8.1+cu111 3 | torchvision==0.9.1+cu111 4 | torchaudio==0.8.1 -------------------------------------------------------------------------------- /requirements/requirements_py1.9_cuda111.txt: -------------------------------------------------------------------------------- 1 | -f https://download.pytorch.org/whl/torch_stable.html 2 | torch==1.9.0+cu111 3 | torchvision==0.10.0+cu111 4 | torchaudio==0.9.0 -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # This run script encapsulates the training and evaluation of a speaker 4 | # recognition model defined by the hydra configuration. 5 | # 6 | # Author(s): Nik Vaessen 7 | ################################################################################ 8 | 9 | import hydra 10 | 11 | from dotenv import load_dotenv 12 | from omegaconf import DictConfig, OmegaConf 13 | 14 | from src.hydra_resolvers import ( 15 | division_resolver, 16 | integer_division_resolver, 17 | random_uuid, 18 | ) 19 | 20 | ################################################################################ 21 | # set custom resolvers 22 | 23 | OmegaConf.register_new_resolver("divide", division_resolver) 24 | OmegaConf.register_new_resolver("idivide", integer_division_resolver) 25 | OmegaConf.register_new_resolver("random_uuid", random_uuid) 26 | 27 | ################################################################################ 28 | # wrap around main hydra script 29 | 30 | 31 | @hydra.main(config_path="config", config_name="train_eval") 32 | def run(cfg: DictConfig): 33 | # we import here such that tab-completion in bash 34 | # does not need to import everything (which slows it down 35 | # significantly) 36 | from src.main import run_train_eval_script 37 | 38 | return run_train_eval_script(cfg) 39 | 40 | 41 | ################################################################################ 42 | # execute hydra application 43 | 44 | if __name__ == "__main__": 45 | load_dotenv() 46 | import os 47 | 48 | run() 49 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/__init__.py -------------------------------------------------------------------------------- /src/callbacks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/callbacks/__init__.py -------------------------------------------------------------------------------- /src/callbacks/memory_monitor.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # This callback will monitor the RAM usage of each worker. 4 | # 5 | # Author(s): Nik Vaessen 6 | ################################################################################ 7 | from typing import Any 8 | 9 | import psutil 10 | import os 11 | 12 | import pytorch_lightning as pl 13 | 14 | from pytorch_lightning.utilities import rank_zero_only 15 | from pytorch_lightning.utilities.types import STEP_OUTPUT 16 | 17 | 18 | ################################################################################ 19 | # callback implementation 20 | 21 | 22 | class RamMemoryMonitor(pl.Callback): 23 | def __init__(self, frequency: int): 24 | self.frequency = frequency 25 | 26 | self.batches = 0 27 | 28 | def on_train_batch_end( 29 | self, 30 | trainer: "pl.Trainer", 31 | pl_module: "pl.LightningModule", 32 | outputs: STEP_OUTPUT, 33 | batch: Any, 34 | batch_idx: int, 35 | dataloader_idx: int, 36 | ) -> None: 37 | self.batches += 1 38 | 39 | if self.batches >= self.frequency: 40 | self.batches = 0 41 | 42 | try: 43 | self._monitor(trainer) 44 | except psutil.NoSuchProcess as e: 45 | pass 46 | 47 | @staticmethod 48 | def _monitor(trainer: pl.Trainer): 49 | current_process = psutil.Process(os.getpid()) 50 | children = current_process.children(recursive=True) 51 | 52 | # track main process 53 | current_process_usage = _get_mem_usage_in_mb(current_process) 54 | 55 | # track child processes 56 | children_usage = [_get_mem_usage_in_mb(c) for c in children] 57 | 58 | # total usage 59 | total_usage = current_process_usage + sum(children_usage) 60 | 61 | # track usage 62 | if trainer is not None: 63 | trainer.logger.log_metrics( 64 | { 65 | "mem_total": total_usage, 66 | } 67 | ) 68 | 69 | 70 | def _get_mem_usage_in_mb(p: psutil.Process): 71 | full_info = p.memory_full_info() 72 | 73 | # usage of process in bytes 74 | usage = full_info.uss 75 | 76 | # convert to megabytes 77 | usage = round(usage / float(1 << 20)) 78 | 79 | return usage 80 | -------------------------------------------------------------------------------- /src/config_util.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Provide a dataclass which automatically tries to cast any type-hinted field 4 | # to the type hint. It also as provides an abstract method 5 | # for constructing the object it configures. 6 | # 7 | # Author(s): Nik Vaessen 8 | ################################################################################ 9 | 10 | import dataclasses 11 | 12 | from abc import abstractmethod 13 | from enum import Enum 14 | from typing import TypeVar, Generic 15 | 16 | ################################################################################ 17 | # base configuration which supports casting to type hint and provides abstract 18 | # interface for creating an object based on the configuration 19 | 20 | C = TypeVar("C") 21 | 22 | 23 | @dataclasses.dataclass() 24 | class CastingConfig(Generic[C]): 25 | def __post_init__(self): 26 | post_init_type_cast(self) 27 | 28 | 29 | def post_init_type_cast(dataclass): 30 | if not dataclasses.is_dataclass(dataclass): 31 | raise Exception("Can only type-cast dataclass classes.") 32 | 33 | for field in dataclasses.fields(dataclass): 34 | value = getattr(dataclass, field.name) 35 | typehint_cls = field.type 36 | 37 | if value is None: 38 | # no value specified to type-convert 39 | continue 40 | 41 | elif isinstance(value, typehint_cls): 42 | # no need for type-conversion 43 | continue 44 | 45 | elif isinstance(value, dict): 46 | """ 47 | if execution gets here, we know 48 | value is not an instance of typehinted-type but 49 | is a dictionary. It contains the contents 50 | of a nested dataclass 51 | """ 52 | obj = typehint_cls(**value) 53 | 54 | # recursively perform type casting 55 | post_init_type_cast(obj) 56 | 57 | elif issubclass(typehint_cls, Enum): 58 | # enum's have a different init procedure 59 | obj = typehint_cls[value] 60 | 61 | else: 62 | # simply type-cast the object 63 | obj = typehint_cls(value) 64 | 65 | setattr(dataclass, field.name, obj) 66 | -------------------------------------------------------------------------------- /src/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/__init__.py -------------------------------------------------------------------------------- /src/data/common.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # A collection of common data classes 4 | # 5 | # Author(s): Nik Vaessen 6 | ################################################################################ 7 | 8 | import pathlib 9 | 10 | from abc import abstractmethod 11 | from dataclasses import dataclass 12 | from typing import List, Tuple, Optional, Dict, Any 13 | 14 | import torch as t 15 | 16 | ################################################################################ 17 | # 18 | 19 | 20 | @dataclass 21 | class WebDataSetShardConfig: 22 | samples_per_shard: int 23 | use_gzip_compression: bool 24 | shuffle_shards: bool 25 | queue_size: int 26 | 27 | 28 | @dataclass 29 | class SpeakerDataLoaderConfig: 30 | num_workers: int 31 | train_batch_size: int 32 | val_batch_size: int 33 | test_batch_size: int 34 | pin_memory: bool 35 | 36 | 37 | @dataclass 38 | class SpeechDataLoaderConfig: 39 | num_workers: int 40 | train_max_num_samples: int 41 | val_batch_size: int 42 | test_batch_size: int 43 | pin_memory: bool 44 | 45 | 46 | ################################################################################ 47 | # 48 | 49 | 50 | @dataclass 51 | class DebugWriter: 52 | @abstractmethod 53 | def write(self, tensor: t.Tensor, save_dir: pathlib.Path, idx: int): 54 | pass 55 | 56 | 57 | @dataclass 58 | class BatchDebugInfo: 59 | # the original tensor which should be easily converted to 60 | # e.g an image/audio file 61 | original_tensor: t.Tensor 62 | 63 | # a list containing the progression steps from the original_tensor 64 | # to the network_input tensor accompanied with a class which can be 65 | # used to write debug output to a particular folder 66 | pipeline_progress: List[ 67 | Tuple[ 68 | t.Tensor, 69 | DebugWriter, 70 | ] 71 | ] 72 | 73 | # optional (untyped) dataset specific information 74 | # about the data sample 75 | meta: Optional[Dict[Any, Any]] 76 | -------------------------------------------------------------------------------- /src/data/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/__init__.py -------------------------------------------------------------------------------- /src/data/modules/speaker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speaker/__init__.py -------------------------------------------------------------------------------- /src/data/modules/speaker/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speaker/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/data/modules/speaker/__pycache__/speaker_data_module.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speaker/__pycache__/speaker_data_module.cpython-38.pyc -------------------------------------------------------------------------------- /src/data/modules/speaker/__pycache__/training_batch_speaker.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speaker/__pycache__/training_batch_speaker.cpython-38.pyc -------------------------------------------------------------------------------- /src/data/modules/speaker/__pycache__/voxceleb.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speaker/__pycache__/voxceleb.cpython-38.pyc -------------------------------------------------------------------------------- /src/data/modules/speaker/speaker_data_module.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Abstract LightningDataModule for speaker recognition 4 | # 5 | # Author(s): Nik Vaessen 6 | ################################################################################ 7 | 8 | from abc import abstractmethod 9 | from typing import List 10 | 11 | import pytorch_lightning 12 | 13 | from src.evaluation.speaker.speaker_recognition_evaluator import EvaluationPair 14 | 15 | 16 | ################################################################################ 17 | # abstract class of a lightning data module for speaker recognition 18 | 19 | 20 | class SpeakerLightningDataModule(pytorch_lightning.LightningDataModule): 21 | @property 22 | @abstractmethod 23 | def num_speakers(self) -> int: 24 | pass 25 | 26 | @property 27 | @abstractmethod 28 | def val_pairs(self) -> List[EvaluationPair]: 29 | pass 30 | 31 | @property 32 | @abstractmethod 33 | def test_pairs(self) -> List[EvaluationPair]: 34 | pass 35 | 36 | @property 37 | @abstractmethod 38 | def summary(self): 39 | pass 40 | -------------------------------------------------------------------------------- /src/data/modules/speech/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speech/__init__.py -------------------------------------------------------------------------------- /src/data/modules/speech/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speech/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/data/modules/speech/__pycache__/librispeech.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speech/__pycache__/librispeech.cpython-38.pyc -------------------------------------------------------------------------------- /src/data/modules/speech/__pycache__/speech_data_module.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speech/__pycache__/speech_data_module.cpython-38.pyc -------------------------------------------------------------------------------- /src/data/modules/speech/__pycache__/training_batch_speech.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speech/__pycache__/training_batch_speech.cpython-38.pyc -------------------------------------------------------------------------------- /src/data/modules/speech/speech_data_module.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Abstract LightningDataModule for speaker recognition 4 | # 5 | # Author(s): Nik Vaessen 6 | ################################################################################ 7 | 8 | from abc import abstractmethod 9 | from typing import List 10 | 11 | import pytorch_lightning 12 | 13 | from src.tokenizer.base import BaseTokenizer 14 | 15 | ################################################################################ 16 | # abstract class of a lightning data module for speaker recognition 17 | 18 | 19 | class SpeechLightningDataModule(pytorch_lightning.LightningDataModule): 20 | @property 21 | @abstractmethod 22 | def vocabulary(self) -> List[str]: 23 | pass 24 | 25 | @property 26 | @abstractmethod 27 | def summary(self): 28 | pass 29 | 30 | @property 31 | @abstractmethod 32 | def tokenizer(self) -> BaseTokenizer: 33 | pass 34 | -------------------------------------------------------------------------------- /src/data/preprocess/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/preprocess/__init__.py -------------------------------------------------------------------------------- /src/data/preprocess/audio_features.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Base API for preprocessors 4 | # 5 | # Author(s): Nik Vaessen 6 | ################################################################################ 7 | 8 | import pathlib 9 | 10 | from typing import Union, List 11 | 12 | import librosa 13 | import torch as t 14 | import torchaudio 15 | import seaborn 16 | 17 | from matplotlib import pyplot as plt 18 | from speechbrain.lobes.features import Fbank 19 | 20 | from src.data.common import DebugWriter 21 | from src.data.preprocess.base import Preprocessor 22 | from src.data.modules.speaker.training_batch_speaker import ( 23 | SpeakerClassificationDataSample, 24 | ) 25 | from src.util import debug_tensor_content 26 | 27 | ################################################################################ 28 | # base preprocessor 29 | 30 | 31 | class FilterBankDebugWriter(DebugWriter): 32 | def write(self, tensor: t.Tensor, save_dir: pathlib.Path, idx: int): 33 | debug_tensor_content(tensor, f"{idx:03d}_filterbank_features", save_dir) 34 | 35 | # make a plot of the filterbank values 36 | heatmap = seaborn.heatmap(tensor.cpu().numpy()) 37 | fig = heatmap.get_figure() 38 | fig.savefig(str(save_dir / f"{idx:03d}_filterbank_features.png")) 39 | plt.clf() 40 | 41 | # convert back to audio 42 | a1 = tensor.numpy().transpose() 43 | a1 = librosa.db_to_amplitude(a1) 44 | a1 = librosa.feature.inverse.mel_to_audio( 45 | a1, 46 | n_fft=400, 47 | fmin=0, 48 | fmax=8000, 49 | hop_length=160, 50 | win_length=16 * 25, 51 | center=False, 52 | power=1, 53 | n_iter=10, 54 | ) 55 | 56 | torchaudio.save( 57 | save_dir / f"{idx:03d}_filterbank_features.wav", 58 | t.Tensor(a1)[None, :], 59 | 16000, 60 | ) 61 | 62 | 63 | class FilterBank(Preprocessor): 64 | def __init__(self, n_mels: int = 40): 65 | self.fb = Fbank(n_mels=n_mels) 66 | 67 | def process( 68 | self, sample: SpeakerClassificationDataSample 69 | ) -> Union[SpeakerClassificationDataSample, List[SpeakerClassificationDataSample]]: 70 | # expects an audio file of shape [1, NUM_AUDIO_SAMPLES] and converts 71 | # to [1, NUM_FRAMES, N_MELS] which is squeezed to [NUM_FRAMES, N_MELS] 72 | sample.network_input = self.fb(sample.network_input).squeeze() 73 | 74 | if sample.side_info is not None: 75 | sample.side_info.pipeline_progress.append( 76 | (sample.network_input, self.init_debug_writer()) 77 | ) 78 | 79 | return sample 80 | 81 | def init_debug_writer( 82 | self, 83 | ): 84 | return FilterBankDebugWriter() 85 | -------------------------------------------------------------------------------- /src/data/preprocess/base.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Base API for preprocessors 4 | # 5 | # Author(s): Nik Vaessen 6 | ################################################################################ 7 | 8 | from abc import abstractmethod 9 | from typing import List, Union 10 | 11 | from src.data.common import DebugWriter 12 | from src.data.modules.speaker.training_batch_speaker import ( 13 | SpeakerClassificationDataSample, 14 | ) 15 | 16 | 17 | ################################################################################ 18 | # base preprocessor 19 | 20 | 21 | class Preprocessor: 22 | @abstractmethod 23 | def process( 24 | self, sample: SpeakerClassificationDataSample 25 | ) -> Union[SpeakerClassificationDataSample, List[SpeakerClassificationDataSample]]: 26 | # process a sample in a particular way and generate one or more 27 | # new samples 28 | pass 29 | 30 | @abstractmethod 31 | def init_debug_writer( 32 | self, 33 | ) -> DebugWriter: 34 | pass 35 | -------------------------------------------------------------------------------- /src/data/preprocess/input_normalisation.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Select a 4 | # 5 | # Author(s): Nik Vaessen 6 | ################################################################################ 7 | 8 | import pathlib 9 | from typing import Union, List 10 | 11 | import torch as t 12 | import seaborn 13 | 14 | from matplotlib import pyplot as plt 15 | 16 | from src.data.common import DebugWriter 17 | from src.data.preprocess.base import Preprocessor 18 | from src.data.modules.speaker.training_batch_speaker import ( 19 | SpeakerClassificationDataSample, 20 | ) 21 | from src.util import debug_tensor_content 22 | 23 | ################################################################################ 24 | # implementation of the selector 25 | 26 | 27 | class InputNormalizerDebugWriter(DebugWriter): 28 | def write(self, tensor: t.Tensor, save_dir: pathlib.Path, idx: int): 29 | debug_tensor_content(tensor, f"{idx:03d}_normalized_features", save_dir) 30 | 31 | # make a plot of the normalized values 32 | heatmap = seaborn.heatmap(tensor.cpu().numpy()) 33 | fig = heatmap.get_figure() 34 | fig.savefig(str(save_dir / f"{idx:03d}_normalized_features.png")) 35 | plt.clf() 36 | 37 | 38 | class InputNormalizer2D(Preprocessor): 39 | def __init__( 40 | self, 41 | normalize_over_channels: bool = True, 42 | ): 43 | """ 44 | Normalize 2D spectograms. 45 | 46 | :param normalize_over_channels: whether to normalize over channels 47 | (when True) or over the whole spectogram (when False) 48 | """ 49 | super().__init__() 50 | 51 | self.channel_wise = normalize_over_channels 52 | 53 | @staticmethod 54 | def normalize(spectogram: t.Tensor, channel_wise: bool): 55 | if len(spectogram.shape) != 2: 56 | raise ValueError("expect to normalize over 2D input") 57 | 58 | if channel_wise: 59 | # calculate over last dimension 60 | # (assuming shape [NUM_FRAMES, NUM_FEATURES]) 61 | std, mean = t.std_mean(spectogram, dim=0) 62 | else: 63 | std, mean = t.std_mean(spectogram) 64 | 65 | normalized_spectogram = (spectogram - mean) / (std + 1e-5) 66 | 67 | return normalized_spectogram, mean, std 68 | 69 | def process( 70 | self, sample: SpeakerClassificationDataSample 71 | ) -> Union[SpeakerClassificationDataSample, List[SpeakerClassificationDataSample]]: 72 | x_norm, mean, std = self.normalize(sample.network_input, self.channel_wise) 73 | 74 | sample.network_input = x_norm 75 | 76 | if sample.side_info is not None: 77 | sample.side_info.pipeline_progress.append( 78 | (x_norm, self.init_debug_writer()) 79 | ) 80 | 81 | return sample 82 | 83 | def init_debug_writer(self): 84 | return InputNormalizerDebugWriter() 85 | -------------------------------------------------------------------------------- /src/data/util.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Utility functions related to data i/o 4 | # 5 | # Author(s): Nik Vaessen 6 | ################################################################################ 7 | 8 | import pathlib 9 | 10 | import torchaudio 11 | 12 | import numpy as np 13 | import torch as t 14 | 15 | ################################################################################ 16 | # read audio from wav file into a tensor 17 | 18 | 19 | def load_raw_audio(path: pathlib.Path) -> t.Tensor: 20 | """ 21 | Load the raw audio file at the specified path and return it as a tensor 22 | with shape [1, num_samples] with floating values between -1 and 1 23 | 24 | :param path: the path to the audio value 25 | :return: a tensor of shape [1, num_samples] of the raw audio 26 | """ 27 | tensor, sample_rate = torchaudio.load(str(path)) 28 | 29 | if sample_rate != 16000: 30 | raise ValueError( 31 | f"audio file {path} is expected to have a sampling" 32 | f" rate of 16000 while actually being {sample_rate}" 33 | ) 34 | 35 | return tensor 36 | 37 | 38 | ################################################################################ 39 | # read/save tensors 40 | 41 | 42 | def load_tensor(path: pathlib.Path, device=t.device("cpu")) -> t.Tensor: 43 | return t.load(path, map_location=device) 44 | 45 | 46 | def save_tensor(embedding: t.Tensor, save_path: pathlib.Path): 47 | save_path.parent.mkdir(exist_ok=True, parents=True) 48 | t.save(embedding, str(save_path)) 49 | 50 | 51 | ################################################################################ 52 | # hacky way to create a None tensor 53 | 54 | 55 | def create_nan_tensor(): 56 | return t.Tensor([np.nan]) 57 | 58 | 59 | def is_nan_tensor(tensor: t.Tensor): 60 | return t.all(t.isnan(tensor)).item() 61 | 62 | 63 | ################################################################################ 64 | # check if a value can cause nan/inf 65 | 66 | 67 | def tensor_has_inf(tensor: t.Tensor): 68 | return t.any(t.isinf(tensor)).item() 69 | 70 | 71 | def tensor_has_nan(tensor: t.Tensor): 72 | return t.any(t.isnan(tensor)).item() 73 | 74 | 75 | def is_invalid_tensor(tensor: t.Tensor): 76 | return tensor_has_inf(tensor) or tensor_has_inf(tensor) 77 | -------------------------------------------------------------------------------- /src/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/evaluation/__init__.py -------------------------------------------------------------------------------- /src/evaluation/speaker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/evaluation/speaker/__init__.py -------------------------------------------------------------------------------- /src/evaluation/speaker/lda.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Implement the PLDA evaluation metric and evaluator. 4 | 5 | # Author(s): Nik Vaessen 6 | ################################################################################ 7 | 8 | from typing import List, Tuple 9 | 10 | import torch as t 11 | from sklearn.decomposition import PCA 12 | 13 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 14 | 15 | from src.evaluation.speaker.cosine_distance import ( 16 | compute_cosine_scores, 17 | ) 18 | from src.evaluation.speaker.speaker_recognition_evaluator import ( 19 | EmbeddingSample, 20 | compute_mean_std_batch, 21 | center_batch, 22 | SpeakerRecognitionEvaluator, 23 | length_norm_batch, 24 | ) 25 | 26 | 27 | ################################################################################ 28 | # Implement an evaluator based PLDA scoring 29 | 30 | 31 | class LDAEvaluator(SpeakerRecognitionEvaluator): 32 | def __init__( 33 | self, 34 | center_before_scoring: bool, 35 | length_norm_before_scoring: bool, 36 | max_training_batches_to_fit: int, 37 | num_pca_components: int, 38 | center_before_fit_training_batches: bool, 39 | ): 40 | super().__init__( 41 | max_training_batches_to_fit=max_training_batches_to_fit, 42 | ) 43 | 44 | self.center_before_scoring = center_before_scoring 45 | self.length_norm_before_scoring = length_norm_before_scoring 46 | self.num_pca_components = num_pca_components 47 | self.center_before_fit_training_batches = center_before_fit_training_batches 48 | 49 | # set in self#fit_parameters 50 | self._lda_model: LinearDiscriminantAnalysis = None 51 | self._mean: t.Tensor = None 52 | self._std: t.Tensor = None 53 | 54 | def fit_parameters( 55 | self, embedding_tensors: List[t.Tensor], label_tensors: List[t.Tensor] 56 | ): 57 | # create a tensor of shape [BATCH_SIZE*len(embedding_tensors), EMBEDDING_SIZE] 58 | all_tensors = t.cat(embedding_tensors) 59 | 60 | # create a tensor of SHAPE [BATCH_SIZE*len(label_tensors),] 61 | all_labels = t.cat(label_tensors) 62 | 63 | if self.center_before_fit_training_batches: 64 | mean, std = compute_mean_std_batch(all_tensors) 65 | all_tensors = center_batch(all_tensors, mean, std) 66 | 67 | # convert to numpy 68 | all_tensors = all_tensors.detach().cpu().numpy() 69 | all_labels = all_labels.detach().cpu().numpy().tolist() 70 | 71 | # train LDA model 72 | self._lda_model = PCA(n_components=200, whiten=True) 73 | all_tensors_transformed = self._lda_model.fit_transform(all_tensors, all_labels) 74 | 75 | # compute mean/std in latent space in order to do centering before 76 | # taking length norm 77 | self._mean, self._std = compute_mean_std_batch( 78 | t.Tensor(all_tensors_transformed) 79 | ) 80 | 81 | def reset_parameters(self): 82 | super().reset_parameters() 83 | self._lda_model = None 84 | 85 | def _compute_prediction_scores( 86 | self, pairs: List[Tuple[EmbeddingSample, EmbeddingSample]] 87 | ) -> List[float]: 88 | # get 2 tensors of size [NUM_SAMPLES, EMBEDDING_SIZE], 89 | # where the same row idx corresponds to a pair to score 90 | b1, b2 = self._transform_pairs_to_tensor(pairs) 91 | 92 | # convert to latent dimension 93 | b1 = self._lda_model.transform(b1.detach().cpu().numpy()) 94 | b2 = self._lda_model.transform(b2.detach().cpu().numpy()) 95 | 96 | # convert back to tensors 97 | b1 = t.Tensor(b1) 98 | b2 = t.Tensor(b2) 99 | 100 | if self.center_before_scoring: 101 | b1 = center_batch(b1, self._mean, self._std) 102 | b2 = center_batch(b2, self._mean, self._std) 103 | 104 | if self.length_norm_before_scoring: 105 | b1 = length_norm_batch(b1) 106 | b2 = length_norm_batch(b2) 107 | 108 | # compute scores based on centering, length norming and then 109 | # taking cosine distance 110 | return compute_cosine_scores(t.Tensor(b1), t.Tensor(b2)) 111 | -------------------------------------------------------------------------------- /src/evaluation/speech/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/evaluation/speech/__init__.py -------------------------------------------------------------------------------- /src/evaluation/speech/wer.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Calculating word-error-rate 4 | # 5 | # Author(s): Nik Vaessen 6 | ################################################################################ 7 | 8 | from typing import List 9 | 10 | from jiwer import wer 11 | 12 | ################################################################################ 13 | # wrapper around jiwer 14 | 15 | 16 | def calculate_wer(transcriptions: List[str], ground_truths: List[str]): 17 | return wer(ground_truths, transcriptions) 18 | -------------------------------------------------------------------------------- /src/hydra_resolvers.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Custom resolvers for hydra configuration 4 | # 5 | # Author(s): Nik Vaessen 6 | ################################################################################ 7 | 8 | import uuid 9 | 10 | ################################################################################ 11 | # implement division of 2 digits 12 | 13 | 14 | def _parse_digit(d: str): 15 | try: 16 | d = int(d) 17 | except ValueError: 18 | try: 19 | d = float(d) 20 | except ValueError: 21 | raise ValueError(f"input {d} cannot be parsed as a digit") 22 | 23 | return d 24 | 25 | 26 | def division_resolver(numerator: str, denominator: str): 27 | return _parse_digit(numerator) / _parse_digit(denominator) 28 | 29 | 30 | def integer_division_resolver(numerator: str, denominator: str): 31 | return int(_parse_digit(numerator) // _parse_digit(denominator)) 32 | 33 | 34 | ################################################################################ 35 | # create a random UUID 36 | 37 | 38 | def random_uuid(): 39 | return uuid.uuid4().hex 40 | -------------------------------------------------------------------------------- /src/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/layers/__init__.py -------------------------------------------------------------------------------- /src/layers/embedding_masking.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Apply dropout on time and channel dimensions of wav2vec2 embedding 4 | # as described in https://arxiv.org/abs/2006.11477 5 | # 6 | # Author(s): Nik Vaessen 7 | ################################################################################ 8 | 9 | from typing import List 10 | 11 | import torch as t 12 | import torch.nn as nn 13 | 14 | ################################################################################ 15 | # implementation as nn module 16 | 17 | 18 | class EmbeddingMasker(nn.Module): 19 | def __init__( 20 | self, 21 | timestep_mask_prob: float, 22 | timestep_mask_width: int, 23 | channel_mask_prob: float, 24 | channel_mask_width: int, 25 | time_dim: int = 1, 26 | embedding_dim: int = 2, 27 | ): 28 | if not (0 <= channel_mask_prob <= 1): 29 | raise ValueError( 30 | f"probability channel_mask_prob {channel_mask_prob} expected to " 31 | f"be in range [0,1]" 32 | ) 33 | if not (0 <= timestep_mask_prob <= 1): 34 | raise ValueError( 35 | f"probability timestep_mask_prob {timestep_mask_prob} expected to " 36 | f"be in range [0,1]" 37 | ) 38 | 39 | if time_dim == 0 or embedding_dim == 0: 40 | raise ValueError("dimensions to mask cannot be dim 0 (batch dimension)") 41 | 42 | super().__init__() 43 | 44 | self.timestep_mask_prob = timestep_mask_prob 45 | self.timestep_mask_width = timestep_mask_width 46 | self.channel_mask_prob = channel_mask_prob 47 | self.channel_mask_width = channel_mask_width 48 | 49 | self.time_dim = time_dim 50 | self.embedding_dim = embedding_dim 51 | 52 | def forward(self, embedding_tensor: t.Tensor): 53 | if not self.training or (self.timestep_mask_prob + self.channel_mask_prob == 0): 54 | return embedding_tensor 55 | 56 | assert len(embedding_tensor.shape) == 3 57 | 58 | num_time_steps = embedding_tensor.shape[self.time_dim] 59 | num_channels = embedding_tensor.shape[self.embedding_dim] 60 | 61 | # create mask with same shape of embedding tensor 62 | m = t.ones(embedding_tensor.shape, device=embedding_tensor.device) 63 | 64 | # determine which time steps to mask 65 | if self.timestep_mask_prob > 0: 66 | time_masked = t.rand((num_time_steps,)) 67 | time_masked = ( 68 | t.where( 69 | time_masked <= self.timestep_mask_prob, t.Tensor([0]), t.Tensor([1]) 70 | ) 71 | .numpy() 72 | .tolist() 73 | ) 74 | 75 | time_masked = self.expand_mask(time_masked, self.timestep_mask_width) 76 | self.insert_into_mask(m, time_masked, 0, self.time_dim) 77 | 78 | # determine which channels to mask 79 | if self.timestep_mask_prob > 0: 80 | channel_mask = t.rand((num_channels,)) 81 | channel_mask = ( 82 | t.where( 83 | channel_mask <= self.channel_mask_prob, t.Tensor([0]), t.Tensor([1]) 84 | ) 85 | .numpy() 86 | .tolist() 87 | ) 88 | 89 | channel_mask = self.expand_mask(channel_mask, self.channel_mask_width) 90 | self.insert_into_mask(m, channel_mask, 0, self.embedding_dim) 91 | 92 | # mask and return the embedding 93 | return m * embedding_tensor 94 | 95 | @staticmethod 96 | def insert_into_mask( 97 | mask_tensor: t.Tensor, mask_list: List[int], mask_value: int, dim: int 98 | ): 99 | mask_idx = [idx for idx, value in enumerate(mask_list) if value == mask_value] 100 | 101 | if dim == 1: 102 | mask_tensor[:, mask_idx, :] = mask_value 103 | else: 104 | mask_tensor[:, :, mask_idx] = mask_value 105 | 106 | return mask_tensor 107 | 108 | @staticmethod 109 | def expand_mask( 110 | mask_list: List[int], mask_width: int, mask_value_to_expand: int = 0 111 | ): 112 | # repeat mask widths 113 | mask_idx = [] 114 | 115 | for idx, mask_value in enumerate(mask_list): 116 | if mask_value == mask_value_to_expand: 117 | mask_idx.append(idx) 118 | 119 | expanded_mask_list = t.Tensor(mask_list) 120 | for idx in mask_idx: 121 | expanded_mask_list[idx : (idx + mask_width)] = mask_value_to_expand 122 | 123 | return expanded_mask_list.numpy().tolist() 124 | -------------------------------------------------------------------------------- /src/layers/temporal_gating.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Implement temporal gating (squeeze-and-excitation layer) as described in: 4 | # 5 | # Wav2Spk: A Simple DNN Architecture for Learning Speaker Embeddings 6 | # from Waveforms 7 | # 8 | # https://www.isca-speech.org/archive/Interspeech_2020/pdfs/1287.pdf 9 | # 10 | # Author(s): Nik Vaessen 11 | ################################################################################ 12 | 13 | import torch as t 14 | import torch.nn as nn 15 | 16 | ################################################################################ 17 | # pytorch module acting as temporal gate 18 | 19 | 20 | class TemporalGate(nn.Module): 21 | def __init__(self, num_features: int): 22 | super(TemporalGate, self).__init__() 23 | 24 | self.W: nn.Parameter = nn.Parameter( 25 | nn.init.xavier_normal_(t.ones((num_features, num_features))) 26 | ) 27 | self.b: nn.Parameter = nn.Parameter( 28 | nn.init.xavier_normal_(t.ones((num_features, 1))) 29 | ) 30 | 31 | def forward(self, x): 32 | # we expect the input x to have dimensionality 33 | # [BS, NUM_FEATURES, NUM_FRAMES] 34 | # so that W matmul x results in the same shape 35 | mask = t.sigmoid(self.W.matmul(x) + self.b) 36 | 37 | return t.mul(mask, x) 38 | -------------------------------------------------------------------------------- /src/lightning_modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/lightning_modules/__init__.py -------------------------------------------------------------------------------- /src/lightning_modules/base_lightning_module.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Define a base lightning module for speech and/or speaker recognition network. 4 | # 5 | # Author(s): Nik Vaessen 6 | ################################################################################ 7 | 8 | import logging 9 | 10 | from abc import abstractmethod 11 | from typing import Callable, Optional 12 | 13 | import torch as t 14 | import torch.nn 15 | import pytorch_lightning as pl 16 | 17 | from omegaconf import DictConfig, OmegaConf 18 | 19 | 20 | ################################################################################ 21 | # Definition of speaker recognition API 22 | 23 | # A logger for this file 24 | 25 | log = logging.getLogger(__name__) 26 | 27 | 28 | class BaseLightningModule(pl.LightningModule): 29 | def __init__( 30 | self, 31 | hyperparameter_config: DictConfig, 32 | loss_fn_constructor: Callable[[], Callable[[t.Tensor, t.Tensor], t.Tensor]], 33 | auto_lr_find: Optional[ 34 | float 35 | ] = None, # will be automatically passed by pytorch-lightning to children 36 | ): 37 | super().__init__() 38 | 39 | # input arguments 40 | self.loss_fn = loss_fn_constructor() 41 | 42 | # created by set_methods 43 | self.optimizer = None 44 | self.schedule = None 45 | self.warmup_optimizer = None 46 | self.warmup_schedule = None 47 | 48 | # flag determining which optimizer/schedule `configure_optimizers` uses 49 | self.warmup_enabled = False 50 | 51 | # auto_lr_find is set when you don't want to train the model 52 | # but want plot a learning rate against loss 53 | self.auto_lr_find = auto_lr_find 54 | 55 | # log hyperparameters 56 | self.save_hyperparameters(OmegaConf.to_container(hyperparameter_config)) 57 | 58 | def set_optimizer(self, optimizer: t.optim.Optimizer): 59 | self.optimizer = optimizer 60 | 61 | def set_lr_schedule(self, schedule: t.optim.lr_scheduler._LRScheduler): 62 | self.schedule = schedule 63 | 64 | @abstractmethod 65 | def generate_example_input( 66 | self, include_batch_dimension: bool, batch_size: Optional[int] 67 | ): 68 | pass 69 | 70 | def configure_optimizers(self): 71 | if self.auto_lr_find: 72 | log.info("USING the `auto_lr_find` learning rate and optimizer!") 73 | return torch.optim.Adam(self.parameters(), lr=self.auto_lr_find) 74 | 75 | return [self.optimizer], [self.schedule] 76 | -------------------------------------------------------------------------------- /src/lightning_modules/multitask/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/lightning_modules/multitask/__init__.py -------------------------------------------------------------------------------- /src/lightning_modules/speaker/__init__.py: -------------------------------------------------------------------------------- 1 | from .dummy import DummyModule, DummyModuleConfig 2 | from .wav2spk import Wav2SpkModule, Wav2SpkModuleConfig 3 | from .wav2vec2_fc import Wav2vec2FCModule, Wav2vec2FCModuleConfig 4 | from .wav2vec2_paired_input import ( 5 | Wav2vec2PairedSpeakerModule, 6 | Wav2vec2PairedSpeakerModuleConfig, 7 | ) 8 | from .wav2vec_fc import Wav2vecFCModule, Wav2vecFCModuleConfig 9 | from .wav2vec_xvector import Wav2vecXVectorModule, Wav2vecXVectorModuleConfig 10 | from .xvector import XVectorModule, XVectorModuleConfig 11 | from .ecapa_tdnn import EcapaTdnnModule, EcapaTDNNModuleConfig 12 | 13 | from .paired_speaker_recognition_module import PairedSpeakerRecognitionLightningModule 14 | from .speaker_recognition_module import SpeakerRecognitionLightningModule 15 | -------------------------------------------------------------------------------- /src/lightning_modules/speaker/dummy.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Implements a dummy module which uses very few parameters to generate 4 | # predictions/embeddings. 5 | # This is useful for debugging training schedules as it removes the 6 | # heavy computation for each step so a full training run can be executed 7 | # fairly quickly. 8 | # 9 | # Author(s): Nik Vaessen 10 | ################################################################################ 11 | 12 | from dataclasses import dataclass 13 | from typing import List, Optional, Callable 14 | 15 | import torch as t 16 | 17 | from omegaconf import DictConfig 18 | 19 | from src.evaluation.speaker.speaker_recognition_evaluator import ( 20 | EvaluationPair, 21 | SpeakerRecognitionEvaluator, 22 | ) 23 | from src.lightning_modules.speaker.speaker_recognition_module import ( 24 | SpeakerRecognitionLightningModule, 25 | ) 26 | 27 | ################################################################################ 28 | # Implementation of a very light-weight neural network 29 | 30 | 31 | @dataclass 32 | class DummyModuleConfig: 33 | pass 34 | 35 | 36 | class DummyModule(SpeakerRecognitionLightningModule): 37 | def __init__( 38 | self, 39 | hyperparameters_to_save: DictConfig, 40 | cfg: DummyModuleConfig, 41 | num_speakers: int, 42 | loss_fn_constructor: Callable[[], Callable[[t.Tensor, t.Tensor], t.Tensor]], 43 | validation_pairs: List[EvaluationPair], 44 | test_pairs: List[EvaluationPair], 45 | evaluator: SpeakerRecognitionEvaluator, 46 | ): 47 | super().__init__( 48 | hyperparameter_config=hyperparameters_to_save, 49 | num_speakers=num_speakers, 50 | embedding_size=2, 51 | loss_fn_constructor=loss_fn_constructor, 52 | validation_pairs=validation_pairs, 53 | test_pairs=test_pairs, 54 | evaluator=evaluator, 55 | embeddings_are_pooled=True 56 | ) 57 | 58 | self.cfg = cfg 59 | 60 | # just create a parameter so optimizer doesn't complain 61 | self.fc1 = t.nn.Linear(in_features=2, out_features=num_speakers) 62 | 63 | def generate_example_input( 64 | self, include_batch_dimension: bool, batch_size: Optional[int] = None 65 | ): 66 | # any input works really 67 | if include_batch_dimension: 68 | # [BATCH_SIZE, NUMBER_OF_WINDOWS, NUMBER_OF_MODEL_COEFFICIENTS] 69 | # the `100` varies depending on length of audio file 70 | # the `40` can be replaced by any other number of mel coefficients 71 | shape = [batch_size, 100, 40] 72 | else: 73 | # [NUMBER_OF_WINDOWS, NUMBER_OF_MODEL_COEFFICIENTS] 74 | # the `100` varies depending on length of audio file 75 | # the `40` can be replaced by any other number of mel coefficients 76 | shape = [100, 40] 77 | 78 | return t.rand(size=shape) 79 | 80 | def compute_speaker_embedding(self, input_tensor: t.Tensor) -> t.Tensor: 81 | std, mean = t.std_mean(input_tensor, dim=(1, 2)) 82 | embedding = t.stack([mean, std]).t() 83 | 84 | return embedding 85 | 86 | def compute_speaker_prediction(self, embedding_tensor: t.Tensor) -> t.Tensor: 87 | prediction = self.fc1(embedding_tensor) 88 | 89 | return prediction 90 | -------------------------------------------------------------------------------- /src/lightning_modules/speech/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/lightning_modules/speech/__init__.py -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/models/__init__.py -------------------------------------------------------------------------------- /src/models/wav2vec.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Provide embeddings from raw audio with the wav2vec model from fairseq. 4 | # 5 | # See `download/download_pretrained_models.sh` for links to pretrained weights. 6 | # 7 | # Author(s): Nik Vaessen 8 | ################################################################################ 9 | 10 | import pathlib 11 | 12 | import fairseq 13 | import pytorch_lightning 14 | import torch as t 15 | 16 | from fairseq.models.wav2vec import Wav2VecModel 17 | 18 | from src.util import reset_model 19 | 20 | ################################################################################ 21 | # loading wav2vec with fairseq 22 | 23 | 24 | def load_wav2vec_model( 25 | model_path: pathlib.Path, device: t.cuda.Device = t.device("cpu") 26 | ) -> Wav2VecModel: 27 | """ 28 | Load the wav2vec model. 29 | 30 | :param model_path: path to the ".pt" file of the model 31 | :param device: the device on which the model should be loaded 32 | :return: the wav2vec2 model on the specified device 33 | """ 34 | checkpoint = t.load(model_path) 35 | 36 | model = fairseq.models.wav2vec.Wav2VecModel.build_model(checkpoint["args"], None) 37 | model.load_state_dict(checkpoint["model"]) 38 | 39 | return model.to(device) 40 | 41 | 42 | ################################################################################ 43 | # computation of embedding 44 | 45 | 46 | def wav2vec_embed_raw_audio( 47 | input_tensor: t.Tensor, model: Wav2VecModel, aggregate: bool = False 48 | ) -> t.Tensor: 49 | """ 50 | Calculate a [1, 512, num_frames] embedding of a given [1, num_samples] audio file 51 | by using the Wav2Vec model. 52 | 53 | :param input_tensor: a raw audio input (between -1 and 1) with a sampling rate of 16000 Hz 54 | :param model: the wav2vec model 55 | :param aggregate whether to apply an aggregation to the initial features 56 | :return: The embedding with shape [1, 512, num_frames], where num_frames < num_samples. 57 | """ 58 | z = model.feature_extractor(input_tensor) 59 | 60 | if not aggregate: 61 | return z 62 | else: 63 | return model.feature_aggregator(z) 64 | 65 | 66 | ################################################################################ 67 | # wrap the wav2vec model 68 | 69 | 70 | class Wav2VecWrapperModule(pytorch_lightning.LightningModule): 71 | def __init__( 72 | self, 73 | wav2vec_model_path: pathlib.Path, 74 | wav2vec_aggregation: bool = False, 75 | reset_weights: bool = False, 76 | ): 77 | super().__init__() 78 | 79 | self.model = load_wav2vec_model(wav2vec_model_path) 80 | self.use_aggregator = wav2vec_aggregation 81 | self.num_features = 512 82 | 83 | if reset_weights: 84 | reset_model(self.model) 85 | 86 | @property 87 | def num_embedding_features(self): 88 | return self.num_features 89 | 90 | def forward(self, wav_input: t.Tensor): 91 | # wav_input has shape [BATCH_SIZE, NUM_SAMPLES] 92 | embedding = wav2vec_embed_raw_audio(wav_input, self.model, self.use_aggregator) 93 | 94 | # return an embedding with shape [BATCH_SIZE, NUM_FEATURES, NUM_FRAMES] 95 | return embedding 96 | -------------------------------------------------------------------------------- /src/optim/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/optim/__init__.py -------------------------------------------------------------------------------- /src/optim/loss/__init__.py: -------------------------------------------------------------------------------- 1 | from .aam_softmax import AngularAdditiveMarginSoftMaxLoss 2 | from .cross_entropy import CrossEntropyLoss 3 | from .triplet_loss import TripletLoss 4 | from .triplet_ce_loss import TripletCrossEntropyLoss 5 | from .ctc_loss import CtcLoss -------------------------------------------------------------------------------- /src/optim/loss/aam_softmax.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Implementation of angular additive margin softmax loss. 4 | # 5 | # Adapted from: https://github.com/clovaai/voxceleb_trainer/blob/master/loss/aamsoftmax.py 6 | # 7 | # Author(s): Nik Vaessen 8 | ################################################################################ 9 | 10 | import torch 11 | 12 | import torch as t 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | 16 | import math 17 | 18 | ################################################################################ 19 | # wrap around aam-loss implementation 20 | 21 | 22 | class AngularAdditiveMarginSoftMaxLoss(t.nn.Module): 23 | def __init__( 24 | self, 25 | input_features, 26 | output_features, 27 | margin=0.3, 28 | scale=15, 29 | easy_margin=False, 30 | ): 31 | super(AngularAdditiveMarginSoftMaxLoss, self).__init__() 32 | 33 | self.margin = margin 34 | self.scale = scale 35 | self.input_features = input_features 36 | self.fc_weights = torch.nn.Parameter( 37 | torch.FloatTensor(output_features, input_features), requires_grad=True 38 | ) 39 | self.ce = nn.CrossEntropyLoss() 40 | nn.init.xavier_normal_(self.fc_weights, gain=1) 41 | 42 | self.easy_margin = easy_margin 43 | self.cos_m = math.cos(self.margin) 44 | self.sin_m = math.sin(self.margin) 45 | 46 | # make the function cos(theta+m) monotonic decreasing while theta in [0°,180°] 47 | self.th = math.cos(math.pi - self.margin) 48 | self.mm = math.sin(math.pi - self.margin) * self.margin 49 | 50 | def forward(self, x, label=None): 51 | assert x.size()[0] == label.size()[0] 52 | assert x.size()[1] == self.input_features 53 | 54 | # cos(theta) 55 | cosine = F.linear(F.normalize(x), F.normalize(self.fc_weights)) 56 | # cos(theta + m) 57 | sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1)) 58 | phi = cosine * self.cos_m - sine * self.sin_m 59 | 60 | if self.easy_margin: 61 | phi = torch.where(cosine > 0, phi, cosine) 62 | else: 63 | phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm) 64 | 65 | # one_hot = torch.zeros(cosine.size(), device='cuda' if torch.cuda.is_available() else 'cpu') 66 | one_hot = torch.zeros_like(cosine) 67 | one_hot.scatter_(1, label.view(-1, 1), 1) 68 | output = (one_hot * phi) + ((1.0 - one_hot) * cosine) 69 | output = output * self.scale 70 | 71 | loss = self.ce(output, label) 72 | prediction = F.softmax(output, dim=1) 73 | 74 | return loss, prediction 75 | -------------------------------------------------------------------------------- /src/optim/loss/binary_cross_entropy.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Wrap around implementation of binary cross-entropy loss. 4 | # 5 | # Author(s): Nik Vaessen 6 | ################################################################################ 7 | 8 | import torch as t 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | 12 | ################################################################################ 13 | # wrap around cross-entropy loss of PyTorch 14 | 15 | 16 | class BinaryCrossEntropyLoss(t.nn.Module): 17 | def __init__(self): 18 | super().__init__() 19 | 20 | self.softmax = nn.LogSoftmax(dim=1) 21 | 22 | def forward(self, logits: t.Tensor, label_indexes: t.Tensor): 23 | return self._bce_loss(logits, label_indexes) 24 | 25 | def _bce_loss(self, logits: t.Tensor, label_indexes: t.Tensor): 26 | # logits (unnormalized quantities on which sigmoid is applied) 27 | # with shape [BATCH_SIZE, 1] and 28 | # label indexes (integers in {0, 1}) with shape [BATCH SIZE] 29 | logits = logits.squeeze().to(t.float32) 30 | label_indexes = label_indexes.squeeze().to(t.float32) 31 | 32 | loss = F.binary_cross_entropy_with_logits(logits, label_indexes) 33 | 34 | with t.no_grad(): 35 | # put predictions into [0, 1] range for later calculation of accuracy 36 | prediction = t.sigmoid(logits).detach() 37 | 38 | return loss, prediction 39 | -------------------------------------------------------------------------------- /src/optim/loss/cross_entropy.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Implementation of Cross-entropy loss. 4 | # 5 | # Author(s): Nik Vaessen 6 | ################################################################################ 7 | 8 | import torch as t 9 | import torch.nn.functional as F 10 | 11 | ################################################################################ 12 | # wrap around PyTorch cross-entropy loss implementation 13 | 14 | 15 | class CrossEntropyLoss(t.nn.Module): 16 | def __init__(self): 17 | super().__init__() 18 | 19 | def forward(self, logits: t.Tensor, label_indexes: t.Tensor): 20 | return self._ce_loss(logits, label_indexes) 21 | 22 | def _ce_loss(self, logits: t.Tensor, label_indexes: t.Tensor): 23 | # logits (unnormalized quantities on which softmax is applied) 24 | # with shape [BATCH_SIZE, NUM_SPEAKERS] and 25 | # label indexes (integers in range [0, NUM_SPEAKERS-1]) 26 | # with shape [BATCH SIZE] 27 | loss = F.cross_entropy(logits, label_indexes) 28 | 29 | with t.no_grad(): 30 | # put predictions into [0, 1] range for later calculation of accuracy 31 | prediction = F.softmax(logits, dim=1).detach() 32 | 33 | return loss, prediction 34 | -------------------------------------------------------------------------------- /src/optim/loss/ctc_loss.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # CTC loss for speech recognition 4 | # 5 | # Author(s): Nik Vaessen 6 | ################################################################################ 7 | 8 | import torch as t 9 | 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | 13 | ################################################################################ 14 | # wrapper around ctc loss of pytorch 15 | 16 | 17 | class CtcLoss(nn.Module): 18 | def __init__(self, blank_idx: int = 0): 19 | super().__init__() 20 | 21 | self.blank_idx = blank_idx 22 | 23 | def forward( 24 | self, 25 | predictions: t.Tensor, 26 | prediction_lengths: t.Tensor, 27 | ground_truths: t.Tensor, 28 | ground_truth_lengths: t.Tensor, 29 | ): 30 | original_device = predictions.device 31 | assert original_device == predictions.device == ground_truths.device 32 | 33 | # predictions will be shape [BATCH_SIZE, MAX_INPUT_SEQUENCE_LENGTH, CLASSES] 34 | # expected to be [MAX_INPUT_SEQUENCE_LENGTH, BATCH_SIZE, CLASSES] for 35 | # loss function 36 | predictions = t.transpose(predictions, 0, 1) 37 | 38 | # they also need to be log probabilities 39 | predictions = F.log_softmax(predictions, dim=2) 40 | 41 | # prediction lengths will be shape [BATCH_SIZE] 42 | pass # already OK 43 | 44 | # ground truths will be shape [BATCH_SIZE, MAX_TARGET_SEQUENCE_LENGTH] 45 | pass # already OK 46 | 47 | # ground_truth_lengths will be shape [BATCH_SIZE] 48 | pass # already OK 49 | 50 | # ctc loss expects every tensor to be on CPU 51 | return F.ctc_loss( 52 | log_probs=predictions.to("cpu"), 53 | targets=ground_truths.to("cpu"), 54 | input_lengths=prediction_lengths.to("cpu"), 55 | target_lengths=ground_truth_lengths.to("cpu"), 56 | blank=self.blank_idx, 57 | zero_infinity=True, # prevents any weird crashes 58 | ).to(original_device) 59 | -------------------------------------------------------------------------------- /src/optim/loss/triplet_ce_loss.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Implement a wrapper around triplet loss and cross-entropy loss 4 | # for speaker recognition embeddings 5 | # 6 | # Author(s): Nik Vaessen 7 | ################################################################################ 8 | 9 | import torch as t 10 | 11 | from src.optim.loss.cross_entropy import CrossEntropyLoss 12 | from src.optim.loss.triplet_loss import TripletLoss 13 | 14 | ################################################################################ 15 | # wrapper combining cross-entropy and triplet loss 16 | 17 | 18 | class TripletCrossEntropyLoss(TripletLoss, CrossEntropyLoss): 19 | def __init__(self, c_ce: float = 1, c_triplet: float = 1): 20 | super().__init__() 21 | 22 | if c_ce < 1 or c_triplet < 1: 23 | raise ValueError( 24 | f"constants need to be natural numbers, while" f"{c_ce=}, {c_triplet=}" 25 | ) 26 | 27 | self.c_ce = c_ce 28 | self.c_triplet = c_triplet 29 | 30 | def forward(self, embeddings: t.Tensor, logits: t.Tensor, label_indexes: t.Tensor): 31 | ce_loss, prediction = self._ce_loss(logits, label_indexes) 32 | triplet_loss = self._triplet_loss(embeddings, label_indexes) 33 | 34 | loss = self.c_ce * ce_loss + self.c_triplet * triplet_loss 35 | 36 | return loss, prediction 37 | -------------------------------------------------------------------------------- /src/optim/loss/triplet_loss.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Implement a wrapper around triplet loss for speaker recognition embeddings 4 | # 5 | # Author(s): Nik Vaessen 6 | ################################################################################ 7 | 8 | import random 9 | 10 | from typing import List 11 | 12 | import torch 13 | import torch as t 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | 17 | ################################################################################ 18 | # wrapper of triplet loss 19 | 20 | 21 | class TripletLoss(nn.Module): 22 | def __init__(self, margin: float = 1): 23 | super().__init__() 24 | 25 | self.margin = margin 26 | 27 | def forward(self, embeddings: t.Tensor, label_indexes: t.Tensor): 28 | return self._triplet_loss(embeddings, label_indexes) 29 | 30 | def _triplet_loss(self, embeddings: t.Tensor, label_indexes: t.Tensor): 31 | # embeddings with shape [BATCH_SIZE, EMBEDDING_SIZE] and 32 | # label indexes (integers in range [0, NUM_SPEAKERS-1]) 33 | # with shape [BATCH SIZE] 34 | 35 | # make sure we can generate triplets for each label 36 | with torch.no_grad(): 37 | label_list: List[int] = label_indexes.detach().cpu().numpy().tolist() 38 | 39 | self.verify_labels(label_list) 40 | 41 | # generate a triplet for each batch dimension 42 | anchors = [] 43 | positives = [] 44 | negatives = [] 45 | 46 | for batch_dim in range(embeddings.shape[0]): 47 | # get anchor 48 | label = label_indexes[batch_dim] 49 | anchor = embeddings[batch_dim].squeeze() 50 | 51 | # find positive 52 | positive = self._find_positive( 53 | embeddings=embeddings, 54 | label_list=label_list, 55 | label=label, 56 | exclude_idx=batch_dim, 57 | ) 58 | 59 | # find negative 60 | negative = self._find_negative( 61 | embeddings=embeddings, label_list=label_list, label=label 62 | ) 63 | 64 | # save anchor, positive, negative tuple 65 | anchors.append(anchor) 66 | positives.append(positive) 67 | negatives.append(negative) 68 | 69 | return F.triplet_margin_loss( 70 | anchor=t.stack(anchors), 71 | positive=t.stack(positives), 72 | negative=t.stack(negatives), 73 | margin=self.margin, 74 | ) 75 | 76 | @staticmethod 77 | def cosine_distance(a: t.Tensor, b: t.Tensor): 78 | return 1 - t.div(F.cosine_similarity(a, b) + 1, 2) 79 | 80 | @staticmethod 81 | def _find_positive( 82 | embeddings: t.Tensor, label_list: List[int], label: int, exclude_idx: int 83 | ) -> t.Tensor: 84 | candidate_indexes = [ 85 | idx for idx, l in enumerate(label_list) if label == l and exclude_idx != idx 86 | ] 87 | 88 | idx = random.choice(candidate_indexes) 89 | 90 | return embeddings[idx].squeeze() 91 | 92 | @staticmethod 93 | def _find_negative( 94 | embeddings: t.Tensor, label_list: List[int], label: int 95 | ) -> t.Tensor: 96 | candidate_indexes = [idx for idx, l in enumerate(label_list) if label != l] 97 | 98 | idx = random.choice(candidate_indexes) 99 | 100 | return embeddings[idx].squeeze() 101 | 102 | @staticmethod 103 | def verify_labels(label_list: List[int]): 104 | unique_labels = set(label_list) 105 | 106 | for label in unique_labels: 107 | assert label_list.count(label) >= 2 108 | -------------------------------------------------------------------------------- /src/optim/schedule/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/optim/schedule/__init__.py -------------------------------------------------------------------------------- /src/optim/schedule/tri_stage.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Implement a class which can be given as function to `LambdaLR` to act 4 | # as a tri-stage learning rate with: 5 | # 1. a linear warmup phase from `initial_lr` to `base_lr` 6 | # 2. a constant phase of `base_lr` 7 | # 3. an exponential decay phase from `base_lr` to `final_lr` 8 | # 9 | # The learning rate of the optimizer is expected to be set to `base_lr` and 10 | # you should use `steps` and not `epochs` as update interval. 11 | # 12 | # Author(s): Nik Vaessen 13 | ################################################################################ 14 | 15 | import math 16 | import torch as t 17 | 18 | ################################################################################ 19 | # implementation of tri-stage LambdaLR function 20 | 21 | 22 | class TriStageLearningRateLambdaLRFunction: 23 | @staticmethod 24 | def is_valid_ratio(ratio: float): 25 | return 0 <= ratio <= 1 26 | 27 | def __init__( 28 | self, 29 | max_steps: int, 30 | warmup_stage_ratio: float, 31 | constant_stage_ratio: float, 32 | decay_stage_ratio: float, 33 | initial_lr: float, 34 | base_lr: float, 35 | final_lr: float, 36 | ): 37 | if not ( 38 | self.is_valid_ratio(warmup_stage_ratio) 39 | and self.is_valid_ratio(constant_stage_ratio) 40 | and self.is_valid_ratio(decay_stage_ratio) 41 | ): 42 | raise ValueError() 43 | 44 | if ( 45 | abs((warmup_stage_ratio + constant_stage_ratio + decay_stage_ratio) - 1) 46 | >= 1e-9 47 | ): 48 | raise ValueError("stage ratio's need to add up to 1") 49 | 50 | # stage computation 51 | self.max_steps = max_steps 52 | 53 | if self.max_steps is None: 54 | raise ValueError( 55 | "TriStage learning rate schedule requires setting `max_steps` " 56 | "in the trainer" 57 | ) 58 | 59 | self.warmup_stage_steps = math.floor(self.max_steps * warmup_stage_ratio) 60 | self.constant_stage_steps = math.floor(self.max_steps * constant_stage_ratio) 61 | self.decay_stage_steps = math.floor(self.max_steps * decay_stage_ratio) 62 | 63 | self.initial_lr = initial_lr 64 | self.base_lr = base_lr 65 | self.final_lr = final_lr 66 | 67 | # warmup_stage lin_space 68 | self.warmup_stage_space = ( 69 | t.linspace(self.initial_lr, self.base_lr, steps=self.warmup_stage_steps) 70 | .cpu() 71 | .numpy() 72 | .tolist() 73 | ) 74 | self.decay_stage_space = ( 75 | t.logspace( 76 | math.log(self.base_lr), 77 | math.log(self.final_lr), 78 | steps=self.decay_stage_steps + 2, 79 | base=math.e, 80 | ) 81 | .cpu() 82 | .numpy() 83 | .tolist() 84 | ) 85 | 86 | def __call__(self, step_count: int): 87 | if step_count < self.warmup_stage_steps: 88 | desired_lr = self.warmup_stage_space[step_count] 89 | elif step_count <= self.warmup_stage_steps + self.constant_stage_steps: 90 | desired_lr = self.base_lr 91 | elif step_count <= self.max_steps: 92 | desired_lr = self.decay_stage_space[ 93 | step_count - (self.warmup_stage_steps + self.constant_stage_steps) 94 | ] 95 | else: 96 | desired_lr = self.final_lr 97 | 98 | factor = desired_lr / self.base_lr 99 | return factor 100 | -------------------------------------------------------------------------------- /src/tokenizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/tokenizer/__init__.py -------------------------------------------------------------------------------- /src/tokenizer/base.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # base API for a tokenizer 4 | # 5 | # Author(s): Nik Vaessen 6 | ################################################################################ 7 | 8 | from abc import abstractmethod 9 | from typing import Dict 10 | 11 | import torch as t 12 | 13 | ################################################################################ 14 | # base API 15 | 16 | 17 | class BaseTokenizer: 18 | @abstractmethod 19 | def encode_string(self, string: str) -> t.Tensor: 20 | pass 21 | 22 | @abstractmethod 23 | def decode_tensor(self, token_tensor: t.Tensor): 24 | pass 25 | 26 | @abstractmethod 27 | def vocabulary_dictionary(self) -> Dict[str, int]: 28 | pass 29 | 30 | @abstractmethod 31 | def vocabulary_size(self) -> int: 32 | pass 33 | 34 | @abstractmethod 35 | def special_tokens_dictionary(self) -> Dict[str, int]: 36 | pass 37 | 38 | @abstractmethod 39 | def blank_token_id(self) -> int: 40 | pass 41 | -------------------------------------------------------------------------------- /src/tokenizer/tokenizer_wav2vec2.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Tokenizer for the wav2vec2 network. 4 | # 5 | # Author(s): Nik Vaessen 6 | ################################################################################ 7 | 8 | from typing import Dict, List 9 | 10 | from attr import dataclass 11 | from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer 12 | 13 | import torch as t 14 | 15 | from src.tokenizer.base import BaseTokenizer 16 | 17 | ################################################################################ 18 | # wrapper around huggingfacae tokenizer 19 | 20 | 21 | @dataclass 22 | class Wav2vec2TokenizerConfig: 23 | tokenizer_huggingface_id: str 24 | 25 | 26 | class Wav2vec2Tokenizer(BaseTokenizer): 27 | def __init__(self, cfg: Wav2vec2TokenizerConfig): 28 | self.tokenizer = Wav2Vec2CTCTokenizer.from_pretrained( 29 | cfg.tokenizer_huggingface_id 30 | ) 31 | 32 | def encode_string(self, string: str) -> t.Tensor: 33 | return t.IntTensor(self.tokenizer(string).input_ids) 34 | 35 | def decode_tensor(self, token_tensor: t.Tensor) -> str: 36 | assert len(token_tensor.shape) == 1 37 | 38 | decoded_str = self.tokenizer.decode(token_tensor) 39 | 40 | return decoded_str 41 | 42 | def vocabulary_dictionary(self) -> Dict[str, int]: 43 | return self.tokenizer.get_vocab() 44 | 45 | def vocabulary_size(self) -> int: 46 | return self.tokenizer.vocab_size 47 | 48 | def special_tokens_dictionary(self) -> Dict[str, int]: 49 | return self.tokenizer.special_tokens_map 50 | 51 | def blank_token_id(self) -> int: 52 | return 0 53 | --------------------------------------------------------------------------------