├── .env.example
├── .gitignore
├── LICENCE
├── README.md
├── config
    ├── callbacks
    │   ├── debugging.yaml
    │   ├── default_speech.yaml
    │   ├── none.yaml
    │   ├── speaker_default.yaml
    │   └── speaker_early_stopping.yaml
    ├── data
    │   ├── dataloader
    │   │   ├── speaker.yaml
    │   │   └── speech.yaml
    │   ├── module
    │   │   ├── librispeech.yaml
    │   │   ├── voxceleb1.yaml
    │   │   ├── voxceleb1_pairs.yaml
    │   │   ├── voxceleb1_triplets.yaml
    │   │   ├── voxceleb2.yaml
    │   │   ├── voxceleb2_pairs.yaml
    │   │   ├── voxceleb2_test_everyone.yaml
    │   │   ├── voxceleb2_test_hard.yaml
    │   │   └── voxceleb2_triplets.yaml
    │   ├── pipeline
    │   │   ├── wav2vec_base_pipeline.yaml
    │   │   ├── wav2vec_full_seq_pipeline.yaml
    │   │   ├── wav2vec_pair_pipeline.yaml
    │   │   ├── wav2vec_short_seq_pipeline.yaml
    │   │   ├── xvector_all_augment_pipeline.yaml
    │   │   ├── xvector_dropout_augment_pipeline.yaml
    │   │   ├── xvector_pipeline.yaml
    │   │   └── xvector_rirs_augment.yaml
    │   └── shards
    │   │   ├── shards_librispeech.yaml
    │   │   └── shards_voxceleb.yaml
    ├── evaluator
    │   ├── cosine_distance.yaml
    │   ├── cosine_distance_with_train_data.yaml
    │   ├── lda.yaml
    │   └── plda.yaml
    ├── experiment
    │   ├── speaker_dummy.yaml
    │   ├── speaker_ecapa_tdnn.yaml
    │   ├── speaker_wav2vec2_aam.yaml
    │   ├── speaker_wav2vec2_ce.yaml
    │   ├── speaker_wav2vec2_ctc.yaml
    │   ├── speaker_wav2vec2_pairs.yaml
    │   ├── speaker_wav2vec2_triplet.yaml
    │   ├── speaker_wav2vec2_triplet_ce.yaml
    │   ├── speaker_xvector.yaml
    │   └── speech_wav2vec2_ctc.yaml
    ├── hydra
    │   └── launcher
    │   │   └── slurm.yaml
    ├── network
    │   ├── dummy.yaml
    │   ├── ecapa_tdnn.yaml
    │   ├── wav2spk.yaml
    │   ├── wav2vec2_fc.yaml
    │   ├── wav2vec2_fc_letter.yaml
    │   ├── wav2vec2_paired.yaml
    │   ├── wav2vec_fc.yaml
    │   ├── wav2vec_xvector.yaml
    │   └── xvector.yaml
    ├── optim
    │   ├── algo
    │   │   ├── adam.yaml
    │   │   └── sgd.yaml
    │   ├── loss
    │   │   ├── aam_softmax.yaml
    │   │   ├── binary_cross_entropy.yaml
    │   │   ├── cross_entropy.yaml
    │   │   ├── ctc.yaml
    │   │   ├── triplet.yaml
    │   │   └── triplet_ce.yaml
    │   └── schedule
    │   │   ├── constant.yaml
    │   │   ├── cyclic.yaml
    │   │   ├── exp_decay.yaml
    │   │   ├── one_cycle.yaml
    │   │   ├── reduce_on_plateau.yaml
    │   │   ├── schedule_wav2spk.yaml
    │   │   ├── schedule_wav2vec_fan_etal.yaml
    │   │   └── tri_stage.yaml
    ├── predict.yaml
    ├── profiler
    │   ├── advanced.yaml
    │   └── simple.yaml
    ├── search
    │   ├── lr_and_aam_loss.yaml
    │   ├── lr_and_pooling.yaml
    │   └── lr_and_schedule_search.yaml
    ├── tokenizer
    │   └── default.yaml
    ├── train_eval.yaml
    └── trainer
    │   ├── debug_trainer.yaml
    │   └── trainer.yaml
├── convert_voxceleb2.sh
├── paper_results
    ├── auto_lr_find
    │   ├── ecapa
    │   │   ├── .hydra
    │   │   │   ├── config.yaml
    │   │   │   ├── hydra.yaml
    │   │   │   └── overrides.yaml
    │   │   ├── data.json
    │   │   ├── lightning_logs
    │   │   │   └── version_0
    │   │   │   │   └── events.out.tfevents.1631794798.katara.82853.0
    │   │   ├── plot.png
    │   │   ├── plot_lr_eer.png
    │   │   ├── plot_lr_eer_zoomed.png
    │   │   └── run.log
    │   ├── grid_search_results.csv
    │   ├── plot_auto_lr.py
    │   ├── plot_eer_and_lr_find.py
    │   ├── plot_eer_and_lr_find_broken.py
    │   ├── wav2vec2-sv-aam
    │   │   ├── .hydra
    │   │   │   ├── config.yaml
    │   │   │   ├── hydra.yaml
    │   │   │   └── overrides.yaml
    │   │   ├── data.json
    │   │   ├── lightning_logs
    │   │   │   └── version_0
    │   │   │   │   └── events.out.tfevents.1631044502.katara.6664.0
    │   │   ├── plot.png
    │   │   ├── plot_lr_eer.png
    │   │   ├── plot_lr_eer_zoomed.png
    │   │   └── run.log
    │   ├── wav2vec2-sv-bce
    │   │   ├── .hydra
    │   │   │   ├── config.yaml
    │   │   │   ├── hydra.yaml
    │   │   │   └── overrides.yaml
    │   │   ├── data.json
    │   │   ├── lightning_logs
    │   │   │   └── version_0
    │   │   │   │   └── events.out.tfevents.1631113238.katara.16035.0
    │   │   ├── plot.png
    │   │   ├── plot_lr_eer.png
    │   │   ├── plot_lr_eer_zoomed.png
    │   │   └── run.log
    │   ├── wav2vec2-sv-ce
    │   │   ├── .hydra
    │   │   │   ├── config.yaml
    │   │   │   ├── hydra.yaml
    │   │   │   └── overrides.yaml
    │   │   ├── data.json
    │   │   ├── lightning_logs
    │   │   │   └── version_0
    │   │   │   │   └── events.out.tfevents.1631043151.katara.6259.0
    │   │   ├── plot.png
    │   │   ├── plot_lr_eer.png
    │   │   ├── plot_lr_eer_zoomed.png
    │   │   └── run.log
    │   ├── wav2vec2-sv-ctc
    │   │   ├── .hydra
    │   │   │   ├── config.yaml
    │   │   │   ├── hydra.yaml
    │   │   │   └── overrides.yaml
    │   │   ├── data.json
    │   │   ├── lightning_logs
    │   │   │   └── version_0
    │   │   │   │   └── events.out.tfevents.1631793388.katara.71473.0
    │   │   ├── plot.png
    │   │   ├── plot_lr_eer.png
    │   │   └── run.log
    │   └── xvector
    │   │   ├── .hydra
    │   │       ├── config.yaml
    │   │       ├── hydra.yaml
    │   │       └── overrides.yaml
    │   │   ├── data.json
    │   │   ├── lightning_logs
    │   │       └── version_0
    │   │       │   └── events.out.tfevents.1631794594.katara.80664.0
    │   │   ├── plot.png
    │   │   ├── plot_lr_eer.png
    │   │   ├── plot_lr_eer_zoomed.png
    │   │   └── run.log
    └── run_tests_pool.py
├── predict.py
├── preparation_scripts
    ├── download_and_prepare_rirs.sh
    ├── download_librispeech.sh
    ├── download_pretrained_models.sh
    ├── download_voxceleb_meta.sh
    ├── hydra_bash_complete.sh
    ├── set_cuda_dependencies.sh
    ├── validate_scores.py
    └── voxceleb2_convert_to_wav.py
├── pyproject.toml
├── requirements
    ├── requirements_cuda101.txt
    ├── requirements_cuda111.txt
    └── requirements_py1.9_cuda111.txt
├── run.py
└── src
    ├── __init__.py
    ├── callbacks
        ├── __init__.py
        ├── input_monitor_callback.py
        ├── memory_monitor.py
        └── progress_tracker_callback.py
    ├── config_util.py
    ├── data
        ├── __init__.py
        ├── collating.py
        ├── common.py
        ├── modules
        │   ├── __init__.py
        │   ├── speaker
        │   │   ├── __init__.py
        │   │   ├── __pycache__
        │   │   │   ├── __init__.cpython-38.pyc
        │   │   │   ├── speaker_data_module.cpython-38.pyc
        │   │   │   ├── training_batch_speaker.cpython-38.pyc
        │   │   │   └── voxceleb.cpython-38.pyc
        │   │   ├── speaker_data_module.py
        │   │   ├── training_batch_speaker.py
        │   │   └── voxceleb.py
        │   └── speech
        │   │   ├── __init__.py
        │   │   ├── __pycache__
        │   │       ├── __init__.cpython-38.pyc
        │   │       ├── librispeech.cpython-38.pyc
        │   │       ├── speech_data_module.cpython-38.pyc
        │   │       └── training_batch_speech.cpython-38.pyc
        │   │   ├── librispeech.py
        │   │   ├── speech_data_module.py
        │   │   └── training_batch_speech.py
        ├── preprocess
        │   ├── __init__.py
        │   ├── audio_features.py
        │   ├── augment.py
        │   ├── base.py
        │   ├── input_normalisation.py
        │   └── random_chunks.py
        └── util.py
    ├── eval_metrics.py
    ├── evaluation
        ├── __init__.py
        ├── speaker
        │   ├── __init__.py
        │   ├── cosine_distance.py
        │   ├── lda.py
        │   ├── plda.py
        │   └── speaker_recognition_evaluator.py
        └── speech
        │   ├── __init__.py
        │   └── wer.py
    ├── hydra_resolvers.py
    ├── layers
        ├── __init__.py
        ├── embedding_masking.py
        ├── pooling.py
        └── temporal_gating.py
    ├── lightning_modules
        ├── __init__.py
        ├── base_lightning_module.py
        ├── multitask
        │   ├── __init__.py
        │   └── mt_speech_speaker_module.py
        ├── speaker
        │   ├── __init__.py
        │   ├── dummy.py
        │   ├── ecapa_tdnn.py
        │   ├── paired_speaker_recognition_module.py
        │   ├── speaker_recognition_module.py
        │   ├── wav2spk.py
        │   ├── wav2vec2_ctc.py
        │   ├── wav2vec2_fc.py
        │   ├── wav2vec2_paired_input.py
        │   ├── wav2vec_fc.py
        │   ├── wav2vec_xvector.py
        │   └── xvector.py
        └── speech
        │   ├── __init__.py
        │   ├── speech_recognition_module.py
        │   └── wav2vec2_fc_letter.py
    ├── main.py
    ├── models
        ├── __init__.py
        ├── wav2vec.py
        └── wav2vec2.py
    ├── optim
        ├── __init__.py
        ├── loss
        │   ├── __init__.py
        │   ├── aam_softmax.py
        │   ├── binary_cross_entropy.py
        │   ├── cross_entropy.py
        │   ├── ctc_loss.py
        │   ├── triplet_ce_loss.py
        │   └── triplet_loss.py
        └── schedule
        │   ├── __init__.py
        │   └── tri_stage.py
    ├── predict.py
    ├── tokenizer
        ├── __init__.py
        ├── base.py
        └── tokenizer_wav2vec2.py
    └── util.py


/.env.example:
--------------------------------------------------------------------------------
 1 | # folder where dataset(s) and pretrained models are stored
 2 | DATA_FOLDER=$PWD
 3 | 
 4 | # folder where results will be logged to
 5 | LOG_FOLDER=$PWD/logs
 6 | 
 7 | # folder which can be used for temporary storage
 8 | TEMP_FOLDER=$DATA_FOLDER/tmp
 9 | 
10 | # folder where huggingface library saves model weights
11 | TRANSFORMERS_CACHE=$DATA_FOLDER/pretrained_models
12 | 
13 | # default value for using comet ml
14 | USE_COMET_ML=False
15 | 
16 | # API key of comet.ml account for experiment tracking
17 | COMET_API_KEY=<API_KEY_HERE>
18 | 
19 | # Default number of GPUs you want to train with
20 | NUM_GPUS=1
21 | 
22 | # hydra launcher to use. Set to SLURM on GPU cluster :)
23 | HYDRA_LAUNCHER=basic
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | 
 3 | *.pyc
 4 | 
 5 | dist/
 6 | build/
 7 | *.egg-info/
 8 | 
 9 | .tox/
10 | .coverage
11 | 
12 | set_environment.sh
13 | 
14 | /results
15 | /models
16 | /data
17 | /poetry.lock
18 | /lightning_logs/
19 | .env
20 | /data/
21 | /outputs/
22 | /playground/
23 | 
24 | poetry.toml
25 | .vscode
26 | /.venv/
27 | 


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
1 | Copyright 2022 Nik Vaessen on behalf of Radboud University
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/config/callbacks/debugging.yaml:
--------------------------------------------------------------------------------
 1 | to_add:
 2 |   - input_monitor
 3 |   - lr_monitor
 4 | #  - gpu_monitor
 5 | 
 6 | # log debug information for a single batch
 7 | input_monitor:
 8 |   _target_: src.callbacks.input_monitor_callback.InputMonitor
 9 | 
10 | # keep track of learning rate in logger
11 | lr_monitor:
12 |   _target_: pytorch_lightning.callbacks.LearningRateMonitor
13 | 
14 | gpu_monitor:
15 |   _target_: pytorch_lightning.callbacks.GPUStatsMonitor


--------------------------------------------------------------------------------
/config/callbacks/default_speech.yaml:
--------------------------------------------------------------------------------
 1 | to_add:
 2 |   - lr_monitor
 3 |   - ram_monitor
 4 |   - checkpoint
 5 | 
 6 | # keep track of learning rate in logger
 7 | lr_monitor:
 8 |   _target_: pytorch_lightning.callbacks.LearningRateMonitor
 9 | 
10 | ram_monitor:
11 |   _target_: src.callbacks.memory_monitor.RamMemoryMonitor
12 |   frequency: 100
13 | 
14 | # save model checkpoint of weights with best validation performance
15 | checkpoint:
16 |   _target_: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint
17 |   monitor: val_wer_clean
18 |   save_top_k: 1
19 |   mode: min
20 |   filename: '{epoch}.{step}.{val_wer_clean:.4f}.best'
21 |   save_last: true
22 |   every_n_val_epochs: 1
23 | 
24 | last_checkpoint_pattern: '{epoch}.{step}.{val_wer_clean:.4f}.last'


--------------------------------------------------------------------------------
/config/callbacks/none.yaml:
--------------------------------------------------------------------------------
1 | to_add:
2 |   - null


--------------------------------------------------------------------------------
/config/callbacks/speaker_default.yaml:
--------------------------------------------------------------------------------
 1 | to_add:
 2 |   - lr_monitor
 3 |   - ram_monitor
 4 |   - checkpoint
 5 | 
 6 | # keep track of learning rate in logger
 7 | lr_monitor:
 8 |   _target_: pytorch_lightning.callbacks.LearningRateMonitor
 9 | 
10 | ram_monitor:
11 |   _target_: src.callbacks.memory_monitor.RamMemoryMonitor
12 |   frequency: 100
13 | 
14 | # save model checkpoint of weights with best validation performance
15 | checkpoint:
16 |   _target_: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint
17 |   monitor: val_eer
18 |   save_top_k: 1
19 |   mode: min
20 |   filename: '{epoch}.{step}.{val_eer:.4f}.best'
21 |   save_last: true
22 |   every_n_val_epochs: 1
23 | 
24 | last_checkpoint_pattern: '{epoch}.{step}.{val_eer:.4f}.last'


--------------------------------------------------------------------------------
/config/callbacks/speaker_early_stopping.yaml:
--------------------------------------------------------------------------------
 1 | to_add:
 2 |   - lr_monitor
 3 |   - ram_monitor
 4 |   - checkpoint
 5 |   - early_stopping
 6 | 
 7 | # keep track of learning rate in logger
 8 | lr_monitor:
 9 |   _target_: pytorch_lightning.callbacks.LearningRateMonitor
10 | 
11 | ram_monitor:
12 |   _target_: src.callbacks.memory_monitor.RamMemoryMonitor
13 |   frequency: 100
14 | 
15 | # save model checkpoint of weights with best validation performance
16 | checkpoint:
17 |   _target_: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint
18 |   monitor: val_eer
19 |   save_top_k: 0
20 |   mode: min
21 |   filename: '{epoch}.{step}.{val_eer:.4f}.best'
22 |   save_last: false
23 |   every_n_val_epochs: 1
24 | 
25 | last_checkpoint_pattern: '{epoch}.{step}.{val_eer:.4f}.last'
26 | 
27 | # stop when val_eer doesn't improve or diverges
28 | early_stopping:
29 |   _target_: pytorch_lightning.callbacks.early_stopping.EarlyStopping
30 |   monitor: val_eer
31 |   min_delta: 0.00
32 |   patience: 4
33 |   mode: min
34 |   check_finite: True
35 |   divergence_threshold: 0.45


--------------------------------------------------------------------------------
/config/data/dataloader/speaker.yaml:
--------------------------------------------------------------------------------
1 | # instantiate the config object
2 | _target_: src.data.common.SpeakerDataLoaderConfig
3 | 
4 | # settings for data loader
5 | train_batch_size: 32
6 | val_batch_size: ${data.dataloader.train_batch_size}
7 | test_batch_size: 1
8 | num_workers: 5
9 | pin_memory: true


--------------------------------------------------------------------------------
/config/data/dataloader/speech.yaml:
--------------------------------------------------------------------------------
1 | # instantiate the config object
2 | _target_: src.data.common.SpeechDataLoaderConfig
3 | 
4 | # settings for data loader
5 | train_max_num_samples: 3_200_000
6 | val_batch_size: 8
7 | test_batch_size: 1
8 | num_workers: 5
9 | pin_memory: true


--------------------------------------------------------------------------------
/config/data/module/librispeech.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the data module config object
 2 | _target_: src.data.modules.speech.librispeech.LibriSpeechLightningDataModuleConfig
 3 | 
 4 | # select which subset of the training data to use
 5 | use_train_clean_100: true
 6 | use_train_clean_360: true
 7 | use_train_other_500: true
 8 | 
 9 | # paths to training data
10 | train_clean_100_path: ${data_folder}/librispeech/train-clean-100.tar.gz
11 | train_clean_360_path: ${data_folder}/librispeech/train-clean-360.tar.gz
12 | train_other_500_path: ${data_folder}/librispeech/train-other-500.tar.gz
13 | 
14 | # paths to validation data
15 | dev_clean_path: ${data_folder}/librispeech/dev-clean.tar.gz
16 | dev_other_path: ${data_folder}/librispeech/dev-other.tar.gz
17 | 
18 | # paths to test data
19 | test_clean_path: ${data_folder}/librispeech/test-clean.tar.gz
20 | test_other_path: ${data_folder}/librispeech/test-other.tar.gz
21 | 
22 | # folder to write train/val/test shards into
23 | shards_folder: ${data_folder}/librispeech_shards
24 | 
25 | # temporary working directory for shard creation process
26 | extraction_folder: ${temp_folder}/librispeech
27 | 
28 | # collation strategy
29 | train_collate_fn: default
30 | val_collate_fn: default
31 | test_collate_fn: default
32 | 
33 | # add side info (in order to ease debugging data pipeline at the cost of
34 | # slowing down the iter/sec)
35 | add_side_info: False
36 | 
37 | # limit the amount of samples to a certain amount - useful for debugging
38 | # whether a model can overfit on a small amount of data.
39 | # No limit when value is <= 0
40 | limit_samples: -1


--------------------------------------------------------------------------------
/config/data/module/voxceleb1.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the data module config object
 2 | _target_: src.data.modules.speaker.voxceleb.VoxCelebDataModuleConfig
 3 | 
 4 | # select which dataset(s) should be used during training
 5 | # note that in practise only voxceleb2 is optional because
 6 | # the test set of voxceleb2 is often all data in voxceleb1)
 7 | use_voxceleb1_dev: true
 8 | use_voxceleb1_test: true
 9 | use_voxceleb2_dev: false
10 | use_voxceleb2_test: false
11 | all_voxceleb1_is_test_set: false
12 | 
13 | # define data kind
14 | has_train: true
15 | has_val: true
16 | has_test: true
17 | 
18 | # path to identity file for test set
19 | # Warning: changing the test set while shards
20 | # are already written has no effect and would require
21 | # overwriting existing shards.
22 | test_split_file_path: ${data_folder}/voxceleb_meta/veri_test2.txt
23 | 
24 | # folder to write train/val/test shards into
25 | shards_folder: ${data_folder}/voxceleb1_shards
26 | 
27 | # temporary working directory for shard creation process
28 | extraction_folder: ${temp_folder}/voxceleb_1
29 | 
30 | # determine train/val split
31 | # `equal` mode means each speaker is in both train and val split
32 | # `different` mode means intersection of speakers in train and val is empty
33 | split_mode: equal  # one of 'equal`, `different`
34 | train_val_ratio: 0.97
35 | num_val_speakers: -1 # not used because split_mode=equal
36 | 
37 | # number of pairs of validation samples to calculate EER on during training
38 | eer_validation_pairs: 10_000
39 | 
40 | # settings related to how data is written to shards
41 | sequential_same_speaker_samples: 1  # num back-to-back samples from same speaker
42 | min_unique_speakers_per_shard: 500
43 | discard_partial_shards: true
44 | 
45 | # The paths to the zipfile containing
46 | # the voxceleb1 training and test data
47 | # Values are ignored if `use_voxceleb1=False`
48 | voxceleb1_train_zip_path: ${data_folder}/voxceleb_archives/vox1_dev_wav.zip
49 | voxceleb1_test_zip_path: ${data_folder}/voxceleb_archives/vox1_test_wav.zip
50 | 
51 | # The paths to the zipfile containing
52 | # the voxceleb2 training and test data
53 | # Values are ignored if `use_voxceleb2=False`
54 | voxceleb2_train_zip_path: ${data_folder}/voxceleb_archives/vox2_dev_wav.zip
55 | voxceleb2_test_zip_path: ${data_folder}/voxceleb_archives/vox2_test_wav.zip
56 | 
57 | # collation strategy
58 | train_collate_fn: pad_right
59 | val_collate_fn: default
60 | test_collate_fn: default
61 | 
62 | # add side info (in order to ease debugging data pipeline at the cost of
63 | # slowing down the iter/sec)
64 | add_batch_debug_info: False
65 | 
66 | # limit the amount of samples to a certain amount - useful for debugging
67 | # whether a model can overfit on a small amount of data.
68 | # No limit when value is <= 0
69 | limit_samples: -1
70 | 
71 | # each sample in a batch consists of two audio samples which are either
72 | # from the same speaker or from different speakers.
73 | batch_processing_mode: categorical


--------------------------------------------------------------------------------
/config/data/module/voxceleb1_pairs.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the data module config object
 2 | _target_: src.data.modules.speaker.voxceleb.VoxCelebDataModuleConfig
 3 | 
 4 | # select which dataset(s) should be used during training
 5 | # note that in practise only voxceleb2 is optional because
 6 | # the test set of voxceleb2 is often all data in voxceleb1)
 7 | use_voxceleb1_dev: true
 8 | use_voxceleb1_test: true
 9 | use_voxceleb2_dev: false
10 | use_voxceleb2_test: false
11 | all_voxceleb1_is_test_set: false
12 | 
13 | # define data kind
14 | has_train: true
15 | has_val: true
16 | has_test: true
17 | 
18 | # path to identity file for test set
19 | # Warning: changing the test set while shards
20 | # are already written has no effect and would require
21 | # overwriting existing shards.
22 | test_split_file_path: ${data_folder}/voxceleb_meta/veri_test2.txt
23 | 
24 | # folder to write train/val/test shards into
25 | shards_folder: ${data_folder}/voxceleb1_shards_pairs
26 | 
27 | # temporary working directory for shard creation process
28 | extraction_folder: ${temp_folder}/voxceleb_1
29 | 
30 | # determine train/val split
31 | # `equal` mode means each speaker is in both train and val split
32 | # `different` mode means intersection of speakers in train and val is empty
33 | split_mode: different  # one of 'equal`, `different`
34 | train_val_ratio: -1 # not used because split_mode=different
35 | num_val_speakers: 41
36 | 
37 | # number of pairs of validation samples to calculate EER on during training
38 | eer_validation_pairs: 10_000
39 | 
40 | # settings related to how data is written to shards
41 | sequential_same_speaker_samples: 4  # num back-to-back samples from same speaker
42 | min_unique_speakers_per_shard: 50
43 | discard_partial_shards: true
44 | 
45 | # The paths to the zipfile containing
46 | # the voxceleb1 training and test data
47 | # Values are ignored if `use_voxceleb1=False`
48 | voxceleb1_train_zip_path: ${data_folder}/voxceleb_archives/vox1_dev_wav.zip
49 | voxceleb1_test_zip_path: ${data_folder}/voxceleb_archives/vox1_test_wav.zip
50 | 
51 | # The paths to the zipfile containing
52 | # the voxceleb2 training and test data
53 | # Values are ignored if `use_voxceleb2=False`
54 | voxceleb2_train_zip_path: ${data_folder}/voxceleb_archives/vox2_dev_wav.zip
55 | voxceleb2_test_zip_path: ${data_folder}/voxceleb_archives/vox2_test_wav.zip
56 | 
57 | # collation strategy
58 | train_collate_fn: pad_right
59 | val_collate_fn: default
60 | test_collate_fn: default
61 | 
62 | # add side info (in order to ease debugging data pipeline at the cost of
63 | # slowing down the iter/sec)
64 | add_batch_debug_info: False
65 | 
66 | # limit the amount of samples to a certain amount - useful for debugging
67 | # whether a model can overfit on a small amount of data.
68 | # No limit when value is <= 0
69 | limit_samples: -1
70 | 
71 | # each sample in a batch consists of two audio samples which are either
72 | # from the same speaker or from different speakers.
73 | batch_processing_mode: pairwise_categorical
74 | 
75 | # distribution of pos/neg pairs in batch
76 | pos_neg_training_batch_ratio: 0.5
77 | yield_limit: null


--------------------------------------------------------------------------------
/config/data/module/voxceleb1_triplets.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the data module config object
 2 | _target_: src.data.modules.speaker.voxceleb.VoxCelebDataModuleConfig
 3 | 
 4 | # select which dataset(s) should be used during training
 5 | # note that in practise only voxceleb2 is optional because
 6 | # the test set of voxceleb2 is often all data in voxceleb1)
 7 | use_voxceleb1_dev: true
 8 | use_voxceleb1_test: true
 9 | use_voxceleb2_dev: false
10 | use_voxceleb2_test: false
11 | all_voxceleb1_is_test_set: false
12 | 
13 | # define data kind
14 | has_train: true
15 | has_val: true
16 | has_test: true
17 | 
18 | # path to identity file for test set
19 | # Warning: changing the test set while shards
20 | # are already written has no effect and would require
21 | # overwriting existing shards.
22 | test_split_file_path: ${data_folder}/voxceleb_meta/veri_test2.txt
23 | 
24 | # folder to write train/val/test shards into
25 | shards_folder: ${data_folder}/voxceleb1_shards_pairs
26 | 
27 | # temporary working directory for shard creation process
28 | extraction_folder: ${temp_folder}/voxceleb_1
29 | 
30 | # determine train/val split
31 | # `equal` mode means each speaker is in both train and val split
32 | # `different` mode means intersection of speakers in train and val is empty
33 | split_mode: different  # one of 'equal`, `different`
34 | train_val_ratio: -1 # not used because split_mode=different
35 | num_val_speakers: 41
36 | 
37 | # number of pairs of validation samples to calculate EER on during training
38 | eer_validation_pairs: 10_000
39 | 
40 | # settings related to how data is written to shards
41 | sequential_same_speaker_samples: 4  # num back-to-back samples from same speaker
42 | min_unique_speakers_per_shard: 50
43 | discard_partial_shards: true
44 | 
45 | # The paths to the zipfile containing
46 | # the voxceleb1 training and test data
47 | # Values are ignored if `use_voxceleb1=False`
48 | voxceleb1_train_zip_path: ${data_folder}/voxceleb_archives/vox1_dev_wav.zip
49 | voxceleb1_test_zip_path: ${data_folder}/voxceleb_archives/vox1_test_wav.zip
50 | 
51 | # The paths to the zipfile containing
52 | # the voxceleb2 training and test data
53 | # Values are ignored if `use_voxceleb2=False`
54 | voxceleb2_train_zip_path: ${data_folder}/voxceleb_archives/vox2_dev_wav.zip
55 | voxceleb2_test_zip_path: ${data_folder}/voxceleb_archives/vox2_test_wav.zip
56 | 
57 | # collation strategy
58 | train_collate_fn: pad_right
59 | val_collate_fn: default
60 | test_collate_fn: default
61 | 
62 | # add side info (in order to ease debugging data pipeline at the cost of
63 | # slowing down the iter/sec)
64 | add_batch_debug_info: False
65 | 
66 | # limit the amount of samples to a certain amount - useful for debugging
67 | # whether a model can overfit on a small amount of data.
68 | # No limit when value is <= 0
69 | limit_samples: -1
70 | 
71 | # each sample in a batch consists of two audio samples which are either
72 | # from the same speaker or from different speakers.
73 | batch_processing_mode: categorical_triplets


--------------------------------------------------------------------------------
/config/data/module/voxceleb2.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the data module config object
 2 | _target_: src.data.modules.speaker.voxceleb.VoxCelebDataModuleConfig
 3 | 
 4 | # select which dataset(s) should be used during training
 5 | # note that in practise only voxceleb2 is optional because
 6 | # the test set of voxceleb2 is often all data in voxceleb1)
 7 | use_voxceleb1_dev: true
 8 | use_voxceleb1_test: true
 9 | use_voxceleb2_dev: true
10 | use_voxceleb2_test: false
11 | all_voxceleb1_is_test_set: true
12 | 
13 | # define data kind
14 | has_train: true
15 | has_val: true
16 | has_test: true
17 | 
18 | # path to identity file for test set
19 | # Warning: changing the test set while shards
20 | # are already written has no effect and would require
21 | # overwriting existing shards.
22 | test_split_file_path: ${data_folder}/voxceleb_meta/veri_test2.txt
23 | 
24 | # folder to write train/val/test shards into
25 | shards_folder: ${data_folder}/voxceleb2_shards
26 | 
27 | # temporary working directory for shard creation process
28 | extraction_folder: ${temp_folder}/voxceleb_2
29 | 
30 | # determine train/val split
31 | # `equal` mode means each speaker is in both train and val split
32 | # `different` mode means intersection of speakers in train and val is empty
33 | split_mode: equal  # one of 'equal`, `different`
34 | train_val_ratio: 0.99
35 | num_val_speakers: -1 # not used because split_mode=equal
36 | 
37 | # number of pairs of validation samples to calculate EER on during training
38 | eer_validation_pairs: 10_000
39 | 
40 | # settings related to how data is written to shards
41 | sequential_same_speaker_samples: 1  # num back-to-back samples from same speaker
42 | min_unique_speakers_per_shard: 500
43 | discard_partial_shards: true
44 | 
45 | # The paths to the zipfile containing
46 | # the voxceleb1 training and test data
47 | # Values are ignored if `use_voxceleb1=False`
48 | voxceleb1_train_zip_path: ${data_folder}/voxceleb_archives/vox1_dev_wav.zip
49 | voxceleb1_test_zip_path: ${data_folder}/voxceleb_archives/vox1_test_wav.zip
50 | 
51 | # The paths to the zipfile containing
52 | # the voxceleb2 training and test data
53 | # Values are ignored if `use_voxceleb2=False`
54 | voxceleb2_train_zip_path: ${data_folder}/voxceleb_archives/vox2_dev_wav.zip
55 | voxceleb2_test_zip_path: ${data_folder}/voxceleb_archives/vox2_test_wav.zip
56 | 
57 | # collation strategy
58 | train_collate_fn: pad_right
59 | val_collate_fn: default
60 | test_collate_fn: default
61 | 
62 | # add side info (in order to ease debugging data pipeline at the cost of
63 | # slowing down the iter/sec)
64 | add_batch_debug_info: False
65 | 
66 | # limit the amount of samples to a certain amount - useful for debugging
67 | # whether a model can overfit on a small amount of data.
68 | # No limit when value is <= 0
69 | limit_samples: -1
70 | 
71 | # each sample in a batch consists of two audio samples which are either
72 | # from the same speaker or from different speakers.
73 | batch_processing_mode: categorical
74 | 


--------------------------------------------------------------------------------
/config/data/module/voxceleb2_pairs.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the data module config object
 2 | _target_: src.data.modules.speaker.voxceleb.VoxCelebDataModuleConfig
 3 | 
 4 | # select which dataset(s) should be used during training
 5 | # note that in practise only voxceleb2 is optional because
 6 | # the test set of voxceleb2 is often all data in voxceleb1)
 7 | use_voxceleb1_dev: true
 8 | use_voxceleb1_test: true
 9 | use_voxceleb2_dev: true
10 | use_voxceleb2_test: false
11 | all_voxceleb1_is_test_set: true
12 | 
13 | # define data kind
14 | has_train: true
15 | has_val: true
16 | has_test: true
17 | 
18 | # path to identity file for test set
19 | # Warning: changing the test set while shards
20 | # are already written has no effect and would require
21 | # overwriting existing shards.
22 | test_split_file_path: ${data_folder}/voxceleb_meta/veri_test2.txt
23 | 
24 | # folder to write train/val/test shards into
25 | shards_folder: ${data_folder}/voxceleb2_shards_pairs
26 | 
27 | # temporary working directory for shard creation process
28 | extraction_folder: ${temp_folder}/voxceleb_2
29 | 
30 | # determine train/val split
31 | # `equal` mode means each speaker is in both train and val split
32 | # `different` mode means intersection of speakers in train and val is empty
33 | split_mode: different  # one of 'equal`, `different`
34 | train_val_ratio: -1 # not used because split_mode=different
35 | num_val_speakers: 41
36 | 
37 | # number of pairs of validation samples to calculate EER on during training
38 | eer_validation_pairs: 10_000
39 | 
40 | # settings related to how data is written to shards
41 | sequential_same_speaker_samples: 4  # num back-to-back samples from same speaker
42 | min_unique_speakers_per_shard: 50
43 | discard_partial_shards: true
44 | 
45 | 
46 | # The paths to the zipfile containing
47 | # the voxceleb1 training and test data
48 | # Values are ignored if `use_voxceleb1=False`
49 | voxceleb1_train_zip_path: ${data_folder}/voxceleb_archives/vox1_dev_wav.zip
50 | voxceleb1_test_zip_path: ${data_folder}/voxceleb_archives/vox1_test_wav.zip
51 | 
52 | # The paths to the zipfile containing
53 | # the voxceleb2 training and test data
54 | # Values are ignored if `use_voxceleb2=False`
55 | voxceleb2_train_zip_path: ${data_folder}/voxceleb_archives/vox2_dev_wav.zip
56 | voxceleb2_test_zip_path: ${data_folder}/voxceleb_archives/vox2_test_wav.zip
57 | 
58 | # collation strategy
59 | train_collate_fn: pad_right
60 | val_collate_fn: default
61 | test_collate_fn: default
62 | 
63 | # add side info (in order to ease debugging data pipeline at the cost of
64 | # slowing down the iter/sec)
65 | add_batch_debug_info: False
66 | 
67 | # limit the amount of samples to a certain amount - useful for debugging
68 | # whether a model can overfit on a small amount of data.
69 | # No limit when value is <= 0
70 | limit_samples: -1
71 | 
72 | # each sample in a batch consists of two audio samples which are either
73 | # from the same speaker or from different speakers.
74 | batch_processing_mode: pairwise_categorical
75 | 
76 | # distribution of pos/neg pairs in batch
77 | pos_neg_training_batch_ratio: 0.5
78 | yield_limit: null


--------------------------------------------------------------------------------
/config/data/module/voxceleb2_test_everyone.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the data module config object
 2 | _target_: src.data.modules.speaker.voxceleb.VoxCelebDataModuleConfig
 3 | 
 4 | # select which dataset(s) should be used during training
 5 | # note that in practise only voxceleb2 is optional because
 6 | # the test set of voxceleb2 is often all data in voxceleb1)
 7 | use_voxceleb1_dev: true
 8 | use_voxceleb1_test: true
 9 | use_voxceleb2_dev: true
10 | use_voxceleb2_test: false
11 | all_voxceleb1_is_test_set: true
12 | 
13 | # define data kind
14 | has_train: true
15 | has_val: true
16 | has_test: true
17 | 
18 | # path to identity file for test set
19 | # Warning: changing the test set while shards
20 | # are already written has no effect and would require
21 | # overwriting existing shards.
22 | test_split_file_path: ${data_folder}/voxceleb_meta/list_test_all2.txt
23 | 
24 | # folder to write train/val/test shards into
25 | shards_folder: ${data_folder}/voxceleb1_test_all_shards
26 | 
27 | # temporary working directory for shard creation process
28 | extraction_folder: ${temp_folder}/voxceleb_1
29 | 
30 | # determine train/val split
31 | # `equal` mode means each speaker is in both train and val split
32 | # `different` mode means intersection of speakers in train and val is empty
33 | split_mode: equal  # one of 'equal`, `different`
34 | train_val_ratio: 0.97
35 | num_val_speakers: -1 # not used because split_mode=equal
36 | 
37 | # number of pairs of validation samples to calculate EER on during training
38 | eer_validation_pairs: 10_000
39 | 
40 | # settings related to how data is written to shards
41 | sequential_same_speaker_samples: 1  # num back-to-back samples from same speaker
42 | min_unique_speakers_per_shard: 500
43 | discard_partial_shards: true
44 | 
45 | # The paths to the zipfile containing
46 | # the voxceleb1 training and test data
47 | # Values are ignored if `use_voxceleb1=False`
48 | voxceleb1_train_zip_path: ${data_folder}/voxceleb_archives/vox1_dev_wav.zip
49 | voxceleb1_test_zip_path: ${data_folder}/voxceleb_archives/vox1_test_wav.zip
50 | 
51 | # The paths to the zipfile containing
52 | # the voxceleb2 training and test data
53 | # Values are ignored if `use_voxceleb2=False`
54 | voxceleb2_train_zip_path: ${data_folder}/voxceleb_archives/vox2_dev_wav.zip
55 | voxceleb2_test_zip_path: ${data_folder}/voxceleb_archives/vox2_test_wav.zip
56 | 
57 | # collation strategy
58 | train_collate_fn: pad_right
59 | val_collate_fn: default
60 | test_collate_fn: default
61 | 
62 | # add side info (in order to ease debugging data pipeline at the cost of
63 | # slowing down the iter/sec)
64 | add_batch_debug_info: False
65 | 
66 | # limit the amount of samples to a certain amount - useful for debugging
67 | # whether a model can overfit on a small amount of data.
68 | # No limit when value is <= 0
69 | limit_samples: -1
70 | 
71 | # each sample in a batch consists of two audio samples which are either
72 | # from the same speaker or from different speakers.
73 | batch_processing_mode: categorical


--------------------------------------------------------------------------------
/config/data/module/voxceleb2_test_hard.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the data module config object
 2 | _target_: src.data.modules.speaker.voxceleb.VoxCelebDataModuleConfig
 3 | 
 4 | # select which dataset(s) should be used during training
 5 | # note that in practise only voxceleb2 is optional because
 6 | # the test set of voxceleb2 is often all data in voxceleb1)
 7 | use_voxceleb1_dev: true
 8 | use_voxceleb1_test: true
 9 | use_voxceleb2_dev: true
10 | use_voxceleb2_test: false
11 | all_voxceleb1_is_test_set: true
12 | 
13 | # define data kind
14 | has_train: true
15 | has_val: true
16 | has_test: true
17 | 
18 | # path to identity file for test set
19 | # Warning: changing the test set while shards
20 | # are already written has no effect and would require
21 | # overwriting existing shards.
22 | test_split_file_path: ${data_folder}/voxceleb_meta/list_test_hard2.txt
23 | 
24 | # folder to write train/val/test shards into
25 | shards_folder: ${data_folder}/voxceleb1_test_hard_shards
26 | 
27 | # temporary working directory for shard creation process
28 | extraction_folder: ${temp_folder}/voxceleb_1
29 | 
30 | # determine train/val split
31 | # `equal` mode means each speaker is in both train and val split
32 | # `different` mode means intersection of speakers in train and val is empty
33 | split_mode: equal  # one of 'equal`, `different`
34 | train_val_ratio: 0.97
35 | num_val_speakers: -1 # not used because split_mode=equal
36 | 
37 | # number of pairs of validation samples to calculate EER on during training
38 | eer_validation_pairs: 10_000
39 | 
40 | # settings related to how data is written to shards
41 | sequential_same_speaker_samples: 1  # num back-to-back samples from same speaker
42 | min_unique_speakers_per_shard: 500
43 | discard_partial_shards: true
44 | 
45 | # The paths to the zipfile containing
46 | # the voxceleb1 training and test data
47 | # Values are ignored if `use_voxceleb1=False`
48 | voxceleb1_train_zip_path: ${data_folder}/voxceleb_archives/vox1_dev_wav.zip
49 | voxceleb1_test_zip_path: ${data_folder}/voxceleb_archives/vox1_test_wav.zip
50 | 
51 | # The paths to the zipfile containing
52 | # the voxceleb2 training and test data
53 | # Values are ignored if `use_voxceleb2=False`
54 | voxceleb2_train_zip_path: ${data_folder}/voxceleb_archives/vox2_dev_wav.zip
55 | voxceleb2_test_zip_path: ${data_folder}/voxceleb_archives/vox2_test_wav.zip
56 | 
57 | # collation strategy
58 | train_collate_fn: pad_right
59 | val_collate_fn: default
60 | test_collate_fn: default
61 | 
62 | # add side info (in order to ease debugging data pipeline at the cost of
63 | # slowing down the iter/sec)
64 | add_batch_debug_info: False
65 | 
66 | # limit the amount of samples to a certain amount - useful for debugging
67 | # whether a model can overfit on a small amount of data.
68 | # No limit when value is <= 0
69 | limit_samples: -1
70 | 
71 | # each sample in a batch consists of two audio samples which are either
72 | # from the same speaker or from different speakers.
73 | batch_processing_mode: categorical


--------------------------------------------------------------------------------
/config/data/module/voxceleb2_triplets.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the data module config object
 2 | _target_: src.data.modules.speaker.voxceleb.VoxCelebDataModuleConfig
 3 | 
 4 | # select which dataset(s) should be used during training
 5 | # note that in practise only voxceleb2 is optional because
 6 | # the test set of voxceleb2 is often all data in voxceleb1)
 7 | use_voxceleb1_dev: true
 8 | use_voxceleb1_test: true
 9 | use_voxceleb2_dev: true
10 | use_voxceleb2_test: false
11 | all_voxceleb1_is_test_set: true
12 | 
13 | # define data kind
14 | has_train: true
15 | has_val: true
16 | has_test: true
17 | 
18 | # path to identity file for test set
19 | # Warning: changing the test set while shards
20 | # are already written has no effect and would require
21 | # overwriting existing shards.
22 | test_split_file_path: ${data_folder}/voxceleb_meta/veri_test2.txt
23 | 
24 | # folder to write train/val/test shards into
25 | shards_folder: ${data_folder}/voxceleb2_shards_pairs
26 | 
27 | # temporary working directory for shard creation process
28 | extraction_folder: ${temp_folder}/voxceleb_1
29 | 
30 | # determine train/val split
31 | # `equal` mode means each speaker is in both train and val split
32 | # `different` mode means intersection of speakers in train and val is empty
33 | split_mode: different  # one of 'equal`, `different`
34 | train_val_ratio: -1 # not used because split_mode=different
35 | num_val_speakers: 41
36 | 
37 | # number of pairs of validation samples to calculate EER on during training
38 | eer_validation_pairs: 10_000
39 | 
40 | # settings related to how data is written to shards
41 | sequential_same_speaker_samples: 4  # num back-to-back samples from same speaker
42 | min_unique_speakers_per_shard: 50
43 | discard_partial_shards: true
44 | 
45 | # The paths to the zipfile containing
46 | # the voxceleb1 training and test data
47 | # Values are ignored if `use_voxceleb1=False`
48 | voxceleb1_train_zip_path: ${data_folder}/voxceleb_archives/vox1_dev_wav.zip
49 | voxceleb1_test_zip_path: ${data_folder}/voxceleb_archives/vox1_test_wav.zip
50 | 
51 | # The paths to the zipfile containing
52 | # the voxceleb2 training and test data
53 | # Values are ignored if `use_voxceleb2=False`
54 | voxceleb2_train_zip_path: ${data_folder}/voxceleb_archives/vox2_dev_wav.zip
55 | voxceleb2_test_zip_path: ${data_folder}/voxceleb_archives/vox2_test_wav.zip
56 | 
57 | # collation strategy
58 | train_collate_fn: pad_right
59 | val_collate_fn: default
60 | test_collate_fn: default
61 | 
62 | # add side info (in order to ease debugging data pipeline at the cost of
63 | # slowing down the iter/sec)
64 | add_batch_debug_info: False
65 | 
66 | # limit the amount of samples to a certain amount - useful for debugging
67 | # whether a model can overfit on a small amount of data.
68 | # No limit when value is <= 0
69 | limit_samples: -1
70 | 
71 | # each sample in a batch consists of two audio samples which are either
72 | # from the same speaker or from different speakers.
73 | batch_processing_mode: categorical_triplets
74 | 


--------------------------------------------------------------------------------
/config/data/pipeline/wav2vec_base_pipeline.yaml:
--------------------------------------------------------------------------------
 1 | train_pipeline:
 2 |   - normalizer
 3 |   - selector_train
 4 | 
 5 | val_pipeline:
 6 |   - normalizer
 7 |   - selector_val
 8 | 
 9 | test_pipeline:
10 |   # assume batch size of 1 due to no selector (and therefore tensors have
11 |   # different dimensions and cannot be collated without padding
12 |   - normalizer
13 | 
14 | selector_train:
15 |   _target_: src.data.preprocess.random_chunks.AudioChunkSelector
16 |   # one of 'start', 'end', 'random', 'random_contiguous', 'contiguous'
17 |   selection_strategy: random
18 |   desired_chunk_length_sec: 3
19 | 
20 | selector_val:
21 |   _target_: src.data.preprocess.random_chunks.AudioChunkSelector
22 |   # one of 'start', 'end', 'random', 'random_contiguous'
23 |   selection_strategy: start
24 |   desired_chunk_length_sec: 3
25 | 
26 | normalizer:
27 |   _target_: src.data.preprocess.input_normalisation.InputNormalizer2D
28 |   normalize_over_channels: false


--------------------------------------------------------------------------------
/config/data/pipeline/wav2vec_full_seq_pipeline.yaml:
--------------------------------------------------------------------------------
 1 | train_pipeline:
 2 |   - normalizer
 3 | 
 4 | val_pipeline:
 5 |   - normalizer
 6 | 
 7 | test_pipeline:
 8 |   # assume batch size of 1 due to no selector (and therefore tensors have
 9 |   # different dimensions and cannot be collated without padding
10 |   - normalizer
11 | 
12 | selector_contiguous:
13 |   _target_: src.data.preprocess.random_chunks.AudioChunkSelector
14 |   # one of 'start', 'end', 'random', 'random_contiguous', 'contiguous'
15 |   selection_strategy: contiguous
16 |   desired_chunk_length_sec: 3
17 | 
18 | selector_start:
19 |   _target_: src.data.preprocess.random_chunks.AudioChunkSelector
20 |   # one of 'start', 'end', 'random', 'random_contiguous'
21 |   selection_strategy: start
22 |   desired_chunk_length_sec: 3
23 | 
24 | filterbank:
25 |   _target_: src.data.preprocess.audio_features.FilterBank
26 | 
27 | normalizer:
28 |   _target_: src.data.preprocess.input_normalisation.InputNormalizer2D
29 |   normalize_over_channels: false


--------------------------------------------------------------------------------
/config/data/pipeline/wav2vec_pair_pipeline.yaml:
--------------------------------------------------------------------------------
 1 | train_pipeline:
 2 |   - normalizer
 3 |   - selector_contiguous
 4 | 
 5 | val_pipeline:
 6 |   - normalizer
 7 |   - selector_start
 8 | 
 9 | test_pipeline:
10 |   # assume batch size of 1 due to no selector (and therefore tensors have
11 |   # different dimensions and cannot be collated without padding
12 |   - normalizer
13 | 
14 | selector_contiguous:
15 |   _target_: src.data.preprocess.random_chunks.AudioChunkSelector
16 |   # one of 'start', 'end', 'random', 'random_contiguous', 'contiguous'
17 |   selection_strategy: random
18 |   desired_chunk_length_sec: 3
19 | 
20 | selector_start:
21 |   _target_: src.data.preprocess.random_chunks.AudioChunkSelector
22 |   # one of 'start', 'end', 'random', 'random_contiguous'
23 |   selection_strategy: start
24 |   desired_chunk_length_sec: 3
25 | 
26 | filterbank:
27 |   _target_: src.data.preprocess.audio_features.FilterBank
28 | 
29 | normalizer:
30 |   _target_: src.data.preprocess.input_normalisation.InputNormalizer2D
31 |   normalize_over_channels: false


--------------------------------------------------------------------------------
/config/data/pipeline/wav2vec_short_seq_pipeline.yaml:
--------------------------------------------------------------------------------
 1 | train_pipeline:
 2 |   - normalizer
 3 |   - selector_contiguous
 4 | 
 5 | val_pipeline:
 6 |   - normalizer
 7 |   - selector_start
 8 | 
 9 | test_pipeline:
10 |   # assume batch size of 1 due to no selector (and therefore tensors have
11 |   # different dimensions and cannot be collated without padding
12 |   - normalizer
13 | 
14 | selector_contiguous:
15 |   _target_: src.data.preprocess.random_chunks.AudioChunkSelector
16 |   # one of 'start', 'end', 'random', 'random_contiguous', 'contiguous'
17 |   selection_strategy: random
18 |   desired_chunk_length_sec: 0.4
19 | 
20 | selector_start:
21 |   _target_: src.data.preprocess.random_chunks.AudioChunkSelector
22 |   # one of 'start', 'end', 'random', 'random_contiguous'
23 |   selection_strategy: start
24 |   desired_chunk_length_sec: 3
25 | 
26 | filterbank:
27 |   _target_: src.data.preprocess.audio_features.FilterBank
28 | 
29 | normalizer:
30 |   _target_: src.data.preprocess.input_normalisation.InputNormalizer2D
31 |   normalize_over_channels: false


--------------------------------------------------------------------------------
/config/data/pipeline/xvector_all_augment_pipeline.yaml:
--------------------------------------------------------------------------------
 1 | # preprocessors to apply to training data
 2 | train_pipeline:
 3 |   - selector_contiguous
 4 |   - augmenter
 5 |   - filterbank
 6 |   - normalizer
 7 | 
 8 | # preprocessors to apply to validation data
 9 | val_pipeline:
10 |   - selector_start
11 |   - filterbank
12 |   - normalizer
13 | 
14 | # preprocessors to apply to test data
15 | test_pipeline:
16 |   # assume batch size of 1 due to no selector (and therefore tensors have
17 |   # different dimensions and cannot be collated without padding
18 |   - filterbank
19 |   - normalizer
20 | 
21 | # define all the augmentations to add to the `augmenter`
22 | augmentations:
23 |   - augment_drop_time
24 |   - augment_drop_freqs
25 |   - augment_change_speed
26 |   - augment_add_reverb
27 |   - augment_add_noise
28 | 
29 | # selects a random audio chunk
30 | selector_contiguous:
31 |   _target_: src.data.preprocess.random_chunks.AudioChunkSelector
32 |   # one of 'start', 'end', 'random', 'random_contiguous', 'contiguous'
33 |   selection_strategy: contiguous
34 |   desired_chunk_length_sec: 3
35 | 
36 | # selects the first x seconds of audio
37 | selector_start:
38 |   _target_: src.data.preprocess.random_chunks.AudioChunkSelector
39 |   # one of 'start', 'end', 'random', 'random_contiguous'
40 |   selection_strategy: start
41 |   desired_chunk_length_sec: 3
42 | 
43 | # converts wav to mel filterbanks
44 | filterbank:
45 |   _target_: src.data.preprocess.audio_features.FilterBank
46 | 
47 | # normalizes filterbanks to 0 mean and unit variance
48 | normalizer:
49 |   _target_: src.data.preprocess.input_normalisation.InputNormalizer2D
50 |   normalize_over_channels: true
51 | 
52 | # augmentation preprocessors to use
53 | augmenter:
54 |   _target_: src.data.preprocess.augment.Augmenter
55 |   yield_intermediate_augmentations: True
56 |   yield_unaugmented: True
57 |   stack_augmentations: False
58 | 
59 | # randomly drop `x` seconds of audio
60 | augment_drop_time:
61 |   _target_: src.data.preprocess.augment.TimeDropoutAugment
62 |   sample_rate: 16000
63 |   max_dropout_length_seconds: 0.25
64 |   min_drop_count: 0
65 |   max_drop_count: 5
66 | 
67 | # randomly drops certain frequency bands from the audio signal
68 | augment_drop_freqs:
69 |   _target_: src.data.preprocess.augment.FrequencyDropoutAugment
70 |   sample_rate: 16000
71 |   min_drop_count: 0
72 |   max_drop_count: 5
73 |   band_scaling: 1
74 | 
75 | # randomly slows down or speeds up the audio
76 | augment_change_speed:
77 |   _target_: src.data.preprocess.augment.ChoiceSpeedAugment
78 |   sample_rate: 16000
79 |   possible_speed_factors: [0.95, 1, 1.05]
80 | 
81 | # randomly adds reverb
82 | augment_add_reverb:
83 |   _target_: src.data.preprocess.augment.ReverbAugment
84 |   sample_rate: 16000
85 |   room_scale_min: 0
86 |   room_scale_max: 100
87 | 
88 | # randomly adds uniform noise to the audio
89 | augment_add_noise:
90 |   _target_: src.data.preprocess.augment.ChoiceNoiseAugment
91 |   sample_rate: 16000
92 |   snr_choices: [15, 20, 100]
93 | 


--------------------------------------------------------------------------------
/config/data/pipeline/xvector_dropout_augment_pipeline.yaml:
--------------------------------------------------------------------------------
 1 | # preprocessors to apply to training data
 2 | train_pipeline:
 3 |   - selector_contiguous
 4 |   - augmenter
 5 |   - filterbank
 6 |   - normalizer
 7 | 
 8 | # preprocessors to apply to validation data
 9 | val_pipeline:
10 |   - selector_start
11 |   - filterbank
12 |   - normalizer
13 | 
14 | # preprocessors to apply to test data
15 | test_pipeline:
16 |   # assume batch size of 1 due to no selector (and therefore tensors have
17 |   # different dimensions and cannot be collated without padding
18 |   - filterbank
19 |   - normalizer
20 | 
21 | # define all the augmentations to add to the `augmenter`
22 | augmentations:
23 |   - augment_drop_time
24 |   - augment_drop_freqs
25 |   - augment_change_speed
26 | 
27 | # selects a random audio chunk
28 | selector_contiguous:
29 |   _target_: src.data.preprocess.random_chunks.AudioChunkSelector
30 |   # one of 'start', 'end', 'random', 'random_contiguous', 'contiguous'
31 |   selection_strategy: contiguous
32 |   desired_chunk_length_sec: 3
33 | 
34 | # selects the first x seconds of audio
35 | selector_start:
36 |   _target_: src.data.preprocess.random_chunks.AudioChunkSelector
37 |   # one of 'start', 'end', 'random', 'random_contiguous'
38 |   selection_strategy: start
39 |   desired_chunk_length_sec: 3
40 | 
41 | # converts wav to mel filterbanks
42 | filterbank:
43 |   _target_: src.data.preprocess.audio_features.FilterBank
44 | 
45 | # normalizes filterbanks to 0 mean and unit variance
46 | normalizer:
47 |   _target_: src.data.preprocess.input_normalisation.InputNormalizer2D
48 |   normalize_over_channels: true
49 | 
50 | # augmentation preprocessors to use
51 | augmenter:
52 |   _target_: src.data.preprocess.augment.Augmenter
53 |   yield_intermediate_augmentations: True
54 |   yield_unaugmented: True
55 |   stack_augmentations: False
56 | 
57 | # randomly drop `x` seconds of audio
58 | augment_drop_time:
59 |   _target_: src.data.preprocess.augment.TimeDropoutAugment
60 |   sample_rate: 16000
61 |   max_dropout_length_seconds: 0.25
62 |   min_drop_count: 0
63 |   max_drop_count: 5
64 | 
65 | # randomly drops certain frequency bands from the audio signal
66 | augment_drop_freqs:
67 |   _target_: src.data.preprocess.augment.FrequencyDropoutAugment
68 |   sample_rate: 16000
69 |   min_drop_count: 0
70 |   max_drop_count: 5
71 |   band_scaling: 1
72 | 
73 | # randomly slows down or speeds up the audio
74 | augment_change_speed:
75 |   _target_: src.data.preprocess.augment.ChoiceSpeedAugment
76 |   sample_rate: 16000
77 |   possible_speed_factors: [0.95, 1, 1.05]
78 | 


--------------------------------------------------------------------------------
/config/data/pipeline/xvector_pipeline.yaml:
--------------------------------------------------------------------------------
 1 | train_pipeline:
 2 |   - selector_train
 3 |   - filterbank
 4 |   - normalizer
 5 | 
 6 | val_pipeline:
 7 |   - selector_val
 8 |   - filterbank
 9 |   - normalizer
10 | 
11 | test_pipeline:
12 |   # assume batch size of 1 due to no selector (and therefore tensors have
13 |   # different dimensions and cannot be collated without padding
14 |   - filterbank
15 |   - normalizer
16 | 
17 | selector_train:
18 |   _target_: src.data.preprocess.random_chunks.AudioChunkSelector
19 |   # one of 'start', 'end', 'random', 'random_contiguous', 'contiguous'
20 |   selection_strategy: random
21 |   desired_chunk_length_sec: 3
22 | 
23 | selector_val:
24 |   _target_: src.data.preprocess.random_chunks.AudioChunkSelector
25 |   # one of 'start', 'end', 'random', 'random_contiguous'
26 |   selection_strategy: start
27 |   desired_chunk_length_sec: 3
28 | 
29 | filterbank:
30 |   _target_: src.data.preprocess.audio_features.FilterBank
31 |   n_mels: 40
32 | 
33 | normalizer:
34 |   _target_: src.data.preprocess.input_normalisation.InputNormalizer2D
35 |   normalize_over_channels: true


--------------------------------------------------------------------------------
/config/data/pipeline/xvector_rirs_augment.yaml:
--------------------------------------------------------------------------------
 1 | # preprocessors to apply to training data
 2 | train_pipeline:
 3 |   - selector_contiguous
 4 |   - augmenter
 5 |   - filterbank
 6 |   - normalizer
 7 | 
 8 | # preprocessors to apply to validation data
 9 | val_pipeline:
10 |   - selector_start
11 |   - filterbank
12 |   - normalizer
13 | 
14 | # preprocessors to apply to test data
15 | test_pipeline:
16 |   # assume batch size of 1 due to no selector (and therefore tensors have
17 |   # different dimensions and cannot be collated without padding
18 |   - filterbank
19 |   - normalizer
20 | 
21 | # define all the augmentations to add to the `augmenter`
22 | augmentations:
23 |   - augment_add_rirs
24 | 
25 | # selects a random audio chunk
26 | selector_contiguous:
27 |   _target_: src.data.preprocess.random_chunks.AudioChunkSelector
28 |   # one of 'start', 'end', 'random', 'random_contiguous', 'contiguous'
29 |   selection_strategy: contiguous
30 |   desired_chunk_length_sec: 3
31 | 
32 | # selects the first x seconds of audio
33 | selector_start:
34 |   _target_: src.data.preprocess.random_chunks.AudioChunkSelector
35 |   # one of 'start', 'end', 'random', 'random_contiguous'
36 |   selection_strategy: start
37 |   desired_chunk_length_sec: 3
38 | 
39 | # converts wav to mel filterbanks
40 | filterbank:
41 |   _target_: src.data.preprocess.audio_features.FilterBank
42 | 
43 | # normalizes filterbanks to 0 mean and unit variance
44 | normalizer:
45 |   _target_: src.data.preprocess.input_normalisation.InputNormalizer2D
46 |   normalize_over_channels: true
47 | 
48 | # augmentation preprocessors to use
49 | augmenter:
50 |   _target_: src.data.preprocess.augment.Augmenter
51 |   yield_intermediate_augmentations: True
52 |   yield_unaugmented: True
53 |   stack_augmentations: False
54 | 
55 | # randomly drop `x` seconds of audio
56 | augment_add_rirs:
57 |   _target_: src.data.preprocess.augment.ChoiceRirsNoiseAugment
58 |   sample_rate: 16000
59 |   snr_choices: [5]
60 |   shards_folder: ${data_folder}/rirs_shards
61 | 


--------------------------------------------------------------------------------
/config/data/shards/shards_librispeech.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the config object
 2 | _target_: src.data.common.WebDataSetShardConfig
 3 | 
 4 | # amount of training samples stored per shard
 5 | samples_per_shard: 155000
 6 | 
 7 | # whether to compress the shards
 8 | use_gzip_compression: false
 9 | 
10 | # whether to use shards in random order
11 | shuffle_shards: True
12 | 
13 | # queue from which samples are extracted
14 | # in order to create batches with higher variance
15 | queue_size: 200


--------------------------------------------------------------------------------
/config/data/shards/shards_voxceleb.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the config object
 2 | _target_: src.data.common.WebDataSetShardConfig
 3 | 
 4 | # amount of training samples stored per shard
 5 | samples_per_shard: 5000
 6 | 
 7 | # whether to compress the shards
 8 | use_gzip_compression: true
 9 | 
10 | # whether to use shards in random order
11 | shuffle_shards: True
12 | 
13 | # queue from which samples are extracted
14 | # in order to create batches with higher variance
15 | queue_size: 1024


--------------------------------------------------------------------------------
/config/evaluator/cosine_distance.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.evaluation.speaker.cosine_distance.CosineDistanceEvaluator
 2 | 
 3 | # whether to center embeddings before calculating cosine score
 4 | center_before_scoring: False
 5 | 
 6 | # whether to length-normalize embeddings before calculating cosine score
 7 | length_norm_before_scoring: False
 8 | 
 9 | # maximum number of samples to use to fit mean/std parameters
10 | # when `use_centering` is True
11 | max_num_training_samples: 0


--------------------------------------------------------------------------------
/config/evaluator/cosine_distance_with_train_data.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.evaluation.speaker.cosine_distance.CosineDistanceEvaluator
 2 | 
 3 | # whether to center embeddings before calculating cosine score
 4 | center_before_scoring: True
 5 | 
 6 | # whether to length-normalize embeddings before calculating cosine score
 7 | length_norm_before_scoring: True
 8 | 
 9 | # maximum number of samples to use to fit mean/std parameters
10 | # when `use_centering` is True
11 | max_num_training_samples: 1000


--------------------------------------------------------------------------------
/config/evaluator/lda.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.evaluation.lda.LDAEvaluator
 2 | 
 3 | # whether to center embeddings before calculating cosine score
 4 | center_before_scoring: True
 5 | 
 6 | # whether to length-normalize embeddings before calculating cosine score
 7 | length_norm_before_scoring: True
 8 | 
 9 | # maximum number of samples to use to fit mean/std parameters
10 | # when `use_centering` is True
11 | max_training_batches_to_fit: 30000 # exhaust a single training epoch
12 | 
13 | # number of PCA components
14 | num_pca_components: 150
15 | 
16 | # center the training batches before training LDA model
17 | center_before_fit_training_batches: true


--------------------------------------------------------------------------------
/config/evaluator/plda.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.evaluation.plda.PLDAEvaluator
 2 | 
 3 | # number of PCA components
 4 | num_lda_pca_components: 50
 5 | 
 6 | # number of PCA components
 7 | num_plda_pca_components: 50
 8 | 
 9 | # number of iterations to train PLDA for
10 | max_iterations: 1
11 | 
12 | # maximum number of samples to use to fit mean/std parameters
13 | # when `use_centering` is True
14 | max_training_batches_to_fit: 300  # exhaust a single training epoch
15 | 


--------------------------------------------------------------------------------
/config/experiment/speaker_dummy.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - override /data/module: voxceleb1
 5 |   - override /data/pipeline: xvector_pipeline
 6 |   - override /data/dataloader: speaker
 7 |   - override /data/shards: shards_voxceleb
 8 |   - override /evaluator: cosine_distance
 9 |   - override /network: dummy
10 |   - override /optim/algo: adam
11 |   - override /optim/schedule: one_cycle
12 |   - override /optim/loss: cross_entropy
13 |   - override /trainer: trainer
14 | 
15 | trainer:
16 |   max_steps: 100_000
17 |   val_check_interval: 5000
18 |   precision: 16
19 | 
20 | project_name: dummy-network


--------------------------------------------------------------------------------
/config/experiment/speaker_ecapa_tdnn.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - override /data/module: voxceleb2
 5 |   - override /data/pipeline: xvector_pipeline
 6 |   - override /data/dataloader: speaker
 7 |   - override /data/shards: shards_voxceleb
 8 |   - override /evaluator: cosine_distance
 9 |   - override /network: ecapa_tdnn
10 |   - override /optim/algo: adam
11 |   - override /optim/schedule: one_cycle
12 |   - override /optim/loss: aam_softmax
13 |   - override /trainer: trainer
14 | 
15 | trainer:
16 |   max_steps: 100_000
17 |   val_check_interval: 5000
18 |   precision: 32
19 | 
20 | data:
21 |   pipeline:
22 |     filterbank:
23 |       n_mels: 40
24 | 
25 | optim:
26 |   loss:
27 |     input_features: 192
28 |     output_features: 5994 # only on voxceleb2 dev
29 | 
30 | 
31 | project_name: ecapa-tdnn


--------------------------------------------------------------------------------
/config/experiment/speaker_wav2vec2_aam.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - override /data/module: voxceleb2
 5 |   - override /data/pipeline: wav2vec_base_pipeline
 6 |   - override /data/dataloader: speaker
 7 |   - override /data/shards: shards_voxceleb
 8 |   - override /network: wav2vec2_fc
 9 |   - override /optim/algo: adam
10 |   - override /optim/schedule: one_cycle
11 |   - override /optim/loss: aam_softmax
12 |   - override /trainer: trainer
13 | 
14 | trainer:
15 |   max_steps: 100_000
16 |   val_check_interval: 5000
17 |   precision: 16
18 | 
19 | project_name: wav2vec2-sv-aam


--------------------------------------------------------------------------------
/config/experiment/speaker_wav2vec2_ce.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - override /data/module: voxceleb2
 5 |   - override /data/pipeline: wav2vec_base_pipeline
 6 |   - override /data/dataloader: speaker
 7 |   - override /data/shards: shards_voxceleb
 8 |   - override /network: wav2vec2_fc
 9 |   - override /optim/algo: adam
10 |   - override /optim/schedule: one_cycle
11 |   - override /optim/loss: cross_entropy
12 |   - override /trainer: trainer
13 | 
14 | trainer:
15 |   max_steps: 100_000
16 |   val_check_interval: 5000
17 |   precision: 16
18 | 
19 | project_name: wav2vec2-sv-ce


--------------------------------------------------------------------------------
/config/experiment/speaker_wav2vec2_ctc.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - override /data/module: voxceleb2
 5 |   - override /data/pipeline: wav2vec_base_pipeline
 6 |   - override /data/dataloader: speaker
 7 |   - override /data/shards: shards_voxceleb
 8 |   - override /network: wav2vec2_fc
 9 |   - override /optim/algo: adam
10 |   - override /optim/schedule: one_cycle
11 |   - override /optim/loss: ctc
12 |   - override /trainer: trainer
13 | 
14 | network:
15 |   stat_pooling_type: none
16 |   test_stat_pooling_type: mean+std
17 | 
18 | trainer:
19 |   max_steps: 100_000
20 |   val_check_interval: 5000
21 |   precision: 16
22 | 
23 | project_name: wav2vec2-sv-ctc


--------------------------------------------------------------------------------
/config/experiment/speaker_wav2vec2_pairs.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - override /data/module: voxceleb2_pairs
 5 |   - override /data/pipeline: wav2vec_pair_pipeline
 6 |   - override /data/dataloader: speaker
 7 |   - override /data/shards: shards_voxceleb
 8 |   - override /network: wav2vec2_paired
 9 |   - override /optim/algo: adam
10 |   - override /optim/schedule: one_cycle
11 |   - override /optim/loss: binary_cross_entropy
12 |   - override /trainer: trainer
13 | 
14 | trainer:
15 |   max_steps: 100_000
16 |   val_check_interval: 5000
17 |   precision: 16
18 | 
19 | data:
20 |   dataloader:
21 |     train_batch_size: 32
22 | 
23 | project_name: wav2vec2-paired


--------------------------------------------------------------------------------
/config/experiment/speaker_wav2vec2_triplet.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - override /data/module: voxceleb1_and_2
 5 |   - override /data/pipeline: wav2vec_base_pipeline
 6 |   - override /data/dataloader: speaker
 7 |   - override /data/shards: shards_voxceleb
 8 |   - override /network: wav2vec2_fc
 9 |   - override /optim/algo: adam
10 |   - override /optim/schedule: one_cycle
11 |   - override /optim/loss: triplet
12 |   - override /trainer: trainer
13 | 
14 | trainer:
15 |   max_steps: 100_000
16 |   val_check_interval: 5000
17 |   precision: 16
18 | 
19 | project_name: wav2vec2-triplet
20 | 
21 | data:
22 |   module:
23 |     enforce_triplets: true


--------------------------------------------------------------------------------
/config/experiment/speaker_wav2vec2_triplet_ce.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - override /data/module: voxceleb1_and_2
 5 |   - override /data/pipeline: wav2vec_base_pipeline
 6 |   - override /data/dataloader: speaker
 7 |   - override /data/shards: shards_voxceleb
 8 |   - override /network: wav2vec2_fc
 9 |   - override /optim/algo: adam
10 |   - override /optim/schedule: one_cycle
11 |   - override /optim/loss: triplet_ce
12 |   - override /trainer: trainer
13 | 
14 | trainer:
15 |   max_steps: 100_000
16 |   val_check_interval: 5000
17 |   precision: 16
18 | 
19 | project_name: wav2vec2-triplet-ce
20 | 
21 | data:
22 |   module:
23 |     enforce_triplets: true


--------------------------------------------------------------------------------
/config/experiment/speaker_xvector.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - override /data/module: voxceleb2
 5 |   - override /data/pipeline: xvector_pipeline
 6 |   - override /data/dataloader: speaker
 7 |   - override /data/shards: shards_voxceleb
 8 |   - override /evaluator: cosine_distance
 9 |   - override /network: xvector
10 |   - override /optim/algo: adam
11 |   - override /optim/schedule: one_cycle
12 |   - override /optim/loss: cross_entropy
13 |   - override /trainer: trainer
14 | 
15 | trainer:
16 |   max_steps: 100_000
17 |   val_check_interval: 5000
18 |   precision: 32
19 | 
20 | data:
21 |   pipeline:
22 |     filterbank:
23 |       n_mels: 40
24 | 
25 | project_name: xvector-sv-ce


--------------------------------------------------------------------------------
/config/experiment/speech_wav2vec2_ctc.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - override /callbacks: default_speech
 5 |   - override /data/module: librispeech
 6 |   - override /data/pipeline: wav2vec_full_seq_pipeline
 7 |   - override /data/dataloader: speech
 8 |   - override /data/shards: shards_librispeech
 9 |   - override /network: wav2vec2_fc_letter
10 |   - override /tokenizer: default
11 |   - override /optim/algo: adam
12 |   - override /optim/schedule: one_cycle
13 |   - override /optim/loss: ctc
14 |   - override /trainer: trainer
15 | 
16 | trainer:
17 |   max_steps: 100_000
18 |   precision: 16
19 | 
20 | project_name: wav2vec2-librispeech
21 | 
22 | optim:
23 |   algo:
24 |     lr: 1e-4


--------------------------------------------------------------------------------
/config/hydra/launcher/slurm.yaml:
--------------------------------------------------------------------------------
 1 | # @package hydra.launcher
 2 | 
 3 | _target_: hydra_plugins.hydra_submitit_launcher.submitit_launcher.SlurmLauncher
 4 | 
 5 | submitit_folder: ${hydra.sweep.dir}/.submitit/%j
 6 | timeout_min: 4320
 7 | cpus_per_task: 6
 8 | gpus_per_node: ${gpus}
 9 | tasks_per_node: 1
10 | mem_gb: 20
11 | nodes: 1
12 | name: ${hydra.job.name}
13 | partition: das
14 | comment: null
15 | constraint: null
16 | exclude: null
17 | signal_delay_s: 120
18 | max_num_timeout: 0
19 | additional_parameters: { "mail-user": "nvaessen", "mail-type": "BEGIN,END,FAIL" }
20 | array_parallelism: 4
21 | 


--------------------------------------------------------------------------------
/config/network/dummy.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.lightning_modules.speaker.dummy.DummyModuleConfig
2 | 


--------------------------------------------------------------------------------
/config/network/ecapa_tdnn.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the x-vector network lightning module config object
 2 | _target_: src.lightning_modules.speaker.ecapa_tdnn.EcapaTDNNModuleConfig
 3 | 
 4 | input_mel_coefficients: ${data.pipeline.filterbank.n_mels}
 5 | lin_neurons: 192
 6 | 
 7 | channels:
 8 |   - 1024
 9 |   - 1024
10 |   - 1024
11 |   - 1024
12 |   - 3072
13 | 
14 | kernel_sizes:
15 |   - 5
16 |   - 3
17 |   - 3
18 |   - 3
19 |   - 1
20 | dilations:
21 |   - 1
22 |   - 2
23 |   - 3
24 |   - 4
25 |   - 1
26 | 
27 | attention_channels: 128
28 | res2net_scale: 8
29 | se_channels: 128
30 | global_context: True
31 | 
32 | pretrained_weights_path: null
33 | 
34 | # optional explicit overwrite of embedding size and/or num speakers
35 | # (e.g if you need to load finetuned weights but want to experiment with another
36 | # pooling type in the evaluation or test on a dataset with different num speakers)
37 | explicit_stat_pool_embedding_size: null
38 | explicit_num_speakers: null


--------------------------------------------------------------------------------
/config/network/wav2spk.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the x-vector network lightning module config object
 2 | _target_: src.lightning_modules.speaker.wav2spk.Wav2SpkModuleConfig
 3 | 
 4 | # whether to use temporal gating after the feature encoder
 5 | apply_temporal_gating: true
 6 | 
 7 | # structure of fc head (excluding the last layer, which always has NUM_SPEAKERS
 8 | # output nodes
 9 | hidden_fc_layers_out:
10 |   - 512
11 |   - 128
12 | 
13 | # Which FC hidden layer to use as speaker embedding for EER evaluation
14 | # should be a valid index from the list `hidden_fc_layers_out`,
15 | # or (len(hidden_fc_layers_out) + 1) to use the softmax output as speaker embedding,
16 | # or -1 when you want to use the stat-pooled wav2vec embeddings
17 | embedding_layer_idx: 0
18 | 
19 | # which type of statistical pooling to use ('mean' or 'mean+std')
20 | stat_pooling_type: mean+std


--------------------------------------------------------------------------------
/config/network/wav2vec2_fc.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the x-vector network lightning module config object
 2 | _target_: src.lightning_modules.speaker.wav2vec2_fc.Wav2vec2FCModuleConfig
 3 | 
 4 | # pretrained weights of wav2vec model
 5 | wav2vec_hunggingface_id: "facebook/wav2vec2-base"
 6 | 
 7 | # whether to use reset the pretrained weights
 8 | # and start from a fresh initialization
 9 | reset_weights: false
10 | 
11 | # settings related to wav2vec2 architecture
12 | wav2vec_feature_encoder_only: false
13 | 
14 | # whether to freeze the feature encoder part
15 | # of the network for the whole training run
16 | completely_freeze_feature_extractor: true
17 | 
18 | # initially freeze wav2vec model
19 | wav2vec_initially_frozen: false
20 | 
21 | # number of steps before the wav2vec model is unfrozen
22 | # (if initially frozen at all)
23 | # if set to null, wav2vec will never be unfrozen
24 | num_frozen_steps: 10000
25 | 
26 | # structure of fc head (excluding the last layer, which is always NUM_SPEAKERS soft max
27 | # classification)
28 | hidden_fc_layers_out:
29 |   [] # empty list means we have only 1 fc layer with NUM_SPEAKER (softmax) embeddings
30 | #  - 1024
31 | #  - 512
32 | 
33 | # Which hidden layer to use as speaker embedding for EER evaluation
34 | # should be a valid index from the list `hidden_fc_layers_out`,
35 | # or (len(hidden_fc_layers_out) + 1) to use the softmax output as speaker embedding,
36 | # or -1 when you want to use the stat-pooled wav2vec embeddings
37 | embedding_layer_idx: -1
38 | 
39 | # which type of statistical pooling to use ('mean', 'mean+std' or 'attentive')
40 | stat_pooling_type: mean+std
41 | test_stat_pooling_type: ${network.stat_pooling_type}
42 | 
43 | # probability of regularization techniques during training
44 | # dropout
45 | activation_dropout: 0.0  # in feed-forward module of transformer layer
46 | attention_dropout: 0.1 # in attention module of transformer layer
47 | feat_proj_dropout: 0.1 # in feature projection module
48 | hidden_dropout: 0.1 # between residual connections in transformer layer
49 | 
50 | # layer skip in transformer
51 | layerdrop: 0.05
52 | 
53 | # specaugment
54 | # feature
55 | mask_feature_length: 10
56 | mask_feature_prob: 0.0
57 | 
58 | # time
59 | mask_time_length: 10
60 | mask_time_prob: 0.05
61 | 
62 | # augment on FINAL TOKENS
63 | final_channel_mask_prob: 0
64 | final_channel_mask_width: 5
65 | 
66 | # optional explicit overwrite of embedding size and/or num speakers
67 | # (e.g if you need to load finetuned weights but want to experiment with another
68 | # pooling type in the evaluation or test on a dataset with different num speakers)
69 | explicit_stat_pool_embedding_size: null
70 | explicit_num_speakers: null
71 | 
72 | use_transformers_as_ensembles: False
73 | num_ensembles: 1


--------------------------------------------------------------------------------
/config/network/wav2vec2_fc_letter.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the x-vector network lightning module config object
 2 | _target_: src.lightning_modules.speech.wav2vec2_fc_letter.Wav2vec2FcLetterRecognizerConfig
 3 | 
 4 | # pretrained weights of wav2vec model
 5 | wav2vec_hunggingface_id: "facebook/wav2vec2-base"
 6 | 
 7 | # whether to use reset the pretrained weights
 8 | # and start from a fresh initialization
 9 | reset_weights: false
10 | 
11 | # initially freeze wav2vec model
12 | wav2vec_initially_frozen: false
13 | 
14 | # whether to freeze the feature encoder part
15 | # of the network for the whole training run
16 | completely_freeze_feature_extractor: true
17 | 
18 | # number of steps before the wav2vec model is unfrozen
19 | # (if initially frozen at all)
20 | # if set to null, wav2vec will never be unfrozen
21 | num_frozen_steps: 10000
22 | 
23 | # mask (dropout of embedding tensor) settings
24 | timestep_mask_prob: 0
25 | timestep_mask_width: 10
26 | channel_mask_prob: 0
27 | channel_mask_width: 64
28 | 


--------------------------------------------------------------------------------
/config/network/wav2vec2_paired.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the x-vector network lightning module config object
 2 | _target_: src.lightning_modules.speaker.wav2vec2_paired_input.Wav2vec2PairedSpeakerModuleConfig
 3 | 
 4 | # pretrained weights of wav2vec model
 5 | wav2vec_hunggingface_id: "facebook/wav2vec2-base"
 6 | 
 7 | # whether to use reset the pretrained weights
 8 | # and start from a fresh initialization
 9 | reset_weights: false
10 | 
11 | # initially freeze wav2vec model
12 | wav2vec_initially_frozen: false
13 | 
14 | # number of steps before the wav2vec model is unfrozen
15 | # (if initially frozen at all)
16 | # if set to null, wav2vec will never be unfrozen
17 | num_frozen_steps: 10000
18 | 
19 | # whether to freeze the feature encoder part
20 | # of the network for the whole training run
21 | completely_freeze_feature_extractor: true
22 | 
23 | # whether to freeze the feature projection part
24 | # of the network for the whole training run
25 | completely_freeze_feature_projector: false
26 | 
27 | # probability of regularization techniques during training
28 | # dropout
29 | activation_dropout: 0.0  # in feed-forward module of transformer layer
30 | attention_dropout: 0.1 # in attention module of transformer layer
31 | feat_proj_dropout: 0.1 # in feature projection module
32 | hidden_dropout: 0.1 # between residual connections in transformer layer
33 | 
34 | # layer skip in transformer
35 | layerdrop: 0.05
36 | 
37 | # specaugment
38 | # feature
39 | mask_feature_length: 10
40 | mask_feature_prob: 0.0
41 | 
42 | # time
43 | mask_time_length: 10
44 | mask_time_prob: 0.05
45 | 
46 | # augment on FINAL TOKENS
47 | final_channel_mask_prob: 0
48 | final_channel_mask_width: 5


--------------------------------------------------------------------------------
/config/network/wav2vec_fc.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the x-vector network lightning module config object
 2 | _target_: src.lightning_modules.speaker.wav2vec_fc.Wav2vecFCModuleConfig
 3 | 
 4 | # pretrained weights of wav2vec model
 5 | wav2vec_model_path: ${data_folder}/pretrained_models/wav2vec/wav2vec_large.pt
 6 | 
 7 | # whether to use the aggregation layers in wav2vec model
 8 | use_aggregation_layers: true
 9 | 
10 | # whether to use reset the pretrained weights
11 | # and start from a fresh initialization
12 | reset_weights: false
13 | 
14 | # initially freeze wav2vec model
15 | wav2vec_initially_frozen: true
16 | 
17 | # number of steps before the wav2vec model is unfrozen
18 | # (if initially frozen at all)
19 | # if set to null, wav2vec will never be unfrozen
20 | num_frozen_steps: 10000
21 | 
22 | # structure of fc head (excluding the last layer, which is always NUM_SPEAKERS soft max
23 | # classification)
24 | hidden_fc_layers_out:
25 |   - 1024
26 |   - 512
27 | #  [] # empty list means we have only 1 fc layer with NUM_SPEAKER (softmax) embeddings
28 | 
29 | # Which hidden layer to use as speaker embedding for EER evaluation
30 | # should be a valid index from the list `hidden_fc_layers_out`,
31 | # or (len(hidden_fc_layers_out) + 1) to use the softmax output as speaker embedding,
32 | # or -1 when you want to use the stat-pooled wav2vec embeddings
33 | embedding_layer_idx: 1
34 | 
35 | # which type of statistical pooling to use ('mean' or 'mean+std')
36 | stat_pooling_type: mean+std


--------------------------------------------------------------------------------
/config/network/wav2vec_xvector.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the x-vector network lightning module config object
 2 | _target_: src.lightning_modules.speaker.wav2vec_xvector.Wav2vecXVectorModuleConfig
 3 | 
 4 | tdnn_blocks: 5
 5 | tdnn_channels: [512, 512, 512, 512, 1500]
 6 | tdnn_kernel_sizes: [5, 3, 3, 1, 1]
 7 | tdnn_dilations: [1, 2, 3, 1, 1]
 8 | lin_neurons: 512
 9 | in_channels: 512  # wav2vec has 512 features
10 | 
11 | # pretrained weights of wav2vec model
12 | wav2vec_model_path: ${data_folder}/pretrained_models/wav2vec/wav2vec_large.pt
13 | 
14 | # whether to use the aggregation layers in wav2vec model
15 | use_aggregation_layers: true
16 | 
17 | # initially freeze wav2vec model
18 | wav2vec_initially_frozen: true
19 | 
20 | # number of steps before the wav2vec model is unfrozen
21 | # (if initially frozen at all)
22 | num_frozen_steps: 10000


--------------------------------------------------------------------------------
/config/network/xvector.yaml:
--------------------------------------------------------------------------------
 1 | # instantiate the x-vector network lightning module config object
 2 | _target_: src.lightning_modules.speaker.xvector.XVectorModuleConfig
 3 | 
 4 | tdnn_blocks: 5
 5 | tdnn_channels: [512, 512, 512, 512, 1500]
 6 | tdnn_kernel_sizes: [5, 3, 3, 1, 1]
 7 | tdnn_dilations: [1, 2, 3, 1, 1]
 8 | lin_neurons: 512
 9 | in_channels: 40  # depends on values in data.pipeline
10 | 
11 | # optional explicit overwrite of embedding size and/or num speakers
12 | # (e.g if you need to load finetuned weights but want to experiment with another
13 | # pooling type in the evaluation or test on a dataset with different num speakers)
14 | explicit_stat_pool_embedding_size: null
15 | explicit_num_speakers: null


--------------------------------------------------------------------------------
/config/optim/algo/adam.yaml:
--------------------------------------------------------------------------------
 1 | _target_:  torch.optim.Adam
 2 | 
 3 | # learning rate
 4 | lr: 1e-4
 5 | 
 6 | # weight decay (l2 regression)
 7 | weight_decay: 0
 8 | 
 9 | # beta constants for running mean of gradient and square of gradient
10 | betas: [0.9, 0.999]
11 | 
12 | # epsilon term for numerical stability
13 | eps: 1e-8
14 | 
15 | # use AMSGRAD version of ADAM
16 | amsgrad: false
17 | 


--------------------------------------------------------------------------------
/config/optim/algo/sgd.yaml:
--------------------------------------------------------------------------------
 1 | _target_:  torch.optim.SGD
 2 | 
 3 | # learning rate
 4 | lr: 3e-3
 5 | 
 6 | # momentum:
 7 | momentum: 0.9
 8 | 
 9 | # weight decay (l2 regression)
10 | weight_decay: 0
11 | 
12 | # momentum dampening
13 | dampening: 0
14 | 
15 | # nesterov
16 | nesterov: True


--------------------------------------------------------------------------------
/config/optim/loss/aam_softmax.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.optim.loss.aam_softmax.AngularAdditiveMarginSoftMaxLoss
2 | 
3 | input_features: 1536 # for mean+std embeddings
4 | output_features: 5994 # only on voxceleb2 dev
5 | 
6 | margin: 0.2
7 | scale: 30
8 | 


--------------------------------------------------------------------------------
/config/optim/loss/binary_cross_entropy.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.optim.loss.binary_cross_entropy.BinaryCrossEntropyLoss
2 | 


--------------------------------------------------------------------------------
/config/optim/loss/cross_entropy.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.optim.loss.cross_entropy.CrossEntropyLoss
2 | 


--------------------------------------------------------------------------------
/config/optim/loss/ctc.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.optim.loss.ctc_loss.CtcLoss


--------------------------------------------------------------------------------
/config/optim/loss/triplet.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.optim.loss.triplet_loss.TripletLoss
2 | 
3 | margin: 1
4 | 


--------------------------------------------------------------------------------
/config/optim/loss/triplet_ce.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.optim.loss.TripletCrossEntropyLoss
2 | 
3 | # weighting which will be multiplied with cross-entropy loss
4 | c_ce: 1
5 | 
6 | # weighting which will be multiplied with triplet loss
7 | c_triplet: 1
8 | 


--------------------------------------------------------------------------------
/config/optim/schedule/constant.yaml:
--------------------------------------------------------------------------------
 1 | # the scheduler object to use
 2 | scheduler:
 3 |   _target_: torch.optim.lr_scheduler.StepLR
 4 | 
 5 |   # number of epochs between consecutive steps
 6 |   step_size: 1
 7 | 
 8 |   # factor by which to multiply the learning rate every `step_size` epochs
 9 |   gamma: 1
10 | 
11 |   # epoch number after which to not do any steps any more. '-1' implies never stop
12 |   last_epoch: -1
13 | 
14 |   # print to STDOUT when making a step
15 |   verbose: false
16 | 
17 | # optional value to track which is fed into the step() call
18 | # only relevant for learning rate schedulers such
19 | # as `reduce on plateau`
20 | monitor: null
21 | 
22 | # whether to step every epoch or every step
23 | interval: epoch
24 | 
25 | # amount of epochs/steps between consecutive step() calls
26 | frequency: null
27 | 
28 | # name to log the learning rate as
29 | name: null


--------------------------------------------------------------------------------
/config/optim/schedule/cyclic.yaml:
--------------------------------------------------------------------------------
 1 | # the scheduler object to use
 2 | scheduler:
 3 |   _target_: torch.optim.lr_scheduler.CyclicLR
 4 | 
 5 |   # the lowest lr in the cycle
 6 |   base_lr: 1e-4
 7 | 
 8 |   # the peak lr in the cycle
 9 |   max_lr: 0.02
10 | 
11 |   # number of steps to go from base_lr to max_lr
12 |   step_size_up: 2500
13 | 
14 |   # number of steps to go from max+lr to base_lr
15 |   step_size_down: 2500
16 | 
17 |   # Adam doesn't have `momentum` parameter, can only be true with SGD
18 |   cycle_momentum: False
19 | 
20 |   # shape of line (triangular=linearly increasing/decreasing)
21 |   mode: triangular
22 | 
23 | # optional value to track which is fed into the step() call
24 | # only relevant for learning rate schedulers such
25 | # as `reduce on plateau`
26 | monitor: null
27 | 
28 | # whether to step every epoch or every step
29 | interval: step
30 | 
31 | # amount of epochs/steps between consecutive step() calls
32 | frequency: null
33 | 
34 | # name to log the learning rate as
35 | name: null


--------------------------------------------------------------------------------
/config/optim/schedule/exp_decay.yaml:
--------------------------------------------------------------------------------
 1 | # the scheduler object to use
 2 | scheduler:
 3 |   _target_: torch.optim.lr_scheduler.LambdaLR
 4 | 
 5 |   # A function which computes a multiplicative factor given an integer parameter
 6 |   lr_lambda:
 7 |     _target_: src.optim.schedule.tri_stage.TriStageLearningRateLambdaLRFunction
 8 |     max_steps: ${trainer.max_steps}
 9 |     warmup_stage_ratio: 0
10 |     constant_stage_ratio: 0
11 |     decay_stage_ratio: 1
12 |     initial_lr: ${optim.algo.lr}
13 |     base_lr: ${optim.algo.lr}
14 |     final_lr: 5e-6
15 | 
16 |   # epoch number after which to not do any steps any more. '-1' implies never stop
17 |   last_epoch: -1
18 | 
19 |   # print to STDOUT when making a step
20 |   verbose: false
21 | 
22 | # optional value to track which is fed into the step() call
23 | # only relevant for learning rate schedulers such
24 | # as `reduce on plateau`
25 | monitor: null
26 | 
27 | # whether to step every epoch or every step
28 | interval: step
29 | 
30 | # amount of epochs/steps between consecutive step() calls
31 | frequency: null
32 | 
33 | # name to log the learning rate as
34 | name: null


--------------------------------------------------------------------------------
/config/optim/schedule/one_cycle.yaml:
--------------------------------------------------------------------------------
 1 | # the scheduler object to use
 2 | scheduler:
 3 |   _target_: torch.optim.lr_scheduler.OneCycleLR
 4 | 
 5 |   # maximum learning rate to reach in the cycle
 6 |   max_lr: ${optim.algo.lr}
 7 | 
 8 |   # the amount of steps in the training
 9 |   total_steps: ${trainer.max_steps}
10 | 
11 |   # the initial learning rate is max_lr / div_factor
12 |   div_factor: 25
13 | 
14 | # optional value to track which is fed into the step() call
15 | # only relevant for learning rate schedulers such
16 | # as `reduce on plateau`
17 | monitor: null
18 | 
19 | # whether to step every epoch or every step
20 | interval: step
21 | 
22 | # amount of epochs/steps between consecutive step() calls
23 | frequency: null
24 | 
25 | # name to log the learning rate as
26 | name: null


--------------------------------------------------------------------------------
/config/optim/schedule/reduce_on_plateau.yaml:
--------------------------------------------------------------------------------
 1 | # the scheduler object to use
 2 | scheduler:
 3 |   _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
 4 | 
 5 |   # whether the monitored value is minimized or maximized
 6 |   mode: min
 7 | 
 8 |   # factor by which to reduce the lr when it has plateaued
 9 |   factor: 0.1
10 | 
11 |   # number of epochs with no improvement after which learning rate will be reduced.
12 |   # Be careful with setting this value when also using early stopping
13 |   patience: 3
14 | 
15 |   # Threshold for measuring the new optimum, to only focus on significant changes
16 |   threshold: 1e-2
17 | 
18 |   # Number of epochs to wait before resuming normal operation after lr has been reduced
19 |   cooldown:  0
20 | 
21 |   # A lower bound on the learning rate
22 |   min_lr: 0
23 | 
24 | # optional value to track which is fed into the step() call
25 | # only relevant for learning rate schedulers such
26 | # as `reduce on plateau`
27 | monitor: val_eer
28 | 
29 | # whether to step every epoch or every step
30 | interval: epoch
31 | 
32 | # amount of epochs/steps between consecutive step() calls
33 | frequency: null
34 | 
35 | # name to log the learning rate as
36 | name: null


--------------------------------------------------------------------------------
/config/optim/schedule/schedule_wav2spk.yaml:
--------------------------------------------------------------------------------
 1 | # the scheduler object to use
 2 | scheduler:
 3 |   _target_: torch.optim.lr_scheduler.MultiStepLR
 4 | 
 5 |   # list of steps at which to decrease LR (assuming batch size 32)
 6 |   milestones:
 7 |     - 300_000
 8 |     - 450_000
 9 |     - 600_000
10 |     - 750_000
11 | 
12 |   # factor by which to multiply the learning rate every `step_size` epochs
13 |   gamma: 0.1
14 | 
15 |   # epoch number after which to not do any steps any more. '-1' implies never stop
16 |   last_epoch: -1
17 | 
18 |   # print to STDOUT when making a step
19 |   verbose: false
20 | 
21 | # optional value to track which is fed into the step() call
22 | # only relevant for learning rate schedulers such
23 | # as `reduce on plateau`
24 | monitor: null
25 | 
26 | # whether to step every epoch or every step
27 | interval: step
28 | 
29 | # amount of epochs/steps between consecutive step() calls
30 | frequency: null
31 | 
32 | # name to log the learning rate as
33 | name: null


--------------------------------------------------------------------------------
/config/optim/schedule/schedule_wav2vec_fan_etal.yaml:
--------------------------------------------------------------------------------
 1 | # schedule to train wav2vec + fc layer as described in
 2 | # EXPLORING WAV2VEC 2.0 ON SPEAKER VERIFICATION AND LANGUAGE IDENTIFICATION
 3 | # https://arxiv.org/abs/2012.06185
 4 | 
 5 | # the scheduler object to use
 6 | scheduler:
 7 |   _target_: torch.optim.lr_scheduler.CyclicLR
 8 | 
 9 |   # the lowest lr in the cycle
10 |   base_lr: 1e-5
11 | 
12 |   # the peak lr in the cycle
13 |   max_lr: 0.005
14 | 
15 |   # number of steps to go from base_lr to max_lr
16 |   step_size_up: 6000
17 | 
18 |   # number of steps to go from max+lr to base_lr
19 |   step_size_down: 7000
20 | 
21 |   # Adam doesn't have `momentum` parameter, can only be true with SGD
22 |   cycle_momentum: False
23 | 
24 |   # shape of line (triangular=linearly increasing/decreasing)
25 |   mode: triangular
26 | 
27 | # optional value to track which is fed into the step() call
28 | # only relevant for learning rate schedulers such
29 | # as `reduce on plateau`
30 | monitor: null
31 | 
32 | # whether to step every epoch or every step
33 | interval: step
34 | 
35 | # amount of epochs/steps between consecutive step() calls
36 | frequency: null
37 | 
38 | # name to log the learning rate as
39 | name: null


--------------------------------------------------------------------------------
/config/optim/schedule/tri_stage.yaml:
--------------------------------------------------------------------------------
 1 | # the scheduler object to use
 2 | scheduler:
 3 |   _target_: torch.optim.lr_scheduler.LambdaLR
 4 | 
 5 |   # A function which computes a multiplicative factor given an integer parameter
 6 |   lr_lambda:
 7 |     _target_: src.optim.schedule.tri_stage.TriStageLearningRateLambdaLRFunction
 8 |     max_steps: ${trainer.max_steps}
 9 |     warmup_stage_ratio: 0.1
10 |     constant_stage_ratio: 0.4
11 |     decay_stage_ratio: 0.5
12 |     initial_lr: 5e-6
13 |     base_lr: ${optim.algo.lr}
14 |     final_lr: 5e-6
15 | 
16 |   # epoch number after which to not do any steps any more. '-1' implies never stop
17 |   last_epoch: -1
18 | 
19 |   # print to STDOUT when making a step
20 |   verbose: false
21 | 
22 | # optional value to track which is fed into the step() call
23 | # only relevant for learning rate schedulers such
24 | # as `reduce on plateau`
25 | monitor: null
26 | 
27 | # whether to step every epoch or every step
28 | interval: step
29 | 
30 | # amount of epochs/steps between consecutive step() calls
31 | frequency: null
32 | 
33 | # name to log the learning rate as
34 | name: null


--------------------------------------------------------------------------------
/config/predict.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - data/module: voxceleb1
 3 |   - data/pipeline: xvector_pipeline
 4 |   - data/shards: shards_voxceleb
 5 |   - data/dataloader: speaker
 6 |   - evaluator: cosine_distance
 7 |   - network: xvector
 8 |   - optim/loss: cross_entropy
 9 |   - tokenizer: default
10 |   - trainer: trainer
11 | 
12 | # root directory with subfolders containing the canonical dataset(s)
13 | data_folder: ${oc.env:DATA_FOLDER}
14 | 
15 | # directory for temporary storage
16 | temp_folder: ${oc.env:TEMP_FOLDER}
17 | 
18 | # directory for slurm and hydra logs
19 | log_folder: ${oc.env:LOG_FOLDER}
20 | 
21 | # random seed used by the experiment
22 | seed: 42133724
23 | 
24 | # verify model (run e.g summary and BatchGradientVerification)
25 | verify_model: False
26 | 
27 | # whether to load the weight of the networks from a checkpoint
28 | load_network_from_checkpoint: null
29 | 
30 | # number of gpus to use
31 | gpus: ${oc.decode:${oc.env:NUM_GPUS}}
32 | 
33 | # experiment name
34 | experiment_name: predict_pairs
35 | 
36 | # path to folder which contains all files which need to be predicted
37 | predict_folder_path: ${data_folder}/voxsrc2021_val/wav
38 | 
39 | # path to text file containing pairs which need to be evaluated
40 | pair_prediction_path: ${data_folder}/voxsrc2021_val/voxsrc2021_val.txt
41 | 
42 | # config variables for hydra
43 | hydra:
44 |   run:
45 |     # set root output directory
46 |     dir: ${log_folder}/wav2vec_speaker_identification/run/${now:%Y-%m-%d_%H-%M-%S}/${experiment_name}
47 |   sweep:
48 |     # set root output directory
49 |     dir: ${log_folder}/wav2vec_speaker_identification/sweep/${now:%Y-%m-%d_%H-%M-%S}
50 |     subdir: ${experiment_name}


--------------------------------------------------------------------------------
/config/profiler/advanced.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |     - override /trainer: null  # override trainer to null so it's not loaded from main config defaults...
 5 | 
 6 | profiler:
 7 |   _target_: pytorch_lightning.profiler.AdvancedProfiler
 8 |   output_filename: advanced_profile.txt
 9 | 
10 | trainer:
11 |   _target_: pytorch_lightning.Trainer
12 | 
13 |   # set `1` to train on GPU, `0` to train on CPU only
14 |   gpus: ${gpus}
15 | 
16 |   # minimum number of epochs to train for
17 |   min_epochs: 1
18 | 
19 |   # maximum number of epochs to train for
20 |   max_epochs: 1
21 | 
22 |   # do not output a progress bar if rate = 0
23 |   progress_bar_refresh_rate: 1
24 | 
25 |   # potentially limit the number of train batches - set to low value for debugging
26 |   limit_train_batches: 200
27 | 
28 |   # amount of sanity validation steps to take before training starts
29 |   num_sanity_val_steps: 0
30 | 
31 | callbacks:
32 |   to_add:
33 |     - gpu_monitor


--------------------------------------------------------------------------------
/config/profiler/simple.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | profiler:
 4 |   _target_: pytorch_lightning.profiler.SimpleProfiler
 5 |   output_filename: simple_profile.txt
 6 | 
 7 | trainer:
 8 |   _target_: pytorch_lightning.Trainer
 9 | 
10 |   # set `1` to train on GPU, `0` to train on CPU only
11 |   gpus: ${gpus}
12 | 
13 |   # minimum number of epochs to train for
14 |   min_epochs: 1
15 | 
16 |   # maximum number of epochs to train for
17 |   max_epochs: 1
18 | 
19 |   # do not output a progress bar if rate = 0
20 |   progress_bar_refresh_rate: 1
21 | 
22 |   # amount of sanity validation steps to take before training starts
23 |   num_sanity_val_steps: 0
24 | 
25 | callbacks:
26 |   to_add:
27 |     - gpu_monitor


--------------------------------------------------------------------------------
/config/search/lr_and_aam_loss.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - override /hydra/sweeper: optuna
 5 |   - override /hydra/sweeper/sampler: tpe
 6 |   - override /callbacks: speaker_early_stopping
 7 |   - override /optim/loss: aam_softmax
 8 | 
 9 | hydra:
10 |   sweeper:
11 |     sampler:
12 |       seed: 123
13 |       n_startup_trials: 48
14 |       multivariate: true
15 | 
16 |     direction: minimize
17 |     study_name: lr_and_schedule_search
18 |     storage: null
19 |     n_trials: 128
20 |     n_jobs: 8
21 | 
22 |     search_space:
23 |       optim.loss.margin:
24 |         type: float
25 |         low: 0
26 |         high: 10
27 |       optim.loss.scale:
28 |         type: int
29 |         low: 1
30 |         high: 50
31 |       optim.algo.lr:
32 |         type: float
33 |         low: 1e-8
34 |         high: 1
35 |         log: true
36 |       optim.algo.weight_decay:
37 |         type: categorical
38 |         choices:
39 |           - 0
40 |           - 1e-12
41 |           - 1e-11
42 |           - 1e-10
43 |           - 1e-9
44 |           - 1e-8
45 |           - 1e-7
46 |           - 1e-6
47 |           - 1e-5
48 |           - 1e-4
49 |           - 1e-3
50 |           - 1e-2
51 |           - 1e-1
52 | 


--------------------------------------------------------------------------------
/config/search/lr_and_pooling.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - override /hydra/sweeper: optuna
 5 |   - override /hydra/sweeper/sampler: tpe
 6 |   - override /callbacks: speaker_early_stopping
 7 | 
 8 | hydra:
 9 |   sweeper:
10 |     sampler:
11 |       seed: 123
12 |       n_startup_trials: 48
13 |       multivariate: true
14 | 
15 |     direction: minimize
16 |     study_name: lr_and_schedule_search
17 |     storage: null
18 |     n_trials: 128
19 |     n_jobs: ${hydra.launcher.array_parallelism}
20 | 
21 |     search_space:
22 |       network.stat_pooling_type:
23 |         type: categorical
24 |         choices:
25 |           - mean
26 |           - mean+std
27 |           - attentive
28 |           - max
29 |           - quantile
30 |           - first
31 |       optim.algo.lr:
32 |         type: float
33 |         low: 1e-8
34 |         high: 1
35 |         log: true
36 |       optim.algo.weight_decay:
37 |         type: categorical
38 |         choices:
39 |           - 0
40 |           - 1e-12
41 |           - 1e-11
42 |           - 1e-10
43 |           - 1e-9
44 |           - 1e-8
45 |           - 1e-7
46 |           - 1e-6
47 |           - 1e-5
48 |           - 1e-4
49 |           - 1e-3
50 |           - 1e-2
51 |           - 1e-1
52 | 


--------------------------------------------------------------------------------
/config/search/lr_and_schedule_search.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - override /hydra/sweeper: optuna
 5 |   - override /hydra/sweeper/sampler: tpe
 6 |   - override /callbacks: speaker_early_stopping
 7 | 
 8 | hydra:
 9 |   sweeper:
10 |     sampler:
11 |       seed: 123
12 |       n_startup_trials: 48
13 |       multivariate: true
14 | 
15 |     direction: minimize
16 |     study_name: lr_and_schedule_search
17 |     storage: null
18 |     n_trials: 128
19 |     n_jobs: ${hydra.launcher.array_parallelism}
20 | 
21 |     search_space:
22 |       optim/schedule:
23 |         type: categorical
24 |         choices:
25 |           - tri_stage
26 |           - one_cycle
27 |       optim/algo:
28 |         type: categorical
29 |         choices:
30 |           - sgd
31 |           - adam
32 |       optim.algo.lr:
33 |         type: float
34 |         low: 1e-8
35 |         high: 1
36 |         log: true
37 |       optim.algo.weight_decay:
38 |         type: categorical
39 |         choices:
40 |           - 0
41 |           - 1e-12
42 |           - 1e-11
43 |           - 1e-10
44 |           - 1e-9
45 |           - 1e-8
46 |           - 1e-7
47 |           - 1e-6
48 |           - 1e-5
49 |           - 1e-4
50 |           - 1e-3
51 |           - 1e-2
52 |           - 1e-1
53 | 


--------------------------------------------------------------------------------
/config/tokenizer/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.tokenizer.tokenizer_wav2vec2.Wav2vec2TokenizerConfig
2 | 
3 | tokenizer_huggingface_id: "facebook/wav2vec2-base-960h"
4 | 


--------------------------------------------------------------------------------
/config/train_eval.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_ 
 3 |   - callbacks: speaker_default
 4 |   - data/module: voxceleb1
 5 |   - data/pipeline: xvector_pipeline
 6 |   - data/shards: shards_voxceleb
 7 |   - data/dataloader: speaker
 8 |   - evaluator: cosine_distance
 9 |   - network: xvector
10 |   - tokenizer: default
11 |   - optim/algo: adam
12 |   - optim/schedule: constant
13 |   - optim/loss: cross_entropy
14 |   - trainer: trainer
15 | 
16 |   # setting a profiler changes the trainer to 1 epoch
17 |   # in order to debug performance
18 |   - profiler: null
19 | 
20 | # root directory with subfolders containing the canonical dataset(s)
21 | data_folder: ${oc.env:DATA_FOLDER}
22 | 
23 | # directory for temporary storage
24 | temp_folder: ${oc.env:TEMP_FOLDER}
25 | 
26 | # directory for slurm and hydra logs
27 | log_folder: ${oc.env:LOG_FOLDER}
28 | 
29 | # random seed used by the experiment
30 | seed: 42133724
31 | 
32 | # whether to tune model
33 | tune_model: False
34 | tune_iterations: 1000
35 | 
36 | # verify model (run e.g summary and BatchGradientVerification)
37 | verify_model: false
38 | 
39 | # whether to fit model
40 | fit_model: True
41 | 
42 | # whether to evaluate model
43 | eval_model: True
44 | 
45 | # whether to load the weight of the networks from a checkpoint
46 | load_network_from_checkpoint: null
47 | 
48 | # whether to log to comet-ml
49 | use_cometml: ${oc.decode:${oc.env:USE_COMET_ML}}
50 | 
51 | # number of gpus to use
52 | gpus: ${oc.decode:${oc.env:NUM_GPUS}}
53 | 
54 | # project name (useful for giving a name to log directories)
55 | project_name: general
56 | 
57 | # experiment name
58 | # (:) indicates it needs to be resolved
59 | experiment_name: ${random_uuid:}
60 | 
61 | # tag to add to the experiment dashboard for easy filtering
62 | # of certain experiment
63 | tag: ${now:%Y-%m-%d}
64 | 
65 | # config variables for hydra
66 | hydra:
67 |   run:
68 |     # set root output directory
69 |     dir: ${log_folder}/wav2vec_speaker_identification/run/${now:%Y-%m-%d_%H-%M-%S}/${experiment_name}
70 |   sweep:
71 |     # set root output directory
72 |     dir: ${log_folder}/wav2vec_speaker_identification/sweep/${now:%Y-%m-%d_%H-%M-%S}
73 |     subdir: ${experiment_name}


--------------------------------------------------------------------------------
/config/trainer/debug_trainer.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | trainer:
 4 |   _target_: pytorch_lightning.Trainer
 5 | 
 6 |   # set `1` to train on (1) GPU, `0` to train on CPU only
 7 |   gpus: ${gpus}
 8 | 
 9 |   # useful for debugging
10 |   limit_train_batches: 10
11 |   limit_val_batches: 0
12 |   limit_test_batches: 0
13 |   fast_dev_run: False
14 | 
15 |   max_epochs: 1000
16 | 
17 | callbacks:
18 |   to_add:
19 |     - gpu_monitor
20 | 
21 | data:
22 |   module:
23 |     limit_samples: 320
24 |   shards:
25 |     initial_fill_buffer_percentage: 0
26 |     shard_shuffle_queue_size: 0
27 |     pre_batch_shuffle_queue_size: 0
28 |   dataloader:
29 |     num_workers: 0
30 |     train_batch_size: 32


--------------------------------------------------------------------------------
/config/trainer/trainer.yaml:
--------------------------------------------------------------------------------
 1 | _target_: pytorch_lightning.Trainer
 2 | 
 3 | # set `1` to train on GPU, `0` to train on CPU only
 4 | gpus: ${gpus}
 5 | 
 6 | # accelerator:
 7 | # - null is 1 gpu training
 8 | # - `ddp` is multi-gpu training
 9 | accelerator: null
10 | 
11 | # how many machines to use for multi-gpu training
12 | num_nodes: 1
13 | 
14 | # minimum number of epochs to train for
15 | min_epochs: null
16 | 
17 | # maximum number of epochs to train for
18 | max_epochs: null
19 | 
20 | # minimum number of steps to train for
21 | min_steps: null
22 | 
23 | # maximum number of steps to train for
24 | max_steps: 20000
25 | 
26 | # due to training dataset having no length we need
27 | # to manually set the validation epoch interval
28 | val_check_interval: 1000
29 | 
30 | # accumulating batches artificially increases
31 | # the batch size by doing multiple
32 | # forward steps before a single backward step
33 | accumulate_grad_batches: 1  # 1300 // 32
34 | 
35 | # do not output a progress bar if rate = 0
36 | progress_bar_refresh_rate: 500
37 | 
38 | # deterministic CUDA operations - true lead to ~20x decrease in speed :(
39 | deterministic: False
40 | 
41 | # potentially limit the number of train batches - set to low value for debugging
42 | limit_train_batches: 1.0
43 | 
44 | # potentially limit the number of val batches - set to low value for debugging
45 | limit_val_batches: 1.0
46 | 
47 | # potentially limit the number of test batches - set to low value for debugging
48 | limit_test_batches: 1.0
49 | 
50 | # fast dev run
51 | # set all three `limit_*_batches to `n` so only `n` batches are used. n=1 if 'true'
52 | fast_dev_run: false
53 | 
54 | # either train with 16 (half), 32 (single) or 64 (double) bit precision
55 | precision: 32
56 | 
57 | # amount of sanity validation steps to take before training starts
58 | num_sanity_val_steps: 2
59 | 
60 | # whether to try auto learning rate finding (this does not actually train the
61 | # model, set tune_model:true, fit_model:false, eval_model:false in `main.yaml`.
62 | # set this value to `auto_lr_find` to try it out
63 | auto_lr_find: False
64 | 
65 | # apply clipping to the global gradient norm to avoid exploding
66 | # gradients. Default value of '0' means no clipping is applied
67 | gradient_clip_val: 0


--------------------------------------------------------------------------------
/convert_voxceleb2.sh:
--------------------------------------------------------------------------------
 1 | source .env
 2 | 
 3 | PDIR=$PWD # folder where this README is located
 4 | D=$DATA_FOLDER # location of data - should be set in .env file
 5 | WORKERS=$(nproc --all) # number of CPUs available
 6 | 
 7 | # extract voxceleb 2 data
 8 | cd "$D" || exit
 9 | mkdir -p convert_tmp/train convert_tmp/test
10 | 
11 | unzip voxceleb_archives/vox2_dev_aac.zip -d convert_tmp/train
12 | unzip voxceleb_archives/vox2_test_aac.zip -d convert_tmp/test
13 | 
14 | # run the conversion script
15 | cd "$PDIR" || exit
16 | poetry run python preparation_scripts/voxceleb2_convert_to_wav.py "$D"/convert_tmp --num_workers "$WORKERS"
17 | 
18 | # rezip the converted data
19 | cd "$D"/convert_tmp/train || exit
20 | zip "$D"/voxceleb_archives/vox2_dev_wav.zip wav -r
21 | 
22 | cd "$D"/convert_tmp/test || exit
23 | zip "$D"/voxceleb_archives/vox2_test_wav.zip wav -r
24 | 
25 | # delete the unzipped .m4a files
26 | cd "$D" || exit
27 | rm -r convert_tmp


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/ecapa/.hydra/overrides.yaml:
--------------------------------------------------------------------------------
1 | - +experiment=speaker_ecapa_tdnn
2 | - tune_model=True
3 | - data/module=voxceleb1
4 | - trainer.auto_lr_find=auto_lr_find
5 | - tune_iterations=5000
6 | 


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/ecapa/lightning_logs/version_0/events.out.tfevents.1631794798.katara.82853.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/ecapa/lightning_logs/version_0/events.out.tfevents.1631794798.katara.82853.0


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/ecapa/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/ecapa/plot.png


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/ecapa/plot_lr_eer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/ecapa/plot_lr_eer.png


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/ecapa/plot_lr_eer_zoomed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/ecapa/plot_lr_eer_zoomed.png


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/ecapa/run.log:
--------------------------------------------------------------------------------
 1 | [2021-09-16 14:19:57,772][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724
 2 | [2021-09-16 14:19:58,541][src.main][INFO] - Using callback <<pytorch_lightning.callbacks.lr_monitor.LearningRateMonitor object at 0x7f9b7260a9d0>>
 3 | [2021-09-16 14:19:58,542][src.main][INFO] - Using callback <<src.callbacks.memory_monitor.RamMemoryMonitor object at 0x7f9b7260af40>>
 4 | [2021-09-16 14:19:58,543][src.main][INFO] - Using callback <<pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint object at 0x7f9b72608a30>>
 5 | [2021-09-16 14:19:58,548][pytorch_lightning.utilities.distributed][INFO] - GPU available: True, used: True
 6 | [2021-09-16 14:19:58,548][pytorch_lightning.utilities.distributed][INFO] - TPU available: False, using: 0 TPU cores
 7 | [2021-09-16 14:19:58,548][pytorch_lightning.utilities.distributed][INFO] - IPU available: False, using: 0 IPUs
 8 | [2021-09-16 14:19:58,651][pytorch_lightning.accelerators.gpu][INFO] - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 9 | [2021-09-16 14:20:00,832][pytorch_lightning.core.lightning][INFO] - 
10 |   | Name              | Type             | Params
11 | -------------------------------------------------------
12 | 0 | loss_fn           | CrossEntropyLoss | 0     
13 | 1 | metric_train_acc  | Accuracy         | 0     
14 | 2 | metric_train_loss | AverageMeter     | 0     
15 | 3 | metric_valid_acc  | Accuracy         | 0     
16 | 4 | feature_extractor | ECAPA_TDNN       | 20.8 M
17 | 5 | classifier        | Classifier       | 232 K 
18 | -------------------------------------------------------
19 | 21.0 M    Trainable params
20 | 0         Non-trainable params
21 | 21.0 M    Total params
22 | 84.000    Total estimated model params size (MB)
23 | [2021-09-16 14:20:05,425][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724
24 | [2021-09-16 14:35:24,725][pytorch_lightning.utilities.distributed][INFO] - Restoring states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-16_14-19-55/e77c80b93db94c5d92741f5b1cd3c351/lr_find_temp_model.ckpt
25 | [2021-09-16 14:35:24,810][pytorch_lightning.utilities.distributed][INFO] - Restored all states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-16_14-19-55/e77c80b93db94c5d92741f5b1cd3c351/lr_find_temp_model.ckpt
26 | [2021-09-16 14:35:24,816][pytorch_lightning.tuner.lr_finder][INFO] - Learning rate set to 0.8887917198848208
27 | 


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/grid_search_results.csv:
--------------------------------------------------------------------------------
 1 | learning rate,network,eer
 2 | 0.00001,wav2vec2-sv-aam,0.02605135925
 3 | 0.00005,wav2vec2-sv-aam,0.02063610218
 4 | 0.00009,wav2vec2-sv-aam,0.0221702382
 5 | 0.0001,wav2vec2-sv-aam,0.02154026181
 6 | 0.0002,wav2vec2-sv-aam,0.03840016946
 7 | 0.0005,wav2vec2-sv-aam,0.5026879907
 8 | 0.001,wav2vec2-sv-aam,0.5
 9 | 0.00001,wav2vec2-sv-ce,0.03126163036
10 | 0.00005,wav2vec2-sv-ce,0.02185124159
11 | 0.00009,wav2vec2-sv-ce,0.02090203203
12 | 0.0001,wav2vec2-sv-ce,0.02180619165
13 | 0.0002,wav2vec2-sv-ce,0.04387830943
14 | 0.0005,wav2vec2-sv-ce,0.5
15 | 0.001,wav2vec2-sv-ce,0.5
16 | 0.00003,wav2vec2-sv-bce,0.07767558098
17 | 0.00002,wav2vec2-sv-bce,0.08068290353
18 | 0.00004,wav2vec2-sv-bce,0.08117688447
19 | 0.00001,wav2vec2-sv-bce,0.0848563239
20 | 0.000009,wav2vec2-sv-bce,0.08623531461
21 | 0.000007,wav2vec2-sv-bce,0.08908626437
22 | 0.000005,wav2vec2-sv-bce,0.09471081942
23 | 0.0001,wav2vec2-sv-bce,0.5001855493
24 | 0.00001,wav2vec2-sv-ctc,0.2
25 | 0.00005,wav2vec2-sv-ctc,0.5
26 | 0.0001,wav2vec2-sv-ctc,0.5
27 | 0.0002,wav2vec2-sv-ctc,0.5
28 | 0.0003,wav2vec2-sv-ctc,0.5
29 | 0.0004,wav2vec2-sv-ctc,0.5
30 | 0.0005,wav2vec2-sv-ctc,0.5
31 | 0.001,ecapa,0.1084459126
32 | 0.005,ecapa,0.1220160574
33 | 0.0009,ecapa,0.113722153
34 | 0.0007,ecapa,0.1097346991
35 | 0.000005,ecapa,0.1525865346
36 | 0.0001,ecapa,0.1180286035
37 | 0.00001,ecapa,0.1362621039
38 | 0.0005,ecapa,0.1127120033
39 | 0.0004,xvector,0.09578768164
40 | 0.0003,xvector,0.09685140103
41 | 0.0008,xvector,0.09695777297
42 | 0.001,xvector,0.09717051685
43 | 0.0002,xvector,0.09988299012
44 | 0.0001,xvector,0.1024509519
45 | 0.00006,xvector,0.1059599146
46 | 0.00001,xvector,0.1311030686


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/plot_auto_lr.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from os import path
 3 | import pathlib
 4 | 
 5 | import pandas as pd
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | # import data
 9 | data_path_ce = pathlib.Path(
10 |     "wav2vec2-sv-ce/23fb5940c4c94ab39ff4ab74c3852857/lr_find_20210907-215014.json"
11 | )
12 | data_path_aam = pathlib.Path(
13 |     "wav2vec2-sv-aam/06c91df465da4d55bed874caf6fa1da5/lr_find_20210907-221822.json"
14 | )
15 | data_path_ctc = pathlib.Path()
16 | data_path_bce = pathlib.Path("wav2vec2-sv-bce/65f16f5c0860494187135a30e48097c7/lr_find_20210908-171251.json")
17 | 
18 | data_path = data_path_bce
19 | with data_path.open("r") as f:
20 |     data = json.load(f)
21 | 
22 | x_loss = data["data"]["lr"]
23 | y_loss = data["data"]["loss"]
24 | 
25 | # draw graph
26 | fig = plt.figure()
27 | ax1 = fig.add_subplot(1, 1, 1)
28 | 
29 | # line plot of learning rate versus loss
30 | (loss_line,) = ax1.plot(x_loss, y_loss, "C1")
31 | 
32 | ax1.set_xscale("log")
33 | ax1.set_xlabel("learning rate")
34 | ax1.set_ylabel("loss")
35 | ax1.set_ylim(0.4, 0.9)
36 | 
37 | plt.show()
38 | 


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/plot_eer_and_lr_find.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pathlib
 3 | 
 4 | import pandas as pd
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | network_name = "xvector"
 8 | zoomed = True
 9 | zoom_min = 0
10 | zoom_max = 0.15
11 | 
12 | # import data
13 | df = pd.read_csv("grid_search_results.csv", sep=",")
14 | df = df.loc[df['network'] == network_name]
15 | x_eer = df["learning rate"].tolist()
16 | y_eer = df["eer"].tolist()
17 | 
18 | data_path = pathlib.Path(
19 |     f"{network_name}/data.json"
20 | )
21 | 
22 | with data_path.open("r") as f:
23 |     data = json.load(f)
24 | 
25 | x_loss = data["data"]["lr"]
26 | y_loss = data["data"]["loss"]
27 | 
28 | # draw graph
29 | fig = plt.figure()
30 | ax1 = fig.add_subplot(1, 1, 1)
31 | 
32 | # line plot of learning rate versus loss
33 | loss_line, = ax1.plot(
34 |     x_loss,
35 |     y_loss,
36 |     "C1"
37 | )
38 | 
39 | ax1.set_xscale("log")
40 | ax1.set_xlabel("learning rate")
41 | ax1.set_ylabel("loss")
42 | 
43 | # scatter plot of EER result at certain LR values
44 | ax2 = plt.twinx()
45 | 
46 | eer_scatter = ax2.scatter(x=x_eer, y=y_eer, marker="x")
47 | 
48 | ax2.set_ylabel("EER")
49 | if zoomed:
50 |     ax2.set_ylim(zoom_min, zoom_max)
51 | else:
52 |     ax2.set_ylim(0, 0.6)
53 | 
54 | plt.legend([loss_line, eer_scatter], ["loss", "EER"], loc=2)
55 | plt.suptitle(network_name)
56 | 
57 | plt.savefig(f'{network_name}/plot_lr_eer{"_zoomed" if zoomed else ""}.png')
58 | plt.show()
59 | 


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/plot_eer_and_lr_find_broken.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pathlib
 3 | 
 4 | import pandas as pd
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | # import data
 8 | df = pd.read_csv("grid_search_results.csv", sep=";")
 9 | x_eer = df["learning rate"].tolist()
10 | y_eer = df["eer"].tolist()
11 | 
12 | data_path = pathlib.Path(
13 |     "wav2vec2-sv-ce/23fb5940c4c94ab39ff4ab74c3852857/lr_find_20210907-215014.json"
14 | )
15 | 
16 | with data_path.open("r") as f:
17 |     data = json.load(f)
18 | 
19 | x_loss = data["data"]["lr"]
20 | y_loss = data["data"]["loss"]
21 | 
22 | # draw graph
23 | fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True)
24 | 
25 | # scatter plot of EER result at certain LR values
26 | eer_scatter_down = ax1.scatter(x=x_eer, y=y_eer, marker="x")
27 | eer_scatter_up = ax2.scatter(x=x_eer, y=y_eer, marker="x")
28 | 
29 | ax1.set_ylabel("EER")
30 | ax2.set_ylabel("EER")
31 | 
32 | ax1.set_ylim(0.45, 0.55)
33 | ax2.set_ylim(0, 0.07)
34 | 
35 | ax1.set_xscale("log")
36 | ax2.set_xscale("log")
37 | 
38 | # line plot of learning rate versus loss
39 | (loss_line,) = ax3.plot(x_loss, y_loss, "C1")
40 | 
41 | ax3.set_xscale("log")
42 | ax3.set_xlabel("learning rate")
43 | ax3.set_ylabel("loss")
44 | ax3.set_xlim(1e-6, 5e-2)
45 | 
46 | # hide the spines between ax1 and ax2, and ax2 and ax3
47 | ax1.spines.bottom.set_visible(False)
48 | ax1.xaxis.tick_top()
49 | ax1.tick_params(labeltop=False, labelbottom=False)
50 | 
51 | ax2.spines.top.set_visible(False)
52 | ax2.spines.bottom.set_visible(False)
53 | ax2.tick_params(bottom=False, top=False, labeltop=False, labelbottom=False)
54 | ax2.xaxis.set_visible(False)
55 | 
56 | ax3.spines.top.set_visible(False)
57 | ax3.xaxis.tick_bottom()
58 | ax3.tick_params(labeltop=False)
59 | 
60 | # add discontinuity
61 | d = 0.5  # proportion of vertical to horizontal extent of the slanted line
62 | kwargs = dict(
63 |     marker=[(-1, -d), (1, d)],
64 |     markersize=12,
65 |     linestyle="none",
66 |     color="k",
67 |     mec="k",
68 |     mew=1,
69 |     clip_on=False,
70 | )
71 | ax1.plot([0, 1], [0, 0], transform=ax1.transAxes, **kwargs)
72 | ax2.plot([0, 1], [1, 1], transform=ax2.transAxes, **kwargs)
73 | 
74 | # create legend
75 | plt.legend([eer_scatter_down, loss_line], ["EER", "loss"], loc=2)
76 | 
77 | # show plot
78 | plt.show()
79 | 


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-aam/.hydra/overrides.yaml:
--------------------------------------------------------------------------------
1 | - +experiment=speaker_wav2vec2_ce
2 | - tune_model=True
3 | - data/module=voxceleb1
4 | - trainer.auto_lr_find=auto_lr_find
5 | - tune_iterations=5000
6 | - optim/loss=aam_softmax
7 | 


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-aam/lightning_logs/version_0/events.out.tfevents.1631044502.katara.6664.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-aam/lightning_logs/version_0/events.out.tfevents.1631044502.katara.6664.0


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-aam/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-aam/plot.png


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-aam/plot_lr_eer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-aam/plot_lr_eer.png


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-aam/plot_lr_eer_zoomed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-aam/plot_lr_eer_zoomed.png


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-aam/run.log:
--------------------------------------------------------------------------------
 1 | [2021-09-07 21:55:00,574][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724
 2 | [2021-09-07 21:55:02,628][src.main][INFO] - Using callback <<pytorch_lightning.callbacks.lr_monitor.LearningRateMonitor object at 0x7f3092af4940>>
 3 | [2021-09-07 21:55:02,628][src.main][INFO] - Using callback <<src.callbacks.memory_monitor.RamMemoryMonitor object at 0x7f3092af4eb0>>
 4 | [2021-09-07 21:55:02,631][src.main][INFO] - Using callback <<pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint object at 0x7f3091c64550>>
 5 | [2021-09-07 21:55:02,636][pytorch_lightning.trainer.connectors.accelerator_connector][INFO] - Using native 16bit precision.
 6 | [2021-09-07 21:55:02,637][pytorch_lightning.utilities.distributed][INFO] - GPU available: True, used: True
 7 | [2021-09-07 21:55:02,637][pytorch_lightning.utilities.distributed][INFO] - TPU available: False, using: 0 TPU cores
 8 | [2021-09-07 21:55:02,637][pytorch_lightning.utilities.distributed][INFO] - IPU available: False, using: 0 IPUs
 9 | [2021-09-07 21:55:03,125][pytorch_lightning.accelerators.gpu][INFO] - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
10 | [2021-09-07 21:55:05,880][pytorch_lightning.core.lightning][INFO] - 
11 |   | Name              | Type                             | Params
12 | -----------------------------------------------------------------------
13 | 0 | loss_fn           | AngularAdditiveMarginSoftMaxLoss | 9.2 M 
14 | 1 | metric_train_acc  | Accuracy                         | 0     
15 | 2 | metric_train_loss | AverageMeter                     | 0     
16 | 3 | metric_valid_acc  | Accuracy                         | 0     
17 | 4 | wav2vec           | Wav2Vec2WrapperModule            | 94.4 M
18 | 5 | embedding_masker  | EmbeddingMasker                  | 0     
19 | 6 | stat_pooling      | MeanStdStatPool1D                | 0     
20 | 7 | fc_list           | ModuleList                       | 0     
21 | -----------------------------------------------------------------------
22 | 103 M     Trainable params
23 | 0         Non-trainable params
24 | 103 M     Total params
25 | 414.314   Total estimated model params size (MB)
26 | [2021-09-07 21:55:10,051][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724
27 | [2021-09-07 22:18:22,208][pytorch_lightning.utilities.distributed][INFO] - Restoring states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-07_21-54-58/06c91df465da4d55bed874caf6fa1da5/lr_find_temp_model.ckpt
28 | [2021-09-07 22:18:22,620][pytorch_lightning.utilities.distributed][INFO] - Restored all states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-07_21-54-58/06c91df465da4d55bed874caf6fa1da5/lr_find_temp_model.ckpt
29 | [2021-09-07 22:18:22,666][pytorch_lightning.tuner.lr_finder][INFO] - Learning rate set to 8.379150674384097e-05
30 | 


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-bce/.hydra/overrides.yaml:
--------------------------------------------------------------------------------
1 | - +experiment=speaker_wav2vec2_pairs
2 | - tune_model=True
3 | - data/module=voxceleb1_pairs
4 | - trainer.auto_lr_find=auto_lr_find
5 | - tune_iterations=5000
6 | - data.dataloader.train_batch_size=8
7 | 


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-bce/lightning_logs/version_0/events.out.tfevents.1631113238.katara.16035.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-bce/lightning_logs/version_0/events.out.tfevents.1631113238.katara.16035.0


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-bce/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-bce/plot.png


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-bce/plot_lr_eer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-bce/plot_lr_eer.png


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-bce/plot_lr_eer_zoomed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-bce/plot_lr_eer_zoomed.png


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-bce/run.log:
--------------------------------------------------------------------------------
 1 | [2021-09-08 17:00:36,262][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724
 2 | [2021-09-08 17:00:38,382][src.main][INFO] - Using callback <<pytorch_lightning.callbacks.lr_monitor.LearningRateMonitor object at 0x7fd770c85e20>>
 3 | [2021-09-08 17:00:38,382][src.main][INFO] - Using callback <<src.callbacks.memory_monitor.RamMemoryMonitor object at 0x7fd75e90b1c0>>
 4 | [2021-09-08 17:00:38,383][src.main][INFO] - Using callback <<pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint object at 0x7fd770c85bb0>>
 5 | [2021-09-08 17:00:38,388][pytorch_lightning.trainer.connectors.accelerator_connector][INFO] - Using native 16bit precision.
 6 | [2021-09-08 17:00:38,389][pytorch_lightning.utilities.distributed][INFO] - GPU available: True, used: True
 7 | [2021-09-08 17:00:38,389][pytorch_lightning.utilities.distributed][INFO] - TPU available: False, using: 0 TPU cores
 8 | [2021-09-08 17:00:38,389][pytorch_lightning.utilities.distributed][INFO] - IPU available: False, using: 0 IPUs
 9 | [2021-09-08 17:00:38,830][pytorch_lightning.accelerators.gpu][INFO] - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
10 | [2021-09-08 17:00:41,475][pytorch_lightning.core.lightning][INFO] - 
11 |   | Name              | Type                   | Params
12 | -------------------------------------------------------------
13 | 0 | loss_fn           | BinaryCrossEntropyLoss | 0     
14 | 1 | metric_train_acc  | Accuracy               | 0     
15 | 2 | metric_train_loss | AverageMeter           | 0     
16 | 3 | metric_valid_acc  | Accuracy               | 0     
17 | 4 | wav2vec           | Wav2Vec2WrapperModule  | 94.4 M
18 | 5 | linear            | Linear                 | 769   
19 | -------------------------------------------------------------
20 | 94.4 M    Trainable params
21 | 0         Non-trainable params
22 | 94.4 M    Total params
23 | 377.490   Total estimated model params size (MB)
24 | [2021-09-08 17:01:02,988][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724
25 | [2021-09-08 17:12:50,906][pytorch_lightning.tuner.lr_finder][INFO] - LR finder stopped early after 4155 steps due to diverging loss.
26 | [2021-09-08 17:12:50,930][pytorch_lightning.utilities.distributed][INFO] - Restoring states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-08_17-00-34/65f16f5c0860494187135a30e48097c7/lr_find_temp_model.ckpt
27 | [2021-09-08 17:12:51,232][pytorch_lightning.utilities.distributed][INFO] - Restored all states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-08_17-00-34/65f16f5c0860494187135a30e48097c7/lr_find_temp_model.ckpt
28 | [2021-09-08 17:12:51,293][pytorch_lightning.tuner.lr_finder][INFO] - Learning rate set to 0.04429961991003636
29 | 


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-ce/.hydra/overrides.yaml:
--------------------------------------------------------------------------------
1 | - +experiment=speaker_wav2vec2_ce
2 | - tune_model=True
3 | - data/module=voxceleb1
4 | - trainer.auto_lr_find=auto_lr_find
5 | - tune_iterations=5000
6 | 


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-ce/lightning_logs/version_0/events.out.tfevents.1631043151.katara.6259.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-ce/lightning_logs/version_0/events.out.tfevents.1631043151.katara.6259.0


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-ce/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-ce/plot.png


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-ce/plot_lr_eer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-ce/plot_lr_eer.png


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-ce/plot_lr_eer_zoomed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-ce/plot_lr_eer_zoomed.png


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-ce/run.log:
--------------------------------------------------------------------------------
 1 | [2021-09-07 21:32:29,938][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724
 2 | [2021-09-07 21:32:31,811][src.main][INFO] - Using callback <<pytorch_lightning.callbacks.lr_monitor.LearningRateMonitor object at 0x7f2557d38e80>>
 3 | [2021-09-07 21:32:31,812][src.main][INFO] - Using callback <<src.callbacks.memory_monitor.RamMemoryMonitor object at 0x7f2555cbb340>>
 4 | [2021-09-07 21:32:31,813][src.main][INFO] - Using callback <<pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint object at 0x7f2555c87a30>>
 5 | [2021-09-07 21:32:31,818][pytorch_lightning.trainer.connectors.accelerator_connector][INFO] - Using native 16bit precision.
 6 | [2021-09-07 21:32:31,819][pytorch_lightning.utilities.distributed][INFO] - GPU available: True, used: True
 7 | [2021-09-07 21:32:31,819][pytorch_lightning.utilities.distributed][INFO] - TPU available: False, using: 0 TPU cores
 8 | [2021-09-07 21:32:31,819][pytorch_lightning.utilities.distributed][INFO] - IPU available: False, using: 0 IPUs
 9 | [2021-09-07 21:32:32,212][pytorch_lightning.accelerators.gpu][INFO] - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
10 | [2021-09-07 21:32:34,753][pytorch_lightning.core.lightning][INFO] - 
11 |   | Name              | Type                  | Params
12 | ------------------------------------------------------------
13 | 0 | loss_fn           | CrossEntropyLoss      | 0     
14 | 1 | metric_train_acc  | Accuracy              | 0     
15 | 2 | metric_train_loss | AverageMeter          | 0     
16 | 3 | metric_valid_acc  | Accuracy              | 0     
17 | 4 | wav2vec           | Wav2Vec2WrapperModule | 94.4 M
18 | 5 | embedding_masker  | EmbeddingMasker       | 0     
19 | 6 | stat_pooling      | MeanStdStatPool1D     | 0     
20 | 7 | fc_list           | ModuleList            | 1.9 M 
21 | ------------------------------------------------------------
22 | 96.2 M    Trainable params
23 | 0         Non-trainable params
24 | 96.2 M    Total params
25 | 384.932   Total estimated model params size (MB)
26 | [2021-09-07 21:32:38,530][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724
27 | [2021-09-07 21:50:13,768][pytorch_lightning.tuner.lr_finder][INFO] - LR finder stopped early after 3950 steps due to diverging loss.
28 | [2021-09-07 21:50:13,797][pytorch_lightning.utilities.distributed][INFO] - Restoring states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-07_21-32-28/23fb5940c4c94ab39ff4ab74c3852857/lr_find_temp_model.ckpt
29 | [2021-09-07 21:50:14,118][pytorch_lightning.utilities.distributed][INFO] - Restored all states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-07_21-32-28/23fb5940c4c94ab39ff4ab74c3852857/lr_find_temp_model.ckpt
30 | [2021-09-07 21:50:14,155][pytorch_lightning.tuner.lr_finder][INFO] - Learning rate set to 0.00016811249744769598
31 | 


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-ctc/.hydra/hydra.yaml:
--------------------------------------------------------------------------------
  1 | hydra:
  2 |   run:
  3 |     dir: ${log_folder}/wav2vec_speaker_identification/run/${now:%Y-%m-%d_%H-%M-%S}/${experiment_name}
  4 |   sweep:
  5 |     dir: ${log_folder}/wav2vec_speaker_identification/sweep/${now:%Y-%m-%d_%H-%M-%S}
  6 |     subdir: ${experiment_name}
  7 |   launcher:
  8 |     _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
  9 |   sweeper:
 10 |     _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
 11 |     max_batch_size: null
 12 |   help:
 13 |     app_name: ${hydra.job.name}
 14 |     header: '${hydra.help.app_name} is powered by Hydra.
 15 | 
 16 |       '
 17 |     footer: 'Powered by Hydra (https://hydra.cc)
 18 | 
 19 |       Use --hydra-help to view Hydra specific help
 20 | 
 21 |       '
 22 |     template: '${hydra.help.header}
 23 | 
 24 |       == Configuration groups ==
 25 | 
 26 |       Compose your configuration from those groups (group=option)
 27 | 
 28 | 
 29 |       $APP_CONFIG_GROUPS
 30 | 
 31 | 
 32 |       == Config ==
 33 | 
 34 |       Override anything in the config (foo.bar=value)
 35 | 
 36 | 
 37 |       $CONFIG
 38 | 
 39 | 
 40 |       ${hydra.help.footer}
 41 | 
 42 |       '
 43 |   hydra_help:
 44 |     template: 'Hydra (${hydra.runtime.version})
 45 | 
 46 |       See https://hydra.cc for more info.
 47 | 
 48 | 
 49 |       == Flags ==
 50 | 
 51 |       $FLAGS_HELP
 52 | 
 53 | 
 54 |       == Configuration groups ==
 55 | 
 56 |       Compose your configuration from those groups (For example, append hydra/job_logging=disabled
 57 |       to command line)
 58 | 
 59 | 
 60 |       $HYDRA_CONFIG_GROUPS
 61 | 
 62 | 
 63 |       Use ''--cfg hydra'' to Show the Hydra config.
 64 | 
 65 |       '
 66 |     hydra_help: ???
 67 |   hydra_logging:
 68 |     version: 1
 69 |     formatters:
 70 |       simple:
 71 |         format: '[%(asctime)s][HYDRA] %(message)s'
 72 |     handlers:
 73 |       console:
 74 |         class: logging.StreamHandler
 75 |         formatter: simple
 76 |         stream: ext://sys.stdout
 77 |     root:
 78 |       level: INFO
 79 |       handlers:
 80 |       - console
 81 |     loggers:
 82 |       logging_example:
 83 |         level: DEBUG
 84 |     disable_existing_loggers: false
 85 |   job_logging:
 86 |     version: 1
 87 |     formatters:
 88 |       simple:
 89 |         format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
 90 |     handlers:
 91 |       console:
 92 |         class: logging.StreamHandler
 93 |         formatter: simple
 94 |         stream: ext://sys.stdout
 95 |       file:
 96 |         class: logging.FileHandler
 97 |         formatter: simple
 98 |         filename: ${hydra.job.name}.log
 99 |     root:
100 |       level: INFO
101 |       handlers:
102 |       - console
103 |       - file
104 |     disable_existing_loggers: false
105 |   env: {}
106 |   searchpath: []
107 |   callbacks: {}
108 |   output_subdir: .hydra
109 |   overrides:
110 |     hydra: []
111 |     task:
112 |     - +experiment=speaker_wav2vec2_ctc
113 |     - tune_model=True
114 |     - data/module=voxceleb1
115 |     - trainer.auto_lr_find=auto_lr_find
116 |     - tune_iterations=5000
117 |   job:
118 |     name: run
119 |     override_dirname: +experiment=speaker_wav2vec2_ctc,data/module=voxceleb1,trainer.auto_lr_find=auto_lr_find,tune_iterations=5000,tune_model=True
120 |     id: ???
121 |     num: ???
122 |     config_name: train_eval
123 |     env_set: {}
124 |     env_copy: []
125 |     config:
126 |       override_dirname:
127 |         kv_sep: '='
128 |         item_sep: ','
129 |         exclude_keys: []
130 |   runtime:
131 |     version: 1.1.1
132 |     cwd: /home/nik/workspace/phd/repos/wav2vec_speaker_identification
133 |     config_sources:
134 |     - path: hydra.conf
135 |       schema: pkg
136 |       provider: hydra
137 |     - path: /home/nik/workspace/phd/repos/wav2vec_speaker_identification/config
138 |       schema: file
139 |       provider: main
140 |     - path: ''
141 |       schema: structured
142 |       provider: schema
143 |     choices:
144 |       experiment: speaker_wav2vec2_ctc
145 |       profiler: null
146 |       trainer: trainer
147 |       optim/loss: ctc
148 |       optim/schedule: one_cycle
149 |       optim/algo: adam
150 |       tokenizer: default
151 |       network: wav2vec2_fc
152 |       evaluator: cosine_distance
153 |       data/dataloader: speaker
154 |       data/shards: shards_voxceleb
155 |       data/pipeline: wav2vec_base_pipeline
156 |       data/module: voxceleb1
157 |       callbacks: speaker_default
158 |       hydra/env: default
159 |       hydra/callbacks: null
160 |       hydra/job_logging: default
161 |       hydra/hydra_logging: default
162 |       hydra/hydra_help: default
163 |       hydra/help: default
164 |       hydra/sweeper: basic
165 |       hydra/launcher: basic
166 |       hydra/output: default
167 |   verbose: false
168 | 


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-ctc/.hydra/overrides.yaml:
--------------------------------------------------------------------------------
1 | - +experiment=speaker_wav2vec2_ctc
2 | - tune_model=True
3 | - data/module=voxceleb1
4 | - trainer.auto_lr_find=auto_lr_find
5 | - tune_iterations=5000
6 | 


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-ctc/lightning_logs/version_0/events.out.tfevents.1631793388.katara.71473.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-ctc/lightning_logs/version_0/events.out.tfevents.1631793388.katara.71473.0


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-ctc/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-ctc/plot.png


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-ctc/plot_lr_eer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/wav2vec2-sv-ctc/plot_lr_eer.png


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/wav2vec2-sv-ctc/run.log:
--------------------------------------------------------------------------------
 1 | [2021-09-16 13:56:26,445][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724
 2 | [2021-09-16 13:56:28,219][src.main][INFO] - Using callback <<pytorch_lightning.callbacks.lr_monitor.LearningRateMonitor object at 0x7fe9778a9490>>
 3 | [2021-09-16 13:56:28,219][src.main][INFO] - Using callback <<src.callbacks.memory_monitor.RamMemoryMonitor object at 0x7fe977897670>>
 4 | [2021-09-16 13:56:28,220][src.main][INFO] - Using callback <<pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint object at 0x7fe97788c160>>
 5 | [2021-09-16 13:56:28,225][pytorch_lightning.trainer.connectors.accelerator_connector][INFO] - Using native 16bit precision.
 6 | [2021-09-16 13:56:28,226][pytorch_lightning.utilities.distributed][INFO] - GPU available: True, used: True
 7 | [2021-09-16 13:56:28,226][pytorch_lightning.utilities.distributed][INFO] - TPU available: False, using: 0 TPU cores
 8 | [2021-09-16 13:56:28,226][pytorch_lightning.utilities.distributed][INFO] - IPU available: False, using: 0 IPUs
 9 | [2021-09-16 13:56:28,615][pytorch_lightning.accelerators.gpu][INFO] - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
10 | [2021-09-16 13:56:30,795][pytorch_lightning.core.lightning][INFO] - 
11 |   | Name              | Type                  | Params
12 | ------------------------------------------------------------
13 | 0 | loss_fn           | CtcLoss               | 0     
14 | 1 | metric_train_acc  | Accuracy              | 0     
15 | 2 | metric_train_loss | AverageMeter          | 0     
16 | 3 | metric_valid_acc  | Accuracy              | 0     
17 | 4 | wav2vec           | Wav2Vec2WrapperModule | 94.4 M
18 | 5 | embedding_masker  | EmbeddingMasker       | 0     
19 | 6 | stat_pooling      | NoPooling             | 0     
20 | 7 | test_stat_pooling | MeanStdStatPool1D     | 0     
21 | 8 | fc_list           | ModuleList            | 932 K 
22 | ------------------------------------------------------------
23 | 95.3 M    Trainable params
24 | 0         Non-trainable params
25 | 95.3 M    Total params
26 | 381.215   Total estimated model params size (MB)
27 | [2021-09-16 13:56:34,302][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724
28 | [2021-09-16 14:12:17,607][pytorch_lightning.tuner.lr_finder][INFO] - LR finder stopped early after 3379 steps due to diverging loss.
29 | [2021-09-16 14:12:17,627][pytorch_lightning.utilities.distributed][INFO] - Restoring states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-16_13-56-24/05cf7a78d19f46128b4c2d4fadf3eaec/lr_find_temp_model.ckpt
30 | [2021-09-16 14:12:17,947][pytorch_lightning.utilities.distributed][INFO] - Restored all states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-16_13-56-24/05cf7a78d19f46128b4c2d4fadf3eaec/lr_find_temp_model.ckpt
31 | [2021-09-16 14:12:17,980][pytorch_lightning.tuner.lr_finder][INFO] - Learning rate set to 0.00029977816715823815
32 | 


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/xvector/.hydra/hydra.yaml:
--------------------------------------------------------------------------------
  1 | hydra:
  2 |   run:
  3 |     dir: ${log_folder}/wav2vec_speaker_identification/run/${now:%Y-%m-%d_%H-%M-%S}/${experiment_name}
  4 |   sweep:
  5 |     dir: ${log_folder}/wav2vec_speaker_identification/sweep/${now:%Y-%m-%d_%H-%M-%S}
  6 |     subdir: ${experiment_name}
  7 |   launcher:
  8 |     _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
  9 |   sweeper:
 10 |     _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
 11 |     max_batch_size: null
 12 |   help:
 13 |     app_name: ${hydra.job.name}
 14 |     header: '${hydra.help.app_name} is powered by Hydra.
 15 | 
 16 |       '
 17 |     footer: 'Powered by Hydra (https://hydra.cc)
 18 | 
 19 |       Use --hydra-help to view Hydra specific help
 20 | 
 21 |       '
 22 |     template: '${hydra.help.header}
 23 | 
 24 |       == Configuration groups ==
 25 | 
 26 |       Compose your configuration from those groups (group=option)
 27 | 
 28 | 
 29 |       $APP_CONFIG_GROUPS
 30 | 
 31 | 
 32 |       == Config ==
 33 | 
 34 |       Override anything in the config (foo.bar=value)
 35 | 
 36 | 
 37 |       $CONFIG
 38 | 
 39 | 
 40 |       ${hydra.help.footer}
 41 | 
 42 |       '
 43 |   hydra_help:
 44 |     template: 'Hydra (${hydra.runtime.version})
 45 | 
 46 |       See https://hydra.cc for more info.
 47 | 
 48 | 
 49 |       == Flags ==
 50 | 
 51 |       $FLAGS_HELP
 52 | 
 53 | 
 54 |       == Configuration groups ==
 55 | 
 56 |       Compose your configuration from those groups (For example, append hydra/job_logging=disabled
 57 |       to command line)
 58 | 
 59 | 
 60 |       $HYDRA_CONFIG_GROUPS
 61 | 
 62 | 
 63 |       Use ''--cfg hydra'' to Show the Hydra config.
 64 | 
 65 |       '
 66 |     hydra_help: ???
 67 |   hydra_logging:
 68 |     version: 1
 69 |     formatters:
 70 |       simple:
 71 |         format: '[%(asctime)s][HYDRA] %(message)s'
 72 |     handlers:
 73 |       console:
 74 |         class: logging.StreamHandler
 75 |         formatter: simple
 76 |         stream: ext://sys.stdout
 77 |     root:
 78 |       level: INFO
 79 |       handlers:
 80 |       - console
 81 |     loggers:
 82 |       logging_example:
 83 |         level: DEBUG
 84 |     disable_existing_loggers: false
 85 |   job_logging:
 86 |     version: 1
 87 |     formatters:
 88 |       simple:
 89 |         format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
 90 |     handlers:
 91 |       console:
 92 |         class: logging.StreamHandler
 93 |         formatter: simple
 94 |         stream: ext://sys.stdout
 95 |       file:
 96 |         class: logging.FileHandler
 97 |         formatter: simple
 98 |         filename: ${hydra.job.name}.log
 99 |     root:
100 |       level: INFO
101 |       handlers:
102 |       - console
103 |       - file
104 |     disable_existing_loggers: false
105 |   env: {}
106 |   searchpath: []
107 |   callbacks: {}
108 |   output_subdir: .hydra
109 |   overrides:
110 |     hydra: []
111 |     task:
112 |     - +experiment=speaker_xvector
113 |     - tune_model=True
114 |     - data/module=voxceleb1
115 |     - trainer.auto_lr_find=auto_lr_find
116 |     - tune_iterations=5000
117 |   job:
118 |     name: run
119 |     override_dirname: +experiment=speaker_xvector,data/module=voxceleb1,trainer.auto_lr_find=auto_lr_find,tune_iterations=5000,tune_model=True
120 |     id: ???
121 |     num: ???
122 |     config_name: train_eval
123 |     env_set: {}
124 |     env_copy: []
125 |     config:
126 |       override_dirname:
127 |         kv_sep: '='
128 |         item_sep: ','
129 |         exclude_keys: []
130 |   runtime:
131 |     version: 1.1.1
132 |     cwd: /home/nik/workspace/phd/repos/wav2vec_speaker_identification
133 |     config_sources:
134 |     - path: hydra.conf
135 |       schema: pkg
136 |       provider: hydra
137 |     - path: /home/nik/workspace/phd/repos/wav2vec_speaker_identification/config
138 |       schema: file
139 |       provider: main
140 |     - path: ''
141 |       schema: structured
142 |       provider: schema
143 |     choices:
144 |       experiment: speaker_xvector
145 |       profiler: null
146 |       trainer: trainer
147 |       optim/loss: cross_entropy
148 |       optim/schedule: one_cycle
149 |       optim/algo: adam
150 |       tokenizer: default
151 |       network: xvector
152 |       evaluator: cosine_distance_with_train_data
153 |       data/dataloader: speaker
154 |       data/shards: shards_voxceleb
155 |       data/pipeline: xvector_pipeline
156 |       data/module: voxceleb1
157 |       callbacks: speaker_default
158 |       hydra/env: default
159 |       hydra/callbacks: null
160 |       hydra/job_logging: default
161 |       hydra/hydra_logging: default
162 |       hydra/hydra_help: default
163 |       hydra/help: default
164 |       hydra/sweeper: basic
165 |       hydra/launcher: basic
166 |       hydra/output: default
167 |   verbose: false
168 | 


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/xvector/.hydra/overrides.yaml:
--------------------------------------------------------------------------------
1 | - +experiment=speaker_xvector
2 | - tune_model=True
3 | - data/module=voxceleb1
4 | - trainer.auto_lr_find=auto_lr_find
5 | - tune_iterations=5000
6 | 


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/xvector/lightning_logs/version_0/events.out.tfevents.1631794594.katara.80664.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/xvector/lightning_logs/version_0/events.out.tfevents.1631794594.katara.80664.0


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/xvector/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/xvector/plot.png


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/xvector/plot_lr_eer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/xvector/plot_lr_eer.png


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/xvector/plot_lr_eer_zoomed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/paper_results/auto_lr_find/xvector/plot_lr_eer_zoomed.png


--------------------------------------------------------------------------------
/paper_results/auto_lr_find/xvector/run.log:
--------------------------------------------------------------------------------
 1 | [2021-09-16 14:16:33,601][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724
 2 | [2021-09-16 14:16:34,332][src.main][INFO] - Using callback <<pytorch_lightning.callbacks.lr_monitor.LearningRateMonitor object at 0x7fe13117ae50>>
 3 | [2021-09-16 14:16:34,332][src.main][INFO] - Using callback <<src.callbacks.memory_monitor.RamMemoryMonitor object at 0x7fe13117abe0>>
 4 | [2021-09-16 14:16:34,333][src.main][INFO] - Using callback <<pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint object at 0x7fe13117d7f0>>
 5 | [2021-09-16 14:16:34,338][pytorch_lightning.utilities.distributed][INFO] - GPU available: True, used: True
 6 | [2021-09-16 14:16:34,338][pytorch_lightning.utilities.distributed][INFO] - TPU available: False, using: 0 TPU cores
 7 | [2021-09-16 14:16:34,338][pytorch_lightning.utilities.distributed][INFO] - IPU available: False, using: 0 IPUs
 8 | [2021-09-16 14:16:34,366][pytorch_lightning.accelerators.gpu][INFO] - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 9 | [2021-09-16 14:16:36,535][pytorch_lightning.core.lightning][INFO] - 
10 |   | Name              | Type             | Params
11 | -------------------------------------------------------
12 | 0 | loss_fn           | CrossEntropyLoss | 0     
13 | 1 | metric_train_acc  | Accuracy         | 0     
14 | 2 | metric_train_loss | AverageMeter     | 0     
15 | 3 | metric_valid_acc  | Accuracy         | 0     
16 | 4 | feature_extractor | Xvector          | 4.3 M 
17 | 5 | classifier        | Classifier       | 885 K 
18 | -------------------------------------------------------
19 | 5.1 M     Trainable params
20 | 0         Non-trainable params
21 | 5.1 M     Total params
22 | 20.554    Total estimated model params size (MB)
23 | [2021-09-16 14:16:40,831][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42133724
24 | [2021-09-16 14:18:51,476][pytorch_lightning.tuner.lr_finder][INFO] - LR finder stopped early after 4418 steps due to diverging loss.
25 | [2021-09-16 14:18:51,479][pytorch_lightning.utilities.distributed][INFO] - Restoring states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-16_14-16-31/f47df09ca90443d3b2dd5ffffcc8d60c/lr_find_temp_model.ckpt
26 | [2021-09-16 14:18:51,499][pytorch_lightning.utilities.distributed][INFO] - Restored all states from the checkpoint file at /home/nik/workspace/phd/data/logs/wav2vec_speaker_identification/run/2021-09-16_14-16-31/f47df09ca90443d3b2dd5ffffcc8d60c/lr_find_temp_model.ckpt
27 | [2021-09-16 14:18:51,502][pytorch_lightning.tuner.lr_finder][INFO] - Learning rate set to 0.0021281390459827135
28 | 


--------------------------------------------------------------------------------
/paper_results/run_tests_pool.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | from collections import defaultdict
 4 | 
 5 | root_folder = "/home/nvaessen/data/transfer/paper_n4/aam/ablation/"
 6 | experiment_name = "speaker_wav2vec2_aam"
 7 | tag_prefix = "ablation"
 8 | test_run = False
 9 | test_sets_to_use = [
10 |     # "voxceleb2",
11 |     "voxceleb2_test_everyone",
12 |     # "voxceleb2_test_hard"
13 | ]
14 | 
15 | postfix_map = {
16 |     "voxceleb2": "o",
17 |     "voxceleb2_test_hard": "h",
18 |     "voxceleb2_test_everyone": "e",
19 | }
20 | 
21 | 
22 | path_dict = defaultdict(set)
23 | for ckpt in sorted(pathlib.Path(root_folder).glob("*.ckpt")):
24 |     first_underscore = ckpt.stem.find("_")
25 |     first_dot = ckpt.stem.find(".")
26 | 
27 |     ablation_name = ckpt.stem[first_underscore+1:first_dot]
28 | 
29 |     path_dict[ablation_name].add(ckpt.absolute())
30 | 
31 | assert len(path_dict) == 10
32 | for key, v in path_dict.items():
33 |     assert len(v) == 3
34 | 
35 | for test_set in test_sets_to_use:
36 |     for ablation_name, ckpt_set in path_dict.items():
37 |         for ckpt in ckpt_set:
38 |             command_template = (
39 |                 f"python run.py -m +experiment={experiment_name} "
40 |                 f"data/module={test_set} "
41 |                 f"fit_model=False "
42 |                 f"network.stat_pooling_type=first+cls "
43 |                 f"tag={ablation_name}_eval_{postfix_map[test_set]} "
44 |                 f"load_network_from_checkpoint={ckpt} "
45 |                 f"network.explicit_num_speakers=5994 "
46 |                 f"hydra/launcher=slurm "
47 |             )
48 | 
49 |             print(f"{command_template} & ")
50 | 
51 |             if test_run:
52 |                 exit()
53 | 


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # This run script encapsulates making predictions with a particular network
 4 | # on data without labels.
 5 | #
 6 | # Author(s): Nik Vaessen
 7 | ################################################################################
 8 | 
 9 | import hydra
10 | 
11 | from dotenv import load_dotenv
12 | from omegaconf import DictConfig, OmegaConf
13 | 
14 | from src.hydra_resolvers import (
15 |     division_resolver,
16 |     integer_division_resolver,
17 |     random_uuid,
18 | )
19 | 
20 | ################################################################################
21 | # set custom resolvers
22 | 
23 | OmegaConf.register_new_resolver("divide", division_resolver)
24 | OmegaConf.register_new_resolver("idivide", integer_division_resolver)
25 | OmegaConf.register_new_resolver("random_uuid", random_uuid)
26 | 
27 | ################################################################################
28 | # wrap around main hydra script
29 | 
30 | 
31 | @hydra.main(config_path="config", config_name="predict")
32 | def run(cfg: DictConfig):
33 |     # we import here such that tab-completion in bash
34 |     # does not need to import everything (which slows it down
35 |     # significantly)
36 |     from src.main import run_predictions
37 | 
38 |     return run_predictions(cfg)
39 | 
40 | 
41 | ################################################################################
42 | # execute hydra application
43 | 
44 | if __name__ == "__main__":
45 |     load_dotenv()
46 |     run()
47 | 
48 | 


--------------------------------------------------------------------------------
/preparation_scripts/download_and_prepare_rirs.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | ### set environment variables
 4 | source ../.env 2> /dev/null || source .env
 5 | 
 6 | # default directory to save files in
 7 | DIR="$DATA_FOLDER"
 8 | mkdir -p "$DIR"
 9 | 
10 | ## download files
11 | curl -C - https://www.openslr.org/resources/28/rirs_noises.zip --output "$DIR"/rirs_noises.zip
12 | 
13 | # extract file and remove zip
14 | cd "$DIR"
15 | unzip rirs_noises.zip -d "$DIR"
16 | rm rirs_noises.zip
17 | 
18 | # create tar for webdataset compatability
19 | mkdir -p "$DIR"/rirs_shards/
20 | tar --sort=name -cf rirs_shards/pointsource_noises.tar RIRS_NOISES/pointsource_noises
21 | tar --sort=name -cf rirs_shards/real_rirs_isotropic_noises.tar RIRS_NOISES/real_rirs_isotropic_noises
22 | tar --sort=name -cf rirs_shards/simulated_rirs.tar RIRS_NOISES/simulated_rirs
23 | 
24 | # remove extracted dir
25 | rm -r "$DIR"/RIRS_NOISES


--------------------------------------------------------------------------------
/preparation_scripts/download_librispeech.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | ### set environment variables
 4 | source ../.env 2> /dev/null || source .env
 5 | 
 6 | # default directory to save files in
 7 | DIR="$DATA_FOLDER"/librispeech
 8 | mkdir -p "$DIR"
 9 | 
10 | ## download files
11 | curl -C - https://www.openslr.org/resources/12/dev-clean.tar.gz --output "$DIR"/dev-clean.tar.gz
12 | curl -C - https://www.openslr.org/resources/12/dev-other.tar.gz --output "$DIR"/dev-other.tar.gz
13 | curl -C - https://www.openslr.org/resources/12/test-clean.tar.gz --output "$DIR"/test-clean.tar.gz
14 | curl -C - https://www.openslr.org/resources/12/test-other.tar.gz --output "$DIR"/test-other.tar.gz
15 | curl -C - https://www.openslr.org/resources/12/train-clean-100.tar.gz --output "$DIR"/train-clean-100.tar.gz
16 | curl -C - https://www.openslr.org/resources/12/train-clean-360.tar.gz --output "$DIR"/train-clean-360.tar.gz
17 | curl -C - https://www.openslr.org/resources/12/train-other-500.tar.gz --output "$DIR"/train-other-500.tar.gz
18 | 


--------------------------------------------------------------------------------
/preparation_scripts/download_pretrained_models.sh:
--------------------------------------------------------------------------------
 1 | ### set environment variables
 2 | source ../.env 2> /dev/null || source .env
 3 | 
 4 | ### create folder to store models in
 5 | PRETRAINED="$DATA_FOLDER"/pretrained_models/wav2vec
 6 | mkdir -p "$PRETRAINED"
 7 | 
 8 | ### download pretrained models
 9 | 
10 | # wav2vec1 large
11 | echo "wav2vec1 - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_large.pt"
12 | curl -C - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_large.pt --output "$PRETRAINED"/wav2vec_large.pt
13 | 
14 | # wav2vec2 small - no ft
15 | echo "# wav2vec2 small - no ft - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small.pt"
16 | curl -C - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small.pt --output "$PRETRAINED"/wav2vec2_small_noft.pt
17 | 
18 | # wav2vec2 small - 10 minutes
19 | echo "# wav2vec2 small - 10m ft - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small_10m.pt"
20 | curl -C - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small_10m.pt --output "$PRETRAINED"/wav2vec2_small_ft10m.pt
21 | 
22 | # wav2vec2 small - 100 hours
23 | echo "# wav2vec2 small - 100h ft - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small_100h.pt"
24 | curl -C - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small_100h.pt --output "$PRETRAINED"/wav2vec2_small_ft100h.pt
25 | 
26 | # wav2vec2 small - 960h ft
27 | echo "wav2vec2 small - 960h ft - https://dl.fbaipublicfiles.com/fairseq/wav2vec/libri960_big.pt"
28 | curl -C - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small_960h.pt --output "$PRETRAINED"/wav2vec2_small_ft960h.pt
29 | 
30 | # wav2vec2 large - no ft
31 | echo "wav2vec2 large - no ft - https://dl.fbaipublicfiles.com/fairseq/wav2vec/libri960_big.pt"
32 | curl -C - https://dl.fbaipublicfiles.com/fairseq/wav2vec/libri960_big.pt --output "$PRETRAINED"/wav2vec2_large_noft.pt
33 | 
34 | # wav2vec2 large - 10 minutes
35 | echo "# wav2vec2 base - 10m ft - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_big_10m.pt"
36 | curl -C - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_big_10m.pt --output "$PRETRAINED"/wav2vec2_large_10m.pt
37 | 
38 | # wav2vec2 large - 100 hours
39 | echo "# wav2vec2 large - 100h ft - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_big_100h.pt"
40 | curl -C - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_big_100h.pt --output "$PRETRAINED"/wav2vec2_large_100h.pt
41 | 
42 | # wav2vec2 large - 960 ft
43 | echo "wav2vec2 large - 960h ft - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_big_960h.pt"
44 | curl -C - https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_big_960h.pt --output "$PRETRAINED"/wav2vec2_large_960h.pt


--------------------------------------------------------------------------------
/preparation_scripts/download_voxceleb_meta.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | ### set environment variables
 4 | source ../.env 2> /dev/null || source .env
 5 | 
 6 | # default directory to save files in
 7 | DIR="$DATA_FOLDER"/voxceleb_meta
 8 | mkdir -p "$DIR"
 9 | 
10 | ## download files
11 | curl -C - https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/iden_split.txt --output "$DIR"/iden_split.txt
12 | curl -C - https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test.txt --output "$DIR"/veri_test.txt
13 | curl -C - https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt --output "$DIR"/veri_test2.txt
14 | curl -C - https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_hard.txt --output "$DIR"/list_test_hard.txt
15 | curl -C - https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_hard2.txt --output "$DIR"/list_test_hard2.txt
16 | curl -C - https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_all.txt --output "$DIR"/list_test_all.txt
17 | curl -C - https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_all2.txt --output "$DIR"/list_test_all2.txt
18 | curl -C - https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/vox1_meta.csv --output "$DIR"/vox1_meta.csv
19 | 


--------------------------------------------------------------------------------
/preparation_scripts/hydra_bash_complete.sh:
--------------------------------------------------------------------------------
1 | # you need to source this file instead of executing it
2 | eval "$(python run.py -sc install=bash)"
3 | 


--------------------------------------------------------------------------------
/preparation_scripts/set_cuda_dependencies.sh:
--------------------------------------------------------------------------------
1 | pip install -r requirements/requirements_py1.9_cuda111.txt


--------------------------------------------------------------------------------
/preparation_scripts/validate_scores.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | #
  3 | # This file creates a CLI for validating a score text file given a
  4 | # text file with pairs (without gt labels). If validation is successfull a 
  5 | # zipfile will be created which can be submitted to voxceleb challenge on
  6 | # codalab.
  7 | #
  8 | # pair text file format:
  9 | # 'FILEa FILEb\n'
 10 | # ...
 11 | # 'FILEc FILEd\n'
 12 | # 
 13 | # score text file format:
 14 | # 'SCORE_FLOAT FILEa FILEb\n'
 15 | # ...
 16 | # 'SCORE_FLOAT FILEc FILEd\n'
 17 | #
 18 | # where SCORE_FLOAT is a string representing a float between 0 and 1.
 19 | # 
 20 | # Author(s): Nik Vaessen
 21 | ################################################################################
 22 | 
 23 | import pathlib
 24 | import argparse
 25 | import tqdm
 26 | import zipfile
 27 | 
 28 | from typing import List, Tuple
 29 | 
 30 | ################################################################################
 31 | # validation function
 32 | 
 33 | def _load_pair_file(file: pathlib.Path) -> List[Tuple[str, str]]:
 34 |     with file.open('r') as f:
 35 |         lines = f.readlines()
 36 | 
 37 |     loaded_list = []
 38 | 
 39 |     for l in lines:
 40 |         l = l.strip()
 41 | 
 42 |         assert l.count(" ") == 1
 43 |         
 44 |         split_line = l.split(" ")
 45 |         assert len(split_line) == 2
 46 | 
 47 |         key1, key2 = split_line
 48 |         loaded_list.append((key1, key2))
 49 |     
 50 |     return loaded_list
 51 | 
 52 | def _load_score_file(file: pathlib.Path) -> List[Tuple[float, str, str]]:
 53 |     with file.open('r') as f:
 54 |         lines = f.readlines()
 55 | 
 56 |     loaded_list = []
 57 |     
 58 |     for l in lines:
 59 |         l = l.strip()
 60 | 
 61 |         assert l.count(" ") == 2
 62 |         
 63 |         split_line = l.split(" ")
 64 |         assert len(split_line) == 3
 65 | 
 66 |         score, key1, key2 = split_line
 67 | 
 68 |         try:
 69 |             score = float(score)
 70 |         except:
 71 |             raise ValueError(f"could not convert {score} to float")
 72 |         
 73 |         assert isinstance(score, float)
 74 |         loaded_list.append((score, key1, key2))
 75 |     
 76 |     return loaded_list
 77 | 
 78 | def validate(pair_file: pathlib.Path, score_file: pathlib.Path):
 79 |     # load data in file
 80 |     pairs = _load_pair_file(pair_file)
 81 |     scores = _load_score_file(score_file)
 82 | 
 83 |     # ensure each float is between 0 and 1
 84 |     print("validate each score is valid")
 85 |     for score_tuple in tqdm.tqdm(scores):
 86 |         score = score_tuple[0]
 87 | 
 88 |         assert score <= 1
 89 |         assert score >= 0
 90 | 
 91 |     # ensure each pair is present
 92 |     print("validate each pair is present")
 93 |     for score_tuple in tqdm.tqdm(scores):
 94 |         pair_tuple = (score_tuple[1], score_tuple[2])
 95 | 
 96 |         assert pair_tuple in pairs
 97 |     
 98 | 
 99 | 
100 | ################################################################################
101 | # creation of submission file.
102 | 
103 | SCORE_FILE_NAME = 'scores.txt'
104 | ZIPFILE_NAME = 'submission.zip'
105 | 
106 | def create_submission(score_file: pathlib.Path):
107 |     zipfile_path = score_file.parent / ZIPFILE_NAME
108 | 
109 |     with zipfile.ZipFile(str(zipfile_path), mode='w') as f:
110 |         f.write(str(score_file), SCORE_FILE_NAME)
111 | 
112 | ################################################################################
113 | # entrypoint of CLI
114 | 
115 | def main():
116 |     # set CLI arguments 
117 |     parser = argparse.ArgumentParser()
118 | 
119 |     parser.add_argument("--score_file", required=True)
120 |     parser.add_argument("--pair_file", required=True)
121 | 
122 |     # load arguments
123 |     args = parser.parse_args()
124 |     
125 |     score_file = pathlib.Path(args.score_file)
126 |     pair_file = pathlib.Path(args.pair_file)
127 | 
128 |     # validate score file
129 |     # validate(pair_file, score_file)
130 | 
131 |     # create submission zipfile
132 |     create_submission(score_file)
133 | 
134 | 
135 | if __name__ == "__main__":
136 |     main()


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "wav2vec-speaker-identification"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Nik Vaessen <git@mail.vaessen.tech>"]
 6 | packages = [
 7 |     { include = "src" },
 8 |     { include = "tests" }
 9 | ]
10 | 
11 | [tool.poetry.dependencies]
12 | python = "^3.8"
13 | tqdm = "^4.27"
14 | scipy = "^1.6.0"
15 | scikit-learn = "^0.24.0"
16 | click = "^7.1.2"
17 | pandas = "^1.2.1"
18 | pytorch-lightning = "1.4.5"
19 | pytorch-model-summary = "^0.1.2"
20 | jupyter = "^1.0.0"
21 | librosa = "^0.8.0"
22 | matplotlib = "^3.3.4"
23 | seaborn = "^0.11.1"
24 | hydra-core = "^1.1.0"
25 | psutil = "^5.8.0"
26 | webdataset = "0.1.58"
27 | yaspin = "2.0.0"
28 | "hurry.filesize" = "^0.9"
29 | python-dotenv = "^0.17.0"
30 | torchaudio = "0.9.0"
31 | speechbrain = "^0.5.5"
32 | comet-ml = "^3.9.0"
33 | lightning-bolts = "^0.3.3"
34 | hydra-submitit-launcher = "^1.1.1"
35 | wavaugment = "^0.2"
36 | jupyterlab = "^3.0.14"
37 | fairseq = "0.10.2"
38 | jiwer = "^2.2.0"
39 | datasets = "^1.8.0"
40 | transformers = "^4.8.2"
41 | hydra-optuna-sweeper = "^1.1.0"
42 | 
43 | [tool.poetry.dev-dependencies]
44 | black = "^21.6b0"
45 | pytest = "^6.2.2"
46 | 
47 | [build-system]
48 | requires = ["poetry-core>=1.0.0"]
49 | build-backend = "poetry.core.masonry.api"
50 | 


--------------------------------------------------------------------------------
/requirements/requirements_cuda101.txt:
--------------------------------------------------------------------------------
1 | -f https://download.pytorch.org/whl/torch_stable.html
2 | torch==1.8.1+cu101
3 | torchvision==0.9.1+cu101
4 | torchaudio==0.8.1


--------------------------------------------------------------------------------
/requirements/requirements_cuda111.txt:
--------------------------------------------------------------------------------
1 | -f https://download.pytorch.org/whl/torch_stable.html
2 | torch==1.8.1+cu111
3 | torchvision==0.9.1+cu111
4 | torchaudio==0.8.1


--------------------------------------------------------------------------------
/requirements/requirements_py1.9_cuda111.txt:
--------------------------------------------------------------------------------
1 | -f https://download.pytorch.org/whl/torch_stable.html
2 | torch==1.9.0+cu111
3 | torchvision==0.10.0+cu111
4 | torchaudio==0.9.0


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # This run script encapsulates the training and evaluation of a speaker
 4 | # recognition model defined by the hydra configuration.
 5 | #
 6 | # Author(s): Nik Vaessen
 7 | ################################################################################
 8 | 
 9 | import hydra
10 | 
11 | from dotenv import load_dotenv
12 | from omegaconf import DictConfig, OmegaConf
13 | 
14 | from src.hydra_resolvers import (
15 |     division_resolver,
16 |     integer_division_resolver,
17 |     random_uuid,
18 | )
19 | 
20 | ################################################################################
21 | # set custom resolvers
22 | 
23 | OmegaConf.register_new_resolver("divide", division_resolver)
24 | OmegaConf.register_new_resolver("idivide", integer_division_resolver)
25 | OmegaConf.register_new_resolver("random_uuid", random_uuid)
26 | 
27 | ################################################################################
28 | # wrap around main hydra script
29 | 
30 | 
31 | @hydra.main(config_path="config", config_name="train_eval")
32 | def run(cfg: DictConfig):
33 |     # we import here such that tab-completion in bash
34 |     # does not need to import everything (which slows it down
35 |     # significantly)
36 |     from src.main import run_train_eval_script
37 | 
38 |     return run_train_eval_script(cfg)
39 | 
40 | 
41 | ################################################################################
42 | # execute hydra application
43 | 
44 | if __name__ == "__main__":
45 |     load_dotenv()
46 |     import os
47 | 
48 |     run()
49 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/__init__.py


--------------------------------------------------------------------------------
/src/callbacks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/callbacks/__init__.py


--------------------------------------------------------------------------------
/src/callbacks/memory_monitor.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # This callback will monitor the RAM usage of each worker.
 4 | #
 5 | # Author(s): Nik Vaessen
 6 | ################################################################################
 7 | from typing import Any
 8 | 
 9 | import psutil
10 | import os
11 | 
12 | import pytorch_lightning as pl
13 | 
14 | from pytorch_lightning.utilities import rank_zero_only
15 | from pytorch_lightning.utilities.types import STEP_OUTPUT
16 | 
17 | 
18 | ################################################################################
19 | # callback implementation
20 | 
21 | 
22 | class RamMemoryMonitor(pl.Callback):
23 |     def __init__(self, frequency: int):
24 |         self.frequency = frequency
25 | 
26 |         self.batches = 0
27 | 
28 |     def on_train_batch_end(
29 |         self,
30 |         trainer: "pl.Trainer",
31 |         pl_module: "pl.LightningModule",
32 |         outputs: STEP_OUTPUT,
33 |         batch: Any,
34 |         batch_idx: int,
35 |         dataloader_idx: int,
36 |     ) -> None:
37 |         self.batches += 1
38 | 
39 |         if self.batches >= self.frequency:
40 |             self.batches = 0
41 | 
42 |             try:
43 |                 self._monitor(trainer)
44 |             except psutil.NoSuchProcess as e:
45 |                 pass
46 | 
47 |     @staticmethod
48 |     def _monitor(trainer: pl.Trainer):
49 |         current_process = psutil.Process(os.getpid())
50 |         children = current_process.children(recursive=True)
51 | 
52 |         # track main process
53 |         current_process_usage = _get_mem_usage_in_mb(current_process)
54 | 
55 |         # track child processes
56 |         children_usage = [_get_mem_usage_in_mb(c) for c in children]
57 | 
58 |         # total usage
59 |         total_usage = current_process_usage + sum(children_usage)
60 | 
61 |         # track usage
62 |         if trainer is not None:
63 |             trainer.logger.log_metrics(
64 |                 {
65 |                     "mem_total": total_usage,
66 |                 }
67 |             )
68 | 
69 | 
70 | def _get_mem_usage_in_mb(p: psutil.Process):
71 |     full_info = p.memory_full_info()
72 | 
73 |     # usage of process in bytes
74 |     usage = full_info.uss
75 | 
76 |     # convert to megabytes
77 |     usage = round(usage / float(1 << 20))
78 | 
79 |     return usage
80 | 


--------------------------------------------------------------------------------
/src/config_util.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Provide a dataclass which automatically tries to cast any type-hinted field
 4 | # to the type hint. It also as provides an abstract method
 5 | # for constructing the object it configures.
 6 | #
 7 | # Author(s): Nik Vaessen
 8 | ################################################################################
 9 | 
10 | import dataclasses
11 | 
12 | from abc import abstractmethod
13 | from enum import Enum
14 | from typing import TypeVar, Generic
15 | 
16 | ################################################################################
17 | # base configuration which supports casting to type hint and provides abstract
18 | # interface for creating an object based on the configuration
19 | 
20 | C = TypeVar("C")
21 | 
22 | 
23 | @dataclasses.dataclass()
24 | class CastingConfig(Generic[C]):
25 |     def __post_init__(self):
26 |         post_init_type_cast(self)
27 | 
28 | 
29 | def post_init_type_cast(dataclass):
30 |     if not dataclasses.is_dataclass(dataclass):
31 |         raise Exception("Can only type-cast dataclass classes.")
32 | 
33 |     for field in dataclasses.fields(dataclass):
34 |         value = getattr(dataclass, field.name)
35 |         typehint_cls = field.type
36 | 
37 |         if value is None:
38 |             # no value specified to type-convert
39 |             continue
40 | 
41 |         elif isinstance(value, typehint_cls):
42 |             # no need for type-conversion
43 |             continue
44 | 
45 |         elif isinstance(value, dict):
46 |             """
47 |             if execution gets here, we know
48 |             value is not an instance of typehinted-type but
49 |             is a dictionary. It contains the contents
50 |             of a nested dataclass
51 |             """
52 |             obj = typehint_cls(**value)
53 | 
54 |             # recursively perform type casting
55 |             post_init_type_cast(obj)
56 | 
57 |         elif issubclass(typehint_cls, Enum):
58 |             # enum's have a different init procedure
59 |             obj = typehint_cls[value]
60 | 
61 |         else:
62 |             # simply type-cast the object
63 |             obj = typehint_cls(value)
64 | 
65 |         setattr(dataclass, field.name, obj)
66 | 


--------------------------------------------------------------------------------
/src/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/__init__.py


--------------------------------------------------------------------------------
/src/data/common.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # A collection of common data classes
 4 | #
 5 | # Author(s): Nik Vaessen
 6 | ################################################################################
 7 | 
 8 | import pathlib
 9 | 
10 | from abc import abstractmethod
11 | from dataclasses import dataclass
12 | from typing import List, Tuple, Optional, Dict, Any
13 | 
14 | import torch as t
15 | 
16 | ################################################################################
17 | #
18 | 
19 | 
20 | @dataclass
21 | class WebDataSetShardConfig:
22 |     samples_per_shard: int
23 |     use_gzip_compression: bool
24 |     shuffle_shards: bool
25 |     queue_size: int
26 | 
27 | 
28 | @dataclass
29 | class SpeakerDataLoaderConfig:
30 |     num_workers: int
31 |     train_batch_size: int
32 |     val_batch_size: int
33 |     test_batch_size: int
34 |     pin_memory: bool
35 | 
36 | 
37 | @dataclass
38 | class SpeechDataLoaderConfig:
39 |     num_workers: int
40 |     train_max_num_samples: int
41 |     val_batch_size: int
42 |     test_batch_size: int
43 |     pin_memory: bool
44 | 
45 | 
46 | ################################################################################
47 | #
48 | 
49 | 
50 | @dataclass
51 | class DebugWriter:
52 |     @abstractmethod
53 |     def write(self, tensor: t.Tensor, save_dir: pathlib.Path, idx: int):
54 |         pass
55 | 
56 | 
57 | @dataclass
58 | class BatchDebugInfo:
59 |     # the original tensor which should be easily converted to
60 |     # e.g an image/audio file
61 |     original_tensor: t.Tensor
62 | 
63 |     # a list containing the progression steps from the original_tensor
64 |     # to the network_input tensor accompanied with a class which can be
65 |     # used to write debug output to a particular folder
66 |     pipeline_progress: List[
67 |         Tuple[
68 |             t.Tensor,
69 |             DebugWriter,
70 |         ]
71 |     ]
72 | 
73 |     # optional (untyped) dataset specific information
74 |     # about the data sample
75 |     meta: Optional[Dict[Any, Any]]
76 | 


--------------------------------------------------------------------------------
/src/data/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/__init__.py


--------------------------------------------------------------------------------
/src/data/modules/speaker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speaker/__init__.py


--------------------------------------------------------------------------------
/src/data/modules/speaker/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speaker/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/src/data/modules/speaker/__pycache__/speaker_data_module.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speaker/__pycache__/speaker_data_module.cpython-38.pyc


--------------------------------------------------------------------------------
/src/data/modules/speaker/__pycache__/training_batch_speaker.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speaker/__pycache__/training_batch_speaker.cpython-38.pyc


--------------------------------------------------------------------------------
/src/data/modules/speaker/__pycache__/voxceleb.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speaker/__pycache__/voxceleb.cpython-38.pyc


--------------------------------------------------------------------------------
/src/data/modules/speaker/speaker_data_module.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Abstract LightningDataModule for speaker recognition
 4 | #
 5 | # Author(s): Nik Vaessen
 6 | ################################################################################
 7 | 
 8 | from abc import abstractmethod
 9 | from typing import List
10 | 
11 | import pytorch_lightning
12 | 
13 | from src.evaluation.speaker.speaker_recognition_evaluator import EvaluationPair
14 | 
15 | 
16 | ################################################################################
17 | # abstract class of a lightning data module for speaker recognition
18 | 
19 | 
20 | class SpeakerLightningDataModule(pytorch_lightning.LightningDataModule):
21 |     @property
22 |     @abstractmethod
23 |     def num_speakers(self) -> int:
24 |         pass
25 | 
26 |     @property
27 |     @abstractmethod
28 |     def val_pairs(self) -> List[EvaluationPair]:
29 |         pass
30 | 
31 |     @property
32 |     @abstractmethod
33 |     def test_pairs(self) -> List[EvaluationPair]:
34 |         pass
35 | 
36 |     @property
37 |     @abstractmethod
38 |     def summary(self):
39 |         pass
40 | 


--------------------------------------------------------------------------------
/src/data/modules/speech/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speech/__init__.py


--------------------------------------------------------------------------------
/src/data/modules/speech/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speech/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/src/data/modules/speech/__pycache__/librispeech.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speech/__pycache__/librispeech.cpython-38.pyc


--------------------------------------------------------------------------------
/src/data/modules/speech/__pycache__/speech_data_module.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speech/__pycache__/speech_data_module.cpython-38.pyc


--------------------------------------------------------------------------------
/src/data/modules/speech/__pycache__/training_batch_speech.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/modules/speech/__pycache__/training_batch_speech.cpython-38.pyc


--------------------------------------------------------------------------------
/src/data/modules/speech/speech_data_module.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Abstract LightningDataModule for speaker recognition
 4 | #
 5 | # Author(s): Nik Vaessen
 6 | ################################################################################
 7 | 
 8 | from abc import abstractmethod
 9 | from typing import List
10 | 
11 | import pytorch_lightning
12 | 
13 | from src.tokenizer.base import BaseTokenizer
14 | 
15 | ################################################################################
16 | # abstract class of a lightning data module for speaker recognition
17 | 
18 | 
19 | class SpeechLightningDataModule(pytorch_lightning.LightningDataModule):
20 |     @property
21 |     @abstractmethod
22 |     def vocabulary(self) -> List[str]:
23 |         pass
24 | 
25 |     @property
26 |     @abstractmethod
27 |     def summary(self):
28 |         pass
29 | 
30 |     @property
31 |     @abstractmethod
32 |     def tokenizer(self) -> BaseTokenizer:
33 |         pass
34 | 


--------------------------------------------------------------------------------
/src/data/preprocess/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/data/preprocess/__init__.py


--------------------------------------------------------------------------------
/src/data/preprocess/audio_features.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Base API for preprocessors
 4 | #
 5 | # Author(s): Nik Vaessen
 6 | ################################################################################
 7 | 
 8 | import pathlib
 9 | 
10 | from typing import Union, List
11 | 
12 | import librosa
13 | import torch as t
14 | import torchaudio
15 | import seaborn
16 | 
17 | from matplotlib import pyplot as plt
18 | from speechbrain.lobes.features import Fbank
19 | 
20 | from src.data.common import DebugWriter
21 | from src.data.preprocess.base import Preprocessor
22 | from src.data.modules.speaker.training_batch_speaker import (
23 |     SpeakerClassificationDataSample,
24 | )
25 | from src.util import debug_tensor_content
26 | 
27 | ################################################################################
28 | # base preprocessor
29 | 
30 | 
31 | class FilterBankDebugWriter(DebugWriter):
32 |     def write(self, tensor: t.Tensor, save_dir: pathlib.Path, idx: int):
33 |         debug_tensor_content(tensor, f"{idx:03d}_filterbank_features", save_dir)
34 | 
35 |         # make a plot of the filterbank values
36 |         heatmap = seaborn.heatmap(tensor.cpu().numpy())
37 |         fig = heatmap.get_figure()
38 |         fig.savefig(str(save_dir / f"{idx:03d}_filterbank_features.png"))
39 |         plt.clf()
40 | 
41 |         # convert back to audio
42 |         a1 = tensor.numpy().transpose()
43 |         a1 = librosa.db_to_amplitude(a1)
44 |         a1 = librosa.feature.inverse.mel_to_audio(
45 |             a1,
46 |             n_fft=400,
47 |             fmin=0,
48 |             fmax=8000,
49 |             hop_length=160,
50 |             win_length=16 * 25,
51 |             center=False,
52 |             power=1,
53 |             n_iter=10,
54 |         )
55 | 
56 |         torchaudio.save(
57 |             save_dir / f"{idx:03d}_filterbank_features.wav",
58 |             t.Tensor(a1)[None, :],
59 |             16000,
60 |         )
61 | 
62 | 
63 | class FilterBank(Preprocessor):
64 |     def __init__(self, n_mels: int = 40):
65 |         self.fb = Fbank(n_mels=n_mels)
66 | 
67 |     def process(
68 |         self, sample: SpeakerClassificationDataSample
69 |     ) -> Union[SpeakerClassificationDataSample, List[SpeakerClassificationDataSample]]:
70 |         # expects an audio file of shape [1, NUM_AUDIO_SAMPLES] and converts
71 |         # to [1, NUM_FRAMES, N_MELS] which is squeezed to [NUM_FRAMES, N_MELS]
72 |         sample.network_input = self.fb(sample.network_input).squeeze()
73 | 
74 |         if sample.side_info is not None:
75 |             sample.side_info.pipeline_progress.append(
76 |                 (sample.network_input, self.init_debug_writer())
77 |             )
78 | 
79 |         return sample
80 | 
81 |     def init_debug_writer(
82 |         self,
83 |     ):
84 |         return FilterBankDebugWriter()
85 | 


--------------------------------------------------------------------------------
/src/data/preprocess/base.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Base API for preprocessors
 4 | #
 5 | # Author(s): Nik Vaessen
 6 | ################################################################################
 7 | 
 8 | from abc import abstractmethod
 9 | from typing import List, Union
10 | 
11 | from src.data.common import DebugWriter
12 | from src.data.modules.speaker.training_batch_speaker import (
13 |     SpeakerClassificationDataSample,
14 | )
15 | 
16 | 
17 | ################################################################################
18 | # base preprocessor
19 | 
20 | 
21 | class Preprocessor:
22 |     @abstractmethod
23 |     def process(
24 |         self, sample: SpeakerClassificationDataSample
25 |     ) -> Union[SpeakerClassificationDataSample, List[SpeakerClassificationDataSample]]:
26 |         # process a sample in a particular way and generate one or more
27 |         # new samples
28 |         pass
29 | 
30 |     @abstractmethod
31 |     def init_debug_writer(
32 |         self,
33 |     ) -> DebugWriter:
34 |         pass
35 | 


--------------------------------------------------------------------------------
/src/data/preprocess/input_normalisation.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Select a
 4 | #
 5 | # Author(s): Nik Vaessen
 6 | ################################################################################
 7 | 
 8 | import pathlib
 9 | from typing import Union, List
10 | 
11 | import torch as t
12 | import seaborn
13 | 
14 | from matplotlib import pyplot as plt
15 | 
16 | from src.data.common import DebugWriter
17 | from src.data.preprocess.base import Preprocessor
18 | from src.data.modules.speaker.training_batch_speaker import (
19 |     SpeakerClassificationDataSample,
20 | )
21 | from src.util import debug_tensor_content
22 | 
23 | ################################################################################
24 | # implementation of the selector
25 | 
26 | 
27 | class InputNormalizerDebugWriter(DebugWriter):
28 |     def write(self, tensor: t.Tensor, save_dir: pathlib.Path, idx: int):
29 |         debug_tensor_content(tensor, f"{idx:03d}_normalized_features", save_dir)
30 | 
31 |         # make a plot of the normalized values
32 |         heatmap = seaborn.heatmap(tensor.cpu().numpy())
33 |         fig = heatmap.get_figure()
34 |         fig.savefig(str(save_dir / f"{idx:03d}_normalized_features.png"))
35 |         plt.clf()
36 | 
37 | 
38 | class InputNormalizer2D(Preprocessor):
39 |     def __init__(
40 |         self,
41 |         normalize_over_channels: bool = True,
42 |     ):
43 |         """
44 |         Normalize 2D spectograms.
45 | 
46 |         :param normalize_over_channels: whether to normalize over channels
47 |         (when True) or over the whole spectogram (when False)
48 |         """
49 |         super().__init__()
50 | 
51 |         self.channel_wise = normalize_over_channels
52 | 
53 |     @staticmethod
54 |     def normalize(spectogram: t.Tensor, channel_wise: bool):
55 |         if len(spectogram.shape) != 2:
56 |             raise ValueError("expect to normalize over 2D input")
57 | 
58 |         if channel_wise:
59 |             # calculate over last dimension
60 |             # (assuming shape [NUM_FRAMES, NUM_FEATURES])
61 |             std, mean = t.std_mean(spectogram, dim=0)
62 |         else:
63 |             std, mean = t.std_mean(spectogram)
64 | 
65 |         normalized_spectogram = (spectogram - mean) / (std + 1e-5)
66 | 
67 |         return normalized_spectogram, mean, std
68 | 
69 |     def process(
70 |         self, sample: SpeakerClassificationDataSample
71 |     ) -> Union[SpeakerClassificationDataSample, List[SpeakerClassificationDataSample]]:
72 |         x_norm, mean, std = self.normalize(sample.network_input, self.channel_wise)
73 | 
74 |         sample.network_input = x_norm
75 | 
76 |         if sample.side_info is not None:
77 |             sample.side_info.pipeline_progress.append(
78 |                 (x_norm, self.init_debug_writer())
79 |             )
80 | 
81 |         return sample
82 | 
83 |     def init_debug_writer(self):
84 |         return InputNormalizerDebugWriter()
85 | 


--------------------------------------------------------------------------------
/src/data/util.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Utility functions related to data i/o
 4 | #
 5 | # Author(s): Nik Vaessen
 6 | ################################################################################
 7 | 
 8 | import pathlib
 9 | 
10 | import torchaudio
11 | 
12 | import numpy as np
13 | import torch as t
14 | 
15 | ################################################################################
16 | # read audio from wav file into a tensor
17 | 
18 | 
19 | def load_raw_audio(path: pathlib.Path) -> t.Tensor:
20 |     """
21 |     Load the raw audio file at the specified path and return it as a tensor
22 |     with shape [1, num_samples] with floating values between -1 and 1
23 | 
24 |     :param path: the path to the audio value
25 |     :return: a tensor of shape [1, num_samples] of the raw audio
26 |     """
27 |     tensor, sample_rate = torchaudio.load(str(path))
28 | 
29 |     if sample_rate != 16000:
30 |         raise ValueError(
31 |             f"audio file {path} is expected to have a sampling"
32 |             f" rate of 16000 while actually being {sample_rate}"
33 |         )
34 | 
35 |     return tensor
36 | 
37 | 
38 | ################################################################################
39 | # read/save tensors
40 | 
41 | 
42 | def load_tensor(path: pathlib.Path, device=t.device("cpu")) -> t.Tensor:
43 |     return t.load(path, map_location=device)
44 | 
45 | 
46 | def save_tensor(embedding: t.Tensor, save_path: pathlib.Path):
47 |     save_path.parent.mkdir(exist_ok=True, parents=True)
48 |     t.save(embedding, str(save_path))
49 | 
50 | 
51 | ################################################################################
52 | # hacky way to create a None tensor
53 | 
54 | 
55 | def create_nan_tensor():
56 |     return t.Tensor([np.nan])
57 | 
58 | 
59 | def is_nan_tensor(tensor: t.Tensor):
60 |     return t.all(t.isnan(tensor)).item()
61 | 
62 | 
63 | ################################################################################
64 | # check if a value can cause nan/inf
65 | 
66 | 
67 | def tensor_has_inf(tensor: t.Tensor):
68 |     return t.any(t.isinf(tensor)).item()
69 | 
70 | 
71 | def tensor_has_nan(tensor: t.Tensor):
72 |     return t.any(t.isnan(tensor)).item()
73 | 
74 | 
75 | def is_invalid_tensor(tensor: t.Tensor):
76 |     return tensor_has_inf(tensor) or tensor_has_inf(tensor)
77 | 


--------------------------------------------------------------------------------
/src/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/evaluation/__init__.py


--------------------------------------------------------------------------------
/src/evaluation/speaker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/evaluation/speaker/__init__.py


--------------------------------------------------------------------------------
/src/evaluation/speaker/lda.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | #
  3 | # Implement the PLDA evaluation metric and evaluator.
  4 | 
  5 | # Author(s): Nik Vaessen
  6 | ################################################################################
  7 | 
  8 | from typing import List, Tuple
  9 | 
 10 | import torch as t
 11 | from sklearn.decomposition import PCA
 12 | 
 13 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 14 | 
 15 | from src.evaluation.speaker.cosine_distance import (
 16 |     compute_cosine_scores,
 17 | )
 18 | from src.evaluation.speaker.speaker_recognition_evaluator import (
 19 |     EmbeddingSample,
 20 |     compute_mean_std_batch,
 21 |     center_batch,
 22 |     SpeakerRecognitionEvaluator,
 23 |     length_norm_batch,
 24 | )
 25 | 
 26 | 
 27 | ################################################################################
 28 | # Implement an evaluator based PLDA scoring
 29 | 
 30 | 
 31 | class LDAEvaluator(SpeakerRecognitionEvaluator):
 32 |     def __init__(
 33 |         self,
 34 |         center_before_scoring: bool,
 35 |         length_norm_before_scoring: bool,
 36 |         max_training_batches_to_fit: int,
 37 |         num_pca_components: int,
 38 |         center_before_fit_training_batches: bool,
 39 |     ):
 40 |         super().__init__(
 41 |             max_training_batches_to_fit=max_training_batches_to_fit,
 42 |         )
 43 | 
 44 |         self.center_before_scoring = center_before_scoring
 45 |         self.length_norm_before_scoring = length_norm_before_scoring
 46 |         self.num_pca_components = num_pca_components
 47 |         self.center_before_fit_training_batches = center_before_fit_training_batches
 48 | 
 49 |         # set in self#fit_parameters
 50 |         self._lda_model: LinearDiscriminantAnalysis = None
 51 |         self._mean: t.Tensor = None
 52 |         self._std: t.Tensor = None
 53 | 
 54 |     def fit_parameters(
 55 |         self, embedding_tensors: List[t.Tensor], label_tensors: List[t.Tensor]
 56 |     ):
 57 |         # create a tensor of shape [BATCH_SIZE*len(embedding_tensors), EMBEDDING_SIZE]
 58 |         all_tensors = t.cat(embedding_tensors)
 59 | 
 60 |         # create a tensor of SHAPE [BATCH_SIZE*len(label_tensors),]
 61 |         all_labels = t.cat(label_tensors)
 62 | 
 63 |         if self.center_before_fit_training_batches:
 64 |             mean, std = compute_mean_std_batch(all_tensors)
 65 |             all_tensors = center_batch(all_tensors, mean, std)
 66 | 
 67 |         # convert to numpy
 68 |         all_tensors = all_tensors.detach().cpu().numpy()
 69 |         all_labels = all_labels.detach().cpu().numpy().tolist()
 70 | 
 71 |         # train LDA model
 72 |         self._lda_model = PCA(n_components=200, whiten=True)
 73 |         all_tensors_transformed = self._lda_model.fit_transform(all_tensors, all_labels)
 74 | 
 75 |         # compute mean/std in latent space in order to do centering before
 76 |         # taking length norm
 77 |         self._mean, self._std = compute_mean_std_batch(
 78 |             t.Tensor(all_tensors_transformed)
 79 |         )
 80 | 
 81 |     def reset_parameters(self):
 82 |         super().reset_parameters()
 83 |         self._lda_model = None
 84 | 
 85 |     def _compute_prediction_scores(
 86 |         self, pairs: List[Tuple[EmbeddingSample, EmbeddingSample]]
 87 |     ) -> List[float]:
 88 |         # get 2 tensors of size [NUM_SAMPLES, EMBEDDING_SIZE],
 89 |         # where the same row idx corresponds to a pair to score
 90 |         b1, b2 = self._transform_pairs_to_tensor(pairs)
 91 | 
 92 |         # convert to latent dimension
 93 |         b1 = self._lda_model.transform(b1.detach().cpu().numpy())
 94 |         b2 = self._lda_model.transform(b2.detach().cpu().numpy())
 95 | 
 96 |         # convert back to tensors
 97 |         b1 = t.Tensor(b1)
 98 |         b2 = t.Tensor(b2)
 99 | 
100 |         if self.center_before_scoring:
101 |             b1 = center_batch(b1, self._mean, self._std)
102 |             b2 = center_batch(b2, self._mean, self._std)
103 | 
104 |         if self.length_norm_before_scoring:
105 |             b1 = length_norm_batch(b1)
106 |             b2 = length_norm_batch(b2)
107 | 
108 |         # compute scores based on centering, length norming and then
109 |         # taking cosine distance
110 |         return compute_cosine_scores(t.Tensor(b1), t.Tensor(b2))
111 | 


--------------------------------------------------------------------------------
/src/evaluation/speech/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/evaluation/speech/__init__.py


--------------------------------------------------------------------------------
/src/evaluation/speech/wer.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Calculating word-error-rate
 4 | #
 5 | # Author(s): Nik Vaessen
 6 | ################################################################################
 7 | 
 8 | from typing import List
 9 | 
10 | from jiwer import wer
11 | 
12 | ################################################################################
13 | # wrapper around jiwer
14 | 
15 | 
16 | def calculate_wer(transcriptions: List[str], ground_truths: List[str]):
17 |     return wer(ground_truths, transcriptions)
18 | 


--------------------------------------------------------------------------------
/src/hydra_resolvers.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Custom resolvers for hydra configuration
 4 | #
 5 | # Author(s): Nik Vaessen
 6 | ################################################################################
 7 | 
 8 | import uuid
 9 | 
10 | ################################################################################
11 | # implement division of 2 digits
12 | 
13 | 
14 | def _parse_digit(d: str):
15 |     try:
16 |         d = int(d)
17 |     except ValueError:
18 |         try:
19 |             d = float(d)
20 |         except ValueError:
21 |             raise ValueError(f"input {d} cannot be parsed as a digit")
22 | 
23 |     return d
24 | 
25 | 
26 | def division_resolver(numerator: str, denominator: str):
27 |     return _parse_digit(numerator) / _parse_digit(denominator)
28 | 
29 | 
30 | def integer_division_resolver(numerator: str, denominator: str):
31 |     return int(_parse_digit(numerator) // _parse_digit(denominator))
32 | 
33 | 
34 | ################################################################################
35 | # create a random UUID
36 | 
37 | 
38 | def random_uuid():
39 |     return uuid.uuid4().hex
40 | 


--------------------------------------------------------------------------------
/src/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/layers/__init__.py


--------------------------------------------------------------------------------
/src/layers/embedding_masking.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | #
  3 | # Apply dropout on time and channel dimensions of wav2vec2 embedding
  4 | # as described in https://arxiv.org/abs/2006.11477
  5 | #
  6 | # Author(s): Nik Vaessen
  7 | ################################################################################
  8 | 
  9 | from typing import List
 10 | 
 11 | import torch as t
 12 | import torch.nn as nn
 13 | 
 14 | ################################################################################
 15 | # implementation as nn module
 16 | 
 17 | 
 18 | class EmbeddingMasker(nn.Module):
 19 |     def __init__(
 20 |         self,
 21 |         timestep_mask_prob: float,
 22 |         timestep_mask_width: int,
 23 |         channel_mask_prob: float,
 24 |         channel_mask_width: int,
 25 |         time_dim: int = 1,
 26 |         embedding_dim: int = 2,
 27 |     ):
 28 |         if not (0 <= channel_mask_prob <= 1):
 29 |             raise ValueError(
 30 |                 f"probability channel_mask_prob {channel_mask_prob} expected to "
 31 |                 f"be in range [0,1]"
 32 |             )
 33 |         if not (0 <= timestep_mask_prob <= 1):
 34 |             raise ValueError(
 35 |                 f"probability timestep_mask_prob {timestep_mask_prob} expected to "
 36 |                 f"be in range [0,1]"
 37 |             )
 38 | 
 39 |         if time_dim == 0 or embedding_dim == 0:
 40 |             raise ValueError("dimensions to mask cannot be dim 0 (batch dimension)")
 41 | 
 42 |         super().__init__()
 43 | 
 44 |         self.timestep_mask_prob = timestep_mask_prob
 45 |         self.timestep_mask_width = timestep_mask_width
 46 |         self.channel_mask_prob = channel_mask_prob
 47 |         self.channel_mask_width = channel_mask_width
 48 | 
 49 |         self.time_dim = time_dim
 50 |         self.embedding_dim = embedding_dim
 51 | 
 52 |     def forward(self, embedding_tensor: t.Tensor):
 53 |         if not self.training or (self.timestep_mask_prob + self.channel_mask_prob == 0):
 54 |             return embedding_tensor
 55 | 
 56 |         assert len(embedding_tensor.shape) == 3
 57 | 
 58 |         num_time_steps = embedding_tensor.shape[self.time_dim]
 59 |         num_channels = embedding_tensor.shape[self.embedding_dim]
 60 | 
 61 |         # create mask with same shape of embedding tensor
 62 |         m = t.ones(embedding_tensor.shape, device=embedding_tensor.device)
 63 | 
 64 |         # determine which time steps to mask
 65 |         if self.timestep_mask_prob > 0:
 66 |             time_masked = t.rand((num_time_steps,))
 67 |             time_masked = (
 68 |                 t.where(
 69 |                     time_masked <= self.timestep_mask_prob, t.Tensor([0]), t.Tensor([1])
 70 |                 )
 71 |                 .numpy()
 72 |                 .tolist()
 73 |             )
 74 | 
 75 |             time_masked = self.expand_mask(time_masked, self.timestep_mask_width)
 76 |             self.insert_into_mask(m, time_masked, 0, self.time_dim)
 77 | 
 78 |         # determine which channels to mask
 79 |         if self.timestep_mask_prob > 0:
 80 |             channel_mask = t.rand((num_channels,))
 81 |             channel_mask = (
 82 |                 t.where(
 83 |                     channel_mask <= self.channel_mask_prob, t.Tensor([0]), t.Tensor([1])
 84 |                 )
 85 |                 .numpy()
 86 |                 .tolist()
 87 |             )
 88 | 
 89 |             channel_mask = self.expand_mask(channel_mask, self.channel_mask_width)
 90 |             self.insert_into_mask(m, channel_mask, 0, self.embedding_dim)
 91 | 
 92 |         # mask and return the embedding
 93 |         return m * embedding_tensor
 94 | 
 95 |     @staticmethod
 96 |     def insert_into_mask(
 97 |         mask_tensor: t.Tensor, mask_list: List[int], mask_value: int, dim: int
 98 |     ):
 99 |         mask_idx = [idx for idx, value in enumerate(mask_list) if value == mask_value]
100 | 
101 |         if dim == 1:
102 |             mask_tensor[:, mask_idx, :] = mask_value
103 |         else:
104 |             mask_tensor[:, :, mask_idx] = mask_value
105 | 
106 |         return mask_tensor
107 | 
108 |     @staticmethod
109 |     def expand_mask(
110 |         mask_list: List[int], mask_width: int, mask_value_to_expand: int = 0
111 |     ):
112 |         # repeat mask widths
113 |         mask_idx = []
114 | 
115 |         for idx, mask_value in enumerate(mask_list):
116 |             if mask_value == mask_value_to_expand:
117 |                 mask_idx.append(idx)
118 | 
119 |         expanded_mask_list = t.Tensor(mask_list)
120 |         for idx in mask_idx:
121 |             expanded_mask_list[idx : (idx + mask_width)] = mask_value_to_expand
122 | 
123 |         return expanded_mask_list.numpy().tolist()
124 | 


--------------------------------------------------------------------------------
/src/layers/temporal_gating.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Implement temporal gating (squeeze-and-excitation layer) as described in:
 4 | #
 5 | # Wav2Spk: A Simple DNN Architecture for Learning Speaker Embeddings
 6 | # from Waveforms
 7 | #
 8 | # https://www.isca-speech.org/archive/Interspeech_2020/pdfs/1287.pdf
 9 | #
10 | # Author(s): Nik Vaessen
11 | ################################################################################
12 | 
13 | import torch as t
14 | import torch.nn as nn
15 | 
16 | ################################################################################
17 | # pytorch module acting as temporal gate
18 | 
19 | 
20 | class TemporalGate(nn.Module):
21 |     def __init__(self, num_features: int):
22 |         super(TemporalGate, self).__init__()
23 | 
24 |         self.W: nn.Parameter = nn.Parameter(
25 |             nn.init.xavier_normal_(t.ones((num_features, num_features)))
26 |         )
27 |         self.b: nn.Parameter = nn.Parameter(
28 |             nn.init.xavier_normal_(t.ones((num_features, 1)))
29 |         )
30 | 
31 |     def forward(self, x):
32 |         # we expect the input x to have dimensionality
33 |         # [BS, NUM_FEATURES, NUM_FRAMES]
34 |         # so that W matmul x results in the same shape
35 |         mask = t.sigmoid(self.W.matmul(x) + self.b)
36 | 
37 |         return t.mul(mask, x)
38 | 


--------------------------------------------------------------------------------
/src/lightning_modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/lightning_modules/__init__.py


--------------------------------------------------------------------------------
/src/lightning_modules/base_lightning_module.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Define a base lightning module for speech and/or speaker recognition network.
 4 | #
 5 | # Author(s): Nik Vaessen
 6 | ################################################################################
 7 | 
 8 | import logging
 9 | 
10 | from abc import abstractmethod
11 | from typing import Callable, Optional
12 | 
13 | import torch as t
14 | import torch.nn
15 | import pytorch_lightning as pl
16 | 
17 | from omegaconf import DictConfig, OmegaConf
18 | 
19 | 
20 | ################################################################################
21 | # Definition of speaker recognition API
22 | 
23 | # A logger for this file
24 | 
25 | log = logging.getLogger(__name__)
26 | 
27 | 
28 | class BaseLightningModule(pl.LightningModule):
29 |     def __init__(
30 |         self,
31 |         hyperparameter_config: DictConfig,
32 |         loss_fn_constructor: Callable[[], Callable[[t.Tensor, t.Tensor], t.Tensor]],
33 |         auto_lr_find: Optional[
34 |             float
35 |         ] = None,  # will be automatically passed by pytorch-lightning to children
36 |     ):
37 |         super().__init__()
38 | 
39 |         # input arguments
40 |         self.loss_fn = loss_fn_constructor()
41 | 
42 |         # created by set_methods
43 |         self.optimizer = None
44 |         self.schedule = None
45 |         self.warmup_optimizer = None
46 |         self.warmup_schedule = None
47 | 
48 |         # flag determining which optimizer/schedule `configure_optimizers` uses
49 |         self.warmup_enabled = False
50 | 
51 |         # auto_lr_find is set when you don't want to train the model
52 |         # but want plot a learning rate against loss
53 |         self.auto_lr_find = auto_lr_find
54 | 
55 |         # log hyperparameters
56 |         self.save_hyperparameters(OmegaConf.to_container(hyperparameter_config))
57 | 
58 |     def set_optimizer(self, optimizer: t.optim.Optimizer):
59 |         self.optimizer = optimizer
60 | 
61 |     def set_lr_schedule(self, schedule: t.optim.lr_scheduler._LRScheduler):
62 |         self.schedule = schedule
63 | 
64 |     @abstractmethod
65 |     def generate_example_input(
66 |         self, include_batch_dimension: bool, batch_size: Optional[int]
67 |     ):
68 |         pass
69 | 
70 |     def configure_optimizers(self):
71 |         if self.auto_lr_find:
72 |             log.info("USING the `auto_lr_find` learning rate and optimizer!")
73 |             return torch.optim.Adam(self.parameters(), lr=self.auto_lr_find)
74 | 
75 |         return [self.optimizer], [self.schedule]
76 | 


--------------------------------------------------------------------------------
/src/lightning_modules/multitask/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/lightning_modules/multitask/__init__.py


--------------------------------------------------------------------------------
/src/lightning_modules/speaker/__init__.py:
--------------------------------------------------------------------------------
 1 | from .dummy import DummyModule, DummyModuleConfig
 2 | from .wav2spk import Wav2SpkModule, Wav2SpkModuleConfig
 3 | from .wav2vec2_fc import Wav2vec2FCModule, Wav2vec2FCModuleConfig
 4 | from .wav2vec2_paired_input import (
 5 |     Wav2vec2PairedSpeakerModule,
 6 |     Wav2vec2PairedSpeakerModuleConfig,
 7 | )
 8 | from .wav2vec_fc import Wav2vecFCModule, Wav2vecFCModuleConfig
 9 | from .wav2vec_xvector import Wav2vecXVectorModule, Wav2vecXVectorModuleConfig
10 | from .xvector import XVectorModule, XVectorModuleConfig
11 | from .ecapa_tdnn import EcapaTdnnModule, EcapaTDNNModuleConfig
12 | 
13 | from .paired_speaker_recognition_module import PairedSpeakerRecognitionLightningModule
14 | from .speaker_recognition_module import SpeakerRecognitionLightningModule
15 | 


--------------------------------------------------------------------------------
/src/lightning_modules/speaker/dummy.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Implements a dummy module which uses very few parameters to generate
 4 | # predictions/embeddings.
 5 | # This is useful for debugging training schedules as it removes the
 6 | # heavy computation for each step so a full training run can be executed
 7 | # fairly quickly.
 8 | #
 9 | # Author(s): Nik Vaessen
10 | ################################################################################
11 | 
12 | from dataclasses import dataclass
13 | from typing import List, Optional, Callable
14 | 
15 | import torch as t
16 | 
17 | from omegaconf import DictConfig
18 | 
19 | from src.evaluation.speaker.speaker_recognition_evaluator import (
20 |     EvaluationPair,
21 |     SpeakerRecognitionEvaluator,
22 | )
23 | from src.lightning_modules.speaker.speaker_recognition_module import (
24 |     SpeakerRecognitionLightningModule,
25 | )
26 | 
27 | ################################################################################
28 | # Implementation of a very light-weight neural network
29 | 
30 | 
31 | @dataclass
32 | class DummyModuleConfig:
33 |     pass
34 | 
35 | 
36 | class DummyModule(SpeakerRecognitionLightningModule):
37 |     def __init__(
38 |         self,
39 |         hyperparameters_to_save: DictConfig,
40 |         cfg: DummyModuleConfig,
41 |         num_speakers: int,
42 |         loss_fn_constructor: Callable[[], Callable[[t.Tensor, t.Tensor], t.Tensor]],
43 |         validation_pairs: List[EvaluationPair],
44 |         test_pairs: List[EvaluationPair],
45 |         evaluator: SpeakerRecognitionEvaluator,
46 |     ):
47 |         super().__init__(
48 |             hyperparameter_config=hyperparameters_to_save,
49 |             num_speakers=num_speakers,
50 |             embedding_size=2,
51 |             loss_fn_constructor=loss_fn_constructor,
52 |             validation_pairs=validation_pairs,
53 |             test_pairs=test_pairs,
54 |             evaluator=evaluator,
55 |             embeddings_are_pooled=True
56 |         )
57 | 
58 |         self.cfg = cfg
59 | 
60 |         # just create a parameter so optimizer doesn't complain
61 |         self.fc1 = t.nn.Linear(in_features=2, out_features=num_speakers)
62 | 
63 |     def generate_example_input(
64 |         self, include_batch_dimension: bool, batch_size: Optional[int] = None
65 |     ):
66 |         # any input works really
67 |         if include_batch_dimension:
68 |             # [BATCH_SIZE, NUMBER_OF_WINDOWS, NUMBER_OF_MODEL_COEFFICIENTS]
69 |             # the `100` varies depending on length of audio file
70 |             # the `40` can be replaced by any other number of mel coefficients
71 |             shape = [batch_size, 100, 40]
72 |         else:
73 |             # [NUMBER_OF_WINDOWS, NUMBER_OF_MODEL_COEFFICIENTS]
74 |             # the `100` varies depending on length of audio file
75 |             # the `40` can be replaced by any other number of mel coefficients
76 |             shape = [100, 40]
77 | 
78 |         return t.rand(size=shape)
79 | 
80 |     def compute_speaker_embedding(self, input_tensor: t.Tensor) -> t.Tensor:
81 |         std, mean = t.std_mean(input_tensor, dim=(1, 2))
82 |         embedding = t.stack([mean, std]).t()
83 | 
84 |         return embedding
85 | 
86 |     def compute_speaker_prediction(self, embedding_tensor: t.Tensor) -> t.Tensor:
87 |         prediction = self.fc1(embedding_tensor)
88 | 
89 |         return prediction
90 | 


--------------------------------------------------------------------------------
/src/lightning_modules/speech/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/lightning_modules/speech/__init__.py


--------------------------------------------------------------------------------
/src/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/models/__init__.py


--------------------------------------------------------------------------------
/src/models/wav2vec.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Provide embeddings from raw audio with the wav2vec model from fairseq.
 4 | #
 5 | # See `download/download_pretrained_models.sh` for links to pretrained weights.
 6 | #
 7 | # Author(s): Nik Vaessen
 8 | ################################################################################
 9 | 
10 | import pathlib
11 | 
12 | import fairseq
13 | import pytorch_lightning
14 | import torch as t
15 | 
16 | from fairseq.models.wav2vec import Wav2VecModel
17 | 
18 | from src.util import reset_model
19 | 
20 | ################################################################################
21 | # loading wav2vec with fairseq
22 | 
23 | 
24 | def load_wav2vec_model(
25 |     model_path: pathlib.Path, device: t.cuda.Device = t.device("cpu")
26 | ) -> Wav2VecModel:
27 |     """
28 |     Load the wav2vec model.
29 | 
30 |     :param model_path: path to the ".pt" file of the model
31 |     :param device: the device on which the model should be loaded
32 |     :return: the wav2vec2 model on the specified device
33 |     """
34 |     checkpoint = t.load(model_path)
35 | 
36 |     model = fairseq.models.wav2vec.Wav2VecModel.build_model(checkpoint["args"], None)
37 |     model.load_state_dict(checkpoint["model"])
38 | 
39 |     return model.to(device)
40 | 
41 | 
42 | ################################################################################
43 | # computation of embedding
44 | 
45 | 
46 | def wav2vec_embed_raw_audio(
47 |     input_tensor: t.Tensor, model: Wav2VecModel, aggregate: bool = False
48 | ) -> t.Tensor:
49 |     """
50 |     Calculate a [1, 512, num_frames] embedding of a given [1, num_samples] audio file
51 |     by using the Wav2Vec model.
52 | 
53 |     :param input_tensor: a raw audio input (between -1 and 1) with a sampling rate of 16000 Hz
54 |     :param model: the wav2vec model
55 |     :param aggregate whether to apply an aggregation to the initial features
56 |     :return: The embedding with shape [1, 512, num_frames], where num_frames < num_samples.
57 |     """
58 |     z = model.feature_extractor(input_tensor)
59 | 
60 |     if not aggregate:
61 |         return z
62 |     else:
63 |         return model.feature_aggregator(z)
64 | 
65 | 
66 | ################################################################################
67 | # wrap the wav2vec model
68 | 
69 | 
70 | class Wav2VecWrapperModule(pytorch_lightning.LightningModule):
71 |     def __init__(
72 |         self,
73 |         wav2vec_model_path: pathlib.Path,
74 |         wav2vec_aggregation: bool = False,
75 |         reset_weights: bool = False,
76 |     ):
77 |         super().__init__()
78 | 
79 |         self.model = load_wav2vec_model(wav2vec_model_path)
80 |         self.use_aggregator = wav2vec_aggregation
81 |         self.num_features = 512
82 | 
83 |         if reset_weights:
84 |             reset_model(self.model)
85 | 
86 |     @property
87 |     def num_embedding_features(self):
88 |         return self.num_features
89 | 
90 |     def forward(self, wav_input: t.Tensor):
91 |         # wav_input has shape [BATCH_SIZE, NUM_SAMPLES]
92 |         embedding = wav2vec_embed_raw_audio(wav_input, self.model, self.use_aggregator)
93 | 
94 |         # return an embedding with shape [BATCH_SIZE, NUM_FEATURES, NUM_FRAMES]
95 |         return embedding
96 | 


--------------------------------------------------------------------------------
/src/optim/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/optim/__init__.py


--------------------------------------------------------------------------------
/src/optim/loss/__init__.py:
--------------------------------------------------------------------------------
1 | from .aam_softmax import AngularAdditiveMarginSoftMaxLoss
2 | from .cross_entropy import CrossEntropyLoss
3 | from .triplet_loss import TripletLoss
4 | from .triplet_ce_loss import TripletCrossEntropyLoss
5 | from .ctc_loss import CtcLoss


--------------------------------------------------------------------------------
/src/optim/loss/aam_softmax.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Implementation of angular additive margin softmax loss.
 4 | #
 5 | # Adapted from: https://github.com/clovaai/voxceleb_trainer/blob/master/loss/aamsoftmax.py
 6 | #
 7 | # Author(s): Nik Vaessen
 8 | ################################################################################
 9 | 
10 | import torch
11 | 
12 | import torch as t
13 | import torch.nn as nn
14 | import torch.nn.functional as F
15 | 
16 | import math
17 | 
18 | ################################################################################
19 | # wrap around aam-loss implementation
20 | 
21 | 
22 | class AngularAdditiveMarginSoftMaxLoss(t.nn.Module):
23 |     def __init__(
24 |         self,
25 |         input_features,
26 |         output_features,
27 |         margin=0.3,
28 |         scale=15,
29 |         easy_margin=False,
30 |     ):
31 |         super(AngularAdditiveMarginSoftMaxLoss, self).__init__()
32 | 
33 |         self.margin = margin
34 |         self.scale = scale
35 |         self.input_features = input_features
36 |         self.fc_weights = torch.nn.Parameter(
37 |             torch.FloatTensor(output_features, input_features), requires_grad=True
38 |         )
39 |         self.ce = nn.CrossEntropyLoss()
40 |         nn.init.xavier_normal_(self.fc_weights, gain=1)
41 | 
42 |         self.easy_margin = easy_margin
43 |         self.cos_m = math.cos(self.margin)
44 |         self.sin_m = math.sin(self.margin)
45 | 
46 |         # make the function cos(theta+m) monotonic decreasing while theta in [0°,180°]
47 |         self.th = math.cos(math.pi - self.margin)
48 |         self.mm = math.sin(math.pi - self.margin) * self.margin
49 | 
50 |     def forward(self, x, label=None):
51 |         assert x.size()[0] == label.size()[0]
52 |         assert x.size()[1] == self.input_features
53 | 
54 |         # cos(theta)
55 |         cosine = F.linear(F.normalize(x), F.normalize(self.fc_weights))
56 |         # cos(theta + m)
57 |         sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1))
58 |         phi = cosine * self.cos_m - sine * self.sin_m
59 | 
60 |         if self.easy_margin:
61 |             phi = torch.where(cosine > 0, phi, cosine)
62 |         else:
63 |             phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm)
64 | 
65 |         # one_hot = torch.zeros(cosine.size(), device='cuda' if torch.cuda.is_available() else 'cpu')
66 |         one_hot = torch.zeros_like(cosine)
67 |         one_hot.scatter_(1, label.view(-1, 1), 1)
68 |         output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
69 |         output = output * self.scale
70 | 
71 |         loss = self.ce(output, label)
72 |         prediction = F.softmax(output, dim=1)
73 | 
74 |         return loss, prediction
75 | 


--------------------------------------------------------------------------------
/src/optim/loss/binary_cross_entropy.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Wrap around implementation of binary cross-entropy loss.
 4 | #
 5 | # Author(s): Nik Vaessen
 6 | ################################################################################
 7 | 
 8 | import torch as t
 9 | import torch.nn as nn
10 | import torch.nn.functional as F
11 | 
12 | ################################################################################
13 | # wrap around cross-entropy loss of PyTorch
14 | 
15 | 
16 | class BinaryCrossEntropyLoss(t.nn.Module):
17 |     def __init__(self):
18 |         super().__init__()
19 | 
20 |         self.softmax = nn.LogSoftmax(dim=1)
21 | 
22 |     def forward(self, logits: t.Tensor, label_indexes: t.Tensor):
23 |         return self._bce_loss(logits, label_indexes)
24 | 
25 |     def _bce_loss(self, logits: t.Tensor, label_indexes: t.Tensor):
26 |         # logits (unnormalized quantities on which sigmoid is applied)
27 |         # with shape [BATCH_SIZE, 1] and
28 |         # label indexes (integers in {0, 1}) with shape [BATCH SIZE]
29 |         logits = logits.squeeze().to(t.float32)
30 |         label_indexes = label_indexes.squeeze().to(t.float32)
31 | 
32 |         loss = F.binary_cross_entropy_with_logits(logits, label_indexes)
33 | 
34 |         with t.no_grad():
35 |             # put predictions into [0, 1] range for later calculation of accuracy
36 |             prediction = t.sigmoid(logits).detach()
37 | 
38 |         return loss, prediction
39 | 


--------------------------------------------------------------------------------
/src/optim/loss/cross_entropy.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Implementation of Cross-entropy loss.
 4 | #
 5 | # Author(s): Nik Vaessen
 6 | ################################################################################
 7 | 
 8 | import torch as t
 9 | import torch.nn.functional as F
10 | 
11 | ################################################################################
12 | # wrap around PyTorch cross-entropy loss implementation
13 | 
14 | 
15 | class CrossEntropyLoss(t.nn.Module):
16 |     def __init__(self):
17 |         super().__init__()
18 | 
19 |     def forward(self, logits: t.Tensor, label_indexes: t.Tensor):
20 |         return self._ce_loss(logits, label_indexes)
21 | 
22 |     def _ce_loss(self, logits: t.Tensor, label_indexes: t.Tensor):
23 |         # logits (unnormalized quantities on which softmax is applied)
24 |         # with shape [BATCH_SIZE, NUM_SPEAKERS] and
25 |         # label indexes (integers in range [0, NUM_SPEAKERS-1])
26 |         # with shape [BATCH SIZE]
27 |         loss = F.cross_entropy(logits, label_indexes)
28 | 
29 |         with t.no_grad():
30 |             # put predictions into [0, 1] range for later calculation of accuracy
31 |             prediction = F.softmax(logits, dim=1).detach()
32 | 
33 |         return loss, prediction
34 | 


--------------------------------------------------------------------------------
/src/optim/loss/ctc_loss.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # CTC loss for speech recognition
 4 | #
 5 | # Author(s): Nik Vaessen
 6 | ################################################################################
 7 | 
 8 | import torch as t
 9 | 
10 | import torch.nn as nn
11 | import torch.nn.functional as F
12 | 
13 | ################################################################################
14 | # wrapper around ctc loss of pytorch
15 | 
16 | 
17 | class CtcLoss(nn.Module):
18 |     def __init__(self, blank_idx: int = 0):
19 |         super().__init__()
20 | 
21 |         self.blank_idx = blank_idx
22 | 
23 |     def forward(
24 |         self,
25 |         predictions: t.Tensor,
26 |         prediction_lengths: t.Tensor,
27 |         ground_truths: t.Tensor,
28 |         ground_truth_lengths: t.Tensor,
29 |     ):
30 |         original_device = predictions.device
31 |         assert original_device == predictions.device == ground_truths.device
32 | 
33 |         # predictions will be shape [BATCH_SIZE, MAX_INPUT_SEQUENCE_LENGTH, CLASSES]
34 |         # expected to be [MAX_INPUT_SEQUENCE_LENGTH, BATCH_SIZE, CLASSES] for
35 |         # loss function
36 |         predictions = t.transpose(predictions, 0, 1)
37 | 
38 |         # they also need to be log probabilities
39 |         predictions = F.log_softmax(predictions, dim=2)
40 | 
41 |         # prediction lengths will be shape [BATCH_SIZE]
42 |         pass  # already OK
43 | 
44 |         # ground truths will be shape [BATCH_SIZE, MAX_TARGET_SEQUENCE_LENGTH]
45 |         pass  # already OK
46 | 
47 |         # ground_truth_lengths will be shape [BATCH_SIZE]
48 |         pass  # already OK
49 | 
50 |         # ctc loss expects every tensor to be on CPU
51 |         return F.ctc_loss(
52 |             log_probs=predictions.to("cpu"),
53 |             targets=ground_truths.to("cpu"),
54 |             input_lengths=prediction_lengths.to("cpu"),
55 |             target_lengths=ground_truth_lengths.to("cpu"),
56 |             blank=self.blank_idx,
57 |             zero_infinity=True,  # prevents any weird crashes
58 |         ).to(original_device)
59 | 


--------------------------------------------------------------------------------
/src/optim/loss/triplet_ce_loss.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Implement a wrapper around triplet loss and cross-entropy loss
 4 | # for speaker recognition embeddings
 5 | #
 6 | # Author(s): Nik Vaessen
 7 | ################################################################################
 8 | 
 9 | import torch as t
10 | 
11 | from src.optim.loss.cross_entropy import CrossEntropyLoss
12 | from src.optim.loss.triplet_loss import TripletLoss
13 | 
14 | ################################################################################
15 | # wrapper combining cross-entropy and triplet loss
16 | 
17 | 
18 | class TripletCrossEntropyLoss(TripletLoss, CrossEntropyLoss):
19 |     def __init__(self, c_ce: float = 1, c_triplet: float = 1):
20 |         super().__init__()
21 | 
22 |         if c_ce < 1 or c_triplet < 1:
23 |             raise ValueError(
24 |                 f"constants need to be natural numbers, while" f"{c_ce=}, {c_triplet=}"
25 |             )
26 | 
27 |         self.c_ce = c_ce
28 |         self.c_triplet = c_triplet
29 | 
30 |     def forward(self, embeddings: t.Tensor, logits: t.Tensor, label_indexes: t.Tensor):
31 |         ce_loss, prediction = self._ce_loss(logits, label_indexes)
32 |         triplet_loss = self._triplet_loss(embeddings, label_indexes)
33 | 
34 |         loss = self.c_ce * ce_loss + self.c_triplet * triplet_loss
35 | 
36 |         return loss, prediction
37 | 


--------------------------------------------------------------------------------
/src/optim/loss/triplet_loss.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | #
  3 | # Implement a wrapper around triplet loss for speaker recognition embeddings
  4 | #
  5 | # Author(s): Nik Vaessen
  6 | ################################################################################
  7 | 
  8 | import random
  9 | 
 10 | from typing import List
 11 | 
 12 | import torch
 13 | import torch as t
 14 | import torch.nn as nn
 15 | import torch.nn.functional as F
 16 | 
 17 | ################################################################################
 18 | # wrapper of triplet loss
 19 | 
 20 | 
 21 | class TripletLoss(nn.Module):
 22 |     def __init__(self, margin: float = 1):
 23 |         super().__init__()
 24 | 
 25 |         self.margin = margin
 26 | 
 27 |     def forward(self, embeddings: t.Tensor, label_indexes: t.Tensor):
 28 |         return self._triplet_loss(embeddings, label_indexes)
 29 | 
 30 |     def _triplet_loss(self, embeddings: t.Tensor, label_indexes: t.Tensor):
 31 |         # embeddings with shape [BATCH_SIZE, EMBEDDING_SIZE] and
 32 |         # label indexes (integers in range [0, NUM_SPEAKERS-1])
 33 |         # with shape [BATCH SIZE]
 34 | 
 35 |         # make sure we can generate triplets for each label
 36 |         with torch.no_grad():
 37 |             label_list: List[int] = label_indexes.detach().cpu().numpy().tolist()
 38 | 
 39 |         self.verify_labels(label_list)
 40 | 
 41 |         # generate a triplet for each batch dimension
 42 |         anchors = []
 43 |         positives = []
 44 |         negatives = []
 45 | 
 46 |         for batch_dim in range(embeddings.shape[0]):
 47 |             # get anchor
 48 |             label = label_indexes[batch_dim]
 49 |             anchor = embeddings[batch_dim].squeeze()
 50 | 
 51 |             # find positive
 52 |             positive = self._find_positive(
 53 |                 embeddings=embeddings,
 54 |                 label_list=label_list,
 55 |                 label=label,
 56 |                 exclude_idx=batch_dim,
 57 |             )
 58 | 
 59 |             # find negative
 60 |             negative = self._find_negative(
 61 |                 embeddings=embeddings, label_list=label_list, label=label
 62 |             )
 63 | 
 64 |             # save anchor, positive, negative tuple
 65 |             anchors.append(anchor)
 66 |             positives.append(positive)
 67 |             negatives.append(negative)
 68 | 
 69 |         return F.triplet_margin_loss(
 70 |             anchor=t.stack(anchors),
 71 |             positive=t.stack(positives),
 72 |             negative=t.stack(negatives),
 73 |             margin=self.margin,
 74 |         )
 75 | 
 76 |     @staticmethod
 77 |     def cosine_distance(a: t.Tensor, b: t.Tensor):
 78 |         return 1 - t.div(F.cosine_similarity(a, b) + 1, 2)
 79 | 
 80 |     @staticmethod
 81 |     def _find_positive(
 82 |         embeddings: t.Tensor, label_list: List[int], label: int, exclude_idx: int
 83 |     ) -> t.Tensor:
 84 |         candidate_indexes = [
 85 |             idx for idx, l in enumerate(label_list) if label == l and exclude_idx != idx
 86 |         ]
 87 | 
 88 |         idx = random.choice(candidate_indexes)
 89 | 
 90 |         return embeddings[idx].squeeze()
 91 | 
 92 |     @staticmethod
 93 |     def _find_negative(
 94 |         embeddings: t.Tensor, label_list: List[int], label: int
 95 |     ) -> t.Tensor:
 96 |         candidate_indexes = [idx for idx, l in enumerate(label_list) if label != l]
 97 | 
 98 |         idx = random.choice(candidate_indexes)
 99 | 
100 |         return embeddings[idx].squeeze()
101 | 
102 |     @staticmethod
103 |     def verify_labels(label_list: List[int]):
104 |         unique_labels = set(label_list)
105 | 
106 |         for label in unique_labels:
107 |             assert label_list.count(label) >= 2
108 | 


--------------------------------------------------------------------------------
/src/optim/schedule/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/optim/schedule/__init__.py


--------------------------------------------------------------------------------
/src/optim/schedule/tri_stage.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | #
  3 | # Implement a class which can be given as function to `LambdaLR` to act
  4 | # as a tri-stage learning rate with:
  5 | # 1. a linear warmup phase from `initial_lr` to `base_lr`
  6 | # 2. a constant phase of `base_lr`
  7 | # 3. an exponential decay phase from `base_lr` to `final_lr`
  8 | #
  9 | # The learning rate of the optimizer is expected to be set to `base_lr` and
 10 | # you should use `steps` and not `epochs` as update interval.
 11 | #
 12 | # Author(s): Nik Vaessen
 13 | ################################################################################
 14 | 
 15 | import math
 16 | import torch as t
 17 | 
 18 | ################################################################################
 19 | # implementation of tri-stage LambdaLR function
 20 | 
 21 | 
 22 | class TriStageLearningRateLambdaLRFunction:
 23 |     @staticmethod
 24 |     def is_valid_ratio(ratio: float):
 25 |         return 0 <= ratio <= 1
 26 | 
 27 |     def __init__(
 28 |         self,
 29 |         max_steps: int,
 30 |         warmup_stage_ratio: float,
 31 |         constant_stage_ratio: float,
 32 |         decay_stage_ratio: float,
 33 |         initial_lr: float,
 34 |         base_lr: float,
 35 |         final_lr: float,
 36 |     ):
 37 |         if not (
 38 |             self.is_valid_ratio(warmup_stage_ratio)
 39 |             and self.is_valid_ratio(constant_stage_ratio)
 40 |             and self.is_valid_ratio(decay_stage_ratio)
 41 |         ):
 42 |             raise ValueError()
 43 | 
 44 |         if (
 45 |             abs((warmup_stage_ratio + constant_stage_ratio + decay_stage_ratio) - 1)
 46 |             >= 1e-9
 47 |         ):
 48 |             raise ValueError("stage ratio's need to add up to 1")
 49 | 
 50 |         # stage computation
 51 |         self.max_steps = max_steps
 52 | 
 53 |         if self.max_steps is None:
 54 |             raise ValueError(
 55 |                 "TriStage learning rate schedule requires setting `max_steps` "
 56 |                 "in the trainer"
 57 |             )
 58 | 
 59 |         self.warmup_stage_steps = math.floor(self.max_steps * warmup_stage_ratio)
 60 |         self.constant_stage_steps = math.floor(self.max_steps * constant_stage_ratio)
 61 |         self.decay_stage_steps = math.floor(self.max_steps * decay_stage_ratio)
 62 | 
 63 |         self.initial_lr = initial_lr
 64 |         self.base_lr = base_lr
 65 |         self.final_lr = final_lr
 66 | 
 67 |         # warmup_stage lin_space
 68 |         self.warmup_stage_space = (
 69 |             t.linspace(self.initial_lr, self.base_lr, steps=self.warmup_stage_steps)
 70 |             .cpu()
 71 |             .numpy()
 72 |             .tolist()
 73 |         )
 74 |         self.decay_stage_space = (
 75 |             t.logspace(
 76 |                 math.log(self.base_lr),
 77 |                 math.log(self.final_lr),
 78 |                 steps=self.decay_stage_steps + 2,
 79 |                 base=math.e,
 80 |             )
 81 |             .cpu()
 82 |             .numpy()
 83 |             .tolist()
 84 |         )
 85 | 
 86 |     def __call__(self, step_count: int):
 87 |         if step_count < self.warmup_stage_steps:
 88 |             desired_lr = self.warmup_stage_space[step_count]
 89 |         elif step_count <= self.warmup_stage_steps + self.constant_stage_steps:
 90 |             desired_lr = self.base_lr
 91 |         elif step_count <= self.max_steps:
 92 |             desired_lr = self.decay_stage_space[
 93 |                 step_count - (self.warmup_stage_steps + self.constant_stage_steps)
 94 |             ]
 95 |         else:
 96 |             desired_lr = self.final_lr
 97 | 
 98 |         factor = desired_lr / self.base_lr
 99 |         return factor
100 | 


--------------------------------------------------------------------------------
/src/tokenizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikvaessen/w2v2-speaker/1505fa32aa832984983710b52dad22b0092b3efb/src/tokenizer/__init__.py


--------------------------------------------------------------------------------
/src/tokenizer/base.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # base API for a tokenizer
 4 | #
 5 | # Author(s): Nik Vaessen
 6 | ################################################################################
 7 | 
 8 | from abc import abstractmethod
 9 | from typing import Dict
10 | 
11 | import torch as t
12 | 
13 | ################################################################################
14 | # base API
15 | 
16 | 
17 | class BaseTokenizer:
18 |     @abstractmethod
19 |     def encode_string(self, string: str) -> t.Tensor:
20 |         pass
21 | 
22 |     @abstractmethod
23 |     def decode_tensor(self, token_tensor: t.Tensor):
24 |         pass
25 | 
26 |     @abstractmethod
27 |     def vocabulary_dictionary(self) -> Dict[str, int]:
28 |         pass
29 | 
30 |     @abstractmethod
31 |     def vocabulary_size(self) -> int:
32 |         pass
33 | 
34 |     @abstractmethod
35 |     def special_tokens_dictionary(self) -> Dict[str, int]:
36 |         pass
37 | 
38 |     @abstractmethod
39 |     def blank_token_id(self) -> int:
40 |         pass
41 | 


--------------------------------------------------------------------------------
/src/tokenizer/tokenizer_wav2vec2.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Tokenizer for the wav2vec2 network.
 4 | #
 5 | # Author(s): Nik Vaessen
 6 | ################################################################################
 7 | 
 8 | from typing import Dict, List
 9 | 
10 | from attr import dataclass
11 | from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer
12 | 
13 | import torch as t
14 | 
15 | from src.tokenizer.base import BaseTokenizer
16 | 
17 | ################################################################################
18 | # wrapper around huggingfacae tokenizer
19 | 
20 | 
21 | @dataclass
22 | class Wav2vec2TokenizerConfig:
23 |     tokenizer_huggingface_id: str
24 | 
25 | 
26 | class Wav2vec2Tokenizer(BaseTokenizer):
27 |     def __init__(self, cfg: Wav2vec2TokenizerConfig):
28 |         self.tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
29 |             cfg.tokenizer_huggingface_id
30 |         )
31 | 
32 |     def encode_string(self, string: str) -> t.Tensor:
33 |         return t.IntTensor(self.tokenizer(string).input_ids)
34 | 
35 |     def decode_tensor(self, token_tensor: t.Tensor) -> str:
36 |         assert len(token_tensor.shape) == 1
37 | 
38 |         decoded_str = self.tokenizer.decode(token_tensor)
39 | 
40 |         return decoded_str
41 | 
42 |     def vocabulary_dictionary(self) -> Dict[str, int]:
43 |         return self.tokenizer.get_vocab()
44 | 
45 |     def vocabulary_size(self) -> int:
46 |         return self.tokenizer.vocab_size
47 | 
48 |     def special_tokens_dictionary(self) -> Dict[str, int]:
49 |         return self.tokenizer.special_tokens_map
50 | 
51 |     def blank_token_id(self) -> int:
52 |         return 0
53 | 


--------------------------------------------------------------------------------