├── Gemini_df_resnet.py ├── inference_demo.py └── README.md /Gemini_df_resnet.py: -------------------------------------------------------------------------------- 1 | # Model file for Gemini DF-ResNet in 'Golden Gemini is All You Need: 2 | # Finding the Sweet Spots for Speaker Verification' 3 | # https://arxiv.org/abs/2312.03620 4 | 5 | # Author: Tianchi Liu 6 | # Special thanks to the Author of DF-ResNet: Dr. Bei Liu 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | import math 12 | import torchaudio.transforms as trans 13 | 14 | 15 | class Inverted_Bottleneck(nn.Module): 16 | def __init__(self, dim): 17 | super(Inverted_Bottleneck, self).__init__() 18 | self.conv1 = nn.Conv2d(dim, 4 * dim, kernel_size=1, bias=False) 19 | self.bn1 = nn.BatchNorm2d(4 * dim) 20 | self.conv2 = nn.Conv2d(4 * dim, 4 * dim, kernel_size=3, padding=1, groups=4 * dim, bias=False) 21 | self.bn2 = nn.BatchNorm2d(4 * dim) 22 | self.conv3 = nn.Conv2d(4 * dim, dim, kernel_size=1, bias=False) 23 | self.bn3 = nn.BatchNorm2d(dim) 24 | 25 | def forward(self, x): 26 | out = F.relu(self.bn1(self.conv1(x))) 27 | out = F.relu(self.bn2(self.conv2(out))) 28 | out = self.bn3(self.conv3(out)) 29 | out += x 30 | out = F.relu(out) 31 | return out 32 | 33 | 34 | class Gemini_DF_ResNet(nn.Module): 35 | # DF_ResNet with Golden Gemini T14c stride strategy of Golden Gemini 36 | def __init__(self, depths=[3, 3, 9, 3], dims=[32, 64, 128, 256], feat_dim=40, emb_dim=128, feat_type='fbank', sr=16000): 37 | super(Gemini_DF_ResNet, self).__init__() 38 | self.feat_dim = feat_dim 39 | self.emb_dim = emb_dim 40 | self.feat_type = feat_type 41 | 42 | win_len = int(sr * 0.025) 43 | hop_len = int(sr * 0.01) 44 | 45 | self.downsample_layers = nn.ModuleList() 46 | stem = nn.Sequential( 47 | nn.Conv2d(1, dims[0], kernel_size=3, stride=1, padding=1, bias=False), 48 | nn.BatchNorm2d(dims[0]), 49 | nn.ReLU() 50 | ) 51 | self.downsample_layers.append(stem) 52 | 53 | # Golden Gemini T14c stride strategy 54 | stride_f = [2,2,2,2] 55 | stride_t = [1,2,1,1] 56 | 57 | for i in range(4): 58 | downsample_layer = nn.Sequential( 59 | nn.Conv2d(dims[i], dims[i + 1], kernel_size=3, stride=(stride_f[i], stride_t[i]), padding=1, bias=False), 60 | nn.BatchNorm2d(dims[i + 1]) 61 | ) 62 | self.downsample_layers.append(downsample_layer) 63 | 64 | self.stages = nn.ModuleList() 65 | for i in range(4): 66 | stage = nn.Sequential( 67 | *[Inverted_Bottleneck(dim=dims[i+1]) for j in range(depths[i])] 68 | ) 69 | self.stages.append(stage) 70 | 71 | self.embedding = nn.Linear(math.ceil(feat_dim / 8) * dims[-1], emb_dim) 72 | 73 | def forward(self, x): 74 | x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T) 75 | 76 | x = x.unsqueeze_(1) 77 | x = self.downsample_layers[0](x) 78 | x = self.downsample_layers[1](x) 79 | x = self.stages[0](x) 80 | x = self.downsample_layers[2](x) 81 | x = self.stages[1](x) 82 | x = self.downsample_layers[3](x) 83 | x = self.stages[2](x) 84 | x = self.downsample_layers[4](x) 85 | x = self.stages[3](x) 86 | 87 | pooling_mean = torch.mean(x, dim=-1) 88 | pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-10) 89 | out = torch.cat((torch.flatten(pooling_mean, start_dim=1), 90 | torch.flatten(pooling_std, start_dim=1)), 1) 91 | 92 | 93 | embedding = self.embedding(out) 94 | return embedding 95 | 96 | 97 | # following models do NOT include separate downsmapling layers into layer counting 98 | def Gemini_df_resnet56(feat_dim, embed_dim, feat_type='fbank', sr=16000): # this is actually Gemini_df_resnet60 in paper 99 | return Gemini_DF_ResNet(depths=[3, 3, 9, 3], dims=[32, 32, 64, 128, 256], feat_dim=feat_dim, emb_dim=embed_dim, feat_type=feat_type, sr=sr) 100 | 101 | def Gemini_df_resnet110(feat_dim, embed_dim, feat_type='fbank', sr=16000): # this is actually Gemini_df_resnet114 in paper 102 | return Gemini_DF_ResNet(depths=[3, 3, 27, 3], dims=[32, 32, 64, 128, 256], feat_dim=feat_dim, emb_dim=embed_dim, feat_type=feat_type, sr=sr) 103 | 104 | 105 | def Gemini_df_resnet179(feat_dim, embed_dim, feat_type='fbank', sr=16000): # this is actually Gemini_df_resnet183 in paper 106 | return Gemini_DF_ResNet(depths=[3, 8, 45, 3], dims=[32, 32, 64, 128, 256], feat_dim=feat_dim, emb_dim=embed_dim, feat_type=feat_type, sr=sr) 107 | 108 | # following models do include separate downsmapling layers into layer counting 109 | def Gemini_df_resnet60(feat_dim, embed_dim, feat_type='fbank', sr=16000): 110 | return Gemini_DF_ResNet(depths=[3, 3, 9, 3], dims=[32, 32, 64, 128, 256], feat_dim=feat_dim, emb_dim=embed_dim, feat_type=feat_type, sr=sr) 111 | 112 | def Gemini_df_resnet114(feat_dim, embed_dim, feat_type='fbank', sr=16000): 113 | return Gemini_DF_ResNet(depths=[3, 3, 27, 3], dims=[32, 32, 64, 128, 256], feat_dim=feat_dim, emb_dim=embed_dim, feat_type=feat_type, sr=sr) 114 | 115 | 116 | def Gemini_df_resnet183(feat_dim, embed_dim, feat_type='fbank', sr=16000): 117 | return Gemini_DF_ResNet(depths=[3, 8, 45, 3], dims=[32, 32, 64, 128, 256], feat_dim=feat_dim, emb_dim=embed_dim, feat_type=feat_type, sr=sr) 118 | 119 | if __name__ == '__main__': 120 | net = Gemini_DF_ResNet56(80, 256) 121 | x = torch.randn(2, 32000) 122 | out = net(x) 123 | print(out.shape) 124 | -------------------------------------------------------------------------------- /inference_demo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchaudio 3 | from Gemini_df_resnet import Gemini_df_resnet179 4 | import torchaudio.compliance.kaldi as kaldi 5 | import torch.nn.functional as F 6 | 7 | def cosine_similarity(embedding1, embedding2): 8 | """ 9 | Calculate cosine similarity between two embeddings. 10 | 11 | Args: 12 | embedding1 (torch.Tensor): First embedding of shape (1, 256). 13 | embedding2 (torch.Tensor): Second embedding of shape (1, 256). 14 | 15 | Returns: 16 | float: Cosine similarity between the two embeddings. 17 | """ 18 | assert embedding1.shape == (1, 256), "embedding1 must have shape (1, 256)" 19 | assert embedding2.shape == (1, 256), "embedding2 must have shape (1, 256)" 20 | 21 | # Normalize the embeddings to unit vectors 22 | embedding1_normalized = F.normalize(embedding1, p=2, dim=1) 23 | embedding2_normalized = F.normalize(embedding2, p=2, dim=1) 24 | 25 | # Compute cosine similarity 26 | similarity = torch.sum(embedding1_normalized * embedding2_normalized) 27 | 28 | return similarity.item() 29 | 30 | def compute_fbank(waveform, 31 | sample_rate, 32 | num_mel_bins=80, 33 | frame_length=25, 34 | frame_shift=10, 35 | dither=1.0, 36 | flag_apply_cmvn=True): 37 | """ Extract fbank 38 | """ 39 | waveform = waveform * (1 << 15) 40 | mat = kaldi.fbank(waveform, 41 | num_mel_bins=num_mel_bins, 42 | frame_length=frame_length, 43 | frame_shift=frame_shift, 44 | dither=dither, 45 | sample_frequency=sample_rate, 46 | window_type='hamming', 47 | use_energy=False) 48 | if flag_apply_cmvn: 49 | mat = apply_cmvn(mat=mat) 50 | print("Notification: cmvn is applied") 51 | return mat 52 | 53 | def apply_cmvn(mat, norm_mean=True, norm_var=False): 54 | """ Apply CMVN 55 | """ 56 | if norm_mean: 57 | mat = mat - torch.mean(mat, dim=0) 58 | if norm_var: 59 | mat = mat / torch.sqrt(torch.var(mat, dim=0) + 1e-8) 60 | return mat 61 | 62 | # load model 63 | sv_net = Gemini_df_resnet179(80, 256) 64 | sv_net.load_state_dict(torch.load('./0621-Gemini_df_resnet179-emb256-fbank80-num_frms200-aug0.6-spTrue-saFalse-ArcMargin-AdamW-epoch165-LM/models/final_model.pt'), strict=False) 65 | sv_net.to('cuda') 66 | 67 | # example wav paths 68 | wav_path_spk1_utt1 = '/home/tianchi/data/VoxCeleb/voxceleb1/wav/id10050/6OUWWa4tdJw/00001.wav' 69 | wav_path_spk1_utt2 = '/home/tianchi/data/VoxCeleb/voxceleb1/wav/id10050/Yo0U6EbyVJg/00001.wav' 70 | wav_path_spk2_utt1 = '/home/tianchi/data/VoxCeleb/voxceleb1/wav/id10051/hEnGRr7qNUY/00001.wav' 71 | wav_path_spk2_utt2 = '/home/tianchi/data/VoxCeleb/voxceleb1/wav/id10051/v8znF6-r-D8/00001.wav' 72 | 73 | # load wav files 74 | wav_spk1_utt1, sample_rate_spk1_utt1 = torchaudio.load(wav_path_spk1_utt1) 75 | wav_spk1_utt2, sample_rate_spk1_utt2 = torchaudio.load(wav_path_spk1_utt2) 76 | wav_spk2_utt1, sample_rate_spk2_utt1 = torchaudio.load(wav_path_spk2_utt1) 77 | wav_spk2_utt2, sample_rate_spk2_utt2 = torchaudio.load(wav_path_spk2_utt2) 78 | assert sample_rate_spk1_utt1 == sample_rate_spk1_utt2 == sample_rate_spk2_utt1 == sample_rate_spk2_utt2 == 16000 79 | 80 | # extract Fbank and apply cmvn 81 | fea_spk1_utt1 = compute_fbank(wav_spk1_utt1, sample_rate_spk1_utt1, flag_apply_cmvn=True) 82 | fea_spk1_utt2 = compute_fbank(wav_spk1_utt2, sample_rate_spk1_utt2, flag_apply_cmvn=True) 83 | fea_spk2_utt1 = compute_fbank(wav_spk2_utt1, sample_rate_spk2_utt1, flag_apply_cmvn=True) 84 | fea_spk2_utt2 = compute_fbank(wav_spk2_utt2, sample_rate_spk2_utt2, flag_apply_cmvn=True) 85 | 86 | # extract embeddings 87 | embd_spk1_utt1 = sv_net(fea_spk1_utt1.to('cuda').unsqueeze(0)) 88 | embd_spk1_utt2 = sv_net(fea_spk1_utt2.to('cuda').unsqueeze(0)) 89 | embd_spk2_utt1 = sv_net(fea_spk2_utt1.to('cuda').unsqueeze(0)) 90 | embd_spk2_utt2 = sv_net(fea_spk2_utt2.to('cuda').unsqueeze(0)) 91 | 92 | # output cosine similarity 93 | print('\n--- inference for same speaker, the cosine similarity should be closer to 1.0 ---') 94 | print("cosine similarity of spk1_utt1 and spk1_utt2:", cosine_similarity(embd_spk1_utt1, embd_spk1_utt2)) 95 | print("cosine similarity of spk2_utt1 and spk2_utt2:", cosine_similarity(embd_spk2_utt1, embd_spk2_utt2)) 96 | print('\n--- inference for different speaker, the cosine similarity should be closer to 0.0 ---') 97 | print("cosine similarity of spk1_utt1 and spk2_utt1:", cosine_similarity(embd_spk1_utt1, embd_spk2_utt1)) 98 | print("cosine similarity of spk1_utt1 and spk2_utt2:", cosine_similarity(embd_spk1_utt1, embd_spk2_utt2)) 99 | print("cosine similarity of spk1_utt2 and spk2_utt1:", cosine_similarity(embd_spk1_utt2, embd_spk2_utt1)) 100 | print("cosine similarity of spk1_utt2 and spk2_utt2:", cosine_similarity(embd_spk1_utt2, embd_spk2_utt2)) 101 | 102 | # outputs of the demo 103 | ''' 104 | Notification: cmvn is applied 105 | Notification: cmvn is applied 106 | Notification: cmvn is applied 107 | Notification: cmvn is applied 108 | 109 | --- inference for same speaker, the cosine similarity should be closer to 1.0 --- 110 | cosine similarity of spk1_utt1 and spk1_utt2: 0.6580173373222351 111 | cosine similarity of spk2_utt1 and spk2_utt2: 0.6858957409858704 112 | 113 | --- inference for different speaker, the cosine similarity should be closer to 0.0 --- 114 | cosine similarity of spk1_utt1 and spk2_utt1: 0.14298376441001892 115 | cosine similarity of spk1_utt1 and spk2_utt2: 0.1417258083820343 116 | cosine similarity of spk1_utt2 and spk2_utt1: 0.06481592357158661 117 | cosine similarity of spk1_utt2 and spk2_utt2: 0.09413496404886246 118 | ''' 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Golden-Gemini-for-Speaker-Verification 2 | 🔥🔥🔥 Official release of pretrained models and scripts for ♊ 'Golden Gemini Is All You Need: Finding the Sweet Spots for Speaker Verification' accepted by IEEE/ACM Transactions on Audio, Speech, and Language Processing (TASLP), 2024. 3 | 4 | (Free access) IEEE link: https://ieeexplore.ieee.org/document/10497864 5 | 6 | arXiv Link: https://arxiv.org/abs/2312.03620 7 | 8 | # Update: 9 | Jan. 2025, **Added an inference demo script**. Considering the need for simply inference using pre-trained models, we have provided an inference demo script, inference_demo.py 10 | 11 | # Note: 12 | 13 | 1. ***[Important]*** This repository is dedicated to sharing the pretrained models in our paper for connvinent usage. For training and inference, we recommend using the Gemini DF-ResNet, now available on WeSpeaker: https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxceleb/v2. We extend our gratitude to the WeSpeaker community for their support, with special thanks to Dr. Wang Shuai. 14 | 15 | 2. Special thanks to Dr. Liu Bei for sharing the implementation details related to DF-ResNet (https://ieeexplore.ieee.org/document/10119228). 16 | 17 | 3. [New] We also release a large-margin finetuned pretrained model. 18 | 19 | # Pretrained Models 20 | 21 | | Model | Param | Large Margin Fine-Tuning | Vox1-O EER | Vox1-O MinDCF | Vox1-E EER | Vox1-E MinDCF | Vox1-H EER | Vox1-H MinDCF | Pretained Model Folder | 22 | |----------------------|-------|--------------------------|------------|---------------|------------|---------------|------------|---------------|-------------------------------------------------------------------------------------------------------| 23 | | Gemini DF-ResNet60 [[Google Drive]](https://drive.google.com/file/d/1zfck1eEOFCxGonRRxUzsLKeruwv4f-kU/view?usp=sharing) | 4.05 | X | 0.941 | 0.089 | 1.051 | 0.116 | 1.799 | 0.166 | 0611-Gemini_df_resnet56-emb256-fbank80-num_frms200-aug0.6-spTrue-saFalse-ArcMargin-AdamW-epoch165 | 24 | | Gemini DF-ResNet114 [[Google Drive]](https://drive.google.com/file/d/1hruxkctjIzzUkooXikExb3if8wurR6pv/view?usp=sharing) | 6.53 | X | 0.686 | 0.067 | 0.863 | 0.097 | 1.490 | 0.144 | 0615-Gemini_df_resnet110-emb256-fbank80-num_frms200-aug0.6-spTrue-saFalse-ArcMargin-AdamW-epoch165 | 25 | | Gemini DF-ResNet183 [[Google Drive]](https://drive.google.com/file/d/1Bb1VaD8ZoUREoRoQ73oiCXjIJ21SuKLS/view?usp=drive_link) | 9.20 | X | 0.596 | 0.065 | 0.806 | 0.090 | 1.440 | 0.137 | 0621-Gemini_df_resnet179-emb256-fbank80-num_frms200-aug0.6-spTrue-saFalse-ArcMargin-AdamW-epoch165 | 26 | | [New] Gemini DF-ResNet183 [[Google Drive]](https://drive.google.com/file/d/1rEb5UpeOvirCt9mhIW54BRAd-6EF3n_c/view?usp=drive_link) | 9.20 | ✔ | 0.569 | 0.045 | 0.768 | 0.078 | 1.342 | 0.126 | 0621-Gemini_df_resnet179-emb256-fbank80-num_frms200-aug0.6-spTrue-saFalse-ArcMargin-AdamW-epoch165-LM | 27 | 28 | *: The layer count between the names of the pretrained model folder and the model itself differs by 4, as mentioned in footnote 4 of the paper, where the distinction lies in whether to include the 4 separate downsampling layers in the layer count. **The models are identical, only differing in nomenclature.** During experimentation, we did not include the separate downsampling layers in the layer count; however, through discussion during paper writing, we decided to include. **Therefore, Gemini DF-ResNet60/114/183 are the official name.** 29 | 30 | # Folder Structure: 31 | 32 | Take 0611-Gemini_df_resnet56-emb256-fbank80-num_frms200-aug0.6-spTrue-saFalse-ArcMargin-AdamW-epoch165 as an example: 33 | 34 | ``` 35 | ├── models/ 36 | │ ├── model_165.pt # The model checkpoint of 165 epoch. This is the model for testing. 37 | ├── scores/ 38 | │ ├── vox1_asnorm300_result # testing results with asnorm 39 | │ ├── vox1_snorm300_result # testing results with snorm 40 | │ ├── vox1_cos_result # testing results simply by cosine similarity 41 | │ ├── vox2_dev_asnorm300_vox1_O_cleaned.kaldi.det.png # visualization 42 | │ ├── vox2_dev_asnorm300_vox1_H_cleaned.kaldi.det.png 43 | │ ├── vox2_dev_asnorm300_vox1_E_cleaned.kaldi.det.png 44 | │ ├── vox2_dev_asnorm300_vox1_O_cleaned.kaldi.score # scores of all the trials 45 | │ ├── vox2_dev_asnorm300_vox1_H_cleaned.kaldi.score 46 | │ └── vox2_dev_asnorm300_vox1_E_cleaned.kaldi.score 47 | └── config.yaml # The config file to train the model in wespeaker platform (https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxceleb/v2/conf) 48 | └── train.log # The training log automatically generated by the Wespeaker toolkit. 49 | ``` 50 | 51 | 52 | 53 | # Usage: 54 | 55 | **[Important]** 56 | 🔥🔥🔥 **The Gemini DF-ResNet is now available in Wespeaker! (https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxceleb/v2). We encourage using the version provided by Wespeaker for better compatibility.** 57 | **Additionally, you can find the large-margin finetuned pretrained models in both Pytorch and ONNX formats at: https://github.com/wenet-e2e/wespeaker/blob/master/docs/pretrained.md** 58 | 59 | Alternatively, you can follow the steps below to reproduce the provided checkpoint: 60 | 1. Set up Wespeaker Toolkit (https://github.com/wenet-e2e/wespeaker). 61 | 2. Copy the model file, Gemini_df_resnet.py, from this repository to wespeaker/wespeaker/models/ 62 | 3. Modify wespeaker/wespeaker/models/speaker_model.py by adding 63 | ``` 64 | import wespeaker.models.Gemini_df_resnet as Gemini_df_resnet 65 | ``` 66 | and 67 | ``` 68 | elif model_name.startswith("Gemini_df_resnet"): 69 | return getattr(Gemini_df_resnet, model_name) 70 | ``` 71 | 4. Create a config file following the config.yaml file in the pre-trained model folder and place it in /wespeaker/examples/voxceleb/v2/conf/ 72 | [Note]: Warm-up is not explicitly stated in the configuration file, yet it is employed by default in the Wespeaker toolkit as follows: 73 | ``` 74 | warm_from_zero: False 75 | warm_up_epoch: 6 76 | ``` 77 | 6. In wespeaker/examples/voxceleb/v2/run.sh, modify 'config' to point to the new config file. 78 | And then you can start training and reproduce. 79 | 80 | # Cite 81 | 🔥♊**Golden Gemini (this work):** 82 | ``` 83 | @ARTICLE{10497864, 84 | author={Liu, Tianchi and Lee, Kong Aik and Wang, Qiongqiong and Li, Haizhou}, 85 | journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, 86 | title={Golden Gemini is All You Need: Finding the Sweet Spots for Speaker Verification}, 87 | year={2024}, 88 | volume={32}, 89 | number={}, 90 | pages={2324-2337} 91 | } 92 | ``` 93 | (Prior Work) RecXi (Golden Gemini is a continued research of tResNet) 94 | ``` 95 | @inproceedings{NEURIPS2023_9d276b0a, 96 | author = {Liu, Tianchi and Lee, Kong Aik and Wang, Qiongqiong and Li, Haizhou}, 97 | booktitle = {Advances in Neural Information Processing Systems (NeurIPS)}, 98 | pages = {50221--50236}, 99 | title = {Disentangling Voice and Content with Self-Supervision for Speaker Recognition}, 100 | volume = {36}, 101 | year = {2023} 102 | } 103 | ``` 104 | (Related Work) DF-ResNet 105 | ``` 106 | @ARTICLE{10119228, 107 | author={Liu, Bei and Chen, Zhengyang and Qian, Yanmin}, 108 | journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, 109 | title={Depth-First Neural Architecture With Attentive Feature Fusion for Efficient Speaker Verification}, 110 | year={2023}, 111 | volume={31}, 112 | pages={1825-1838} 113 | } 114 | ``` 115 | 116 | (Related Work) wespeaker toolkit 117 | ``` 118 | @INPROCEEDINGS{10096626, 119 | author={Wang, Hongji and Liang, Chengdong and Wang, Shuai and Chen, Zhengyang and Zhang, Binbin and Xiang, Xu and Deng, Yanlei and Qian, Yanmin}, 120 | booktitle={ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 121 | title={Wespeaker: A Research and Production Oriented Speaker Embedding Learning Toolkit}, 122 | year={2023}, 123 | pages={1-5} 124 | } 125 | @article{wang4748855advancing, 126 | title={Advancing Speaker Embedding Learning: Wespeaker Toolkit for Research and Production}, 127 | author={Wang, Shuai and Chen, Zhengyang and Han, Bing and Wang, Hongji and Liang, Chengdong and Zhang, Binbin and Xiang, Xu and Ding, Wen and Rohdin, Johan and Silnova, Anna and others}, 128 | journal={Available at SSRN 4748855} 129 | } 130 | ``` 131 | 132 | --------------------------------------------------------------------------------