├── Gemini_df_resnet.py
├── inference_demo.py
└── README.md


/Gemini_df_resnet.py:
--------------------------------------------------------------------------------
  1 | # Model file for Gemini DF-ResNet in 'Golden Gemini is All You Need: 
  2 | # Finding the Sweet Spots for Speaker Verification' 
  3 | # https://arxiv.org/abs/2312.03620
  4 | 
  5 | # Author: Tianchi Liu
  6 | # Special thanks to the Author of DF-ResNet: Dr. Bei Liu
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.nn.functional as F
 11 | import math
 12 | import torchaudio.transforms as trans
 13 | 
 14 | 
 15 | class Inverted_Bottleneck(nn.Module):
 16 |     def __init__(self, dim):
 17 |         super(Inverted_Bottleneck, self).__init__()
 18 |         self.conv1 = nn.Conv2d(dim, 4 * dim, kernel_size=1, bias=False)
 19 |         self.bn1 = nn.BatchNorm2d(4 * dim)
 20 |         self.conv2 = nn.Conv2d(4 * dim, 4 * dim, kernel_size=3, padding=1, groups=4 * dim, bias=False)
 21 |         self.bn2 = nn.BatchNorm2d(4 * dim)
 22 |         self.conv3 = nn.Conv2d(4 * dim, dim, kernel_size=1, bias=False)
 23 |         self.bn3 = nn.BatchNorm2d(dim)
 24 | 
 25 |     def forward(self, x):
 26 |         out = F.relu(self.bn1(self.conv1(x)))
 27 |         out = F.relu(self.bn2(self.conv2(out)))
 28 |         out = self.bn3(self.conv3(out))
 29 |         out += x
 30 |         out = F.relu(out)
 31 |         return out
 32 | 
 33 | 
 34 | class Gemini_DF_ResNet(nn.Module):
 35 |     # DF_ResNet with Golden Gemini T14c stride strategy of Golden Gemini
 36 |     def __init__(self, depths=[3, 3, 9, 3], dims=[32, 64, 128, 256], feat_dim=40, emb_dim=128, feat_type='fbank', sr=16000):
 37 |         super(Gemini_DF_ResNet, self).__init__()
 38 |         self.feat_dim = feat_dim
 39 |         self.emb_dim = emb_dim
 40 |         self.feat_type = feat_type
 41 | 
 42 |         win_len = int(sr * 0.025)
 43 |         hop_len = int(sr * 0.01)
 44 | 
 45 |         self.downsample_layers = nn.ModuleList()
 46 |         stem = nn.Sequential(
 47 |             nn.Conv2d(1, dims[0], kernel_size=3, stride=1, padding=1, bias=False),
 48 |             nn.BatchNorm2d(dims[0]),
 49 |             nn.ReLU()
 50 |         )
 51 |         self.downsample_layers.append(stem)
 52 | 
 53 |         # Golden Gemini T14c stride strategy 
 54 |         stride_f = [2,2,2,2]
 55 |         stride_t = [1,2,1,1]
 56 | 
 57 |         for i in range(4):
 58 |             downsample_layer = nn.Sequential(
 59 |                 nn.Conv2d(dims[i], dims[i + 1], kernel_size=3, stride=(stride_f[i], stride_t[i]), padding=1, bias=False),
 60 |                 nn.BatchNorm2d(dims[i + 1])
 61 |             )
 62 |             self.downsample_layers.append(downsample_layer)
 63 | 
 64 |         self.stages = nn.ModuleList()
 65 |         for i in range(4):
 66 |             stage = nn.Sequential(
 67 |                 *[Inverted_Bottleneck(dim=dims[i+1]) for j in range(depths[i])]
 68 |             )
 69 |             self.stages.append(stage)
 70 | 
 71 |         self.embedding = nn.Linear(math.ceil(feat_dim / 8) * dims[-1], emb_dim)
 72 | 
 73 |     def forward(self, x):
 74 |         x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
 75 | 
 76 |         x = x.unsqueeze_(1)
 77 |         x = self.downsample_layers[0](x)
 78 |         x = self.downsample_layers[1](x)
 79 |         x = self.stages[0](x)
 80 |         x = self.downsample_layers[2](x)
 81 |         x = self.stages[1](x)
 82 |         x = self.downsample_layers[3](x)
 83 |         x = self.stages[2](x)        
 84 |         x = self.downsample_layers[4](x)
 85 |         x = self.stages[3](x)
 86 | 
 87 |         pooling_mean = torch.mean(x, dim=-1)
 88 |         pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-10)
 89 |         out = torch.cat((torch.flatten(pooling_mean, start_dim=1),
 90 |                          torch.flatten(pooling_std, start_dim=1)), 1)
 91 | 
 92 | 
 93 |         embedding = self.embedding(out)
 94 |         return embedding
 95 | 
 96 | 
 97 | # following models do NOT include separate downsmapling layers into layer counting
 98 | def Gemini_df_resnet56(feat_dim, embed_dim, feat_type='fbank', sr=16000): # this is actually Gemini_df_resnet60 in paper
 99 |     return Gemini_DF_ResNet(depths=[3, 3, 9, 3], dims=[32, 32, 64, 128, 256], feat_dim=feat_dim, emb_dim=embed_dim, feat_type=feat_type, sr=sr)
100 | 
101 | def Gemini_df_resnet110(feat_dim, embed_dim, feat_type='fbank', sr=16000): # this is actually Gemini_df_resnet114 in paper
102 |     return Gemini_DF_ResNet(depths=[3, 3, 27, 3], dims=[32, 32, 64, 128, 256], feat_dim=feat_dim, emb_dim=embed_dim, feat_type=feat_type, sr=sr)
103 | 
104 | 
105 | def Gemini_df_resnet179(feat_dim, embed_dim, feat_type='fbank', sr=16000): # this is actually Gemini_df_resnet183 in paper
106 |     return Gemini_DF_ResNet(depths=[3, 8, 45, 3], dims=[32, 32, 64, 128, 256], feat_dim=feat_dim, emb_dim=embed_dim, feat_type=feat_type, sr=sr)
107 | 
108 | # following models do include separate downsmapling layers into layer counting
109 | def Gemini_df_resnet60(feat_dim, embed_dim, feat_type='fbank', sr=16000):
110 |     return Gemini_DF_ResNet(depths=[3, 3, 9, 3], dims=[32, 32, 64, 128, 256], feat_dim=feat_dim, emb_dim=embed_dim, feat_type=feat_type, sr=sr)
111 | 
112 | def Gemini_df_resnet114(feat_dim, embed_dim, feat_type='fbank', sr=16000):
113 |     return Gemini_DF_ResNet(depths=[3, 3, 27, 3], dims=[32, 32, 64, 128, 256], feat_dim=feat_dim, emb_dim=embed_dim, feat_type=feat_type, sr=sr)
114 | 
115 | 
116 | def Gemini_df_resnet183(feat_dim, embed_dim, feat_type='fbank', sr=16000):
117 |     return Gemini_DF_ResNet(depths=[3, 8, 45, 3], dims=[32, 32, 64, 128, 256], feat_dim=feat_dim, emb_dim=embed_dim, feat_type=feat_type, sr=sr)
118 | 
119 | if __name__ == '__main__':
120 |     net = Gemini_DF_ResNet56(80, 256)
121 |     x = torch.randn(2, 32000)
122 |     out = net(x)
123 |     print(out.shape)
124 | 


--------------------------------------------------------------------------------
/inference_demo.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torchaudio
  3 | from Gemini_df_resnet import Gemini_df_resnet179
  4 | import torchaudio.compliance.kaldi as kaldi
  5 | import torch.nn.functional as F
  6 | 
  7 | def cosine_similarity(embedding1, embedding2):
  8 |     """
  9 |     Calculate cosine similarity between two embeddings.
 10 | 
 11 |     Args:
 12 |         embedding1 (torch.Tensor): First embedding of shape (1, 256).
 13 |         embedding2 (torch.Tensor): Second embedding of shape (1, 256).
 14 | 
 15 |     Returns:
 16 |         float: Cosine similarity between the two embeddings.
 17 |     """
 18 |     assert embedding1.shape == (1, 256), "embedding1 must have shape (1, 256)"
 19 |     assert embedding2.shape == (1, 256), "embedding2 must have shape (1, 256)"
 20 | 
 21 |     # Normalize the embeddings to unit vectors
 22 |     embedding1_normalized = F.normalize(embedding1, p=2, dim=1)
 23 |     embedding2_normalized = F.normalize(embedding2, p=2, dim=1)
 24 | 
 25 |     # Compute cosine similarity
 26 |     similarity = torch.sum(embedding1_normalized * embedding2_normalized)
 27 | 
 28 |     return similarity.item()
 29 | 
 30 | def compute_fbank(waveform,
 31 |                   sample_rate,
 32 |                   num_mel_bins=80,
 33 |                   frame_length=25,
 34 |                   frame_shift=10,
 35 |                   dither=1.0,
 36 |                   flag_apply_cmvn=True):
 37 |     """ Extract fbank
 38 |     """
 39 |     waveform = waveform * (1 << 15)
 40 |     mat = kaldi.fbank(waveform,
 41 |                       num_mel_bins=num_mel_bins,
 42 |                       frame_length=frame_length,
 43 |                       frame_shift=frame_shift,
 44 |                       dither=dither,
 45 |                       sample_frequency=sample_rate,
 46 |                       window_type='hamming',
 47 |                       use_energy=False)
 48 |     if flag_apply_cmvn:
 49 |         mat = apply_cmvn(mat=mat)
 50 |         print("Notification: cmvn is applied")
 51 |     return mat
 52 | 
 53 | def apply_cmvn(mat, norm_mean=True, norm_var=False):
 54 |     """ Apply CMVN
 55 |     """
 56 |     if norm_mean:
 57 |         mat = mat - torch.mean(mat, dim=0)
 58 |     if norm_var:
 59 |         mat = mat / torch.sqrt(torch.var(mat, dim=0) + 1e-8)
 60 |     return mat
 61 | 
 62 | # load model
 63 | sv_net = Gemini_df_resnet179(80, 256)
 64 | sv_net.load_state_dict(torch.load('./0621-Gemini_df_resnet179-emb256-fbank80-num_frms200-aug0.6-spTrue-saFalse-ArcMargin-AdamW-epoch165-LM/models/final_model.pt'), strict=False)
 65 | sv_net.to('cuda')
 66 | 
 67 | # example wav paths
 68 | wav_path_spk1_utt1 = '/home/tianchi/data/VoxCeleb/voxceleb1/wav/id10050/6OUWWa4tdJw/00001.wav'
 69 | wav_path_spk1_utt2 = '/home/tianchi/data/VoxCeleb/voxceleb1/wav/id10050/Yo0U6EbyVJg/00001.wav'
 70 | wav_path_spk2_utt1 = '/home/tianchi/data/VoxCeleb/voxceleb1/wav/id10051/hEnGRr7qNUY/00001.wav'
 71 | wav_path_spk2_utt2 = '/home/tianchi/data/VoxCeleb/voxceleb1/wav/id10051/v8znF6-r-D8/00001.wav'
 72 | 
 73 | # load wav files
 74 | wav_spk1_utt1, sample_rate_spk1_utt1 = torchaudio.load(wav_path_spk1_utt1)
 75 | wav_spk1_utt2, sample_rate_spk1_utt2 = torchaudio.load(wav_path_spk1_utt2)
 76 | wav_spk2_utt1, sample_rate_spk2_utt1 = torchaudio.load(wav_path_spk2_utt1)
 77 | wav_spk2_utt2, sample_rate_spk2_utt2 = torchaudio.load(wav_path_spk2_utt2)
 78 | assert sample_rate_spk1_utt1 == sample_rate_spk1_utt2 == sample_rate_spk2_utt1 == sample_rate_spk2_utt2 == 16000
 79 | 
 80 | # extract Fbank and apply cmvn
 81 | fea_spk1_utt1 = compute_fbank(wav_spk1_utt1, sample_rate_spk1_utt1, flag_apply_cmvn=True)
 82 | fea_spk1_utt2 = compute_fbank(wav_spk1_utt2, sample_rate_spk1_utt2, flag_apply_cmvn=True)
 83 | fea_spk2_utt1 = compute_fbank(wav_spk2_utt1, sample_rate_spk2_utt1, flag_apply_cmvn=True)
 84 | fea_spk2_utt2 = compute_fbank(wav_spk2_utt2, sample_rate_spk2_utt2, flag_apply_cmvn=True)
 85 | 
 86 | # extract embeddings
 87 | embd_spk1_utt1 = sv_net(fea_spk1_utt1.to('cuda').unsqueeze(0))
 88 | embd_spk1_utt2 = sv_net(fea_spk1_utt2.to('cuda').unsqueeze(0))
 89 | embd_spk2_utt1 = sv_net(fea_spk2_utt1.to('cuda').unsqueeze(0))
 90 | embd_spk2_utt2 = sv_net(fea_spk2_utt2.to('cuda').unsqueeze(0))
 91 | 
 92 | # output cosine similarity
 93 | print('\n--- inference for same speaker, the cosine similarity should be closer to 1.0 ---')
 94 | print("cosine similarity of spk1_utt1 and spk1_utt2:", cosine_similarity(embd_spk1_utt1, embd_spk1_utt2))
 95 | print("cosine similarity of spk2_utt1 and spk2_utt2:", cosine_similarity(embd_spk2_utt1, embd_spk2_utt2))
 96 | print('\n--- inference for different speaker, the cosine similarity should be closer to 0.0 ---')
 97 | print("cosine similarity of spk1_utt1 and spk2_utt1:", cosine_similarity(embd_spk1_utt1, embd_spk2_utt1))
 98 | print("cosine similarity of spk1_utt1 and spk2_utt2:", cosine_similarity(embd_spk1_utt1, embd_spk2_utt2))
 99 | print("cosine similarity of spk1_utt2 and spk2_utt1:", cosine_similarity(embd_spk1_utt2, embd_spk2_utt1))
100 | print("cosine similarity of spk1_utt2 and spk2_utt2:", cosine_similarity(embd_spk1_utt2, embd_spk2_utt2))
101 | 
102 | # outputs of the demo
103 | '''
104 | Notification: cmvn is applied
105 | Notification: cmvn is applied
106 | Notification: cmvn is applied
107 | Notification: cmvn is applied
108 | 
109 | --- inference for same speaker, the cosine similarity should be closer to 1.0 ---
110 | cosine similarity of spk1_utt1 and spk1_utt2: 0.6580173373222351
111 | cosine similarity of spk2_utt1 and spk2_utt2: 0.6858957409858704
112 | 
113 | --- inference for different speaker, the cosine similarity should be closer to 0.0 ---
114 | cosine similarity of spk1_utt1 and spk2_utt1: 0.14298376441001892
115 | cosine similarity of spk1_utt1 and spk2_utt2: 0.1417258083820343
116 | cosine similarity of spk1_utt2 and spk2_utt1: 0.06481592357158661
117 | cosine similarity of spk1_utt2 and spk2_utt2: 0.09413496404886246
118 | '''
119 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Golden-Gemini-for-Speaker-Verification
  2 | 🔥🔥🔥 Official release of pretrained models and scripts for ♊ 'Golden Gemini Is All You Need: Finding the Sweet Spots for Speaker Verification' accepted by IEEE/ACM Transactions on Audio, Speech, and Language Processing (TASLP), 2024.
  3 | 
  4 | (Free access) IEEE link: https://ieeexplore.ieee.org/document/10497864
  5 | 
  6 | arXiv Link: https://arxiv.org/abs/2312.03620
  7 | 
  8 | # Update:
  9 | Jan. 2025, **Added an inference demo script**. Considering the need for simply inference using pre-trained models, we have provided an inference demo script, inference_demo.py
 10 | 
 11 | # Note:
 12 | 
 13 | 1. ***[Important]*** This repository is dedicated to sharing the pretrained models in our paper for connvinent usage. For training and inference, we recommend using the Gemini DF-ResNet, now available on WeSpeaker: https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxceleb/v2. We extend our gratitude to the WeSpeaker community for their support, with special thanks to Dr. Wang Shuai.
 14 | 
 15 | 2. Special thanks to Dr. Liu Bei for sharing the implementation details related to DF-ResNet (https://ieeexplore.ieee.org/document/10119228).
 16 |  
 17 | 3. [New] We also release a large-margin finetuned pretrained model.
 18 | 
 19 | # Pretrained Models
 20 | 
 21 | |  Model               | Param | Large Margin Fine-Tuning | Vox1-O EER | Vox1-O MinDCF | Vox1-E EER | Vox1-E MinDCF | Vox1-H EER | Vox1-H MinDCF | Pretained Model Folder                                                                                |
 22 | |----------------------|-------|--------------------------|------------|---------------|------------|---------------|------------|---------------|-------------------------------------------------------------------------------------------------------|
 23 | |  Gemini DF-ResNet60 [[Google Drive]](https://drive.google.com/file/d/1zfck1eEOFCxGonRRxUzsLKeruwv4f-kU/view?usp=sharing)  | 4.05  |  X                       | 0.941      | 0.089         | 1.051      | 0.116         | 1.799      | 0.166         | 0611-Gemini_df_resnet56-emb256-fbank80-num_frms200-aug0.6-spTrue-saFalse-ArcMargin-AdamW-epoch165     |
 24 | |  Gemini DF-ResNet114 [[Google Drive]](https://drive.google.com/file/d/1hruxkctjIzzUkooXikExb3if8wurR6pv/view?usp=sharing) | 6.53  |  X                       | 0.686      | 0.067         | 0.863      | 0.097         | 1.490      | 0.144         | 0615-Gemini_df_resnet110-emb256-fbank80-num_frms200-aug0.6-spTrue-saFalse-ArcMargin-AdamW-epoch165    |
 25 | |  Gemini DF-ResNet183 [[Google Drive]](https://drive.google.com/file/d/1Bb1VaD8ZoUREoRoQ73oiCXjIJ21SuKLS/view?usp=drive_link) | 9.20  |  X                       | 0.596      | 0.065         | 0.806      | 0.090         | 1.440      | 0.137         | 0621-Gemini_df_resnet179-emb256-fbank80-num_frms200-aug0.6-spTrue-saFalse-ArcMargin-AdamW-epoch165    |
 26 | |  [New] Gemini DF-ResNet183 [[Google Drive]](https://drive.google.com/file/d/1rEb5UpeOvirCt9mhIW54BRAd-6EF3n_c/view?usp=drive_link) | 9.20  | ✔                        | 0.569      | 0.045         | 0.768      | 0.078         | 1.342      | 0.126         | 0621-Gemini_df_resnet179-emb256-fbank80-num_frms200-aug0.6-spTrue-saFalse-ArcMargin-AdamW-epoch165-LM |
 27 | 
 28 | *: The layer count between the names of the pretrained model folder and the model itself differs by 4, as mentioned in footnote 4 of the paper, where the distinction lies in whether to include the 4 separate downsampling layers in the layer count.  **The models are identical, only differing in nomenclature.** During experimentation, we did not include the separate downsampling layers in the layer count; however, through discussion during paper writing, we decided to include. **Therefore, Gemini DF-ResNet60/114/183 are the official name.**
 29 | 
 30 | # Folder Structure:
 31 | 
 32 | Take 0611-Gemini_df_resnet56-emb256-fbank80-num_frms200-aug0.6-spTrue-saFalse-ArcMargin-AdamW-epoch165 as an example:
 33 | 
 34 | ```
 35 | ├── models/
 36 | │ ├── model_165.pt                                            # The model checkpoint of 165 epoch. This is the model for testing.
 37 | ├── scores/
 38 | │ ├── vox1_asnorm300_result                                   # testing results with asnorm
 39 | │ ├── vox1_snorm300_result                                    # testing results with snorm
 40 | │ ├── vox1_cos_result                                         # testing results simply by cosine similarity 
 41 | │ ├── vox2_dev_asnorm300_vox1_O_cleaned.kaldi.det.png         # visualization
 42 | │ ├── vox2_dev_asnorm300_vox1_H_cleaned.kaldi.det.png
 43 | │ ├── vox2_dev_asnorm300_vox1_E_cleaned.kaldi.det.png
 44 | │ ├── vox2_dev_asnorm300_vox1_O_cleaned.kaldi.score           # scores of all the trials
 45 | │ ├── vox2_dev_asnorm300_vox1_H_cleaned.kaldi.score
 46 | │ └── vox2_dev_asnorm300_vox1_E_cleaned.kaldi.score
 47 | └── config.yaml                                               # The config file to train the model in wespeaker platform (https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxceleb/v2/conf)
 48 | └── train.log                                                 # The training log automatically generated by the Wespeaker toolkit. 
 49 | ```
 50 | 
 51 | 
 52 | 
 53 | # Usage:
 54 | 
 55 | **[Important]**
 56 | 🔥🔥🔥 **The Gemini DF-ResNet is now available in Wespeaker! (https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxceleb/v2). We encourage using the version provided by Wespeaker for better compatibility.**
 57 | **Additionally, you can find the large-margin finetuned pretrained models in both Pytorch and ONNX formats at: https://github.com/wenet-e2e/wespeaker/blob/master/docs/pretrained.md**
 58 | 
 59 | Alternatively, you can follow the steps below to reproduce the provided checkpoint:
 60 |   1. Set up Wespeaker Toolkit (https://github.com/wenet-e2e/wespeaker).
 61 |   2. Copy the model file, Gemini_df_resnet.py, from this repository to wespeaker/wespeaker/models/
 62 |   3. Modify wespeaker/wespeaker/models/speaker_model.py by adding
 63 |        ```
 64 |        import wespeaker.models.Gemini_df_resnet as Gemini_df_resnet
 65 |        ```
 66 |        and
 67 |        ```
 68 |        elif model_name.startswith("Gemini_df_resnet"):
 69 |          return getattr(Gemini_df_resnet, model_name)
 70 |        ```
 71 |   4. Create a config file following the config.yaml file in the pre-trained model folder and place it in /wespeaker/examples/voxceleb/v2/conf/
 72 |      [Note]: Warm-up is not explicitly stated in the configuration file, yet it is employed by default in the Wespeaker toolkit as follows:
 73 |      ```
 74 |        warm_from_zero: False
 75 |        warm_up_epoch: 6
 76 |      ```
 77 |   6. In wespeaker/examples/voxceleb/v2/run.sh, modify 'config' to point to the new config file.
 78 |   And then you can start training and reproduce.
 79 | 
 80 | # Cite
 81 | 🔥♊**Golden Gemini (this work):**
 82 | ```  
 83 | @ARTICLE{10497864,
 84 |   author={Liu, Tianchi and Lee, Kong Aik and Wang, Qiongqiong and Li, Haizhou},
 85 |   journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, 
 86 |   title={Golden Gemini is All You Need: Finding the Sweet Spots for Speaker Verification}, 
 87 |   year={2024},
 88 |   volume={32},
 89 |   number={},
 90 |   pages={2324-2337}
 91 | }
 92 | ```
 93 | (Prior Work) RecXi (Golden Gemini is a continued research of tResNet)
 94 | ```
 95 | @inproceedings{NEURIPS2023_9d276b0a,
 96 |  author = {Liu, Tianchi and Lee, Kong Aik and Wang, Qiongqiong and Li, Haizhou},
 97 |  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
 98 |  pages = {50221--50236},
 99 |  title = {Disentangling Voice and Content with Self-Supervision for Speaker Recognition},
100 |  volume = {36},
101 |  year = {2023}
102 | }
103 | ```  
104 | (Related Work) DF-ResNet
105 | ```  
106 | @ARTICLE{10119228,
107 |   author={Liu, Bei and Chen, Zhengyang and Qian, Yanmin},
108 |   journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, 
109 |   title={Depth-First Neural Architecture With Attentive Feature Fusion for Efficient Speaker Verification}, 
110 |   year={2023},
111 |   volume={31},
112 |   pages={1825-1838}
113 | }
114 | ```
115 | 
116 | (Related Work) wespeaker toolkit
117 | ```
118 | @INPROCEEDINGS{10096626,
119 |   author={Wang, Hongji and Liang, Chengdong and Wang, Shuai and Chen, Zhengyang and Zhang, Binbin and Xiang, Xu and Deng, Yanlei and Qian, Yanmin},
120 |   booktitle={ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
121 |   title={Wespeaker: A Research and Production Oriented Speaker Embedding Learning Toolkit}, 
122 |   year={2023},
123 |   pages={1-5}
124 | }
125 | @article{wang4748855advancing,
126 |   title={Advancing Speaker Embedding Learning: Wespeaker Toolkit for Research and Production},
127 |   author={Wang, Shuai and Chen, Zhengyang and Han, Bing and Wang, Hongji and Liang, Chengdong and Zhang, Binbin and Xiang, Xu and Ding, Wen and Rohdin, Johan and Silnova, Anna and others},
128 |   journal={Available at SSRN 4748855}
129 | }
130 | ```
131 | 
132 | 


--------------------------------------------------------------------------------