├── LICENSE ├── README.md ├── img └── overview.png ├── stage1 └── VoxCeleb2 │ ├── DatasetLoader.py │ ├── README.md │ ├── SpeakerNet.py │ ├── loss │ ├── aamsoftmax.py │ ├── aamsoftmaxproto.py │ ├── amsoftmax.py │ ├── angleproto.py │ ├── ge2e.py │ ├── proto.py │ ├── softmax.py │ ├── softmaxproto.py │ └── triplet.py │ ├── models │ ├── ECAPA_TDNN.py │ ├── MFA_Conformer.py │ ├── SKA_TDNN.py │ ├── specaugment.py │ └── wenet │ │ ├── bin │ │ ├── .train.py.swp │ │ ├── alignment.py │ │ ├── average_model.py │ │ ├── export_jit.py │ │ ├── recognize.py │ │ └── train.py │ │ ├── dataset │ │ ├── dataset.py │ │ ├── kaldi_io.py │ │ └── wav_distortion.py │ │ ├── transformer │ │ ├── __pycache__ │ │ │ ├── attention.cpython-36.pyc │ │ │ ├── attention.cpython-37.pyc │ │ │ ├── attention.cpython-39.pyc │ │ │ ├── convolution.cpython-36.pyc │ │ │ ├── convolution.cpython-37.pyc │ │ │ ├── convolution.cpython-39.pyc │ │ │ ├── embedding.cpython-36.pyc │ │ │ ├── embedding.cpython-37.pyc │ │ │ ├── embedding.cpython-39.pyc │ │ │ ├── encoder_cat.cpython-36.pyc │ │ │ ├── encoder_cat.cpython-37.pyc │ │ │ ├── encoder_cat.cpython-39.pyc │ │ │ ├── encoder_layer.cpython-36.pyc │ │ │ ├── encoder_layer.cpython-37.pyc │ │ │ ├── encoder_layer.cpython-39.pyc │ │ │ ├── positionwise_feed_forward.cpython-36.pyc │ │ │ ├── positionwise_feed_forward.cpython-37.pyc │ │ │ ├── positionwise_feed_forward.cpython-39.pyc │ │ │ ├── subsampling.cpython-36.pyc │ │ │ ├── subsampling.cpython-37.pyc │ │ │ ├── subsampling.cpython-39.pyc │ │ │ ├── swish.cpython-36.pyc │ │ │ ├── swish.cpython-37.pyc │ │ │ └── swish.cpython-39.pyc │ │ ├── asr_model.py │ │ ├── attention.py │ │ ├── cmvn.py │ │ ├── convolution.py │ │ ├── ctc.py │ │ ├── decoder.py │ │ ├── decoder_layer.py │ │ ├── embedding.py │ │ ├── encoder.py │ │ ├── encoder_cat.py │ │ ├── encoder_layer.py │ │ ├── encoder_weight.py │ │ ├── label_smoothing_loss.py │ │ ├── positionwise_feed_forward.py │ │ ├── subsampling.py │ │ └── swish.py │ │ └── utils │ │ ├── __pycache__ │ │ ├── common.cpython-36.pyc │ │ ├── common.cpython-37.pyc │ │ ├── common.cpython-39.pyc │ │ ├── mask.cpython-36.pyc │ │ ├── mask.cpython-37.pyc │ │ └── mask.cpython-39.pyc │ │ ├── checkpoint.py │ │ ├── cmvn.py │ │ ├── common.py │ │ ├── ctc_util.py │ │ ├── executor.py │ │ ├── mask.py │ │ └── scheduler.py │ ├── optimizer │ ├── adam.py │ ├── adamP.py │ ├── adamW.py │ └── sgd.py │ ├── process_musan.py │ ├── requirements.txt │ ├── scheduler │ ├── cosine_annealing_warmup_restarts.py │ ├── cycliclr.py │ ├── exponentiallr.py │ └── steplr.py │ ├── trainSpeakerNet.py │ ├── tuneThreshold.py │ └── utils.py ├── stage2 └── README.md └── stage3 └── ASVspoof2019 ├── DatasetLoader.py ├── README.md ├── SASVNet.py ├── loss ├── aamsoftmax.py ├── angleproto_sasv.py └── sasv_e2e_v1.py ├── metrics.py ├── models ├── ECAPA_TDNN.py ├── MFA_Conformer.py ├── SKA_TDNN.py ├── specaugment.py └── wenet │ ├── bin │ ├── .train.py.swp │ ├── alignment.py │ ├── average_model.py │ ├── export_jit.py │ ├── recognize.py │ └── train.py │ ├── dataset │ ├── dataset.py │ ├── kaldi_io.py │ └── wav_distortion.py │ ├── transformer │ ├── __pycache__ │ │ ├── attention.cpython-36.pyc │ │ ├── attention.cpython-37.pyc │ │ ├── attention.cpython-39.pyc │ │ ├── convolution.cpython-36.pyc │ │ ├── convolution.cpython-37.pyc │ │ ├── convolution.cpython-39.pyc │ │ ├── embedding.cpython-36.pyc │ │ ├── embedding.cpython-37.pyc │ │ ├── embedding.cpython-39.pyc │ │ ├── encoder_cat.cpython-36.pyc │ │ ├── encoder_cat.cpython-37.pyc │ │ ├── encoder_cat.cpython-39.pyc │ │ ├── encoder_layer.cpython-36.pyc │ │ ├── encoder_layer.cpython-37.pyc │ │ ├── encoder_layer.cpython-39.pyc │ │ ├── positionwise_feed_forward.cpython-36.pyc │ │ ├── positionwise_feed_forward.cpython-37.pyc │ │ ├── positionwise_feed_forward.cpython-39.pyc │ │ ├── subsampling.cpython-36.pyc │ │ ├── subsampling.cpython-37.pyc │ │ ├── subsampling.cpython-39.pyc │ │ ├── swish.cpython-36.pyc │ │ ├── swish.cpython-37.pyc │ │ └── swish.cpython-39.pyc │ ├── asr_model.py │ ├── attention.py │ ├── cmvn.py │ ├── convolution.py │ ├── ctc.py │ ├── decoder.py │ ├── decoder_layer.py │ ├── embedding.py │ ├── encoder.py │ ├── encoder_cat.py │ ├── encoder_layer.py │ ├── encoder_weight.py │ ├── label_smoothing_loss.py │ ├── positionwise_feed_forward.py │ ├── subsampling.py │ └── swish.py │ └── utils │ ├── __pycache__ │ ├── common.cpython-36.pyc │ ├── common.cpython-37.pyc │ ├── common.cpython-39.pyc │ ├── mask.cpython-36.pyc │ ├── mask.cpython-37.pyc │ └── mask.cpython-39.pyc │ ├── checkpoint.py │ ├── cmvn.py │ ├── common.py │ ├── ctc_util.py │ ├── executor.py │ ├── mask.py │ └── scheduler.py ├── optimizer ├── adam.py ├── adamP.py ├── adamW.py └── sgd.py ├── protocols ├── ASVspoof2019.LA.asv.dev.gi.trl.txt ├── ASVspoof2019.LA.asv.eval.female.trn.txt ├── ASVspoof2019.LA.asv.eval.gi.trl.txt ├── ASVspoof2019.LA.asv.eval.male.trn.txt ├── ASVspoof2019.LA.cm.dev.trl.txt ├── ASVspoof2019.LA.cm.eval.trl.txt ├── ASVspoof2019.LA.cm.train.trn.txt └── ASVspoof2019.LA.cm.train_dev.trn.txt ├── requirements.txt ├── scheduler └── cosine_annealing_warmup_restarts.py ├── spk_meta ├── spk_meta_dev.pk ├── spk_meta_eval.pk └── spk_meta_trn.pk ├── trainSASVNet.py ├── tuneThreshold.py └── utils.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 sasv-challenge 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /img/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/img/overview.png -------------------------------------------------------------------------------- /stage1/VoxCeleb2/README.md: -------------------------------------------------------------------------------- 1 | # Stage 1 2 | 3 | This repository is developed based on the [voxceleb_trainer](https://github.com/clovaai/voxceleb_trainer) and [ska-tdnn](https://github.com/msh9184/ska-tdnn). 4 | 5 | ## Dependencies 6 | If you use the Anaconda virtual environment, 7 | ``` 8 | conda create -n sasv python=3.9 cudatoolkit=11.3 9 | conda activate sasv 10 | ``` 11 | Install all dependency packages, 12 | ``` 13 | pip3 install -r requirements.txt 14 | ``` 15 | 16 | 17 | ## Data Preparation 18 | The [VoxCeleb](https://mm.kaist.ac.kr/datasets/voxceleb/) datasets are used for these experiments. 19 | The train list should contain the file path and speaker identity for instance, 20 | ``` 21 | id00012/21Uxsk56VDQ/00001.wav id00012 22 | id00012/21Uxsk56VDQ/00002.wav id00012 23 | ... 24 | id09272/u7VNkYraCw0/00026.wav id09272 25 | id09272/u7VNkYraCw0/00027.wav id09272 26 | ``` 27 | The example files of train list for VoxCeleb2 and the test lists for VoxCeleb1-O, VoxCeleb1-E, VoxCeleb1-H can be download from [train_vox2.txt](https://drive.google.com/file/d/1Y6yjKDULxJ40mhLzeKUzkeAvqNlP0tzX/view?usp=sharing) and [veri_test2.txt](https://drive.google.com/file/d/1EUDR5oCPC-zOexhLBHbFQpdnw1IRWq-B/view?usp=sharing), [list_test_all2](https://drive.google.com/file/d/1BgnEugORlSPsi4ZpTjTayAGPqyWTm7S8/view?usp=sharing), [list_test_hard2](https://drive.google.com/file/d/1p-gbPbDK4dy_SvSRWZ3KP17iZdHqjHQ4/view?usp=sharing), respectively. You can also follow the instructions on the [voxceleb_trainer](https://github.com/clovaai/voxceleb_trainer) repository for the download and data preparation of training, augmentation, and evaluation. 28 | 29 | For the data augmentation of noise addition, you can download the [MUSAN noise corpus](https://www.openslr.org/17/). 30 | After downloading and extracting the files, you can split the audio files into short segments for faster random access as the following command: 31 | ```bash 32 | python process_musan.py /path/to/dataset/MUSAN 33 | ``` 34 | where `/path/to/dataset/MUSAN` is your path to the MUSAN corpus. 35 | 36 | For the data augmentation of convolution with simulated RIRs, you can download the [Room Impulse Response and Noise Database](https://www.openslr.org/28/). 37 | 38 | 39 | ## Models 40 | Three models are included in this repository. You can select the model by the `--model` option: 41 | ``` 42 | ECAPA_TDNN [1] 43 | MFA_Conformer [2] 44 | SKA_TDNN [3] 45 | ``` 46 | [1] B. Desplanques, J. Thienpondt, and K. Demuynck, "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification," in *Proc. INTERSPEECH*, 2020, pp. 3707-3711. 47 | 48 | [2] Y. Zhang, Z. Lv, H. Wu, S. Zhang, P. Hu, Z. Wu, H. Lee, and H. Meng., “MFA-Conformer: Multi-scale Feature Aggregation Conformer for Automatic Speaker Verification,” in *Proc. INTERSPEECH*, 2022. 49 | 50 | [3] S. H. Mun, J. Jung, M. H. Han, and N. S. Kim, "Frequency and Multi-Scale Selective Kernel Attention for Speaker Verification," in *Proc. IEEE SLT*, 2022. 51 | 52 | 53 | ## Training 54 | Distributed Data Parallel (DDP) training example: SKA_TDNN with a vanilla cosine similarity (COS) evaluation every epoch, 55 | ``` 56 | CUDA_VISIBLE_DEVICES=0,1,2,3 python trainSpeakerNet.py \ 57 | --max_frames 200 \ 58 | --eval_frames 0 \ 59 | --num_eval 1 \ 60 | --num_spk 100 \ 61 | --num_utt 2 \ 62 | --augment Ture \ 63 | --optimizer adamW \ 64 | --scheduler cosine_annealing_warmup_restarts \ 65 | --lr_t0 25 \ 66 | --lr_tmul 1.0 \ 67 | --lr_max 1e-3 \ 68 | --lr_min 1e-8 \ 69 | --lr_wstep 10 \ 70 | --lr_gamma 0.5 \ 71 | --margin 0.2 \ 72 | --scale 30 \ 73 | --num_class 5994 \ 74 | --save_path ./save/ska_tdnn \ 75 | --train_list ./list/train_vox2.txt \ 76 | --test_list ./list/veri_test2.txt \ 77 | --train_path /path/to/dataset/VoxCeleb2/dev/wav \ 78 | --test_path /path/to/dataset/VoxCeleb1/test/wav \ 79 | --musan_path /path/to/dataset/MUSAN/musan_split \ 80 | --rir_path /path/to/dataset/RIRS_NOISES/simulated_rirs \ 81 | --model SKA_TDNN \ 82 | --port 8000 \ 83 | --distributed 84 | ``` 85 | 86 | ## Evaluation 87 | Evaluation example using vanilla cosine similarity (COS) on the VoxCeleb1-O, 88 | ``` 89 | CUDA_VISIBLE_DEVICES=0,1,2,3 python trainSpeakerNet.py \ 90 | --eval \ 91 | --eval_frames 0 \ 92 | --num_eval 1 \ 93 | --initial_model ./save/model/your_model.model \ 94 | --test_list ./list/veri_test2.txt \ 95 | --test_path /path/to/dataset/VoxCeleb1/test/wav \ 96 | --model SKA_TDNN \ 97 | --port 8001 \ 98 | --distributed 99 | ``` 100 | Evaluation example using Test Time Augmentation (TTA) on the VoxCeleb1-E, 101 | ``` 102 | CUDA_VISIBLE_DEVICES=0,1,2,3 python trainSpeakerNet.py \ 103 | --eval \ 104 | --tta \ 105 | --eval_frames 400 \ 106 | --num_eval 10 \ 107 | --initial_model ./save/model/your_model.model \ 108 | --test_list ./list/list_test_all2 \ 109 | --test_path /path/to/dataset/VoxCeleb1/all/wav \ 110 | --model SKA_TDNN \ 111 | --port 8002 \ 112 | --distributed 113 | ``` 114 | Evaluation example using Score Normalisation (SN) on the VoxCeleb1-H, 115 | ``` 116 | CUDA_VISIBLE_DEVICES=0,1,2,3 python trainSpeakerNet.py \ 117 | --eval \ 118 | --score_norm \ 119 | --type_coh utt \ 120 | --top_coh_size 20000 \ 121 | --eval_frames 0 \ 122 | --num_eval 1 \ 123 | --initial_model ./save/model/your_model.model \ 124 | --train_list ./list/train_vox2.txt \ 125 | --test_list ./list/list_test_hard2 \ 126 | --train_path /path/to/dataset/VoxCeleb2/dev/wav \ 127 | --test_path /path/to/dataset/VoxCeleb1/all/wav \ 128 | --model SKA_TDNN \ 129 | --port 8003 \ 130 | --distributed 131 | ``` 132 | 133 | 134 | ## Citation 135 | If you utilize this repository, please cite the following paper, 136 | ``` 137 | @inproceedings{chung2020in, 138 | title={In defence of metric learning for speaker recognition}, 139 | author={Chung, Joon Son and Huh, Jaesung and Mun, Seongkyu and Lee, Minjae and Heo, Hee Soo and Choe, Soyeon and Ham, Chiheon and Jung, Sunghwan and Lee, Bong-Jin and Han, Icksang}, 140 | booktitle={Proc. Interspeech}, 141 | year={2020} 142 | } 143 | ``` 144 | 145 | ``` 146 | @inproceedings{jung2022pushing, 147 | title={Pushing the limits of raw waveform speaker recognition}, 148 | author={Jung, Jee-weon and Kim, You Jin and Heo, Hee-Soo and Lee, Bong-Jin and Kwon, Youngki and Chung, Joon Son}, 149 | booktitle={Proc. Interspeech}, 150 | year={2022} 151 | } 152 | ``` 153 | 154 | ``` 155 | @inproceedings{mun2022frequency, 156 | title={Frequency and Multi-Scale Selective Kernel Attention for Speaker Verification}, 157 | author={Mun, Sung Hwan and Jung, Jee-weon and Han, Min Hyun and Kim, Nam Soo}, 158 | booktitle={Proc. IEEE SLT}, 159 | year={2022} 160 | } 161 | ``` -------------------------------------------------------------------------------- /stage1/VoxCeleb2/loss/aamsoftmax.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | # Adapted from https://github.com/wujiyang/Face_Pytorch (Apache License) 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import time, pdb, numpy, math 9 | from utils import accuracy 10 | 11 | class LossFunction(nn.Module): 12 | def __init__(self, num_out, num_class, margin=0.3, scale=15, easy_margin=False, **kwargs): 13 | super(LossFunction, self).__init__() 14 | 15 | self.test_normalize = True 16 | 17 | self.m = margin 18 | self.s = scale 19 | self.in_feats = num_out 20 | self.weight = torch.nn.Parameter(torch.FloatTensor(num_class, num_out), requires_grad=True) 21 | self.ce = nn.CrossEntropyLoss() 22 | nn.init.xavier_normal_(self.weight, gain=1) 23 | 24 | self.easy_margin = easy_margin 25 | self.cos_m = math.cos(self.m) 26 | self.sin_m = math.sin(self.m) 27 | 28 | # make the function cos(theta+m) monotonic decreasing while theta in [0°,180°] 29 | self.th = math.cos(math.pi - self.m) 30 | self.mm = math.sin(math.pi - self.m) * self.m 31 | 32 | print('Initialised AAMSoftmax margin %.3f scale %.3f'%(self.m,self.s)) 33 | 34 | def forward(self, x, label=None): 35 | 36 | assert x.size()[0] == label.size()[0] 37 | assert x.size()[1] == self.in_feats 38 | 39 | # cos(theta) 40 | cosine = F.linear(F.normalize(x), F.normalize(self.weight)) 41 | # cos(theta + m) 42 | sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1)) 43 | phi = cosine * self.cos_m - sine * self.sin_m 44 | 45 | if self.easy_margin: 46 | phi = torch.where(cosine > 0, phi, cosine) 47 | else: 48 | phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm) 49 | 50 | #one_hot = torch.zeros(cosine.size(), device='cuda' if torch.cuda.is_available() else 'cpu') 51 | one_hot = torch.zeros_like(cosine) 52 | one_hot.scatter_(1, label.view(-1, 1), 1) 53 | output = (one_hot * phi) + ((1.0 - one_hot) * cosine) 54 | output = output * self.s 55 | 56 | loss = self.ce(output, label) 57 | prec1 = accuracy(output.detach(), label.detach(), topk=(1,))[0] 58 | return loss, prec1 59 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/loss/aamsoftmaxproto.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | import torch.nn as nn 6 | import loss.aamsoftmax as aamsoftmax 7 | import loss.angleproto as angleproto 8 | 9 | class LossFunction(nn.Module): 10 | def __init__(self, **kwargs): 11 | super(LossFunction, self).__init__() 12 | self.test_normalize = True 13 | self.aamsoftmax = aamsoftmax.LossFunction(**kwargs) 14 | self.angleproto = angleproto.LossFunction(**kwargs) 15 | print('Initialised AAMSoftmaxPrototypicalLoss') 16 | 17 | def forward(self, x, label=None): 18 | assert x.size()[1] == 2 19 | nlossS, prec1 = self.aamsoftmax(x.reshape(-1,x.size()[-1]), label.repeat_interleave(2)) 20 | nlossM, _ = self.angleproto(x,None) 21 | return nlossS+nlossM, prec1 22 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/loss/amsoftmax.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | # Adapted from https://github.com/CoinCheung/pytorch-loss (MIT License) 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import time, pdb, numpy 9 | from utils import accuracy 10 | 11 | class LossFunction(nn.Module): 12 | def __init__(self, nOut, nClasses, margin=0.3, scale=15, **kwargs): 13 | super(LossFunction, self).__init__() 14 | 15 | self.test_normalize = True 16 | 17 | self.m = margin 18 | self.s = scale 19 | self.in_feats = nOut 20 | self.W = torch.nn.Parameter(torch.randn(nOut, nClasses), requires_grad=True) 21 | self.ce = nn.CrossEntropyLoss() 22 | nn.init.xavier_normal_(self.W, gain=1) 23 | 24 | print('Initialised AMSoftmax m=%.3f s=%.3f'%(self.m,self.s)) 25 | 26 | def forward(self, x, label=None): 27 | 28 | assert x.size()[0] == label.size()[0] 29 | assert x.size()[1] == self.in_feats 30 | 31 | x_norm = torch.norm(x, p=2, dim=1, keepdim=True).clamp(min=1e-12) 32 | x_norm = torch.div(x, x_norm) 33 | w_norm = torch.norm(self.W, p=2, dim=0, keepdim=True).clamp(min=1e-12) 34 | w_norm = torch.div(self.W, w_norm) 35 | costh = torch.mm(x_norm, w_norm) 36 | label_view = label.view(-1, 1) 37 | if label_view.is_cuda: label_view = label_view.cpu() 38 | delt_costh = torch.zeros(costh.size()).scatter_(1, label_view, self.m) 39 | if x.is_cuda: delt_costh = delt_costh.cuda() 40 | costh_m = costh - delt_costh 41 | costh_m_s = self.s * costh_m 42 | loss = self.ce(costh_m_s, label) 43 | prec1 = accuracy(costh_m_s.detach(), label.detach(), topk=(1,))[0] 44 | return loss, prec1 45 | 46 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/loss/angleproto.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import time, pdb, numpy 8 | from utils import accuracy 9 | 10 | class LossFunction(nn.Module): 11 | 12 | def __init__(self, init_w=10.0, init_b=-5.0, **kwargs): 13 | super(LossFunction, self).__init__() 14 | 15 | self.test_normalize = True 16 | 17 | self.w = nn.Parameter(torch.tensor(init_w)) 18 | self.b = nn.Parameter(torch.tensor(init_b)) 19 | self.criterion = torch.nn.CrossEntropyLoss() 20 | 21 | print('Initialised AngleProto') 22 | 23 | def forward(self, x, label=None): 24 | 25 | assert x.size()[1] >= 2 26 | 27 | out_anchor = torch.mean(x[:,1:,:],1) 28 | out_positive = x[:,0,:] 29 | stepsize = out_anchor.size()[0] 30 | 31 | cos_sim_matrix = F.cosine_similarity(out_positive.unsqueeze(-1),out_anchor.unsqueeze(-1).transpose(0,2)) 32 | torch.clamp(self.w, 1e-6) 33 | cos_sim_matrix = cos_sim_matrix * self.w + self.b 34 | 35 | label = torch.from_numpy(numpy.asarray(range(0,stepsize))).cuda() 36 | nloss = self.criterion(cos_sim_matrix, label) 37 | prec1 = accuracy(cos_sim_matrix.detach(), label.detach(), topk=(1,))[0] 38 | 39 | return nloss, prec1 40 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/loss/ge2e.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | ## Fast re-implementation of the GE2E loss (https://arxiv.org/abs/1710.10467) 4 | ## Numerically checked against https://github.com/cvqluu/GE2E-Loss 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import time, pdb, numpy 10 | from utils import accuracy 11 | 12 | class LossFunction(nn.Module): 13 | 14 | def __init__(self, init_w=10.0, init_b=-5.0, **kwargs): 15 | super(LossFunction, self).__init__() 16 | 17 | self.test_normalize = True 18 | 19 | self.w = nn.Parameter(torch.tensor(init_w)) 20 | self.b = nn.Parameter(torch.tensor(init_b)) 21 | self.criterion = torch.nn.CrossEntropyLoss() 22 | 23 | print('Initialised GE2E') 24 | 25 | def forward(self, x, label=None): 26 | 27 | assert x.size()[1] >= 2 28 | 29 | gsize = x.size()[1] 30 | centroids = torch.mean(x, 1) 31 | stepsize = x.size()[0] 32 | 33 | cos_sim_matrix = [] 34 | 35 | for ii in range(0,gsize): 36 | idx = [*range(0,gsize)] 37 | idx.remove(ii) 38 | exc_centroids = torch.mean(x[:,idx,:], 1) 39 | cos_sim_diag = F.cosine_similarity(x[:,ii,:],exc_centroids) 40 | cos_sim = F.cosine_similarity(x[:,ii,:].unsqueeze(-1),centroids.unsqueeze(-1).transpose(0,2)) 41 | cos_sim[range(0,stepsize),range(0,stepsize)] = cos_sim_diag 42 | cos_sim_matrix.append(torch.clamp(cos_sim,1e-6)) 43 | 44 | cos_sim_matrix = torch.stack(cos_sim_matrix,dim=1) 45 | 46 | torch.clamp(self.w, 1e-6) 47 | cos_sim_matrix = cos_sim_matrix * self.w + self.b 48 | 49 | label = torch.from_numpy(numpy.asarray(range(0,stepsize))).cuda() 50 | nloss = self.criterion(cos_sim_matrix.view(-1,stepsize), torch.repeat_interleave(label,repeats=gsize,dim=0).cuda()) 51 | prec1 = accuracy(cos_sim_matrix.view(-1,stepsize).detach(), torch.repeat_interleave(label,repeats=gsize,dim=0).detach(), topk=(1,))[0] 52 | 53 | return nloss, prec1 -------------------------------------------------------------------------------- /stage1/VoxCeleb2/loss/proto.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | ## Re-implementation of prototypical networks (https://arxiv.org/abs/1703.05175). 4 | ## Numerically checked against https://github.com/cyvius96/prototypical-network-pytorch 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import time, pdb, numpy 10 | from utils import accuracy 11 | 12 | class LossFunction(nn.Module): 13 | 14 | def __init__(self, **kwargs): 15 | super(LossFunction, self).__init__() 16 | 17 | self.test_normalize = False 18 | 19 | self.criterion = torch.nn.CrossEntropyLoss() 20 | 21 | print('Initialised Prototypical Loss') 22 | 23 | def forward(self, x, label=None): 24 | 25 | assert x.size()[1] >= 2 26 | 27 | out_anchor = torch.mean(x[:,1:,:],1) 28 | out_positive = x[:,0,:] 29 | stepsize = out_anchor.size()[0] 30 | 31 | output = -1 * (F.pairwise_distance(out_positive.unsqueeze(-1),out_anchor.unsqueeze(-1).transpose(0,2))**2) 32 | label = torch.from_numpy(numpy.asarray(range(0,stepsize))).cuda() 33 | nloss = self.criterion(output, label) 34 | prec1 = accuracy(output.detach(), label.detach(), topk=(1,))[0] 35 | 36 | return nloss, prec1 -------------------------------------------------------------------------------- /stage1/VoxCeleb2/loss/softmax.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import time, pdb, numpy 8 | from utils import accuracy 9 | 10 | class LossFunction(nn.Module): 11 | def __init__(self, nOut, nClasses, **kwargs): 12 | super(LossFunction, self).__init__() 13 | 14 | self.test_normalize = True 15 | 16 | self.criterion = torch.nn.CrossEntropyLoss() 17 | self.fc = nn.Linear(nOut,nClasses) 18 | 19 | print('Initialised Softmax Loss') 20 | 21 | def forward(self, x, label=None): 22 | 23 | x = self.fc(x) 24 | nloss = self.criterion(x, label) 25 | prec1 = accuracy(x.detach(), label.detach(), topk=(1,))[0] 26 | 27 | return nloss, prec1 -------------------------------------------------------------------------------- /stage1/VoxCeleb2/loss/softmaxproto.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | import torch.nn as nn 6 | import loss.softmax as softmax 7 | import loss.angleproto as angleproto 8 | 9 | class LossFunction(nn.Module): 10 | 11 | def __init__(self, **kwargs): 12 | super(LossFunction, self).__init__() 13 | 14 | self.test_normalize = True 15 | 16 | self.softmax = softmax.LossFunction(**kwargs) 17 | self.angleproto = angleproto.LossFunction(**kwargs) 18 | 19 | print('Initialised SoftmaxPrototypical Loss') 20 | 21 | def forward(self, x, label=None): 22 | 23 | assert x.size()[1] == 2 24 | 25 | nlossS, prec1 = self.softmax(x.reshape(-1,x.size()[-1]), label.repeat_interleave(2)) 26 | 27 | nlossP, _ = self.angleproto(x,None) 28 | 29 | return nlossS+nlossP, prec1 -------------------------------------------------------------------------------- /stage1/VoxCeleb2/loss/triplet.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import time, pdb, numpy 8 | from tuneThreshold import tuneThresholdfromScore 9 | import random 10 | 11 | class LossFunction(nn.Module): 12 | 13 | def __init__(self, hard_rank=0, hard_prob=0, margin=0, **kwargs): 14 | super(LossFunction, self).__init__() 15 | 16 | self.test_normalize = True 17 | 18 | self.hard_rank = hard_rank 19 | self.hard_prob = hard_prob 20 | self.margin = margin 21 | 22 | print('Initialised Triplet Loss') 23 | 24 | def forward(self, x, label=None): 25 | 26 | assert x.size()[1] == 2 27 | 28 | out_anchor = F.normalize(x[:,0,:], p=2, dim=1) 29 | out_positive = F.normalize(x[:,1,:], p=2, dim=1) 30 | stepsize = out_anchor.size()[0] 31 | 32 | output = -1 * (F.pairwise_distance(out_anchor.unsqueeze(-1),out_positive.unsqueeze(-1).transpose(0,2))**2) 33 | 34 | negidx = self.mineHardNegative(output.detach()) 35 | 36 | out_negative = out_positive[negidx,:] 37 | 38 | labelnp = numpy.array([1]*len(out_positive)+[0]*len(out_negative)) 39 | 40 | ## calculate distances 41 | pos_dist = F.pairwise_distance(out_anchor,out_positive) 42 | neg_dist = F.pairwise_distance(out_anchor,out_negative) 43 | 44 | ## loss function 45 | nloss = torch.mean(F.relu(torch.pow(pos_dist, 2) - torch.pow(neg_dist, 2) + self.margin)) 46 | 47 | scores = -1 * torch.cat([pos_dist,neg_dist],dim=0).detach().cpu().numpy() 48 | 49 | errors = tuneThresholdfromScore(scores, labelnp, []); 50 | 51 | return nloss, errors[1] 52 | 53 | ## ===== ===== ===== ===== ===== ===== ===== ===== 54 | ## Hard negative mining 55 | ## ===== ===== ===== ===== ===== ===== ===== ===== 56 | 57 | def mineHardNegative(self, output): 58 | 59 | negidx = [] 60 | 61 | for idx, similarity in enumerate(output): 62 | 63 | simval, simidx = torch.sort(similarity,descending=True) 64 | 65 | if self.hard_rank < 0: 66 | 67 | ## Semi hard negative mining 68 | 69 | semihardidx = simidx[(similarity[idx] - self.margin < simval) & (simval < similarity[idx])] 70 | 71 | if len(semihardidx) == 0: 72 | negidx.append(random.choice(simidx)) 73 | else: 74 | negidx.append(random.choice(semihardidx)) 75 | 76 | else: 77 | 78 | ## Rank based negative mining 79 | 80 | simidx = simidx[simidx!=idx] 81 | 82 | if random.random() < self.hard_prob: 83 | negidx.append(simidx[random.randint(0, self.hard_rank)]) 84 | else: 85 | negidx.append(random.choice(simidx)) 86 | 87 | return negidx -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/ECAPA_TDNN.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | ## Here, log_input forces alternative mfcc implementation with pre-emphasis instead of actual log mfcc 5 | 6 | import math 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | import torchaudio 11 | import pdb 12 | from utils import PreEmphasis 13 | 14 | class SEModule(nn.Module): 15 | def __init__(self, channels, bottleneck=128): 16 | super(SEModule, self).__init__() 17 | self.se = nn.Sequential( 18 | nn.AdaptiveAvgPool1d(1), 19 | nn.Conv1d(channels, bottleneck, kernel_size=1, padding=0), 20 | nn.ReLU(), 21 | #nn.BatchNorm1d(bottleneck), 22 | nn.Conv1d(bottleneck, channels, kernel_size=1, padding=0), 23 | nn.Sigmoid(), 24 | ) 25 | 26 | def forward(self, input): 27 | x = self.se(input) 28 | return input * x 29 | 30 | class Bottle2neck(nn.Module): 31 | 32 | def __init__(self, inplanes, planes, kernel_size=None, dilation=None, scale = 8): 33 | super(Bottle2neck, self).__init__() 34 | width = int(math.floor(planes / scale)) 35 | self.conv1 = nn.Conv1d(inplanes, width*scale, kernel_size=1) 36 | self.bn1 = nn.BatchNorm1d(width*scale) 37 | self.nums = scale -1 38 | convs = [] 39 | bns = [] 40 | num_pad = math.floor(kernel_size/2)*dilation 41 | for i in range(self.nums): 42 | convs.append(nn.Conv1d(width, width, kernel_size=kernel_size, dilation=dilation, padding=num_pad)) 43 | bns.append(nn.BatchNorm1d(width)) 44 | self.convs = nn.ModuleList(convs) 45 | self.bns = nn.ModuleList(bns) 46 | self.conv3 = nn.Conv1d(width*scale, planes, kernel_size=1) 47 | self.bn3 = nn.BatchNorm1d(planes) 48 | self.relu = nn.ReLU() 49 | self.width = width 50 | self.se = SEModule(planes) 51 | 52 | def forward(self, x): 53 | residual = x 54 | out = self.conv1(x) 55 | out = self.relu(out) 56 | out = self.bn1(out) 57 | 58 | spx = torch.split(out, self.width, 1) 59 | for i in range(self.nums): 60 | if i==0: 61 | sp = spx[i] 62 | else: 63 | sp = sp + spx[i] 64 | sp = self.convs[i](sp) 65 | sp = self.relu(sp) 66 | sp = self.bns[i](sp) 67 | if i==0: 68 | out = sp 69 | else: 70 | out = torch.cat((out, sp), 1) 71 | out = torch.cat((out, spx[self.nums]),1) 72 | 73 | out = self.conv3(out) 74 | out = self.relu(out) 75 | out = self.bn3(out) 76 | 77 | out = self.se(out) 78 | out += residual 79 | return out 80 | 81 | class FbankAug(nn.Module): 82 | 83 | def __init__(self, freq_mask_width = (0, 8), time_mask_width = (0, 10)): 84 | self.time_mask_width = time_mask_width 85 | self.freq_mask_width = freq_mask_width 86 | super().__init__() 87 | 88 | def mask_along_axis(self, x, dim): 89 | original_size = x.shape 90 | batch, fea, time = x.shape 91 | if dim == 1: 92 | D = fea 93 | width_range = self.freq_mask_width 94 | else: 95 | D = time 96 | width_range = self.time_mask_width 97 | 98 | mask_len = torch.randint(width_range[0], width_range[1], (batch, 1), device=x.device).unsqueeze(2) 99 | mask_pos = torch.randint(0, max(1, D - mask_len.max()), (batch, 1), device=x.device).unsqueeze(2) 100 | arange = torch.arange(D, device=x.device).view(1, 1, -1) 101 | mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len)) 102 | mask = mask.any(dim=1) 103 | 104 | if dim == 1: 105 | mask = mask.unsqueeze(2) 106 | else: 107 | mask = mask.unsqueeze(1) 108 | 109 | x = x.masked_fill_(mask, 0.0) 110 | return x.view(*original_size) 111 | 112 | def forward(self, x): 113 | x = self.mask_along_axis(x, dim=2) 114 | x = self.mask_along_axis(x, dim=1) 115 | return x 116 | 117 | class ECAPA_TDNN(nn.Module): 118 | def __init__(self, block, C, model_scale, log_input=True, num_mels=80, num_out=192, **kwargs): 119 | self.log_input = log_input 120 | super(ECAPA_TDNN, self).__init__() 121 | self.scale = model_scale 122 | self.conv1 = nn.Conv1d(num_mels, C, kernel_size=5, stride=1, padding=2) 123 | self.relu = nn.ReLU() 124 | self.bn1 = nn.BatchNorm1d(C) 125 | self.layer1 = block(C, C, kernel_size=3, dilation=2, scale=self.scale) 126 | self.layer2 = block(C, C, kernel_size=3, dilation=3, scale=self.scale) 127 | self.layer3 = block(C, C, kernel_size=3, dilation=4, scale=self.scale) 128 | self.layer4 = nn.Conv1d(3*C, 1536, kernel_size=1) 129 | self.attention = nn.Sequential( 130 | nn.Conv1d(4608, 256, kernel_size=1), 131 | nn.ReLU(), 132 | nn.BatchNorm1d(256), 133 | nn.Tanh(), 134 | nn.Conv1d(256, 1536, kernel_size=1), 135 | nn.Softmax(dim=2), 136 | ) 137 | self.torchfbank = torch.nn.Sequential( 138 | PreEmphasis(), 139 | torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \ 140 | f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=num_mels), 141 | ) 142 | self.specaug = FbankAug() 143 | self.bn5 = nn.BatchNorm1d(3072) 144 | self.fc6 = nn.Linear(3072, num_out) 145 | self.bn6 = nn.BatchNorm1d(num_out) 146 | 147 | def forward(self, x, aug): 148 | with torch.no_grad(): 149 | with torch.cuda.amp.autocast(enabled=False): 150 | x = self.torchfbank(x)+1e-6 151 | if self.log_input: 152 | x = x.log() 153 | x = x - torch.mean(x, dim=-1, keepdim=True) 154 | if aug == True: 155 | x = self.specaug(x) 156 | x = self.conv1(x) 157 | x = self.relu(x) 158 | x = self.bn1(x) 159 | x1 = self.layer1(x) 160 | x2 = self.layer2(x+x1) 161 | x3 = self.layer3(x+x1+x2) 162 | x = self.layer4(torch.cat((x1,x2,x3),dim=1)) 163 | x = self.relu(x) 164 | t = x.size()[-1] 165 | global_x = torch.cat((x,torch.mean(x,dim=2,keepdim=True).repeat(1,1,t), torch.sqrt(torch.var(x,dim=2,keepdim=True).clamp(min=1e-4)).repeat(1,1,t)), dim=1) 166 | w = self.attention(global_x) 167 | mu = torch.sum(x * w, dim=2) 168 | sg = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-4) ) 169 | x = torch.cat((mu,sg),1) 170 | x = self.bn5(x) 171 | x = self.fc6(x) 172 | x = self.bn6(x) 173 | return x 174 | 175 | def MainModel(eca_c=1024, eca_s=8, log_input=True, num_mels=80, num_out=192, **kwargs): 176 | model = ECAPA_TDNN(block=Bottle2neck, C=eca_c, model_scale=eca_s, log_input=log_input, num_mels=num_mels, num_out=num_out, **kwargs) 177 | return model 178 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/MFA_Conformer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torchaudio 4 | from torch import Tensor 5 | from typing import Tuple 6 | from utils import PreEmphasis 7 | from .specaugment import SpecAugment 8 | from .wenet.transformer.encoder_cat import ConformerEncoder 9 | 10 | class Conformer(nn.Module): 11 | def __init__(self, num_mels=80, num_blocks=6, output_size=256, embedding_dim=192, input_layer="conv2d2", pos_enc_layer_type="rel_pos"): 12 | super(Conformer, self).__init__() 13 | print("input_layer: {}".format(input_layer)) 14 | print("pos_enc_layer_type: {}".format(pos_enc_layer_type)) 15 | self.conformer = ConformerEncoder(input_size=num_mels, num_blocks=num_blocks, output_size=output_size, input_layer=input_layer, pos_enc_layer_type=pos_enc_layer_type, ) 16 | self.bn = nn.BatchNorm1d(output_size*num_blocks*2) 17 | self.fc = nn.Linear(output_size*num_blocks*2, embedding_dim) 18 | 19 | self.specaug = SpecAugment() 20 | self.torchfbank = torch.nn.Sequential( 21 | PreEmphasis(), 22 | torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \ 23 | f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=80), 24 | ) 25 | output_dim = output_size*num_blocks 26 | self.attention = nn.Sequential( 27 | nn.Conv1d(output_dim*3, 256, kernel_size=1), 28 | nn.ReLU(), 29 | nn.BatchNorm1d(256), 30 | nn.Tanh(), 31 | nn.Conv1d(256, output_dim, kernel_size=1), 32 | nn.Softmax(dim=2), 33 | ) 34 | 35 | def forward(self, x: Tensor, aug=False) -> Tuple[Tensor, bool]: 36 | 37 | with torch.no_grad(): 38 | with torch.cuda.amp.autocast(enabled=False): 39 | x = self.torchfbank(x)+1e-6 40 | x = x.log() 41 | x = x - torch.mean(x, dim=-1, keepdim=True) 42 | if aug == True: 43 | x = self.specaug(x) 44 | x = x.transpose(1,2) 45 | lens = torch.ones(x.shape[0]).to(x.device) 46 | lens = torch.round(lens*x.shape[1]).int() 47 | x, masks = self.conformer(x, lens) 48 | x = x.transpose(1,2) 49 | 50 | # Context dependent ASP 51 | t = x.size()[-1] 52 | global_x = torch.cat((x,torch.mean(x, dim=2, keepdim=True).repeat(1, 1, t), torch.sqrt(torch.var(x, dim=2, keepdim=True).clamp(min=1e-4)).repeat(1, 1, t)), dim=1) 53 | w = self.attention(global_x) 54 | mu = torch.sum(x * w, dim=2) 55 | sg = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-4) ) 56 | x = torch.cat((mu, sg), dim=1) 57 | 58 | # BN -> FC: embedding 59 | x = self.bn(x) 60 | x = self.fc(x) 61 | 62 | return x 63 | 64 | def MainModel(num_mels=80, num_out=192, **kwargs): 65 | model = Conformer(num_mels=num_mels, embedding_dim=num_out, input_layer="conv2d2") 66 | return model 67 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/specaugment.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class SpecAugment(nn.Module): 5 | 6 | def __init__(self, freq_mask_width = (0, 8), time_mask_width = (0, 10)): 7 | self.time_mask_width = time_mask_width 8 | self.freq_mask_width = freq_mask_width 9 | super(SpecAugment, self).__init__() 10 | 11 | def mask_along_axis(self, x, dim): 12 | original_size = x.shape 13 | batch, fea, time = x.shape 14 | if dim == 1: 15 | D = fea 16 | width_range = self.freq_mask_width 17 | else: 18 | D = time 19 | width_range = self.time_mask_width 20 | 21 | mask_len = torch.randint(width_range[0], width_range[1], (batch, 1), device=x.device).unsqueeze(2) 22 | mask_pos = torch.randint(0, max(1, D - mask_len.max()), (batch, 1), device=x.device).unsqueeze(2) 23 | arange = torch.arange(D, device=x.device).view(1, 1, -1) 24 | mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len)) 25 | mask = mask.any(dim=1) 26 | 27 | if dim == 1: 28 | mask = mask.unsqueeze(2) 29 | else: 30 | mask = mask.unsqueeze(1) 31 | 32 | x = x.masked_fill_(mask, 0.0) 33 | return x.view(*original_size) 34 | 35 | def forward(self, x): 36 | x = self.mask_along_axis(x, dim=2) 37 | x = self.mask_along_axis(x, dim=1) 38 | return x 39 | 40 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/bin/.train.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/bin/.train.py.swp -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/bin/average_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Mobvoi Inc. All Rights Reserved. 2 | # Author: di.wu@mobvoi.com (DI WU) 3 | import os 4 | import argparse 5 | import glob 6 | 7 | import yaml 8 | import numpy as np 9 | import torch 10 | 11 | if __name__ == '__main__': 12 | parser = argparse.ArgumentParser(description='average model') 13 | parser.add_argument('--dst_model', required=True, help='averaged model') 14 | parser.add_argument('--src_path', 15 | required=True, 16 | help='src model path for average') 17 | parser.add_argument('--val_best', 18 | action="store_true", 19 | help='averaged model') 20 | parser.add_argument('--num', 21 | default=5, 22 | type=int, 23 | help='nums for averaged model') 24 | parser.add_argument('--min_epoch', 25 | default=0, 26 | type=int, 27 | help='min epoch used for averaging model') 28 | parser.add_argument('--max_epoch', 29 | default=65536, # Big enough 30 | type=int, 31 | help='max epoch used for averaging model') 32 | 33 | args = parser.parse_args() 34 | print(args) 35 | checkpoints = [] 36 | val_scores = [] 37 | if args.val_best: 38 | yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) 39 | for y in yamls: 40 | with open(y, 'r') as f: 41 | dic_yaml = yaml.load(f, Loader=yaml.FullLoader) 42 | loss = dic_yaml['cv_loss'] 43 | epoch = dic_yaml['epoch'] 44 | if epoch >= args.min_epoch and epoch <= args.max_epoch: 45 | val_scores += [[epoch, loss]] 46 | val_scores = np.array(val_scores) 47 | sort_idx = np.argsort(val_scores[:, -1]) 48 | sorted_val_scores = val_scores[sort_idx][::1] 49 | print("best val scores = " + str(sorted_val_scores[:args.num, 1])) 50 | print("selected epochs = " + 51 | str(sorted_val_scores[:args.num, 0].astype(np.int64))) 52 | path_list = [ 53 | args.src_path + '/{}.pt'.format(int(epoch)) 54 | for epoch in sorted_val_scores[:args.num, 0] 55 | ] 56 | else: 57 | path_list = glob.glob('{}/[!avg][!final]*.pt'.format(args.src_path)) 58 | path_list = sorted(path_list, key=os.path.getmtime) 59 | path_list = path_list[-args.num:] 60 | print(path_list) 61 | avg = None 62 | num = args.num 63 | assert num == len(path_list) 64 | for path in path_list: 65 | print('Processing {}'.format(path)) 66 | states = torch.load(path, map_location=torch.device('cpu')) 67 | if avg is None: 68 | avg = states 69 | else: 70 | for k in avg.keys(): 71 | avg[k] += states[k] 72 | # average 73 | for k in avg.keys(): 74 | if avg[k] is not None: 75 | # pytorch 1.6 use true_divide instead of /= 76 | avg[k] = torch.true_divide(avg[k], num) 77 | print('Saving to {}'.format(args.dst_model)) 78 | torch.save(avg, args.dst_model) 79 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/bin/export_jit.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | 17 | import argparse 18 | import os 19 | 20 | import torch 21 | import yaml 22 | 23 | from wenet.transformer.asr_model import init_asr_model 24 | from wenet.utils.checkpoint import load_checkpoint 25 | 26 | if __name__ == '__main__': 27 | parser = argparse.ArgumentParser(description='export your script model') 28 | parser.add_argument('--config', required=True, help='config file') 29 | parser.add_argument('--checkpoint', required=True, help='checkpoint model') 30 | parser.add_argument('--output_file', required=True, help='output file') 31 | parser.add_argument('--output_quant_file', 32 | default=None, 33 | help='output quantized model file') 34 | args = parser.parse_args() 35 | # No need gpu for model export 36 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 37 | 38 | with open(args.config, 'r') as fin: 39 | configs = yaml.load(fin, Loader=yaml.FullLoader) 40 | model = init_asr_model(configs) 41 | print(model) 42 | 43 | load_checkpoint(model, args.checkpoint) 44 | # Export jit torch script model 45 | 46 | script_model = torch.jit.script(model) 47 | script_model.save(args.output_file) 48 | print('Export model successfully, see {}'.format(args.output_file)) 49 | 50 | # Export quantized jit torch script model 51 | if args.output_quant_file: 52 | quantized_model = torch.quantization.quantize_dynamic( 53 | model, {torch.nn.Linear}, dtype=torch.qint8 54 | ) 55 | print(quantized_model) 56 | script_quant_model = torch.jit.script(quantized_model) 57 | script_quant_model.save(args.output_quant_file) 58 | print('Export quantized model successfully, ' 59 | 'see {}'.format(args.output_quant_file)) 60 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/attention.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/attention.cpython-36.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/attention.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/attention.cpython-37.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/attention.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/attention.cpython-39.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/convolution.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/convolution.cpython-36.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/convolution.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/convolution.cpython-37.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/convolution.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/convolution.cpython-39.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/embedding.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/embedding.cpython-36.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/embedding.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/embedding.cpython-37.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/embedding.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/embedding.cpython-39.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_cat.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_cat.cpython-36.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_cat.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_cat.cpython-37.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_cat.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_cat.cpython-39.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_layer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_layer.cpython-36.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_layer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_layer.cpython-37.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_layer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_layer.cpython-39.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-36.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-37.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-39.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/subsampling.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/subsampling.cpython-36.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/subsampling.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/subsampling.cpython-37.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/subsampling.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/subsampling.cpython-39.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/swish.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/swish.cpython-36.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/swish.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/swish.cpython-37.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/__pycache__/swish.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/swish.cpython-39.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/cmvn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | 19 | class GlobalCMVN(torch.nn.Module): 20 | def __init__(self, 21 | mean: torch.Tensor, 22 | istd: torch.Tensor, 23 | norm_var: bool = True): 24 | """ 25 | Args: 26 | mean (torch.Tensor): mean stats 27 | istd (torch.Tensor): inverse std, std which is 1.0 / std 28 | """ 29 | super().__init__() 30 | assert mean.shape == istd.shape 31 | self.norm_var = norm_var 32 | # The buffer can be accessed from this module using self.mean 33 | self.register_buffer("mean", mean) 34 | self.register_buffer("istd", istd) 35 | 36 | def forward(self, x: torch.Tensor): 37 | """ 38 | Args: 39 | x (torch.Tensor): (batch, max_len, feat_dim) 40 | 41 | Returns: 42 | (torch.Tensor): normalized feature 43 | """ 44 | x = x - self.mean 45 | if self.norm_var: 46 | x = x * self.istd 47 | return x 48 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/convolution.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2021 Mobvoi Inc. All Rights Reserved. 5 | # Author: di.wu@mobvoi.com (DI WU) 6 | """ConvolutionModule definition.""" 7 | 8 | from typing import Optional, Tuple 9 | 10 | import torch 11 | from torch import nn 12 | from typeguard import check_argument_types 13 | 14 | 15 | class ConvolutionModule(nn.Module): 16 | """ConvolutionModule in Conformer model.""" 17 | def __init__(self, 18 | channels: int, 19 | kernel_size: int = 15, 20 | activation: nn.Module = nn.ReLU(), 21 | norm: str = "batch_norm", 22 | causal: bool = False, 23 | bias: bool = True): 24 | """Construct an ConvolutionModule object. 25 | Args: 26 | channels (int): The number of channels of conv layers. 27 | kernel_size (int): Kernel size of conv layers. 28 | causal (int): Whether use causal convolution or not 29 | """ 30 | assert check_argument_types() 31 | super().__init__() 32 | 33 | self.pointwise_conv1 = nn.Conv1d( 34 | channels, 35 | 2 * channels, 36 | kernel_size=1, 37 | stride=1, 38 | padding=0, 39 | bias=bias, 40 | ) 41 | # self.lorder is used to distinguish if it's a causal convolution, 42 | # if self.lorder > 0: it's a causal convolution, the input will be 43 | # padded with self.lorder frames on the left in forward. 44 | # else: it's a symmetrical convolution 45 | if causal: 46 | padding = 0 47 | self.lorder = kernel_size - 1 48 | else: 49 | # kernel_size should be an odd number for none causal convolution 50 | assert (kernel_size - 1) % 2 == 0 51 | padding = (kernel_size - 1) // 2 52 | self.lorder = 0 53 | self.depthwise_conv = nn.Conv1d( 54 | channels, 55 | channels, 56 | kernel_size, 57 | stride=1, 58 | padding=padding, 59 | groups=channels, 60 | bias=bias, 61 | ) 62 | 63 | assert norm in ['batch_norm', 'layer_norm'] 64 | if norm == "batch_norm": 65 | self.use_layer_norm = False 66 | self.norm = nn.BatchNorm1d(channels) 67 | else: 68 | self.use_layer_norm = True 69 | self.norm = nn.LayerNorm(channels) 70 | 71 | self.pointwise_conv2 = nn.Conv1d( 72 | channels, 73 | channels, 74 | kernel_size=1, 75 | stride=1, 76 | padding=0, 77 | bias=bias, 78 | ) 79 | self.activation = activation 80 | 81 | def forward( 82 | self, 83 | x: torch.Tensor, 84 | mask_pad: Optional[torch.Tensor] = None, 85 | cache: Optional[torch.Tensor] = None, 86 | ) -> Tuple[torch.Tensor, torch.Tensor]: 87 | """Compute convolution module. 88 | Args: 89 | x (torch.Tensor): Input tensor (#batch, time, channels). 90 | mask_pad (torch.Tensor): used for batch padding (#batch, 1, time) 91 | cache (torch.Tensor): left context cache, it is only 92 | used in causal convolution 93 | Returns: 94 | torch.Tensor: Output tensor (#batch, time, channels). 95 | """ 96 | # exchange the temporal dimension and the feature dimension 97 | x = x.transpose(1, 2) # (#batch, channels, time) 98 | 99 | # mask batch padding 100 | if mask_pad is not None: 101 | x.masked_fill_(~mask_pad, 0.0) 102 | 103 | if self.lorder > 0: 104 | if cache is None: 105 | x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) 106 | else: 107 | assert cache.size(0) == x.size(0) 108 | assert cache.size(1) == x.size(1) 109 | x = torch.cat((cache, x), dim=2) 110 | assert (x.size(2) > self.lorder) 111 | new_cache = x[:, :, -self.lorder:] 112 | else: 113 | # It's better we just return None if no cache is requried, 114 | # However, for JIT export, here we just fake one tensor instead of 115 | # None. 116 | new_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) 117 | 118 | # GLU mechanism 119 | x = self.pointwise_conv1(x) # (batch, 2*channel, dim) 120 | x = nn.functional.glu(x, dim=1) # (batch, channel, dim) 121 | 122 | # 1D Depthwise Conv 123 | x = self.depthwise_conv(x) 124 | if self.use_layer_norm: 125 | x = x.transpose(1, 2) 126 | x = self.activation(self.norm(x)) 127 | if self.use_layer_norm: 128 | x = x.transpose(1, 2) 129 | x = self.pointwise_conv2(x) 130 | # mask batch padding 131 | if mask_pad is not None: 132 | x.masked_fill_(~mask_pad, 0.0) 133 | 134 | return x.transpose(1, 2), new_cache 135 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/ctc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from typeguard import check_argument_types 4 | 5 | 6 | class CTC(torch.nn.Module): 7 | """CTC module""" 8 | def __init__( 9 | self, 10 | odim: int, 11 | encoder_output_size: int, 12 | dropout_rate: float = 0.0, 13 | reduce: bool = True, 14 | ): 15 | """ Construct CTC module 16 | Args: 17 | odim: dimension of outputs 18 | encoder_output_size: number of encoder projection units 19 | dropout_rate: dropout rate (0.0 ~ 1.0) 20 | reduce: reduce the CTC loss into a scalar 21 | """ 22 | assert check_argument_types() 23 | super().__init__() 24 | eprojs = encoder_output_size 25 | self.dropout_rate = dropout_rate 26 | self.ctc_lo = torch.nn.Linear(eprojs, odim) 27 | 28 | reduction_type = "sum" if reduce else "none" 29 | self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) 30 | 31 | def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, 32 | ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: 33 | """Calculate CTC loss. 34 | 35 | Args: 36 | hs_pad: batch of padded hidden state sequences (B, Tmax, D) 37 | hlens: batch of lengths of hidden state sequences (B) 38 | ys_pad: batch of padded character id sequence tensor (B, Lmax) 39 | ys_lens: batch of lengths of character sequence (B) 40 | """ 41 | # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) 42 | ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) 43 | # ys_hat: (B, L, D) -> (L, B, D) 44 | ys_hat = ys_hat.transpose(0, 1) 45 | ys_hat = ys_hat.log_softmax(2) 46 | loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) 47 | # Batch-size average 48 | loss = loss / ys_hat.size(1) 49 | return loss 50 | 51 | def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: 52 | """log_softmax of frame activations 53 | 54 | Args: 55 | Tensor hs_pad: 3d tensor (B, Tmax, eprojs) 56 | Returns: 57 | torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) 58 | """ 59 | return F.log_softmax(self.ctc_lo(hs_pad), dim=2) 60 | 61 | def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: 62 | """argmax of frame activations 63 | 64 | Args: 65 | torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) 66 | Returns: 67 | torch.Tensor: argmax applied 2d tensor (B, Tmax) 68 | """ 69 | return torch.argmax(self.ctc_lo(hs_pad), dim=2) 70 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/decoder_layer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | """Decoder self-attention layer definition.""" 7 | from typing import Optional, Tuple 8 | 9 | import torch 10 | from torch import nn 11 | 12 | 13 | class DecoderLayer(nn.Module): 14 | """Single decoder layer module. 15 | 16 | Args: 17 | size (int): Input dimension. 18 | self_attn (torch.nn.Module): Self-attention module instance. 19 | `MultiHeadedAttention` instance can be used as the argument. 20 | src_attn (torch.nn.Module): Inter-attention module instance. 21 | `MultiHeadedAttention` instance can be used as the argument. 22 | feed_forward (torch.nn.Module): Feed-forward module instance. 23 | `PositionwiseFeedForward` instance can be used as the argument. 24 | dropout_rate (float): Dropout rate. 25 | normalize_before (bool): 26 | True: use layer_norm before each sub-block. 27 | False: to use layer_norm after each sub-block. 28 | concat_after (bool): Whether to concat attention layer's inpu 29 | and output. 30 | True: x -> x + linear(concat(x, att(x))) 31 | False: x -> x + att(x) 32 | """ 33 | def __init__( 34 | self, 35 | size: int, 36 | self_attn: nn.Module, 37 | src_attn: nn.Module, 38 | feed_forward: nn.Module, 39 | dropout_rate: float, 40 | normalize_before: bool = True, 41 | concat_after: bool = False, 42 | ): 43 | """Construct an DecoderLayer object.""" 44 | super().__init__() 45 | self.size = size 46 | self.self_attn = self_attn 47 | self.src_attn = src_attn 48 | self.feed_forward = feed_forward 49 | self.norm1 = nn.LayerNorm(size, eps=1e-12) 50 | self.norm2 = nn.LayerNorm(size, eps=1e-12) 51 | self.norm3 = nn.LayerNorm(size, eps=1e-12) 52 | self.dropout = nn.Dropout(dropout_rate) 53 | self.normalize_before = normalize_before 54 | self.concat_after = concat_after 55 | self.concat_linear1 = nn.Linear(size + size, size) 56 | self.concat_linear2 = nn.Linear(size + size, size) 57 | 58 | def forward( 59 | self, 60 | tgt: torch.Tensor, 61 | tgt_mask: torch.Tensor, 62 | memory: torch.Tensor, 63 | memory_mask: torch.Tensor, 64 | cache: Optional[torch.Tensor] = None 65 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: 66 | """Compute decoded features. 67 | 68 | Args: 69 | tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). 70 | tgt_mask (torch.Tensor): Mask for input tensor 71 | (#batch, maxlen_out). 72 | memory (torch.Tensor): Encoded memory 73 | (#batch, maxlen_in, size). 74 | memory_mask (torch.Tensor): Encoded memory mask 75 | (#batch, maxlen_in). 76 | cache (torch.Tensor): cached tensors. 77 | (#batch, maxlen_out - 1, size). 78 | 79 | Returns: 80 | torch.Tensor: Output tensor (#batch, maxlen_out, size). 81 | torch.Tensor: Mask for output tensor (#batch, maxlen_out). 82 | torch.Tensor: Encoded memory (#batch, maxlen_in, size). 83 | torch.Tensor: Encoded memory mask (#batch, maxlen_in). 84 | 85 | """ 86 | residual = tgt 87 | if self.normalize_before: 88 | tgt = self.norm1(tgt) 89 | 90 | if cache is None: 91 | tgt_q = tgt 92 | tgt_q_mask = tgt_mask 93 | else: 94 | # compute only the last frame query keeping dim: max_time_out -> 1 95 | assert cache.shape == ( 96 | tgt.shape[0], 97 | tgt.shape[1] - 1, 98 | self.size, 99 | ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" 100 | tgt_q = tgt[:, -1:, :] 101 | residual = residual[:, -1:, :] 102 | tgt_q_mask = tgt_mask[:, -1:, :] 103 | 104 | if self.concat_after: 105 | tgt_concat = torch.cat( 106 | (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)), dim=-1) 107 | x = residual + self.concat_linear1(tgt_concat) 108 | else: 109 | x = residual + self.dropout( 110 | self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)) 111 | if not self.normalize_before: 112 | x = self.norm1(x) 113 | 114 | residual = x 115 | if self.normalize_before: 116 | x = self.norm2(x) 117 | if self.concat_after: 118 | x_concat = torch.cat( 119 | (x, self.src_attn(x, memory, memory, memory_mask)), dim=-1) 120 | x = residual + self.concat_linear2(x_concat) 121 | else: 122 | x = residual + self.dropout( 123 | self.src_attn(x, memory, memory, memory_mask)) 124 | if not self.normalize_before: 125 | x = self.norm2(x) 126 | 127 | residual = x 128 | if self.normalize_before: 129 | x = self.norm3(x) 130 | x = residual + self.dropout(self.feed_forward(x)) 131 | if not self.normalize_before: 132 | x = self.norm3(x) 133 | 134 | if cache is not None: 135 | x = torch.cat([cache, x], dim=1) 136 | 137 | return x, tgt_mask, memory, memory_mask 138 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/embedding.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Mobvoi Inc. All Rights Reserved. 5 | # Author: di.wu@mobvoi.com (DI WU) 6 | """Positonal Encoding Module.""" 7 | 8 | import math 9 | from typing import Tuple 10 | 11 | import torch 12 | 13 | 14 | class PositionalEncoding(torch.nn.Module): 15 | """Positional encoding. 16 | 17 | :param int d_model: embedding dim 18 | :param float dropout_rate: dropout rate 19 | :param int max_len: maximum input length 20 | 21 | PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) 22 | PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) 23 | """ 24 | def __init__(self, 25 | d_model: int, 26 | dropout_rate: float, 27 | max_len: int = 50000, 28 | reverse: bool = False): 29 | """Construct an PositionalEncoding object.""" 30 | super().__init__() 31 | self.d_model = d_model 32 | self.xscale = math.sqrt(self.d_model) 33 | self.dropout = torch.nn.Dropout(p=dropout_rate) 34 | self.max_len = max_len 35 | 36 | self.pe = torch.zeros(self.max_len, self.d_model) 37 | position = torch.arange(0, self.max_len, 38 | dtype=torch.float32).unsqueeze(1) 39 | div_term = torch.exp( 40 | torch.arange(0, self.d_model, 2, dtype=torch.float32) * 41 | -(math.log(10000.0) / self.d_model)) 42 | self.pe[:, 0::2] = torch.sin(position * div_term) 43 | self.pe[:, 1::2] = torch.cos(position * div_term) 44 | self.pe = self.pe.unsqueeze(0) 45 | 46 | def forward(self, 47 | x: torch.Tensor, 48 | offset: int = 0) -> Tuple[torch.Tensor, torch.Tensor]: 49 | """Add positional encoding. 50 | 51 | Args: 52 | x (torch.Tensor): Input. Its shape is (batch, time, ...) 53 | offset (int): position offset 54 | 55 | Returns: 56 | torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) 57 | torch.Tensor: for compatibility to RelPositionalEncoding 58 | """ 59 | assert offset + x.size(1) < self.max_len 60 | self.pe = self.pe.to(x.device) 61 | pos_emb = self.pe[:, offset:offset + x.size(1)] 62 | x = x * self.xscale + pos_emb 63 | return self.dropout(x), self.dropout(pos_emb) 64 | 65 | def position_encoding(self, offset: int, size: int) -> torch.Tensor: 66 | """ For getting encoding in a streaming fashion 67 | 68 | Attention!!!!! 69 | we apply dropout only once at the whole utterance level in a none 70 | streaming way, but will call this function several times with 71 | increasing input size in a streaming scenario, so the dropout will 72 | be applied several times. 73 | 74 | Args: 75 | offset (int): start offset 76 | size (int): requried size of position encoding 77 | 78 | Returns: 79 | torch.Tensor: Corresponding encoding 80 | """ 81 | assert offset + size < self.max_len 82 | return self.dropout(self.pe[:, offset:offset + size]) 83 | 84 | 85 | class RelPositionalEncoding(PositionalEncoding): 86 | """Relative positional encoding module. 87 | See : Appendix B in https://arxiv.org/abs/1901.02860 88 | Args: 89 | d_model (int): Embedding dimension. 90 | dropout_rate (float): Dropout rate. 91 | max_len (int): Maximum input length. 92 | """ 93 | def __init__(self, d_model: int, dropout_rate: float, max_len: int = 100000): 94 | """Initialize class.""" 95 | super().__init__(d_model, dropout_rate, max_len, reverse=True) 96 | 97 | def forward(self, 98 | x: torch.Tensor, 99 | offset: int = 0) -> Tuple[torch.Tensor, torch.Tensor]: 100 | """Compute positional encoding. 101 | Args: 102 | x (torch.Tensor): Input tensor (batch, time, `*`). 103 | Returns: 104 | torch.Tensor: Encoded tensor (batch, time, `*`). 105 | torch.Tensor: Positional embedding tensor (1, time, `*`). 106 | """ 107 | assert offset + x.size(1) < self.max_len 108 | self.pe = self.pe.to(x.device) 109 | x = x * self.xscale 110 | pos_emb = self.pe[:, offset:offset + x.size(1)] 111 | return self.dropout(x), self.dropout(pos_emb) 112 | 113 | 114 | class NoPositionalEncoding(torch.nn.Module): 115 | """ No position encoding 116 | """ 117 | def __init__(self, d_model: int, dropout_rate: float): 118 | super().__init__() 119 | self.d_model = d_model 120 | self.dropout = torch.nn.Dropout(p=dropout_rate) 121 | 122 | def forward(self, 123 | x: torch.Tensor, 124 | offset: int = 0) -> Tuple[torch.Tensor, torch.Tensor]: 125 | """ Just return zero vector for interface compatibility 126 | """ 127 | pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) 128 | return self.dropout(x), pos_emb 129 | 130 | def position_encoding(self, offset: int, size: int) -> torch.Tensor: 131 | return torch.zeros(1, size, self.d_model) 132 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/label_smoothing_loss.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | """Label smoothing module.""" 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class LabelSmoothingLoss(nn.Module): 13 | """Label-smoothing loss. 14 | 15 | In a standard CE loss, the label's data distribution is: 16 | [0,1,2] -> 17 | [ 18 | [1.0, 0.0, 0.0], 19 | [0.0, 1.0, 0.0], 20 | [1.0, 0.0, 1.0], 21 | ] 22 | 23 | In the smoothing version CE Loss,some probabilities 24 | are taken from the true label prob (1.0) and are divided 25 | among other labels. 26 | 27 | e.g. 28 | smoothing=0.1 29 | [0,1,2] -> 30 | [ 31 | [0.9, 0.05, 0.05], 32 | [0.05, 0.9, 0.05], 33 | [0.05, 0.05, 0.9], 34 | ] 35 | 36 | Args: 37 | size (int): the number of class 38 | padding_idx (int): padding class id which will be ignored for loss 39 | smoothing (float): smoothing rate (0.0 means the conventional CE) 40 | normalize_length (bool): 41 | normalize loss by sequence length if True 42 | normalize loss by batch size if False 43 | """ 44 | def __init__(self, 45 | size: int, 46 | padding_idx: int, 47 | smoothing: float, 48 | normalize_length: bool = False): 49 | """Construct an LabelSmoothingLoss object.""" 50 | super(LabelSmoothingLoss, self).__init__() 51 | self.criterion = nn.KLDivLoss(reduction="none") 52 | self.padding_idx = padding_idx 53 | self.confidence = 1.0 - smoothing 54 | self.smoothing = smoothing 55 | self.size = size 56 | self.normalize_length = normalize_length 57 | 58 | def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: 59 | """Compute loss between x and target. 60 | 61 | The model outputs and data labels tensors are flatten to 62 | (batch*seqlen, class) shape and a mask is applied to the 63 | padding part which should not be calculated for loss. 64 | 65 | Args: 66 | x (torch.Tensor): prediction (batch, seqlen, class) 67 | target (torch.Tensor): 68 | target signal masked with self.padding_id (batch, seqlen) 69 | Returns: 70 | loss (torch.Tensor) : The KL loss, scalar float value 71 | """ 72 | assert x.size(2) == self.size 73 | batch_size = x.size(0) 74 | x = x.view(-1, self.size) 75 | target = target.view(-1) 76 | # use zeros_like instead of torch.no_grad() for true_dist, 77 | # since no_grad() can not be exported by JIT 78 | true_dist = torch.zeros_like(x) 79 | true_dist.fill_(self.smoothing / (self.size - 1)) 80 | ignore = target == self.padding_idx # (B,) 81 | total = len(target) - ignore.sum().item() 82 | target = target.masked_fill(ignore, 0) # avoid -1 index 83 | true_dist.scatter_(1, target.unsqueeze(1), self.confidence) 84 | kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) 85 | denom = total if self.normalize_length else batch_size 86 | return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom 87 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/positionwise_feed_forward.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | """Positionwise feed forward layer definition.""" 7 | 8 | import torch 9 | 10 | 11 | class PositionwiseFeedForward(torch.nn.Module): 12 | """Positionwise feed forward layer. 13 | 14 | FeedForward are appied on each position of the sequence. 15 | The output dim is same with the input dim. 16 | 17 | Args: 18 | idim (int): Input dimenstion. 19 | hidden_units (int): The number of hidden units. 20 | dropout_rate (float): Dropout rate. 21 | activation (torch.nn.Module): Activation function 22 | """ 23 | def __init__(self, 24 | idim: int, 25 | hidden_units: int, 26 | dropout_rate: float, 27 | activation: torch.nn.Module = torch.nn.ReLU()): 28 | """Construct a PositionwiseFeedForward object.""" 29 | super(PositionwiseFeedForward, self).__init__() 30 | self.w_1 = torch.nn.Linear(idim, hidden_units) 31 | self.activation = activation 32 | self.dropout = torch.nn.Dropout(dropout_rate) 33 | self.w_2 = torch.nn.Linear(hidden_units, idim) 34 | 35 | def forward(self, xs: torch.Tensor) -> torch.Tensor: 36 | """Forward function. 37 | 38 | Args: 39 | xs: input tensor (B, L, D) 40 | Returns: 41 | output tensor, (B, L, D) 42 | """ 43 | return self.w_2(self.dropout(self.activation(self.w_1(xs)))) 44 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/transformer/swish.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe) 5 | # Northwestern Polytechnical University (Pengcheng Guo) 6 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 7 | """Swish() activation function for Conformer.""" 8 | 9 | import torch 10 | 11 | 12 | class Swish(torch.nn.Module): 13 | """Construct an Swish object.""" 14 | def forward(self, x: torch.Tensor) -> torch.Tensor: 15 | """Return Swish activation function.""" 16 | return x * torch.sigmoid(x) 17 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/utils/__pycache__/common.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/utils/__pycache__/common.cpython-36.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/utils/__pycache__/common.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/utils/__pycache__/common.cpython-37.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/utils/__pycache__/common.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/utils/__pycache__/common.cpython-39.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/utils/__pycache__/mask.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/utils/__pycache__/mask.cpython-36.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/utils/__pycache__/mask.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/utils/__pycache__/mask.cpython-37.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/utils/__pycache__/mask.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/utils/__pycache__/mask.cpython-39.pyc -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/utils/checkpoint.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Mobvoi Inc. All Rights Reserved. 2 | # Author: binbinzhang@mobvoi.com (Binbin Zhang) 3 | 4 | import logging 5 | import os 6 | import re 7 | 8 | import yaml 9 | import torch 10 | 11 | 12 | def load_checkpoint(model: torch.nn.Module, path: str) -> dict: 13 | if torch.cuda.is_available(): 14 | logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) 15 | checkpoint = torch.load(path) 16 | else: 17 | logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) 18 | checkpoint = torch.load(path, map_location='cpu') 19 | model.load_state_dict(checkpoint) 20 | info_path = re.sub('.pt$', '.yaml', path) 21 | configs = {} 22 | if os.path.exists(info_path): 23 | with open(info_path, 'r') as fin: 24 | configs = yaml.load(fin, Loader=yaml.FullLoader) 25 | return configs 26 | 27 | 28 | def save_checkpoint(model: torch.nn.Module, path: str, infos=None): 29 | ''' 30 | Args: 31 | infos (dict or None): any info you want to save. 32 | ''' 33 | logging.info('Checkpoint: save to checkpoint %s' % path) 34 | if isinstance(model, torch.nn.DataParallel): 35 | state_dict = model.module.state_dict() 36 | elif isinstance(model, torch.nn.parallel.DistributedDataParallel): 37 | state_dict = model.module.state_dict() 38 | else: 39 | state_dict = model.state_dict() 40 | torch.save(state_dict, path) 41 | info_path = re.sub('.pt$', '.yaml', path) 42 | if infos is None: 43 | infos = {} 44 | with open(info_path, 'w') as fout: 45 | data = yaml.dump(infos) 46 | fout.write(data) 47 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/utils/cmvn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import json 17 | import math 18 | 19 | import numpy as np 20 | 21 | 22 | def _load_json_cmvn(json_cmvn_file): 23 | """ Load the json format cmvn stats file and calculate cmvn 24 | 25 | Args: 26 | json_cmvn_file: cmvn stats file in json format 27 | 28 | Returns: 29 | a numpy array of [means, vars] 30 | """ 31 | with open(json_cmvn_file) as f: 32 | cmvn_stats = json.load(f) 33 | 34 | means = cmvn_stats['mean_stat'] 35 | variance = cmvn_stats['var_stat'] 36 | count = cmvn_stats['frame_num'] 37 | for i in range(len(means)): 38 | means[i] /= count 39 | variance[i] = variance[i] / count - means[i] * means[i] 40 | if variance[i] < 1.0e-20: 41 | variance[i] = 1.0e-20 42 | variance[i] = 1.0 / math.sqrt(variance[i]) 43 | cmvn = np.array([means, variance]) 44 | return cmvn 45 | 46 | 47 | def _load_kaldi_cmvn(kaldi_cmvn_file): 48 | """ Load the kaldi format cmvn stats file and calculate cmvn 49 | 50 | Args: 51 | kaldi_cmvn_file: kaldi text style global cmvn file, which 52 | is generated by: 53 | compute-cmvn-stats --binary=false scp:feats.scp global_cmvn 54 | 55 | Returns: 56 | a numpy array of [means, vars] 57 | """ 58 | means = [] 59 | variance = [] 60 | with open(kaldi_cmvn_file, 'r') as fid: 61 | # kaldi binary file start with '\0B' 62 | if fid.read(2) == '\0B': 63 | logging.error('kaldi cmvn binary file is not supported, please ' 64 | 'recompute it by: compute-cmvn-stats --binary=false ' 65 | ' scp:feats.scp global_cmvn') 66 | sys.exit(1) 67 | fid.seek(0) 68 | arr = fid.read().split() 69 | assert (arr[0] == '[') 70 | assert (arr[-2] == '0') 71 | assert (arr[-1] == ']') 72 | feat_dim = int((len(arr) - 2 - 2) / 2) 73 | for i in range(1, feat_dim + 1): 74 | means.append(float(arr[i])) 75 | count = float(arr[feat_dim + 1]) 76 | for i in range(feat_dim + 2, 2 * feat_dim + 2): 77 | variance.append(float(arr[i])) 78 | 79 | for i in range(len(means)): 80 | means[i] /= count 81 | variance[i] = variance[i] / count - means[i] * means[i] 82 | if variance[i] < 1.0e-20: 83 | variance[i] = 1.0e-20 84 | variance[i] = 1.0 / math.sqrt(variance[i]) 85 | cmvn = np.array([means, variance]) 86 | return cmvn 87 | 88 | 89 | def load_cmvn(cmvn_file, is_json): 90 | if is_json: 91 | cmvn = _load_json_cmvn(cmvn_file) 92 | else: 93 | cmvn = _load_kaldi_cmvn(cmvn_file) 94 | return cmvn[0], cmvn[1] 95 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/utils/common.py: -------------------------------------------------------------------------------- 1 | """Unility functions for Transformer.""" 2 | 3 | import math 4 | from typing import Tuple, List 5 | 6 | import torch 7 | from torch.nn.utils.rnn import pad_sequence 8 | 9 | IGNORE_ID = -1 10 | 11 | 12 | def pad_list(xs: List[torch.Tensor], pad_value: int): 13 | """Perform padding for the list of tensors. 14 | 15 | Args: 16 | xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. 17 | pad_value (float): Value for padding. 18 | 19 | Returns: 20 | Tensor: Padded tensor (B, Tmax, `*`). 21 | 22 | Examples: 23 | >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] 24 | >>> x 25 | [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] 26 | >>> pad_list(x, 0) 27 | tensor([[1., 1., 1., 1.], 28 | [1., 1., 0., 0.], 29 | [1., 0., 0., 0.]]) 30 | 31 | """ 32 | n_batch = len(xs) 33 | max_len = max([x.size(0) for x in xs]) 34 | pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) 35 | pad = pad.fill_(pad_value) 36 | for i in range(n_batch): 37 | pad[i, :xs[i].size(0)] = xs[i] 38 | 39 | return pad 40 | 41 | 42 | def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, 43 | ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: 44 | """Add and labels. 45 | 46 | Args: 47 | ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) 48 | sos (int): index of 49 | eos (int): index of 50 | ignore_id (int): index of padding 51 | 52 | Returns: 53 | ys_in (torch.Tensor) : (B, Lmax + 1) 54 | ys_out (torch.Tensor) : (B, Lmax + 1) 55 | 56 | Examples: 57 | >>> sos_id = 10 58 | >>> eos_id = 11 59 | >>> ignore_id = -1 60 | >>> ys_pad 61 | tensor([[ 1, 2, 3, 4, 5], 62 | [ 4, 5, 6, -1, -1], 63 | [ 7, 8, 9, -1, -1]], dtype=torch.int32) 64 | >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) 65 | >>> ys_in 66 | tensor([[10, 1, 2, 3, 4, 5], 67 | [10, 4, 5, 6, 11, 11], 68 | [10, 7, 8, 9, 11, 11]]) 69 | >>> ys_out 70 | tensor([[ 1, 2, 3, 4, 5, 11], 71 | [ 4, 5, 6, 11, -1, -1], 72 | [ 7, 8, 9, 11, -1, -1]]) 73 | """ 74 | _sos = torch.tensor([sos], 75 | dtype=torch.long, 76 | requires_grad=False, 77 | device=ys_pad.device) 78 | _eos = torch.tensor([eos], 79 | dtype=torch.long, 80 | requires_grad=False, 81 | device=ys_pad.device) 82 | ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys 83 | ys_in = [torch.cat([_sos, y], dim=0) for y in ys] 84 | ys_out = [torch.cat([y, _eos], dim=0) for y in ys] 85 | return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) 86 | 87 | 88 | def reverse_pad_list(ys_pad: torch.Tensor, 89 | ys_lens: torch.Tensor, 90 | pad_value: float = -1.0) -> torch.Tensor: 91 | """Reverse padding for the list of tensors. 92 | 93 | Args: 94 | ys_pad (tensor): The padded tensor (B, Tokenmax). 95 | ys_lens (tensor): The lens of token seqs (B) 96 | pad_value (int): Value for padding. 97 | 98 | Returns: 99 | Tensor: Padded tensor (B, Tokenmax). 100 | 101 | Examples: 102 | >>> x 103 | tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) 104 | >>> pad_list(x, 0) 105 | tensor([[4, 3, 2, 1], 106 | [7, 6, 5, 0], 107 | [9, 8, 0, 0]]) 108 | 109 | """ 110 | r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) 111 | for y, i in zip(ys_pad, ys_lens)], True, 112 | pad_value) 113 | return r_ys_pad 114 | 115 | 116 | def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, 117 | ignore_label: int) -> float: 118 | """Calculate accuracy. 119 | 120 | Args: 121 | pad_outputs (Tensor): Prediction tensors (B * Lmax, D). 122 | pad_targets (LongTensor): Target label tensors (B, Lmax, D). 123 | ignore_label (int): Ignore label id. 124 | 125 | Returns: 126 | float: Accuracy value (0.0 - 1.0). 127 | 128 | """ 129 | pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), 130 | pad_outputs.size(1)).argmax(2) 131 | mask = pad_targets != ignore_label 132 | numerator = torch.sum( 133 | pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) 134 | denominator = torch.sum(mask) 135 | return float(numerator) / float(denominator) 136 | 137 | 138 | def get_activation(act): 139 | """Return activation function.""" 140 | # Lazy load to avoid unused import 141 | #from wenet.transformer.swish import Swish 142 | from ..transformer.swish import Swish 143 | 144 | activation_funcs = { 145 | "hardtanh": torch.nn.Hardtanh, 146 | "tanh": torch.nn.Tanh, 147 | "relu": torch.nn.ReLU, 148 | "selu": torch.nn.SELU, 149 | "swish": Swish, 150 | "gelu": torch.nn.GELU 151 | } 152 | 153 | return activation_funcs[act]() 154 | 155 | 156 | def get_subsample(config): 157 | input_layer = config["encoder_conf"]["input_layer"] 158 | assert input_layer in ["conv2d", "conv2d6", "conv2d8"] 159 | if input_layer == "conv2d": 160 | return 4 161 | elif input_layer == "conv2d6": 162 | return 6 163 | elif input_layer == "conv2d8": 164 | return 8 165 | 166 | 167 | def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: 168 | new_hyp: List[int] = [] 169 | cur = 0 170 | while cur < len(hyp): 171 | if hyp[cur] != 0: 172 | new_hyp.append(hyp[cur]) 173 | prev = cur 174 | while cur < len(hyp) and hyp[cur] == hyp[prev]: 175 | cur += 1 176 | return new_hyp 177 | 178 | 179 | def log_add(args: List[int]) -> float: 180 | """ 181 | Stable log add 182 | """ 183 | if all(a == -float('inf') for a in args): 184 | return -float('inf') 185 | a_max = max(args) 186 | lsp = math.log(sum(math.exp(a - a_max) for a in args)) 187 | return a_max + lsp 188 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/utils/ctc_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Mobvoi Inc. All Rights Reserved. 2 | # Author: binbinzhang@mobvoi.com (Di Wu) 3 | 4 | import numpy as np 5 | import torch 6 | 7 | def insert_blank(label, blank_id=0): 8 | """Insert blank token between every two label token.""" 9 | label = np.expand_dims(label, 1) 10 | blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id 11 | label = np.concatenate([blanks, label], axis=1) 12 | label = label.reshape(-1) 13 | label = np.append(label, label[0]) 14 | return label 15 | 16 | def forced_align(ctc_probs: torch.Tensor, 17 | y: torch.Tensor, 18 | blank_id=0) -> list: 19 | """ctc forced alignment. 20 | 21 | Args: 22 | torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) 23 | torch.Tensor y: id sequence tensor 1d tensor (L) 24 | int blank_id: blank symbol index 25 | Returns: 26 | torch.Tensor: alignment result 27 | """ 28 | y_insert_blank = insert_blank(y, blank_id) 29 | 30 | log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) 31 | log_alpha = log_alpha - float('inf') # log of zero 32 | state_path = (torch.zeros( 33 | (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 34 | ) # state path 35 | 36 | # init start state 37 | log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] 38 | log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] 39 | 40 | for t in range(1, ctc_probs.size(0)): 41 | for s in range(len(y_insert_blank)): 42 | if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ 43 | s] == y_insert_blank[s - 2]: 44 | candidates = torch.tensor( 45 | [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) 46 | prev_state = [s, s - 1] 47 | else: 48 | candidates = torch.tensor([ 49 | log_alpha[t - 1, s], 50 | log_alpha[t - 1, s - 1], 51 | log_alpha[t - 1, s - 2], 52 | ]) 53 | prev_state = [s, s - 1, s - 2] 54 | log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] 55 | state_path[t, s] = prev_state[torch.argmax(candidates)] 56 | 57 | state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) 58 | 59 | candidates = torch.tensor([ 60 | log_alpha[-1, len(y_insert_blank) - 1], 61 | log_alpha[-1, len(y_insert_blank) - 2] 62 | ]) 63 | prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] 64 | state_seq[-1] = prev_state[torch.argmax(candidates)] 65 | for t in range(ctc_probs.size(0) - 2, -1, -1): 66 | state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] 67 | 68 | output_alignment = [] 69 | for t in range(0, ctc_probs.size(0)): 70 | output_alignment.append(y_insert_blank[state_seq[t, 0]]) 71 | 72 | return output_alignment 73 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/utils/executor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Mobvoi Inc. All Rights Reserved. 2 | # Author: binbinzhang@mobvoi.com (Binbin Zhang) 3 | 4 | import logging 5 | from contextlib import nullcontext 6 | # if your python version < 3.7 use the below one 7 | # from contextlib import suppress as nullcontext 8 | import torch 9 | from torch.nn.utils import clip_grad_norm_ 10 | 11 | 12 | class Executor: 13 | def __init__(self): 14 | self.step = 0 15 | 16 | def train(self, model, optimizer, scheduler, data_loader, device, writer, 17 | args, scaler): 18 | ''' Train one epoch 19 | ''' 20 | model.train() 21 | clip = args.get('grad_clip', 50.0) 22 | log_interval = args.get('log_interval', 10) 23 | rank = args.get('rank', 0) 24 | accum_grad = args.get('accum_grad', 1) 25 | is_distributed = args.get('is_distributed', True) 26 | use_amp = args.get('use_amp', False) 27 | logging.info('using accumulate grad, new batch size is {} times' 28 | 'larger than before'.format(accum_grad)) 29 | if use_amp: 30 | assert scaler is not None 31 | num_seen_utts = 0 32 | num_total_batch = len(data_loader) 33 | for batch_idx, batch in enumerate(data_loader): 34 | key, feats, target, feats_lengths, target_lengths = batch 35 | feats = feats.to(device) 36 | target = target.to(device) 37 | feats_lengths = feats_lengths.to(device) 38 | target_lengths = target_lengths.to(device) 39 | num_utts = target_lengths.size(0) 40 | if num_utts == 0: 41 | continue 42 | context = None 43 | # Disable gradient synchronizations across DDP processes. 44 | # Within this context, gradients will be accumulated on module 45 | # variables, which will later be synchronized. 46 | if is_distributed and batch_idx % accum_grad != 0: 47 | context = model.no_sync 48 | # Used for single gpu training and DDP gradient synchronization 49 | # processes. 50 | else: 51 | context = nullcontext 52 | with context(): 53 | # autocast context 54 | # The more details about amp can be found in 55 | # https://pytorch.org/docs/stable/notes/amp_examples.html 56 | with torch.cuda.amp.autocast(scaler is not None): 57 | loss, loss_att, loss_ctc = model(feats, feats_lengths, 58 | target, target_lengths) 59 | loss = loss / accum_grad 60 | if use_amp: 61 | scaler.scale(loss).backward() 62 | else: 63 | loss.backward() 64 | 65 | num_seen_utts += num_utts 66 | if batch_idx % accum_grad == 0: 67 | if rank == 0 and writer is not None: 68 | writer.add_scalar('train_loss', loss, self.step) 69 | # Use mixed precision training 70 | if use_amp: 71 | scaler.unscale_(optimizer) 72 | grad_norm = clip_grad_norm_(model.parameters(), clip) 73 | # Must invoke scaler.update() if unscale_() is used in the 74 | # iteration to avoid the following error: 75 | # RuntimeError: unscale_() has already been called 76 | # on this optimizer since the last update(). 77 | # We don't check grad here since that if the gradient has 78 | # inf/nan values, scaler.step will skip optimizer.step(). 79 | scaler.step(optimizer) 80 | scaler.update() 81 | else: 82 | grad_norm = clip_grad_norm_(model.parameters(), clip) 83 | if torch.isfinite(grad_norm): 84 | optimizer.step() 85 | optimizer.zero_grad() 86 | scheduler.step() 87 | self.step += 1 88 | if batch_idx % log_interval == 0: 89 | lr = optimizer.param_groups[0]['lr'] 90 | log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( 91 | batch_idx, num_total_batch, 92 | loss.item() * accum_grad) 93 | if loss_att is not None: 94 | log_str += 'loss_att {:.6f} '.format(loss_att.item()) 95 | if loss_ctc is not None: 96 | log_str += 'loss_ctc {:.6f} '.format(loss_ctc.item()) 97 | log_str += 'lr {:.8f} rank {}'.format(lr, rank) 98 | logging.debug(log_str) 99 | 100 | def cv(self, model, data_loader, device, args): 101 | ''' Cross validation on 102 | ''' 103 | model.eval() 104 | log_interval = args.get('log_interval', 10) 105 | # in order to avoid division by 0 106 | num_seen_utts = 1 107 | total_loss = 0.0 108 | num_total_batch = len(data_loader) 109 | with torch.no_grad(): 110 | for batch_idx, batch in enumerate(data_loader): 111 | key, feats, target, feats_lengths, target_lengths = batch 112 | feats = feats.to(device) 113 | target = target.to(device) 114 | feats_lengths = feats_lengths.to(device) 115 | target_lengths = target_lengths.to(device) 116 | num_utts = target_lengths.size(0) 117 | if num_utts == 0: 118 | continue 119 | loss, loss_att, loss_ctc = model(feats, feats_lengths, target, 120 | target_lengths) 121 | if torch.isfinite(loss): 122 | num_seen_utts += num_utts 123 | total_loss += loss.item() * num_utts 124 | if batch_idx % log_interval == 0: 125 | log_str = 'CV Batch {}/{} loss {:.6f} '.format( 126 | batch_idx, num_total_batch, loss.item()) 127 | if loss_att is not None: 128 | log_str += 'loss_att {:.6f} '.format(loss_att.item()) 129 | if loss_ctc is not None: 130 | log_str += 'loss_ctc {:.6f} '.format(loss_ctc.item()) 131 | log_str += 'history loss {:.6f}'.format(total_loss / 132 | num_seen_utts) 133 | logging.debug(log_str) 134 | 135 | return total_loss, num_seen_utts 136 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/models/wenet/utils/scheduler.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import torch 4 | from torch.optim.lr_scheduler import _LRScheduler 5 | 6 | from typeguard import check_argument_types 7 | 8 | 9 | class WarmupLR(_LRScheduler): 10 | """The WarmupLR scheduler 11 | 12 | This scheduler is almost same as NoamLR Scheduler except for following 13 | difference: 14 | 15 | NoamLR: 16 | lr = optimizer.lr * model_size ** -0.5 17 | * min(step ** -0.5, step * warmup_step ** -1.5) 18 | WarmupLR: 19 | lr = optimizer.lr * warmup_step ** 0.5 20 | * min(step ** -0.5, step * warmup_step ** -1.5) 21 | 22 | Note that the maximum lr equals to optimizer.lr in this scheduler. 23 | 24 | """ 25 | 26 | def __init__( 27 | self, 28 | optimizer: torch.optim.Optimizer, 29 | warmup_steps: Union[int, float] = 25000, 30 | last_epoch: int = -1, 31 | ): 32 | assert check_argument_types() 33 | self.warmup_steps = warmup_steps 34 | 35 | # __init__() must be invoked before setting field 36 | # because step() is also invoked in __init__() 37 | super().__init__(optimizer, last_epoch) 38 | 39 | def __repr__(self): 40 | return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" 41 | 42 | def get_lr(self): 43 | step_num = self.last_epoch + 1 44 | return [ 45 | lr 46 | * self.warmup_steps ** 0.5 47 | * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) 48 | for lr in self.base_lrs 49 | ] 50 | 51 | def set_step(self, step: int): 52 | self.last_epoch = step 53 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/optimizer/adam.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | 6 | def Optimizer(parameters, lr, weight_decay, **kwargs): 7 | 8 | print('Initialised Adam optimizer') 9 | return torch.optim.Adam(parameters, lr = lr, weight_decay = weight_decay); 10 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/optimizer/adamP.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | from adamp import AdamP 6 | 7 | def Optimizer(parameters, lr, weight_decay, **kwargs): 8 | print('Initialised AdamP optimizer') 9 | return AdamP(parameters, lr = lr, betas = (0.9, 0.999), weight_decay = weight_decay) 10 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/optimizer/adamW.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | 6 | def Optimizer(parameters, lr, weight_decay, **kwargs): 7 | 8 | print('Initialised AdamW optimizer') 9 | return torch.optim.AdamW(parameters, lr = lr, weight_decay = weight_decay) 10 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/optimizer/sgd.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | 6 | def Optimizer(parameters, lr, weight_decay, **kwargs): 7 | 8 | print('Initialised SGD optimizer') 9 | 10 | return torch.optim.SGD(parameters, lr = lr, momentum = 0.9, weight_decay=weight_decay); 11 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/process_musan.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | # splits musan clips to chunks of 5 seconds at 3 second interval 4 | # the first argument should be the parent directory of musan_v1 5 | import os 6 | import sys 7 | import glob 8 | from scipy.io import wavfile 9 | 10 | files = glob.glob('%s/musan/*/*/*.wav'%sys.argv[1]) 11 | 12 | audlen = 16000*5 13 | audstr = 16000*3 14 | 15 | for idx,file in enumerate(files): 16 | fs,aud = wavfile.read(file) 17 | writedir = os.path.splitext(file.replace('/musan/','/musan_split/'))[0] 18 | os.makedirs(writedir) 19 | for st in range(0,len(aud)-audlen,audstr): 20 | wavfile.write(writedir + '/%05d.wav'%(st/fs), fs, aud[st:st+audlen]) 21 | 22 | print(idx,file) -------------------------------------------------------------------------------- /stage1/VoxCeleb2/requirements.txt: -------------------------------------------------------------------------------- 1 | --find-links https://download.pytorch.org/whl/torch_stable.html 2 | torch==1.12.1+cu113 3 | torchaudio==0.12.1+cu113 4 | numpy 5 | scipy 6 | scikit-learn 7 | tqdm 8 | pyyaml 9 | soundfile -------------------------------------------------------------------------------- /stage1/VoxCeleb2/scheduler/cosine_annealing_warmup_restarts.py: -------------------------------------------------------------------------------- 1 | # ref: https://github.com/katsura-jp/pytorch-cosine-annealing-with-warmup/blob/master/cosine_annealing_warmup/scheduler.py 2 | #! /usr/bin/python 3 | # -*- encoding: utf-8 -*- 4 | import math 5 | import torch 6 | from torch.optim.lr_scheduler import _LRScheduler 7 | 8 | class CosineAnnealingWarmupRestarts(_LRScheduler): 9 | def __init__(self, 10 | optimizer : torch.optim.Optimizer, 11 | first_cycle_steps : int, 12 | cycle_mult : float = 1., 13 | max_lr : float = 0.1, 14 | min_lr : float = 0.001, 15 | warmup_steps : int = 0, 16 | gamma : float = 1., 17 | last_epoch : int = -1 18 | ): 19 | assert warmup_steps < first_cycle_steps 20 | self.first_cycle_steps = first_cycle_steps # first cycle step size 21 | self.cycle_mult = cycle_mult # cycle steps magnification 22 | self.base_max_lr = max_lr # first max learning rate 23 | self.max_lr = max_lr # max learning rate in the current cycle 24 | self.min_lr = min_lr # min learning rate 25 | self.warmup_steps = warmup_steps # warmup step size 26 | self.gamma = gamma # decrease rate of max learning rate by cycle 27 | self.cur_cycle_steps = first_cycle_steps # first cycle step size 28 | self.cycle = 0 # cycle count 29 | self.step_in_cycle = last_epoch # step size of the current cycle 30 | super(CosineAnnealingWarmupRestarts, self).__init__(optimizer, last_epoch) 31 | self.init_lr() 32 | 33 | def init_lr(self): 34 | self.base_lrs = [] 35 | for param_group in self.optimizer.param_groups: 36 | param_group['lr'] = self.min_lr 37 | self.base_lrs.append(self.min_lr) 38 | 39 | def get_lr(self): 40 | if self.step_in_cycle == -1: 41 | return self.base_lrs 42 | elif self.step_in_cycle < self.warmup_steps: 43 | return [(self.max_lr - base_lr)*self.step_in_cycle / self.warmup_steps + base_lr for base_lr in self.base_lrs] 44 | else: 45 | return [base_lr + (self.max_lr - base_lr) \ 46 | * (1 + math.cos(math.pi * (self.step_in_cycle-self.warmup_steps) \ 47 | / (self.cur_cycle_steps - self.warmup_steps))) / 2 48 | for base_lr in self.base_lrs] 49 | 50 | def step(self, epoch=None): 51 | if epoch is None: 52 | epoch = self.last_epoch + 1 53 | self.step_in_cycle = self.step_in_cycle + 1 54 | if self.step_in_cycle >= self.cur_cycle_steps: 55 | self.cycle += 1 56 | self.step_in_cycle = self.step_in_cycle - self.cur_cycle_steps 57 | self.cur_cycle_steps = int((self.cur_cycle_steps - self.warmup_steps) * self.cycle_mult) + self.warmup_steps 58 | else: 59 | if epoch >= self.first_cycle_steps: 60 | if self.cycle_mult == 1.: 61 | self.step_in_cycle = epoch % self.first_cycle_steps 62 | self.cycle = epoch // self.first_cycle_steps 63 | else: 64 | n = int(math.log((epoch / self.first_cycle_steps * (self.cycle_mult - 1) + 1), self.cycle_mult)) 65 | self.cycle = n 66 | self.step_in_cycle = epoch - int(self.first_cycle_steps * (self.cycle_mult ** n - 1) / (self.cycle_mult - 1)) 67 | self.cur_cycle_steps = self.first_cycle_steps * self.cycle_mult ** (n) 68 | else: 69 | self.cur_cycle_steps = self.first_cycle_steps 70 | self.step_in_cycle = epoch 71 | 72 | self.max_lr = self.base_max_lr * (self.gamma**self.cycle) 73 | self.last_epoch = math.floor(epoch) 74 | for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()): 75 | param_group['lr'] = lr 76 | 77 | 78 | def Scheduler(optimizer, lr_t0, lr_tmul, lr_max, lr_min, lr_wstep, lr_gamma, **kwargs): 79 | sche_fn = CosineAnnealingWarmupRestarts(optimizer, first_cycle_steps=lr_t0, cycle_mult=lr_tmul, max_lr=lr_max, min_lr=lr_min, warmup_steps=lr_wstep, gamma=lr_gamma) 80 | lr_step = 'epoch' 81 | print('Initialised CosineAnnealingWarmupRestarts scheduler') 82 | return sche_fn, lr_step 83 | #return sche_fn 84 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/scheduler/cycliclr.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | 6 | def Scheduler(optimizer, lr_cyclic_min, lr_cyclic_max, lr_up_size, lr_down_size, lr_mode, **kwargs): 7 | 8 | lr_step = 'epoch' 9 | sche_fn = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=lr_cyclic_min, max_lr=lr_cyclic_max, step_size_up=lr_up_size, step_size_down=lr_down_size, mode=lr_mode, cycle_momentum=False) 10 | print('Initialised cyclic LR scheduler') 11 | return sche_fn, lr_step 12 | #return sche_fn 13 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/scheduler/exponentiallr.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | 6 | def Scheduler(optimizer, **kwargs): 7 | 8 | sche_fn = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9) 9 | lr_step = 'epoch' 10 | print('Initialised exponential LR scheduler') 11 | return sche_fn, lr_step 12 | #return sche_fn 13 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/scheduler/steplr.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | 6 | def Scheduler(optimizer, lr_decay_interval, max_epoch, lr_decay, **kwargs): 7 | 8 | sche_fn = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_decay_interval, gamma=lr_decay) 9 | #lr_step = 'epoch' 10 | lr_step = 'step' 11 | print('Initialised step LR scheduler') 12 | return sche_fn, lr_step 13 | #return sche_fn 14 | -------------------------------------------------------------------------------- /stage1/VoxCeleb2/tuneThreshold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*- coding: utf-8 -*- 3 | import numpy 4 | from sklearn import metrics 5 | from operator import itemgetter 6 | 7 | def tuneThresholdfromScore(scores, labels, target_fa, target_fr = None): 8 | 9 | fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1) 10 | fnr = 1 - tpr 11 | 12 | tunedThreshold = []; 13 | if target_fr: 14 | for tfr in target_fr: 15 | idx = numpy.nanargmin(numpy.absolute((tfr - fnr))) 16 | tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]); 17 | 18 | for tfa in target_fa: 19 | idx = numpy.nanargmin(numpy.absolute((tfa - fpr))) # numpy.where(fpr<=tfa)[0][-1] 20 | tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]); 21 | 22 | idxE = numpy.nanargmin(numpy.absolute((fnr - fpr))) 23 | eer = max(fpr[idxE],fnr[idxE])*100 24 | 25 | return (tunedThreshold, eer, fpr, fnr); 26 | 27 | # Creates a list of false-negative rates, a list of false-positive rates 28 | # and a list of decision thresholds that give those error-rates. 29 | def ComputeErrorRates(scores, labels): 30 | 31 | # Sort the scores from smallest to largest, and also get the corresponding 32 | # indexes of the sorted scores. We will treat the sorted scores as the 33 | # thresholds at which the the error-rates are evaluated. 34 | sorted_indexes, thresholds = zip(*sorted( 35 | [(index, threshold) for index, threshold in enumerate(scores)], 36 | key=itemgetter(1))) 37 | sorted_labels = [] 38 | labels = [labels[i] for i in sorted_indexes] 39 | fnrs = [] 40 | fprs = [] 41 | 42 | # At the end of this loop, fnrs[i] is the number of errors made by 43 | # incorrectly rejecting scores less than thresholds[i]. And, fprs[i] 44 | # is the total number of times that we have correctly accepted scores 45 | # greater than thresholds[i]. 46 | for i in range(0, len(labels)): 47 | if i == 0: 48 | fnrs.append(labels[i]) 49 | fprs.append(1 - labels[i]) 50 | else: 51 | fnrs.append(fnrs[i-1] + labels[i]) 52 | fprs.append(fprs[i-1] + 1 - labels[i]) 53 | fnrs_norm = sum(labels) 54 | fprs_norm = len(labels) - fnrs_norm 55 | 56 | # Now divide by the total number of false negative errors to 57 | # obtain the false positive rates across all thresholds 58 | fnrs = [x / float(fnrs_norm) for x in fnrs] 59 | 60 | # Divide by the total number of corret positives to get the 61 | # true positive rate. Subtract these quantities from 1 to 62 | # get the false positive rates. 63 | fprs = [1 - x / float(fprs_norm) for x in fprs] 64 | return fnrs, fprs, thresholds 65 | 66 | # Computes the minimum of the detection cost function. The comments refer to 67 | # equations in Section 3 of the NIST 2016 Speaker Recognition Evaluation Plan. 68 | def ComputeMinDcf(fnrs, fprs, thresholds, p_target, c_miss, c_fa): 69 | min_c_det = float("inf") 70 | min_c_det_threshold = thresholds[0] 71 | for i in range(0, len(fnrs)): 72 | # See Equation (2). it is a weighted sum of false negative 73 | # and false positive errors. 74 | c_det = c_miss * fnrs[i] * p_target + c_fa * fprs[i] * (1 - p_target) 75 | if c_det < min_c_det: 76 | min_c_det = c_det 77 | min_c_det_threshold = thresholds[i] 78 | # See Equations (3) and (4). Now we normalize the cost. 79 | c_def = min(c_miss * p_target, c_fa * (1 - p_target)) 80 | min_dcf = min_c_det / c_def 81 | return min_dcf, min_c_det_threshold -------------------------------------------------------------------------------- /stage2/README.md: -------------------------------------------------------------------------------- 1 | # Stage 2 2 | 3 | Copy-synthesis training recipe and data generation script will be released soon. 4 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/README.md: -------------------------------------------------------------------------------- 1 | # Stage 3 2 | 3 | This repository is developed based on the [voxceleb_trainer](https://github.com/clovaai/voxceleb_trainer) and [ska-tdnn](https://github.com/msh9184/ska-tdnn). 4 | 5 | ## Dependencies 6 | If you use the Anaconda virtual environment, 7 | ``` 8 | conda create -n sasv python=3.9 cudatoolkit=11.3 9 | conda activate sasv 10 | ``` 11 | Install all dependency packages, 12 | ``` 13 | pip3 install -r requirements.txt 14 | ``` 15 | 16 | ## Models 17 | Three models are included in this repository. You can select the model by the `--model` option: 18 | ``` 19 | ECAPA_TDNN [1] 20 | MFA_Conformer [2] 21 | SKA_TDNN [3] 22 | ``` 23 | 24 | [1] B. Desplanques, J. Thienpondt, and K. Demuynck, "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification," in *Proc. INTERSPEECH*, 2020, pp. 3707-3711. 25 | 26 | [2] Y. Zhang, Z. Lv, H. Wu, S. Zhang, P. Hu, Z. Wu, H. Lee, and H. Meng., “MFA-Conformer: Multi-scale Feature Aggregation Conformer for Automatic Speaker Verification,” in *Proc. INTERSPEECH*, 2022. 27 | 28 | [3] S. H. Mun, J. Jung, M. H. Han, and N. S. Kim, "Frequency and Multi-Scale Selective Kernel Attention for Speaker Verification," in *Proc. IEEE SLT*, 2022. 29 | 30 | 31 | ## Training 32 | Training example 1: `SKA_TDNN` from scratch using `ASVspoof2019 LA train+dev`, 33 | 34 | ``` 35 | CUDA_VISIBLE_DEVICES=0 python trainSASVNet.py \ 36 | --max_frames 500 \ 37 | --num_spk 40 \ 38 | --num_utt 2 \ 39 | --batch_size 160 \ 40 | --trainfunc sasv_e2e_v1 \ 41 | --optimizer adamW \ 42 | --scheduler cosine_annealing_warmup_restarts \ 43 | --lr_t0 8 \ 44 | --lr_tmul 1.0 \ 45 | --lr_max 1e-4 \ 46 | --lr_min 0 \ 47 | --lr_wstep 0 \ 48 | --lr_gamma 0.8 \ 49 | --margin 0.2 \ 50 | --scale 30 \ 51 | --num_class 41 \ 52 | --save_path ./save/sasv_baseline_stage3 \ 53 | --train_list ./protocols/ASVspoof2019.LA.cm.train_dev.trn.txt \ 54 | --eval_list ./protocols/ASVspoof2019.LA.asv.eval.gi.trl.txt \ 55 | --train_path /path/to/dataset/ASVSpoof/ASVSpoof2019/LA \ 56 | --eval_path /path/to/dataset/ASVSpoof/ASVSpoof2019/LA/ASVspoof2019_LA_eval/flac \ 57 | --spk_meta_train ./spk_meta/spk_meta_trn.pk 58 | --spk_meta_eval ./spk_meta/spk_meta_eval.pk 59 | --musan_path /path/to/dataset/MUSAN/musan_split \ 60 | --rir_path /path/to/dataset/RIRS_NOISES/simulated_rirs \ 61 | --model SKA_TDNN 62 | ``` 63 | 64 | Training example 2: `MFA_Conformer` with pre-trained weight using `ASVspoof2019 LA train`, 65 | ``` 66 | CUDA_VISIBLE_DEVICES=0 python trainSASVNet.py \ 67 | --max_frames 500 \ 68 | --num_spk 20 \ 69 | --num_utt 2 \ 70 | --batch_size 80 \ 71 | --trainfunc sasv_e2e_v1 \ 72 | --optimizer adamW \ 73 | --scheduler cosine_annealing_warmup_restarts \ 74 | --lr_t0 8 \ 75 | --lr_tmul 1.0 \ 76 | --lr_max 1e-4 \ 77 | --lr_min 0 \ 78 | --lr_wstep 0 \ 79 | --lr_gamma 0.8 \ 80 | --margin 0.2 \ 81 | --scale 30 \ 82 | --num_class 21 \ 83 | --save_path ./save/sasv_baseline_stage3 \ 84 | --train_list ./protocols/ASVspoof2019.LA.cm.train_dev.trn.txt \ 85 | --eval_list ./protocols/ASVspoof2019.LA.asv.eval.gi.trl.txt \ 86 | --train_path /path/to/dataset/ASVSpoof/ASVSpoof2019/LA \ 87 | --eval_path /path/to/dataset/ASVSpoof/ASVSpoof2019/LA/ASVspoof2019_LA_eval/flac \ 88 | --spk_meta_train ./spk_meta/spk_meta_trn.pk 89 | --spk_meta_eval ./spk_meta/spk_meta_eval.pk 90 | --musan_path /path/to/dataset/MUSAN/musan_split \ 91 | --rir_path /path/to/dataset/RIRS_NOISES/simulated_rirs \ 92 | --model MFA_Conformer \ 93 | --initial_model /path/to/your_model/pretrained_weight.model 94 | ``` 95 | [In this repository](https://github.com/sasv-challenge/ASVSpoof5-SASVBaseline), you can download several pre-trained weights used in [this paper](https://arxiv.org/pdf/2305.19051.pdf) and fine-tune them using the above command. 96 | 97 | ## Evaluation 98 | Evaluation example: `SKA_TDNN` using `SASV protocol` on the ASVspoof2019 LA eval, 99 | ``` 100 | CUDA_VISIBLE_DEVICES=0 python trainSASVNet.py \ 101 | --eval \ 102 | --eval_frames 0 \ 103 | --num_eval 1 \ 104 | --eval_list ./protocols/ASVspoof2019.LA.asv.eval.gi.trl.txt \ 105 | --eval_path /path/to/dataset/ASVSpoof/ASVSpoof2019/LA/ASVspoof2019_LA_eval/flac \ 106 | --model SKA_TDNN \ 107 | --initial_model /path/to/your_model/pretrained_weight.model 108 | ``` 109 | 110 | ## Citation 111 | If you utilize this repository, please cite the following paper, 112 | ``` 113 | @inproceedings{chung2020in, 114 | title={In defence of metric learning for speaker recognition}, 115 | author={Chung, Joon Son and Huh, Jaesung and Mun, Seongkyu and Lee, Minjae and Heo, Hee Soo and Choe, Soyeon and Ham, Chiheon and Jung, Sunghwan and Lee, Bong-Jin and Han, Icksang}, 116 | booktitle={Proc. Interspeech}, 117 | year={2020} 118 | } 119 | ``` 120 | 121 | ``` 122 | @inproceedings{jung2022pushing, 123 | title={Pushing the limits of raw waveform speaker recognition}, 124 | author={Jung, Jee-weon and Kim, You Jin and Heo, Hee-Soo and Lee, Bong-Jin and Kwon, Youngki and Chung, Joon Son}, 125 | booktitle={Proc. Interspeech}, 126 | year={2022} 127 | } 128 | ``` 129 | 130 | ``` 131 | @inproceedings{mun2022frequency, 132 | title={Frequency and Multi-Scale Selective Kernel Attention for Speaker Verification}, 133 | author={Mun, Sung Hwan and Jung, Jee-weon and Han, Min Hyun and Kim, Nam Soo}, 134 | booktitle={Proc. IEEE SLT}, 135 | year={2022} 136 | } 137 | ``` 138 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/loss/aamsoftmax.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | # Adapted from https://github.com/wujiyang/Face_Pytorch (Apache License) 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import time, pdb, numpy, math 9 | from utils import accuracy 10 | 11 | class LossFunction(nn.Module): 12 | def __init__(self, num_out, num_class, margin=0.3, scale=15, easy_margin=False, **kwargs): 13 | super(LossFunction, self).__init__() 14 | 15 | self.test_normalize = True 16 | 17 | self.m = margin 18 | self.s = scale 19 | self.in_feats = num_out 20 | self.weight = torch.nn.Parameter(torch.FloatTensor(num_class, num_out), requires_grad=True) 21 | self.ce = nn.CrossEntropyLoss() 22 | nn.init.xavier_normal_(self.weight, gain=1) 23 | 24 | self.easy_margin = easy_margin 25 | self.cos_m = math.cos(self.m) 26 | self.sin_m = math.sin(self.m) 27 | 28 | # make the function cos(theta+m) monotonic decreasing while theta in [0°,180°] 29 | self.th = math.cos(math.pi - self.m) 30 | self.mm = math.sin(math.pi - self.m) * self.m 31 | 32 | print('Initialized AAMSoftmax margin %.3f scale %.3f'%(self.m,self.s)) 33 | 34 | def forward(self, x, label=None): 35 | 36 | assert x.size()[0] == label.size()[0] 37 | assert x.size()[1] == self.in_feats 38 | 39 | # cos(theta) 40 | cosine = F.linear(F.normalize(x), F.normalize(self.weight)) 41 | # cos(theta + m) 42 | sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1)) 43 | phi = cosine * self.cos_m - sine * self.sin_m 44 | 45 | if self.easy_margin: 46 | phi = torch.where(cosine > 0, phi, cosine) 47 | else: 48 | phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm) 49 | 50 | #one_hot = torch.zeros(cosine.size(), device='cuda' if torch.cuda.is_available() else 'cpu') 51 | one_hot = torch.zeros_like(cosine) 52 | one_hot.scatter_(1, label.view(-1, 1), 1) 53 | output = (one_hot * phi) + ((1.0 - one_hot) * cosine) 54 | output = output * self.s 55 | 56 | loss = self.ce(output, label) 57 | prec1 = accuracy(output.detach(), label.detach(), topk=(1,))[0] 58 | return loss, prec1 59 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/loss/angleproto_sasv.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import time, pdb, numpy 8 | from utils import accuracy 9 | 10 | class LossFunction(nn.Module): 11 | 12 | def __init__(self, init_w1=10.0, init_b1=-5.0, init_w2=10.0, init_b2=-5.0, **kwargs): 13 | super(LossFunction, self).__init__() 14 | 15 | self.test_normalize = True 16 | self.w1 = nn.Parameter(torch.tensor(init_w1)) 17 | self.b1 = nn.Parameter(torch.tensor(init_b1)) 18 | self.criterion = torch.nn.CrossEntropyLoss() 19 | print('Initialized AngleProto') 20 | 21 | def forward(self, x, label=None, num_bna=0): 22 | assert x.size()[1] >= 2 23 | 24 | out_anchor = x[:, 1, :] 25 | out_positive = x[:, 0, :][ :num_bna] 26 | stepsize = out_positive.size()[0] 27 | 28 | cos_sim_matrix1 = F.cosine_similarity(out_positive.unsqueeze(-1),out_anchor.unsqueeze(-1).transpose(0,2)) 29 | torch.clamp(self.w1, 1e-6) 30 | cos_sim_matrix1 = cos_sim_matrix1 * self.w1 + self.b1 31 | 32 | out_anchor = x[:, 0, :] 33 | out_positive = x[:, 1, :][ :num_bna] 34 | 35 | label = torch.from_numpy(numpy.asarray(range(0,stepsize))).cuda() 36 | nloss1 = self.criterion(cos_sim_matrix1, label) 37 | nloss = nloss1 38 | 39 | prec1 = accuracy(cos_sim_matrix1.detach(), label.detach(), topk=(1,))[0] 40 | prec = prec1 41 | 42 | return nloss, prec 43 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/loss/sasv_e2e_v1.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | import torch.nn as nn 6 | import loss.aamsoftmax as aamsoftmax 7 | import loss.angleproto_sasv as angleproto_sasv 8 | 9 | class LossFunction(nn.Module): 10 | def __init__(self, **kwargs): 11 | super(LossFunction, self).__init__() 12 | self.test_normalize = True 13 | self.aamsoftmax = aamsoftmax.LossFunction(**kwargs) 14 | self.angleproto_sasv = angleproto_sasv.LossFunction(**kwargs) 15 | self.num_class = kwargs.get('num_class') 16 | print('Initialized SASV End-to-end v1 Loss Function') 17 | 18 | def forward(self, x, label=None): 19 | assert x.size()[1] == 2 20 | nlossS, prec = self.aamsoftmax(x.reshape(-1, x.size()[-1]), label.repeat_interleave(2)) 21 | 22 | idx_bna = torch.where(label != self.num_class-1) 23 | idx_spf = torch.where(label == self.num_class-1) 24 | x1 = x[idx_bna] 25 | x2 = x[idx_spf] 26 | x = torch.cat((x1, x2)) 27 | nlossM, _ = self.angleproto_sasv(x, None, len(idx_bna[0])) 28 | 29 | return nlossS + nlossM, prec 30 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/metrics.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | 3 | import numpy 4 | import torch 5 | from scipy.interpolate import interp1d 6 | from scipy.optimize import brentq 7 | from sklearn.metrics import roc_curve 8 | 9 | 10 | def get_all_EERs( 11 | preds: Union[torch.Tensor, List, numpy.ndarray], keys: List 12 | ) -> List[float]: 13 | """ 14 | Calculate all three EERs used in the SASV Challenge 2022. 15 | preds and keys should be pre-calculated using dev or eval protocol in 16 | either 'protocols/ASVspoof2019.LA.asv.dev.gi.trl.txt' or 17 | 'protocols/ASVspoof2019.LA.asv.eval.gi.trl.txt' 18 | 19 | :param preds: list of scores in tensor 20 | :param keys: list of keys where each element should be one of 21 | ['target', 'nontarget', 'spoof'] 22 | """ 23 | sasv_labels, sv_labels, spf_labels = [], [], [] 24 | sv_preds, spf_preds = [], [] 25 | 26 | for pred, key in zip(preds, keys): 27 | if key == "target": 28 | sasv_labels.append(1) 29 | sv_labels.append(1) 30 | spf_labels.append(1) 31 | sv_preds.append(pred) 32 | spf_preds.append(pred) 33 | 34 | elif key == "nontarget": 35 | sasv_labels.append(0) 36 | sv_labels.append(0) 37 | sv_preds.append(pred) 38 | 39 | elif key == "spoof": 40 | sasv_labels.append(0) 41 | spf_labels.append(0) 42 | spf_preds.append(pred) 43 | else: 44 | raise ValueError( 45 | f"should be one of 'target', 'nontarget', 'spoof', got:{key}" 46 | ) 47 | 48 | fpr, tpr, _ = roc_curve(sasv_labels, preds, pos_label=1) 49 | sasv_eer = brentq(lambda x: 1.0 - x - interp1d(fpr, tpr)(x), 0.0, 1.0) 50 | 51 | fpr, tpr, _ = roc_curve(sv_labels, sv_preds, pos_label=1) 52 | sv_eer = brentq(lambda x: 1.0 - x - interp1d(fpr, tpr)(x), 0.0, 1.0) 53 | 54 | fpr, tpr, _ = roc_curve(spf_labels, spf_preds, pos_label=1) 55 | spf_eer = brentq(lambda x: 1.0 - x - interp1d(fpr, tpr)(x), 0.0, 1.0) 56 | 57 | return sasv_eer*100, sv_eer*100, spf_eer*100 58 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/ECAPA_TDNN.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | ## Here, log_input forces alternative mfcc implementation with pre-emphasis instead of actual log mfcc 5 | 6 | import math 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | import torchaudio 11 | import pdb 12 | from utils import PreEmphasis 13 | 14 | class SEModule(nn.Module): 15 | def __init__(self, channels, bottleneck=128): 16 | super(SEModule, self).__init__() 17 | self.se = nn.Sequential( 18 | nn.AdaptiveAvgPool1d(1), 19 | nn.Conv1d(channels, bottleneck, kernel_size=1, padding=0), 20 | nn.ReLU(), 21 | #nn.BatchNorm1d(bottleneck), 22 | nn.Conv1d(bottleneck, channels, kernel_size=1, padding=0), 23 | nn.Sigmoid(), 24 | ) 25 | 26 | def forward(self, input): 27 | x = self.se(input) 28 | return input * x 29 | 30 | class Bottle2neck(nn.Module): 31 | 32 | def __init__(self, inplanes, planes, kernel_size=None, dilation=None, scale = 8): 33 | super(Bottle2neck, self).__init__() 34 | width = int(math.floor(planes / scale)) 35 | self.conv1 = nn.Conv1d(inplanes, width*scale, kernel_size=1) 36 | self.bn1 = nn.BatchNorm1d(width*scale) 37 | self.nums = scale -1 38 | convs = [] 39 | bns = [] 40 | num_pad = math.floor(kernel_size/2)*dilation 41 | for i in range(self.nums): 42 | convs.append(nn.Conv1d(width, width, kernel_size=kernel_size, dilation=dilation, padding=num_pad)) 43 | bns.append(nn.BatchNorm1d(width)) 44 | self.convs = nn.ModuleList(convs) 45 | self.bns = nn.ModuleList(bns) 46 | self.conv3 = nn.Conv1d(width*scale, planes, kernel_size=1) 47 | self.bn3 = nn.BatchNorm1d(planes) 48 | self.relu = nn.ReLU() 49 | self.width = width 50 | self.se = SEModule(planes) 51 | 52 | def forward(self, x): 53 | residual = x 54 | out = self.conv1(x) 55 | out = self.relu(out) 56 | out = self.bn1(out) 57 | 58 | spx = torch.split(out, self.width, 1) 59 | for i in range(self.nums): 60 | if i==0: 61 | sp = spx[i] 62 | else: 63 | sp = sp + spx[i] 64 | sp = self.convs[i](sp) 65 | sp = self.relu(sp) 66 | sp = self.bns[i](sp) 67 | if i==0: 68 | out = sp 69 | else: 70 | out = torch.cat((out, sp), 1) 71 | out = torch.cat((out, spx[self.nums]),1) 72 | 73 | out = self.conv3(out) 74 | out = self.relu(out) 75 | out = self.bn3(out) 76 | 77 | out = self.se(out) 78 | out += residual 79 | return out 80 | 81 | class FbankAug(nn.Module): 82 | 83 | def __init__(self, freq_mask_width = (0, 8), time_mask_width = (0, 10)): 84 | self.time_mask_width = time_mask_width 85 | self.freq_mask_width = freq_mask_width 86 | super().__init__() 87 | 88 | def mask_along_axis(self, x, dim): 89 | original_size = x.shape 90 | batch, fea, time = x.shape 91 | if dim == 1: 92 | D = fea 93 | width_range = self.freq_mask_width 94 | else: 95 | D = time 96 | width_range = self.time_mask_width 97 | 98 | mask_len = torch.randint(width_range[0], width_range[1], (batch, 1), device=x.device).unsqueeze(2) 99 | mask_pos = torch.randint(0, max(1, D - mask_len.max()), (batch, 1), device=x.device).unsqueeze(2) 100 | arange = torch.arange(D, device=x.device).view(1, 1, -1) 101 | mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len)) 102 | mask = mask.any(dim=1) 103 | 104 | if dim == 1: 105 | mask = mask.unsqueeze(2) 106 | else: 107 | mask = mask.unsqueeze(1) 108 | 109 | x = x.masked_fill_(mask, 0.0) 110 | return x.view(*original_size) 111 | 112 | def forward(self, x): 113 | x = self.mask_along_axis(x, dim=2) 114 | x = self.mask_along_axis(x, dim=1) 115 | return x 116 | 117 | class ECAPA_TDNN(nn.Module): 118 | def __init__(self, block, C, model_scale, log_input=True, num_mels=80, num_out=192, **kwargs): 119 | self.log_input = log_input 120 | super(ECAPA_TDNN, self).__init__() 121 | self.scale = model_scale 122 | self.conv1 = nn.Conv1d(num_mels, C, kernel_size=5, stride=1, padding=2) 123 | self.relu = nn.ReLU() 124 | self.bn1 = nn.BatchNorm1d(C) 125 | self.layer1 = block(C, C, kernel_size=3, dilation=2, scale=self.scale) 126 | self.layer2 = block(C, C, kernel_size=3, dilation=3, scale=self.scale) 127 | self.layer3 = block(C, C, kernel_size=3, dilation=4, scale=self.scale) 128 | self.layer4 = nn.Conv1d(3*C, 1536, kernel_size=1) 129 | self.attention = nn.Sequential( 130 | nn.Conv1d(4608, 256, kernel_size=1), 131 | nn.ReLU(), 132 | nn.BatchNorm1d(256), 133 | nn.Tanh(), 134 | nn.Conv1d(256, 1536, kernel_size=1), 135 | nn.Softmax(dim=2), 136 | ) 137 | self.torchfbank = torch.nn.Sequential( 138 | PreEmphasis(), 139 | torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \ 140 | f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=num_mels), 141 | ) 142 | self.specaug = FbankAug() 143 | self.bn5 = nn.BatchNorm1d(3072) 144 | self.fc6 = nn.Linear(3072, num_out) 145 | self.bn6 = nn.BatchNorm1d(num_out) 146 | 147 | def forward(self, x, aug): 148 | with torch.no_grad(): 149 | with torch.cuda.amp.autocast(enabled=False): 150 | x = self.torchfbank(x)+1e-6 151 | if self.log_input: 152 | x = x.log() 153 | x = x - torch.mean(x, dim=-1, keepdim=True) 154 | if aug == True: 155 | x = self.specaug(x) 156 | x = self.conv1(x) 157 | x = self.relu(x) 158 | x = self.bn1(x) 159 | x1 = self.layer1(x) 160 | x2 = self.layer2(x+x1) 161 | x3 = self.layer3(x+x1+x2) 162 | x = self.layer4(torch.cat((x1,x2,x3),dim=1)) 163 | x = self.relu(x) 164 | t = x.size()[-1] 165 | global_x = torch.cat((x,torch.mean(x,dim=2,keepdim=True).repeat(1,1,t), torch.sqrt(torch.var(x,dim=2,keepdim=True).clamp(min=1e-4)).repeat(1,1,t)), dim=1) 166 | w = self.attention(global_x) 167 | mu = torch.sum(x * w, dim=2) 168 | sg = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-4) ) 169 | x = torch.cat((mu,sg),1) 170 | x = self.bn5(x) 171 | x = self.fc6(x) 172 | x = self.bn6(x) 173 | return x 174 | 175 | def MainModel(eca_c=1024, eca_s=8, log_input=True, num_mels=80, num_out=192, **kwargs): 176 | model = ECAPA_TDNN(block=Bottle2neck, C=eca_c, model_scale=eca_s, log_input=log_input, num_mels=num_mels, num_out=num_out, **kwargs) 177 | return model 178 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/MFA_Conformer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torchaudio 4 | from torch import Tensor 5 | from typing import Tuple 6 | from utils import PreEmphasis 7 | from .specaugment import SpecAugment 8 | from .wenet.transformer.encoder_cat import ConformerEncoder 9 | 10 | class Conformer(nn.Module): 11 | def __init__(self, num_mels=80, num_blocks=6, output_size=256, embedding_dim=192, input_layer="conv2d2", pos_enc_layer_type="rel_pos"): 12 | super(Conformer, self).__init__() 13 | print("input_layer: {}".format(input_layer)) 14 | print("pos_enc_layer_type: {}".format(pos_enc_layer_type)) 15 | self.conformer = ConformerEncoder(input_size=num_mels, num_blocks=num_blocks, output_size=output_size, input_layer=input_layer, pos_enc_layer_type=pos_enc_layer_type, ) 16 | self.bn = nn.BatchNorm1d(output_size*num_blocks*2) 17 | self.fc = nn.Linear(output_size*num_blocks*2, embedding_dim) 18 | 19 | self.specaug = SpecAugment() 20 | self.torchfbank = torch.nn.Sequential( 21 | PreEmphasis(), 22 | torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \ 23 | f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=80), 24 | ) 25 | output_dim = output_size*num_blocks 26 | self.attention = nn.Sequential( 27 | nn.Conv1d(output_dim*3, 256, kernel_size=1), 28 | nn.ReLU(), 29 | nn.BatchNorm1d(256), 30 | nn.Tanh(), 31 | nn.Conv1d(256, output_dim, kernel_size=1), 32 | nn.Softmax(dim=2), 33 | ) 34 | 35 | def forward(self, x: Tensor, aug=False) -> Tuple[Tensor, bool]: 36 | 37 | with torch.no_grad(): 38 | with torch.cuda.amp.autocast(enabled=False): 39 | x = self.torchfbank(x)+1e-6 40 | x = x.log() 41 | x = x - torch.mean(x, dim=-1, keepdim=True) 42 | if aug == True: 43 | x = self.specaug(x) 44 | x = x.transpose(1,2) 45 | lens = torch.ones(x.shape[0]).to(x.device) 46 | lens = torch.round(lens*x.shape[1]).int() 47 | x, masks = self.conformer(x, lens) 48 | x = x.transpose(1,2) 49 | 50 | # Context dependent ASP 51 | t = x.size()[-1] 52 | global_x = torch.cat((x,torch.mean(x, dim=2, keepdim=True).repeat(1, 1, t), torch.sqrt(torch.var(x, dim=2, keepdim=True).clamp(min=1e-4)).repeat(1, 1, t)), dim=1) 53 | w = self.attention(global_x) 54 | mu = torch.sum(x * w, dim=2) 55 | sg = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-4) ) 56 | x = torch.cat((mu, sg), dim=1) 57 | 58 | # BN -> FC: embedding 59 | x = self.bn(x) 60 | x = self.fc(x) 61 | 62 | return x 63 | 64 | def MainModel(num_mels=80, num_out=192, **kwargs): 65 | model = Conformer(num_mels=num_mels, embedding_dim=num_out, input_layer="conv2d2") 66 | return model 67 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/specaugment.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class SpecAugment(nn.Module): 5 | 6 | def __init__(self, freq_mask_width = (0, 8), time_mask_width = (0, 10)): 7 | self.time_mask_width = time_mask_width 8 | self.freq_mask_width = freq_mask_width 9 | super(SpecAugment, self).__init__() 10 | 11 | def mask_along_axis(self, x, dim): 12 | original_size = x.shape 13 | batch, fea, time = x.shape 14 | if dim == 1: 15 | D = fea 16 | width_range = self.freq_mask_width 17 | else: 18 | D = time 19 | width_range = self.time_mask_width 20 | 21 | mask_len = torch.randint(width_range[0], width_range[1], (batch, 1), device=x.device).unsqueeze(2) 22 | mask_pos = torch.randint(0, max(1, D - mask_len.max()), (batch, 1), device=x.device).unsqueeze(2) 23 | arange = torch.arange(D, device=x.device).view(1, 1, -1) 24 | mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len)) 25 | mask = mask.any(dim=1) 26 | 27 | if dim == 1: 28 | mask = mask.unsqueeze(2) 29 | else: 30 | mask = mask.unsqueeze(1) 31 | 32 | x = x.masked_fill_(mask, 0.0) 33 | return x.view(*original_size) 34 | 35 | def forward(self, x): 36 | x = self.mask_along_axis(x, dim=2) 37 | x = self.mask_along_axis(x, dim=1) 38 | return x 39 | 40 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/bin/.train.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/bin/.train.py.swp -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/bin/average_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Mobvoi Inc. All Rights Reserved. 2 | # Author: di.wu@mobvoi.com (DI WU) 3 | import os 4 | import argparse 5 | import glob 6 | 7 | import yaml 8 | import numpy as np 9 | import torch 10 | 11 | if __name__ == '__main__': 12 | parser = argparse.ArgumentParser(description='average model') 13 | parser.add_argument('--dst_model', required=True, help='averaged model') 14 | parser.add_argument('--src_path', 15 | required=True, 16 | help='src model path for average') 17 | parser.add_argument('--val_best', 18 | action="store_true", 19 | help='averaged model') 20 | parser.add_argument('--num', 21 | default=5, 22 | type=int, 23 | help='nums for averaged model') 24 | parser.add_argument('--min_epoch', 25 | default=0, 26 | type=int, 27 | help='min epoch used for averaging model') 28 | parser.add_argument('--max_epoch', 29 | default=65536, # Big enough 30 | type=int, 31 | help='max epoch used for averaging model') 32 | 33 | args = parser.parse_args() 34 | print(args) 35 | checkpoints = [] 36 | val_scores = [] 37 | if args.val_best: 38 | yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) 39 | for y in yamls: 40 | with open(y, 'r') as f: 41 | dic_yaml = yaml.load(f, Loader=yaml.FullLoader) 42 | loss = dic_yaml['cv_loss'] 43 | epoch = dic_yaml['epoch'] 44 | if epoch >= args.min_epoch and epoch <= args.max_epoch: 45 | val_scores += [[epoch, loss]] 46 | val_scores = np.array(val_scores) 47 | sort_idx = np.argsort(val_scores[:, -1]) 48 | sorted_val_scores = val_scores[sort_idx][::1] 49 | print("best val scores = " + str(sorted_val_scores[:args.num, 1])) 50 | print("selected epochs = " + 51 | str(sorted_val_scores[:args.num, 0].astype(np.int64))) 52 | path_list = [ 53 | args.src_path + '/{}.pt'.format(int(epoch)) 54 | for epoch in sorted_val_scores[:args.num, 0] 55 | ] 56 | else: 57 | path_list = glob.glob('{}/[!avg][!final]*.pt'.format(args.src_path)) 58 | path_list = sorted(path_list, key=os.path.getmtime) 59 | path_list = path_list[-args.num:] 60 | print(path_list) 61 | avg = None 62 | num = args.num 63 | assert num == len(path_list) 64 | for path in path_list: 65 | print('Processing {}'.format(path)) 66 | states = torch.load(path, map_location=torch.device('cpu')) 67 | if avg is None: 68 | avg = states 69 | else: 70 | for k in avg.keys(): 71 | avg[k] += states[k] 72 | # average 73 | for k in avg.keys(): 74 | if avg[k] is not None: 75 | # pytorch 1.6 use true_divide instead of /= 76 | avg[k] = torch.true_divide(avg[k], num) 77 | print('Saving to {}'.format(args.dst_model)) 78 | torch.save(avg, args.dst_model) 79 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/bin/export_jit.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | 17 | import argparse 18 | import os 19 | 20 | import torch 21 | import yaml 22 | 23 | from wenet.transformer.asr_model import init_asr_model 24 | from wenet.utils.checkpoint import load_checkpoint 25 | 26 | if __name__ == '__main__': 27 | parser = argparse.ArgumentParser(description='export your script model') 28 | parser.add_argument('--config', required=True, help='config file') 29 | parser.add_argument('--checkpoint', required=True, help='checkpoint model') 30 | parser.add_argument('--output_file', required=True, help='output file') 31 | parser.add_argument('--output_quant_file', 32 | default=None, 33 | help='output quantized model file') 34 | args = parser.parse_args() 35 | # No need gpu for model export 36 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 37 | 38 | with open(args.config, 'r') as fin: 39 | configs = yaml.load(fin, Loader=yaml.FullLoader) 40 | model = init_asr_model(configs) 41 | print(model) 42 | 43 | load_checkpoint(model, args.checkpoint) 44 | # Export jit torch script model 45 | 46 | script_model = torch.jit.script(model) 47 | script_model.save(args.output_file) 48 | print('Export model successfully, see {}'.format(args.output_file)) 49 | 50 | # Export quantized jit torch script model 51 | if args.output_quant_file: 52 | quantized_model = torch.quantization.quantize_dynamic( 53 | model, {torch.nn.Linear}, dtype=torch.qint8 54 | ) 55 | print(quantized_model) 56 | script_quant_model = torch.jit.script(quantized_model) 57 | script_quant_model.save(args.output_quant_file) 58 | print('Export quantized model successfully, ' 59 | 'see {}'.format(args.output_quant_file)) 60 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/attention.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/attention.cpython-36.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/attention.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/attention.cpython-37.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/attention.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/attention.cpython-39.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/convolution.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/convolution.cpython-36.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/convolution.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/convolution.cpython-37.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/convolution.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/convolution.cpython-39.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/embedding.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/embedding.cpython-36.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/embedding.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/embedding.cpython-37.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/embedding.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/embedding.cpython-39.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_cat.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_cat.cpython-36.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_cat.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_cat.cpython-37.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_cat.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_cat.cpython-39.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_layer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_layer.cpython-36.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_layer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_layer.cpython-37.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_layer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_layer.cpython-39.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-36.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-37.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-39.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/subsampling.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/subsampling.cpython-36.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/subsampling.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/subsampling.cpython-37.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/subsampling.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/subsampling.cpython-39.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/swish.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/swish.cpython-36.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/swish.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/swish.cpython-37.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/__pycache__/swish.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/swish.cpython-39.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/cmvn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | 19 | class GlobalCMVN(torch.nn.Module): 20 | def __init__(self, 21 | mean: torch.Tensor, 22 | istd: torch.Tensor, 23 | norm_var: bool = True): 24 | """ 25 | Args: 26 | mean (torch.Tensor): mean stats 27 | istd (torch.Tensor): inverse std, std which is 1.0 / std 28 | """ 29 | super().__init__() 30 | assert mean.shape == istd.shape 31 | self.norm_var = norm_var 32 | # The buffer can be accessed from this module using self.mean 33 | self.register_buffer("mean", mean) 34 | self.register_buffer("istd", istd) 35 | 36 | def forward(self, x: torch.Tensor): 37 | """ 38 | Args: 39 | x (torch.Tensor): (batch, max_len, feat_dim) 40 | 41 | Returns: 42 | (torch.Tensor): normalized feature 43 | """ 44 | x = x - self.mean 45 | if self.norm_var: 46 | x = x * self.istd 47 | return x 48 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/convolution.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2021 Mobvoi Inc. All Rights Reserved. 5 | # Author: di.wu@mobvoi.com (DI WU) 6 | """ConvolutionModule definition.""" 7 | 8 | from typing import Optional, Tuple 9 | 10 | import torch 11 | from torch import nn 12 | from typeguard import check_argument_types 13 | 14 | 15 | class ConvolutionModule(nn.Module): 16 | """ConvolutionModule in Conformer model.""" 17 | def __init__(self, 18 | channels: int, 19 | kernel_size: int = 15, 20 | activation: nn.Module = nn.ReLU(), 21 | norm: str = "batch_norm", 22 | causal: bool = False, 23 | bias: bool = True): 24 | """Construct an ConvolutionModule object. 25 | Args: 26 | channels (int): The number of channels of conv layers. 27 | kernel_size (int): Kernel size of conv layers. 28 | causal (int): Whether use causal convolution or not 29 | """ 30 | assert check_argument_types() 31 | super().__init__() 32 | 33 | self.pointwise_conv1 = nn.Conv1d( 34 | channels, 35 | 2 * channels, 36 | kernel_size=1, 37 | stride=1, 38 | padding=0, 39 | bias=bias, 40 | ) 41 | # self.lorder is used to distinguish if it's a causal convolution, 42 | # if self.lorder > 0: it's a causal convolution, the input will be 43 | # padded with self.lorder frames on the left in forward. 44 | # else: it's a symmetrical convolution 45 | if causal: 46 | padding = 0 47 | self.lorder = kernel_size - 1 48 | else: 49 | # kernel_size should be an odd number for none causal convolution 50 | assert (kernel_size - 1) % 2 == 0 51 | padding = (kernel_size - 1) // 2 52 | self.lorder = 0 53 | self.depthwise_conv = nn.Conv1d( 54 | channels, 55 | channels, 56 | kernel_size, 57 | stride=1, 58 | padding=padding, 59 | groups=channels, 60 | bias=bias, 61 | ) 62 | 63 | assert norm in ['batch_norm', 'layer_norm'] 64 | if norm == "batch_norm": 65 | self.use_layer_norm = False 66 | self.norm = nn.BatchNorm1d(channels) 67 | else: 68 | self.use_layer_norm = True 69 | self.norm = nn.LayerNorm(channels) 70 | 71 | self.pointwise_conv2 = nn.Conv1d( 72 | channels, 73 | channels, 74 | kernel_size=1, 75 | stride=1, 76 | padding=0, 77 | bias=bias, 78 | ) 79 | self.activation = activation 80 | 81 | def forward( 82 | self, 83 | x: torch.Tensor, 84 | mask_pad: Optional[torch.Tensor] = None, 85 | cache: Optional[torch.Tensor] = None, 86 | ) -> Tuple[torch.Tensor, torch.Tensor]: 87 | """Compute convolution module. 88 | Args: 89 | x (torch.Tensor): Input tensor (#batch, time, channels). 90 | mask_pad (torch.Tensor): used for batch padding (#batch, 1, time) 91 | cache (torch.Tensor): left context cache, it is only 92 | used in causal convolution 93 | Returns: 94 | torch.Tensor: Output tensor (#batch, time, channels). 95 | """ 96 | # exchange the temporal dimension and the feature dimension 97 | x = x.transpose(1, 2) # (#batch, channels, time) 98 | 99 | # mask batch padding 100 | if mask_pad is not None: 101 | x.masked_fill_(~mask_pad, 0.0) 102 | 103 | if self.lorder > 0: 104 | if cache is None: 105 | x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) 106 | else: 107 | assert cache.size(0) == x.size(0) 108 | assert cache.size(1) == x.size(1) 109 | x = torch.cat((cache, x), dim=2) 110 | assert (x.size(2) > self.lorder) 111 | new_cache = x[:, :, -self.lorder:] 112 | else: 113 | # It's better we just return None if no cache is requried, 114 | # However, for JIT export, here we just fake one tensor instead of 115 | # None. 116 | new_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) 117 | 118 | # GLU mechanism 119 | x = self.pointwise_conv1(x) # (batch, 2*channel, dim) 120 | x = nn.functional.glu(x, dim=1) # (batch, channel, dim) 121 | 122 | # 1D Depthwise Conv 123 | x = self.depthwise_conv(x) 124 | if self.use_layer_norm: 125 | x = x.transpose(1, 2) 126 | x = self.activation(self.norm(x)) 127 | if self.use_layer_norm: 128 | x = x.transpose(1, 2) 129 | x = self.pointwise_conv2(x) 130 | # mask batch padding 131 | if mask_pad is not None: 132 | x.masked_fill_(~mask_pad, 0.0) 133 | 134 | return x.transpose(1, 2), new_cache 135 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/ctc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from typeguard import check_argument_types 4 | 5 | 6 | class CTC(torch.nn.Module): 7 | """CTC module""" 8 | def __init__( 9 | self, 10 | odim: int, 11 | encoder_output_size: int, 12 | dropout_rate: float = 0.0, 13 | reduce: bool = True, 14 | ): 15 | """ Construct CTC module 16 | Args: 17 | odim: dimension of outputs 18 | encoder_output_size: number of encoder projection units 19 | dropout_rate: dropout rate (0.0 ~ 1.0) 20 | reduce: reduce the CTC loss into a scalar 21 | """ 22 | assert check_argument_types() 23 | super().__init__() 24 | eprojs = encoder_output_size 25 | self.dropout_rate = dropout_rate 26 | self.ctc_lo = torch.nn.Linear(eprojs, odim) 27 | 28 | reduction_type = "sum" if reduce else "none" 29 | self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) 30 | 31 | def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, 32 | ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: 33 | """Calculate CTC loss. 34 | 35 | Args: 36 | hs_pad: batch of padded hidden state sequences (B, Tmax, D) 37 | hlens: batch of lengths of hidden state sequences (B) 38 | ys_pad: batch of padded character id sequence tensor (B, Lmax) 39 | ys_lens: batch of lengths of character sequence (B) 40 | """ 41 | # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) 42 | ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) 43 | # ys_hat: (B, L, D) -> (L, B, D) 44 | ys_hat = ys_hat.transpose(0, 1) 45 | ys_hat = ys_hat.log_softmax(2) 46 | loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) 47 | # Batch-size average 48 | loss = loss / ys_hat.size(1) 49 | return loss 50 | 51 | def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: 52 | """log_softmax of frame activations 53 | 54 | Args: 55 | Tensor hs_pad: 3d tensor (B, Tmax, eprojs) 56 | Returns: 57 | torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) 58 | """ 59 | return F.log_softmax(self.ctc_lo(hs_pad), dim=2) 60 | 61 | def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: 62 | """argmax of frame activations 63 | 64 | Args: 65 | torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) 66 | Returns: 67 | torch.Tensor: argmax applied 2d tensor (B, Tmax) 68 | """ 69 | return torch.argmax(self.ctc_lo(hs_pad), dim=2) 70 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/decoder_layer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | """Decoder self-attention layer definition.""" 7 | from typing import Optional, Tuple 8 | 9 | import torch 10 | from torch import nn 11 | 12 | 13 | class DecoderLayer(nn.Module): 14 | """Single decoder layer module. 15 | 16 | Args: 17 | size (int): Input dimension. 18 | self_attn (torch.nn.Module): Self-attention module instance. 19 | `MultiHeadedAttention` instance can be used as the argument. 20 | src_attn (torch.nn.Module): Inter-attention module instance. 21 | `MultiHeadedAttention` instance can be used as the argument. 22 | feed_forward (torch.nn.Module): Feed-forward module instance. 23 | `PositionwiseFeedForward` instance can be used as the argument. 24 | dropout_rate (float): Dropout rate. 25 | normalize_before (bool): 26 | True: use layer_norm before each sub-block. 27 | False: to use layer_norm after each sub-block. 28 | concat_after (bool): Whether to concat attention layer's inpu 29 | and output. 30 | True: x -> x + linear(concat(x, att(x))) 31 | False: x -> x + att(x) 32 | """ 33 | def __init__( 34 | self, 35 | size: int, 36 | self_attn: nn.Module, 37 | src_attn: nn.Module, 38 | feed_forward: nn.Module, 39 | dropout_rate: float, 40 | normalize_before: bool = True, 41 | concat_after: bool = False, 42 | ): 43 | """Construct an DecoderLayer object.""" 44 | super().__init__() 45 | self.size = size 46 | self.self_attn = self_attn 47 | self.src_attn = src_attn 48 | self.feed_forward = feed_forward 49 | self.norm1 = nn.LayerNorm(size, eps=1e-12) 50 | self.norm2 = nn.LayerNorm(size, eps=1e-12) 51 | self.norm3 = nn.LayerNorm(size, eps=1e-12) 52 | self.dropout = nn.Dropout(dropout_rate) 53 | self.normalize_before = normalize_before 54 | self.concat_after = concat_after 55 | self.concat_linear1 = nn.Linear(size + size, size) 56 | self.concat_linear2 = nn.Linear(size + size, size) 57 | 58 | def forward( 59 | self, 60 | tgt: torch.Tensor, 61 | tgt_mask: torch.Tensor, 62 | memory: torch.Tensor, 63 | memory_mask: torch.Tensor, 64 | cache: Optional[torch.Tensor] = None 65 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: 66 | """Compute decoded features. 67 | 68 | Args: 69 | tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). 70 | tgt_mask (torch.Tensor): Mask for input tensor 71 | (#batch, maxlen_out). 72 | memory (torch.Tensor): Encoded memory 73 | (#batch, maxlen_in, size). 74 | memory_mask (torch.Tensor): Encoded memory mask 75 | (#batch, maxlen_in). 76 | cache (torch.Tensor): cached tensors. 77 | (#batch, maxlen_out - 1, size). 78 | 79 | Returns: 80 | torch.Tensor: Output tensor (#batch, maxlen_out, size). 81 | torch.Tensor: Mask for output tensor (#batch, maxlen_out). 82 | torch.Tensor: Encoded memory (#batch, maxlen_in, size). 83 | torch.Tensor: Encoded memory mask (#batch, maxlen_in). 84 | 85 | """ 86 | residual = tgt 87 | if self.normalize_before: 88 | tgt = self.norm1(tgt) 89 | 90 | if cache is None: 91 | tgt_q = tgt 92 | tgt_q_mask = tgt_mask 93 | else: 94 | # compute only the last frame query keeping dim: max_time_out -> 1 95 | assert cache.shape == ( 96 | tgt.shape[0], 97 | tgt.shape[1] - 1, 98 | self.size, 99 | ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" 100 | tgt_q = tgt[:, -1:, :] 101 | residual = residual[:, -1:, :] 102 | tgt_q_mask = tgt_mask[:, -1:, :] 103 | 104 | if self.concat_after: 105 | tgt_concat = torch.cat( 106 | (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)), dim=-1) 107 | x = residual + self.concat_linear1(tgt_concat) 108 | else: 109 | x = residual + self.dropout( 110 | self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)) 111 | if not self.normalize_before: 112 | x = self.norm1(x) 113 | 114 | residual = x 115 | if self.normalize_before: 116 | x = self.norm2(x) 117 | if self.concat_after: 118 | x_concat = torch.cat( 119 | (x, self.src_attn(x, memory, memory, memory_mask)), dim=-1) 120 | x = residual + self.concat_linear2(x_concat) 121 | else: 122 | x = residual + self.dropout( 123 | self.src_attn(x, memory, memory, memory_mask)) 124 | if not self.normalize_before: 125 | x = self.norm2(x) 126 | 127 | residual = x 128 | if self.normalize_before: 129 | x = self.norm3(x) 130 | x = residual + self.dropout(self.feed_forward(x)) 131 | if not self.normalize_before: 132 | x = self.norm3(x) 133 | 134 | if cache is not None: 135 | x = torch.cat([cache, x], dim=1) 136 | 137 | return x, tgt_mask, memory, memory_mask 138 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/embedding.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Mobvoi Inc. All Rights Reserved. 5 | # Author: di.wu@mobvoi.com (DI WU) 6 | """Positonal Encoding Module.""" 7 | 8 | import math 9 | from typing import Tuple 10 | 11 | import torch 12 | 13 | 14 | class PositionalEncoding(torch.nn.Module): 15 | """Positional encoding. 16 | 17 | :param int d_model: embedding dim 18 | :param float dropout_rate: dropout rate 19 | :param int max_len: maximum input length 20 | 21 | PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) 22 | PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) 23 | """ 24 | def __init__(self, 25 | d_model: int, 26 | dropout_rate: float, 27 | max_len: int = 50000, 28 | reverse: bool = False): 29 | """Construct an PositionalEncoding object.""" 30 | super().__init__() 31 | self.d_model = d_model 32 | self.xscale = math.sqrt(self.d_model) 33 | self.dropout = torch.nn.Dropout(p=dropout_rate) 34 | self.max_len = max_len 35 | 36 | self.pe = torch.zeros(self.max_len, self.d_model) 37 | position = torch.arange(0, self.max_len, 38 | dtype=torch.float32).unsqueeze(1) 39 | div_term = torch.exp( 40 | torch.arange(0, self.d_model, 2, dtype=torch.float32) * 41 | -(math.log(10000.0) / self.d_model)) 42 | self.pe[:, 0::2] = torch.sin(position * div_term) 43 | self.pe[:, 1::2] = torch.cos(position * div_term) 44 | self.pe = self.pe.unsqueeze(0) 45 | 46 | def forward(self, 47 | x: torch.Tensor, 48 | offset: int = 0) -> Tuple[torch.Tensor, torch.Tensor]: 49 | """Add positional encoding. 50 | 51 | Args: 52 | x (torch.Tensor): Input. Its shape is (batch, time, ...) 53 | offset (int): position offset 54 | 55 | Returns: 56 | torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) 57 | torch.Tensor: for compatibility to RelPositionalEncoding 58 | """ 59 | assert offset + x.size(1) < self.max_len 60 | self.pe = self.pe.to(x.device) 61 | pos_emb = self.pe[:, offset:offset + x.size(1)] 62 | x = x * self.xscale + pos_emb 63 | return self.dropout(x), self.dropout(pos_emb) 64 | 65 | def position_encoding(self, offset: int, size: int) -> torch.Tensor: 66 | """ For getting encoding in a streaming fashion 67 | 68 | Attention!!!!! 69 | we apply dropout only once at the whole utterance level in a none 70 | streaming way, but will call this function several times with 71 | increasing input size in a streaming scenario, so the dropout will 72 | be applied several times. 73 | 74 | Args: 75 | offset (int): start offset 76 | size (int): requried size of position encoding 77 | 78 | Returns: 79 | torch.Tensor: Corresponding encoding 80 | """ 81 | assert offset + size < self.max_len 82 | return self.dropout(self.pe[:, offset:offset + size]) 83 | 84 | 85 | class RelPositionalEncoding(PositionalEncoding): 86 | """Relative positional encoding module. 87 | See : Appendix B in https://arxiv.org/abs/1901.02860 88 | Args: 89 | d_model (int): Embedding dimension. 90 | dropout_rate (float): Dropout rate. 91 | max_len (int): Maximum input length. 92 | """ 93 | def __init__(self, d_model: int, dropout_rate: float, max_len: int = 100000): 94 | """Initialize class.""" 95 | super().__init__(d_model, dropout_rate, max_len, reverse=True) 96 | 97 | def forward(self, 98 | x: torch.Tensor, 99 | offset: int = 0) -> Tuple[torch.Tensor, torch.Tensor]: 100 | """Compute positional encoding. 101 | Args: 102 | x (torch.Tensor): Input tensor (batch, time, `*`). 103 | Returns: 104 | torch.Tensor: Encoded tensor (batch, time, `*`). 105 | torch.Tensor: Positional embedding tensor (1, time, `*`). 106 | """ 107 | assert offset + x.size(1) < self.max_len 108 | self.pe = self.pe.to(x.device) 109 | x = x * self.xscale 110 | pos_emb = self.pe[:, offset:offset + x.size(1)] 111 | return self.dropout(x), self.dropout(pos_emb) 112 | 113 | 114 | class NoPositionalEncoding(torch.nn.Module): 115 | """ No position encoding 116 | """ 117 | def __init__(self, d_model: int, dropout_rate: float): 118 | super().__init__() 119 | self.d_model = d_model 120 | self.dropout = torch.nn.Dropout(p=dropout_rate) 121 | 122 | def forward(self, 123 | x: torch.Tensor, 124 | offset: int = 0) -> Tuple[torch.Tensor, torch.Tensor]: 125 | """ Just return zero vector for interface compatibility 126 | """ 127 | pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) 128 | return self.dropout(x), pos_emb 129 | 130 | def position_encoding(self, offset: int, size: int) -> torch.Tensor: 131 | return torch.zeros(1, size, self.d_model) 132 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/label_smoothing_loss.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | """Label smoothing module.""" 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class LabelSmoothingLoss(nn.Module): 13 | """Label-smoothing loss. 14 | 15 | In a standard CE loss, the label's data distribution is: 16 | [0,1,2] -> 17 | [ 18 | [1.0, 0.0, 0.0], 19 | [0.0, 1.0, 0.0], 20 | [1.0, 0.0, 1.0], 21 | ] 22 | 23 | In the smoothing version CE Loss,some probabilities 24 | are taken from the true label prob (1.0) and are divided 25 | among other labels. 26 | 27 | e.g. 28 | smoothing=0.1 29 | [0,1,2] -> 30 | [ 31 | [0.9, 0.05, 0.05], 32 | [0.05, 0.9, 0.05], 33 | [0.05, 0.05, 0.9], 34 | ] 35 | 36 | Args: 37 | size (int): the number of class 38 | padding_idx (int): padding class id which will be ignored for loss 39 | smoothing (float): smoothing rate (0.0 means the conventional CE) 40 | normalize_length (bool): 41 | normalize loss by sequence length if True 42 | normalize loss by batch size if False 43 | """ 44 | def __init__(self, 45 | size: int, 46 | padding_idx: int, 47 | smoothing: float, 48 | normalize_length: bool = False): 49 | """Construct an LabelSmoothingLoss object.""" 50 | super(LabelSmoothingLoss, self).__init__() 51 | self.criterion = nn.KLDivLoss(reduction="none") 52 | self.padding_idx = padding_idx 53 | self.confidence = 1.0 - smoothing 54 | self.smoothing = smoothing 55 | self.size = size 56 | self.normalize_length = normalize_length 57 | 58 | def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: 59 | """Compute loss between x and target. 60 | 61 | The model outputs and data labels tensors are flatten to 62 | (batch*seqlen, class) shape and a mask is applied to the 63 | padding part which should not be calculated for loss. 64 | 65 | Args: 66 | x (torch.Tensor): prediction (batch, seqlen, class) 67 | target (torch.Tensor): 68 | target signal masked with self.padding_id (batch, seqlen) 69 | Returns: 70 | loss (torch.Tensor) : The KL loss, scalar float value 71 | """ 72 | assert x.size(2) == self.size 73 | batch_size = x.size(0) 74 | x = x.view(-1, self.size) 75 | target = target.view(-1) 76 | # use zeros_like instead of torch.no_grad() for true_dist, 77 | # since no_grad() can not be exported by JIT 78 | true_dist = torch.zeros_like(x) 79 | true_dist.fill_(self.smoothing / (self.size - 1)) 80 | ignore = target == self.padding_idx # (B,) 81 | total = len(target) - ignore.sum().item() 82 | target = target.masked_fill(ignore, 0) # avoid -1 index 83 | true_dist.scatter_(1, target.unsqueeze(1), self.confidence) 84 | kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) 85 | denom = total if self.normalize_length else batch_size 86 | return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom 87 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/positionwise_feed_forward.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | """Positionwise feed forward layer definition.""" 7 | 8 | import torch 9 | 10 | 11 | class PositionwiseFeedForward(torch.nn.Module): 12 | """Positionwise feed forward layer. 13 | 14 | FeedForward are appied on each position of the sequence. 15 | The output dim is same with the input dim. 16 | 17 | Args: 18 | idim (int): Input dimenstion. 19 | hidden_units (int): The number of hidden units. 20 | dropout_rate (float): Dropout rate. 21 | activation (torch.nn.Module): Activation function 22 | """ 23 | def __init__(self, 24 | idim: int, 25 | hidden_units: int, 26 | dropout_rate: float, 27 | activation: torch.nn.Module = torch.nn.ReLU()): 28 | """Construct a PositionwiseFeedForward object.""" 29 | super(PositionwiseFeedForward, self).__init__() 30 | self.w_1 = torch.nn.Linear(idim, hidden_units) 31 | self.activation = activation 32 | self.dropout = torch.nn.Dropout(dropout_rate) 33 | self.w_2 = torch.nn.Linear(hidden_units, idim) 34 | 35 | def forward(self, xs: torch.Tensor) -> torch.Tensor: 36 | """Forward function. 37 | 38 | Args: 39 | xs: input tensor (B, L, D) 40 | Returns: 41 | output tensor, (B, L, D) 42 | """ 43 | return self.w_2(self.dropout(self.activation(self.w_1(xs)))) 44 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/transformer/swish.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe) 5 | # Northwestern Polytechnical University (Pengcheng Guo) 6 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 7 | """Swish() activation function for Conformer.""" 8 | 9 | import torch 10 | 11 | 12 | class Swish(torch.nn.Module): 13 | """Construct an Swish object.""" 14 | def forward(self, x: torch.Tensor) -> torch.Tensor: 15 | """Return Swish activation function.""" 16 | return x * torch.sigmoid(x) 17 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/utils/__pycache__/common.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/utils/__pycache__/common.cpython-36.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/utils/__pycache__/common.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/utils/__pycache__/common.cpython-37.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/utils/__pycache__/common.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/utils/__pycache__/common.cpython-39.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/utils/__pycache__/mask.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/utils/__pycache__/mask.cpython-36.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/utils/__pycache__/mask.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/utils/__pycache__/mask.cpython-37.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/utils/__pycache__/mask.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/utils/__pycache__/mask.cpython-39.pyc -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/utils/checkpoint.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Mobvoi Inc. All Rights Reserved. 2 | # Author: binbinzhang@mobvoi.com (Binbin Zhang) 3 | 4 | import logging 5 | import os 6 | import re 7 | 8 | import yaml 9 | import torch 10 | 11 | 12 | def load_checkpoint(model: torch.nn.Module, path: str) -> dict: 13 | if torch.cuda.is_available(): 14 | logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) 15 | checkpoint = torch.load(path) 16 | else: 17 | logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) 18 | checkpoint = torch.load(path, map_location='cpu') 19 | model.load_state_dict(checkpoint) 20 | info_path = re.sub('.pt$', '.yaml', path) 21 | configs = {} 22 | if os.path.exists(info_path): 23 | with open(info_path, 'r') as fin: 24 | configs = yaml.load(fin, Loader=yaml.FullLoader) 25 | return configs 26 | 27 | 28 | def save_checkpoint(model: torch.nn.Module, path: str, infos=None): 29 | ''' 30 | Args: 31 | infos (dict or None): any info you want to save. 32 | ''' 33 | logging.info('Checkpoint: save to checkpoint %s' % path) 34 | if isinstance(model, torch.nn.DataParallel): 35 | state_dict = model.module.state_dict() 36 | elif isinstance(model, torch.nn.parallel.DistributedDataParallel): 37 | state_dict = model.module.state_dict() 38 | else: 39 | state_dict = model.state_dict() 40 | torch.save(state_dict, path) 41 | info_path = re.sub('.pt$', '.yaml', path) 42 | if infos is None: 43 | infos = {} 44 | with open(info_path, 'w') as fout: 45 | data = yaml.dump(infos) 46 | fout.write(data) 47 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/utils/cmvn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import json 17 | import math 18 | 19 | import numpy as np 20 | 21 | 22 | def _load_json_cmvn(json_cmvn_file): 23 | """ Load the json format cmvn stats file and calculate cmvn 24 | 25 | Args: 26 | json_cmvn_file: cmvn stats file in json format 27 | 28 | Returns: 29 | a numpy array of [means, vars] 30 | """ 31 | with open(json_cmvn_file) as f: 32 | cmvn_stats = json.load(f) 33 | 34 | means = cmvn_stats['mean_stat'] 35 | variance = cmvn_stats['var_stat'] 36 | count = cmvn_stats['frame_num'] 37 | for i in range(len(means)): 38 | means[i] /= count 39 | variance[i] = variance[i] / count - means[i] * means[i] 40 | if variance[i] < 1.0e-20: 41 | variance[i] = 1.0e-20 42 | variance[i] = 1.0 / math.sqrt(variance[i]) 43 | cmvn = np.array([means, variance]) 44 | return cmvn 45 | 46 | 47 | def _load_kaldi_cmvn(kaldi_cmvn_file): 48 | """ Load the kaldi format cmvn stats file and calculate cmvn 49 | 50 | Args: 51 | kaldi_cmvn_file: kaldi text style global cmvn file, which 52 | is generated by: 53 | compute-cmvn-stats --binary=false scp:feats.scp global_cmvn 54 | 55 | Returns: 56 | a numpy array of [means, vars] 57 | """ 58 | means = [] 59 | variance = [] 60 | with open(kaldi_cmvn_file, 'r') as fid: 61 | # kaldi binary file start with '\0B' 62 | if fid.read(2) == '\0B': 63 | logging.error('kaldi cmvn binary file is not supported, please ' 64 | 'recompute it by: compute-cmvn-stats --binary=false ' 65 | ' scp:feats.scp global_cmvn') 66 | sys.exit(1) 67 | fid.seek(0) 68 | arr = fid.read().split() 69 | assert (arr[0] == '[') 70 | assert (arr[-2] == '0') 71 | assert (arr[-1] == ']') 72 | feat_dim = int((len(arr) - 2 - 2) / 2) 73 | for i in range(1, feat_dim + 1): 74 | means.append(float(arr[i])) 75 | count = float(arr[feat_dim + 1]) 76 | for i in range(feat_dim + 2, 2 * feat_dim + 2): 77 | variance.append(float(arr[i])) 78 | 79 | for i in range(len(means)): 80 | means[i] /= count 81 | variance[i] = variance[i] / count - means[i] * means[i] 82 | if variance[i] < 1.0e-20: 83 | variance[i] = 1.0e-20 84 | variance[i] = 1.0 / math.sqrt(variance[i]) 85 | cmvn = np.array([means, variance]) 86 | return cmvn 87 | 88 | 89 | def load_cmvn(cmvn_file, is_json): 90 | if is_json: 91 | cmvn = _load_json_cmvn(cmvn_file) 92 | else: 93 | cmvn = _load_kaldi_cmvn(cmvn_file) 94 | return cmvn[0], cmvn[1] 95 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/utils/common.py: -------------------------------------------------------------------------------- 1 | """Unility functions for Transformer.""" 2 | 3 | import math 4 | from typing import Tuple, List 5 | 6 | import torch 7 | from torch.nn.utils.rnn import pad_sequence 8 | 9 | IGNORE_ID = -1 10 | 11 | 12 | def pad_list(xs: List[torch.Tensor], pad_value: int): 13 | """Perform padding for the list of tensors. 14 | 15 | Args: 16 | xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. 17 | pad_value (float): Value for padding. 18 | 19 | Returns: 20 | Tensor: Padded tensor (B, Tmax, `*`). 21 | 22 | Examples: 23 | >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] 24 | >>> x 25 | [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] 26 | >>> pad_list(x, 0) 27 | tensor([[1., 1., 1., 1.], 28 | [1., 1., 0., 0.], 29 | [1., 0., 0., 0.]]) 30 | 31 | """ 32 | n_batch = len(xs) 33 | max_len = max([x.size(0) for x in xs]) 34 | pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) 35 | pad = pad.fill_(pad_value) 36 | for i in range(n_batch): 37 | pad[i, :xs[i].size(0)] = xs[i] 38 | 39 | return pad 40 | 41 | 42 | def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, 43 | ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: 44 | """Add and labels. 45 | 46 | Args: 47 | ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) 48 | sos (int): index of 49 | eos (int): index of 50 | ignore_id (int): index of padding 51 | 52 | Returns: 53 | ys_in (torch.Tensor) : (B, Lmax + 1) 54 | ys_out (torch.Tensor) : (B, Lmax + 1) 55 | 56 | Examples: 57 | >>> sos_id = 10 58 | >>> eos_id = 11 59 | >>> ignore_id = -1 60 | >>> ys_pad 61 | tensor([[ 1, 2, 3, 4, 5], 62 | [ 4, 5, 6, -1, -1], 63 | [ 7, 8, 9, -1, -1]], dtype=torch.int32) 64 | >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) 65 | >>> ys_in 66 | tensor([[10, 1, 2, 3, 4, 5], 67 | [10, 4, 5, 6, 11, 11], 68 | [10, 7, 8, 9, 11, 11]]) 69 | >>> ys_out 70 | tensor([[ 1, 2, 3, 4, 5, 11], 71 | [ 4, 5, 6, 11, -1, -1], 72 | [ 7, 8, 9, 11, -1, -1]]) 73 | """ 74 | _sos = torch.tensor([sos], 75 | dtype=torch.long, 76 | requires_grad=False, 77 | device=ys_pad.device) 78 | _eos = torch.tensor([eos], 79 | dtype=torch.long, 80 | requires_grad=False, 81 | device=ys_pad.device) 82 | ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys 83 | ys_in = [torch.cat([_sos, y], dim=0) for y in ys] 84 | ys_out = [torch.cat([y, _eos], dim=0) for y in ys] 85 | return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) 86 | 87 | 88 | def reverse_pad_list(ys_pad: torch.Tensor, 89 | ys_lens: torch.Tensor, 90 | pad_value: float = -1.0) -> torch.Tensor: 91 | """Reverse padding for the list of tensors. 92 | 93 | Args: 94 | ys_pad (tensor): The padded tensor (B, Tokenmax). 95 | ys_lens (tensor): The lens of token seqs (B) 96 | pad_value (int): Value for padding. 97 | 98 | Returns: 99 | Tensor: Padded tensor (B, Tokenmax). 100 | 101 | Examples: 102 | >>> x 103 | tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) 104 | >>> pad_list(x, 0) 105 | tensor([[4, 3, 2, 1], 106 | [7, 6, 5, 0], 107 | [9, 8, 0, 0]]) 108 | 109 | """ 110 | r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) 111 | for y, i in zip(ys_pad, ys_lens)], True, 112 | pad_value) 113 | return r_ys_pad 114 | 115 | 116 | def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, 117 | ignore_label: int) -> float: 118 | """Calculate accuracy. 119 | 120 | Args: 121 | pad_outputs (Tensor): Prediction tensors (B * Lmax, D). 122 | pad_targets (LongTensor): Target label tensors (B, Lmax, D). 123 | ignore_label (int): Ignore label id. 124 | 125 | Returns: 126 | float: Accuracy value (0.0 - 1.0). 127 | 128 | """ 129 | pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), 130 | pad_outputs.size(1)).argmax(2) 131 | mask = pad_targets != ignore_label 132 | numerator = torch.sum( 133 | pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) 134 | denominator = torch.sum(mask) 135 | return float(numerator) / float(denominator) 136 | 137 | 138 | def get_activation(act): 139 | """Return activation function.""" 140 | # Lazy load to avoid unused import 141 | #from wenet.transformer.swish import Swish 142 | from ..transformer.swish import Swish 143 | 144 | activation_funcs = { 145 | "hardtanh": torch.nn.Hardtanh, 146 | "tanh": torch.nn.Tanh, 147 | "relu": torch.nn.ReLU, 148 | "selu": torch.nn.SELU, 149 | "swish": Swish, 150 | "gelu": torch.nn.GELU 151 | } 152 | 153 | return activation_funcs[act]() 154 | 155 | 156 | def get_subsample(config): 157 | input_layer = config["encoder_conf"]["input_layer"] 158 | assert input_layer in ["conv2d", "conv2d6", "conv2d8"] 159 | if input_layer == "conv2d": 160 | return 4 161 | elif input_layer == "conv2d6": 162 | return 6 163 | elif input_layer == "conv2d8": 164 | return 8 165 | 166 | 167 | def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: 168 | new_hyp: List[int] = [] 169 | cur = 0 170 | while cur < len(hyp): 171 | if hyp[cur] != 0: 172 | new_hyp.append(hyp[cur]) 173 | prev = cur 174 | while cur < len(hyp) and hyp[cur] == hyp[prev]: 175 | cur += 1 176 | return new_hyp 177 | 178 | 179 | def log_add(args: List[int]) -> float: 180 | """ 181 | Stable log add 182 | """ 183 | if all(a == -float('inf') for a in args): 184 | return -float('inf') 185 | a_max = max(args) 186 | lsp = math.log(sum(math.exp(a - a_max) for a in args)) 187 | return a_max + lsp 188 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/utils/ctc_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Mobvoi Inc. All Rights Reserved. 2 | # Author: binbinzhang@mobvoi.com (Di Wu) 3 | 4 | import numpy as np 5 | import torch 6 | 7 | def insert_blank(label, blank_id=0): 8 | """Insert blank token between every two label token.""" 9 | label = np.expand_dims(label, 1) 10 | blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id 11 | label = np.concatenate([blanks, label], axis=1) 12 | label = label.reshape(-1) 13 | label = np.append(label, label[0]) 14 | return label 15 | 16 | def forced_align(ctc_probs: torch.Tensor, 17 | y: torch.Tensor, 18 | blank_id=0) -> list: 19 | """ctc forced alignment. 20 | 21 | Args: 22 | torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) 23 | torch.Tensor y: id sequence tensor 1d tensor (L) 24 | int blank_id: blank symbol index 25 | Returns: 26 | torch.Tensor: alignment result 27 | """ 28 | y_insert_blank = insert_blank(y, blank_id) 29 | 30 | log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) 31 | log_alpha = log_alpha - float('inf') # log of zero 32 | state_path = (torch.zeros( 33 | (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 34 | ) # state path 35 | 36 | # init start state 37 | log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] 38 | log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] 39 | 40 | for t in range(1, ctc_probs.size(0)): 41 | for s in range(len(y_insert_blank)): 42 | if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ 43 | s] == y_insert_blank[s - 2]: 44 | candidates = torch.tensor( 45 | [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) 46 | prev_state = [s, s - 1] 47 | else: 48 | candidates = torch.tensor([ 49 | log_alpha[t - 1, s], 50 | log_alpha[t - 1, s - 1], 51 | log_alpha[t - 1, s - 2], 52 | ]) 53 | prev_state = [s, s - 1, s - 2] 54 | log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] 55 | state_path[t, s] = prev_state[torch.argmax(candidates)] 56 | 57 | state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) 58 | 59 | candidates = torch.tensor([ 60 | log_alpha[-1, len(y_insert_blank) - 1], 61 | log_alpha[-1, len(y_insert_blank) - 2] 62 | ]) 63 | prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] 64 | state_seq[-1] = prev_state[torch.argmax(candidates)] 65 | for t in range(ctc_probs.size(0) - 2, -1, -1): 66 | state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] 67 | 68 | output_alignment = [] 69 | for t in range(0, ctc_probs.size(0)): 70 | output_alignment.append(y_insert_blank[state_seq[t, 0]]) 71 | 72 | return output_alignment 73 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/utils/executor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Mobvoi Inc. All Rights Reserved. 2 | # Author: binbinzhang@mobvoi.com (Binbin Zhang) 3 | 4 | import logging 5 | from contextlib import nullcontext 6 | # if your python version < 3.7 use the below one 7 | # from contextlib import suppress as nullcontext 8 | import torch 9 | from torch.nn.utils import clip_grad_norm_ 10 | 11 | 12 | class Executor: 13 | def __init__(self): 14 | self.step = 0 15 | 16 | def train(self, model, optimizer, scheduler, data_loader, device, writer, 17 | args, scaler): 18 | ''' Train one epoch 19 | ''' 20 | model.train() 21 | clip = args.get('grad_clip', 50.0) 22 | log_interval = args.get('log_interval', 10) 23 | rank = args.get('rank', 0) 24 | accum_grad = args.get('accum_grad', 1) 25 | is_distributed = args.get('is_distributed', True) 26 | use_amp = args.get('use_amp', False) 27 | logging.info('using accumulate grad, new batch size is {} times' 28 | 'larger than before'.format(accum_grad)) 29 | if use_amp: 30 | assert scaler is not None 31 | num_seen_utts = 0 32 | num_total_batch = len(data_loader) 33 | for batch_idx, batch in enumerate(data_loader): 34 | key, feats, target, feats_lengths, target_lengths = batch 35 | feats = feats.to(device) 36 | target = target.to(device) 37 | feats_lengths = feats_lengths.to(device) 38 | target_lengths = target_lengths.to(device) 39 | num_utts = target_lengths.size(0) 40 | if num_utts == 0: 41 | continue 42 | context = None 43 | # Disable gradient synchronizations across DDP processes. 44 | # Within this context, gradients will be accumulated on module 45 | # variables, which will later be synchronized. 46 | if is_distributed and batch_idx % accum_grad != 0: 47 | context = model.no_sync 48 | # Used for single gpu training and DDP gradient synchronization 49 | # processes. 50 | else: 51 | context = nullcontext 52 | with context(): 53 | # autocast context 54 | # The more details about amp can be found in 55 | # https://pytorch.org/docs/stable/notes/amp_examples.html 56 | with torch.cuda.amp.autocast(scaler is not None): 57 | loss, loss_att, loss_ctc = model(feats, feats_lengths, 58 | target, target_lengths) 59 | loss = loss / accum_grad 60 | if use_amp: 61 | scaler.scale(loss).backward() 62 | else: 63 | loss.backward() 64 | 65 | num_seen_utts += num_utts 66 | if batch_idx % accum_grad == 0: 67 | if rank == 0 and writer is not None: 68 | writer.add_scalar('train_loss', loss, self.step) 69 | # Use mixed precision training 70 | if use_amp: 71 | scaler.unscale_(optimizer) 72 | grad_norm = clip_grad_norm_(model.parameters(), clip) 73 | # Must invoke scaler.update() if unscale_() is used in the 74 | # iteration to avoid the following error: 75 | # RuntimeError: unscale_() has already been called 76 | # on this optimizer since the last update(). 77 | # We don't check grad here since that if the gradient has 78 | # inf/nan values, scaler.step will skip optimizer.step(). 79 | scaler.step(optimizer) 80 | scaler.update() 81 | else: 82 | grad_norm = clip_grad_norm_(model.parameters(), clip) 83 | if torch.isfinite(grad_norm): 84 | optimizer.step() 85 | optimizer.zero_grad() 86 | scheduler.step() 87 | self.step += 1 88 | if batch_idx % log_interval == 0: 89 | lr = optimizer.param_groups[0]['lr'] 90 | log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( 91 | batch_idx, num_total_batch, 92 | loss.item() * accum_grad) 93 | if loss_att is not None: 94 | log_str += 'loss_att {:.6f} '.format(loss_att.item()) 95 | if loss_ctc is not None: 96 | log_str += 'loss_ctc {:.6f} '.format(loss_ctc.item()) 97 | log_str += 'lr {:.8f} rank {}'.format(lr, rank) 98 | logging.debug(log_str) 99 | 100 | def cv(self, model, data_loader, device, args): 101 | ''' Cross validation on 102 | ''' 103 | model.eval() 104 | log_interval = args.get('log_interval', 10) 105 | # in order to avoid division by 0 106 | num_seen_utts = 1 107 | total_loss = 0.0 108 | num_total_batch = len(data_loader) 109 | with torch.no_grad(): 110 | for batch_idx, batch in enumerate(data_loader): 111 | key, feats, target, feats_lengths, target_lengths = batch 112 | feats = feats.to(device) 113 | target = target.to(device) 114 | feats_lengths = feats_lengths.to(device) 115 | target_lengths = target_lengths.to(device) 116 | num_utts = target_lengths.size(0) 117 | if num_utts == 0: 118 | continue 119 | loss, loss_att, loss_ctc = model(feats, feats_lengths, target, 120 | target_lengths) 121 | if torch.isfinite(loss): 122 | num_seen_utts += num_utts 123 | total_loss += loss.item() * num_utts 124 | if batch_idx % log_interval == 0: 125 | log_str = 'CV Batch {}/{} loss {:.6f} '.format( 126 | batch_idx, num_total_batch, loss.item()) 127 | if loss_att is not None: 128 | log_str += 'loss_att {:.6f} '.format(loss_att.item()) 129 | if loss_ctc is not None: 130 | log_str += 'loss_ctc {:.6f} '.format(loss_ctc.item()) 131 | log_str += 'history loss {:.6f}'.format(total_loss / 132 | num_seen_utts) 133 | logging.debug(log_str) 134 | 135 | return total_loss, num_seen_utts 136 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/models/wenet/utils/scheduler.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import torch 4 | from torch.optim.lr_scheduler import _LRScheduler 5 | 6 | from typeguard import check_argument_types 7 | 8 | 9 | class WarmupLR(_LRScheduler): 10 | """The WarmupLR scheduler 11 | 12 | This scheduler is almost same as NoamLR Scheduler except for following 13 | difference: 14 | 15 | NoamLR: 16 | lr = optimizer.lr * model_size ** -0.5 17 | * min(step ** -0.5, step * warmup_step ** -1.5) 18 | WarmupLR: 19 | lr = optimizer.lr * warmup_step ** 0.5 20 | * min(step ** -0.5, step * warmup_step ** -1.5) 21 | 22 | Note that the maximum lr equals to optimizer.lr in this scheduler. 23 | 24 | """ 25 | 26 | def __init__( 27 | self, 28 | optimizer: torch.optim.Optimizer, 29 | warmup_steps: Union[int, float] = 25000, 30 | last_epoch: int = -1, 31 | ): 32 | assert check_argument_types() 33 | self.warmup_steps = warmup_steps 34 | 35 | # __init__() must be invoked before setting field 36 | # because step() is also invoked in __init__() 37 | super().__init__(optimizer, last_epoch) 38 | 39 | def __repr__(self): 40 | return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" 41 | 42 | def get_lr(self): 43 | step_num = self.last_epoch + 1 44 | return [ 45 | lr 46 | * self.warmup_steps ** 0.5 47 | * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) 48 | for lr in self.base_lrs 49 | ] 50 | 51 | def set_step(self, step: int): 52 | self.last_epoch = step 53 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/optimizer/adam.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | 6 | def Optimizer(parameters, lr, weight_decay, **kwargs): 7 | 8 | print('Initialised Adam optimizer') 9 | return torch.optim.Adam(parameters, lr = lr, weight_decay = weight_decay); 10 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/optimizer/adamP.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | from adamp import AdamP 6 | 7 | def Optimizer(parameters, lr, weight_decay, **kwargs): 8 | print('Initialised AdamP optimizer') 9 | return AdamP(parameters, lr = lr, betas = (0.9, 0.999), weight_decay = weight_decay) 10 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/optimizer/adamW.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | 6 | def Optimizer(parameters, lr, weight_decay, **kwargs): 7 | 8 | print('Initialised AdamW optimizer') 9 | return torch.optim.AdamW(parameters, lr = lr, weight_decay = weight_decay) 10 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/optimizer/sgd.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import torch 5 | 6 | def Optimizer(parameters, lr, weight_decay, **kwargs): 7 | 8 | print('Initialised SGD optimizer') 9 | 10 | return torch.optim.SGD(parameters, lr = lr, momentum = 0.9, weight_decay=weight_decay); 11 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/protocols/ASVspoof2019.LA.asv.eval.female.trn.txt: -------------------------------------------------------------------------------- 1 | LA_0026 LA_E_A6067886,LA_E_A6399397,LA_E_A7328076,LA_E_A7522472,LA_E_A8382737,LA_E_A8628133,LA_E_A8936529,LA_E_A9162657,LA_E_A9477785,LA_E_A9847831,LA_E_A9970430 2 | LA_0041 LA_E_A2600639,LA_E_A2615005,LA_E_A3009558,LA_E_A3893238,LA_E_A4308598,LA_E_A5901043,LA_E_A6995425,LA_E_A8885354,LA_E_A9864327,LA_E_A9903225,LA_E_A9921939 3 | LA_0043 LA_E_A2159621,LA_E_A2750783,LA_E_A2883190,LA_E_A3950726,LA_E_A6019368,LA_E_A6548293,LA_E_A6887259,LA_E_A7782002,LA_E_A8331933,LA_E_A8639897,LA_E_A9500389 4 | LA_0012 LA_E_A1053965,LA_E_A1795401,LA_E_A2440720,LA_E_A2829678,LA_E_A4042686,LA_E_A4233081,LA_E_A5462934,LA_E_A5982169,LA_E_A6508387,LA_E_A7343806,LA_E_A8704258 5 | LA_0031 LA_E_A1478121,LA_E_A2460512,LA_E_A2926096,LA_E_A3041661,LA_E_A3554530,LA_E_A3598858,LA_E_A4171094,LA_E_A4538545,LA_E_A6798483,LA_E_A7032162,LA_E_A7210101 6 | LA_0037 LA_E_A1051956,LA_E_A1196355,LA_E_A2695639,LA_E_A3555619,LA_E_A3654052,LA_E_A3789634,LA_E_A4791598,LA_E_A5467066,LA_E_A5912013,LA_E_A6211829,LA_E_A9327727 7 | LA_0008 LA_E_A1280994,LA_E_A2012637,LA_E_A2281694,LA_E_A3406491,LA_E_A3583360,LA_E_A3917123,LA_E_A5239949,LA_E_A5939507,LA_E_A6514798,LA_E_A9527561,LA_E_A9776482 8 | LA_0029 LA_E_A1987953,LA_E_A2171329,LA_E_A2217302,LA_E_A2864595,LA_E_A4536568,LA_E_A4720897,LA_E_A6298434,LA_E_A7162720,LA_E_A7935803,LA_E_A8288389,LA_E_A9426249 9 | LA_0004 LA_E_A1350785,LA_E_A2578761,LA_E_A3090180,LA_E_A4176263,LA_E_A4801136,LA_E_A5523997,LA_E_A5740231,LA_E_A6371783,LA_E_A7554553,LA_E_A8796242,LA_E_A9929520 10 | LA_0045 LA_E_A1383315,LA_E_A2359362,LA_E_A3521251,LA_E_A3574364,LA_E_A3821822,LA_E_A6030971,LA_E_A6066016,LA_E_A6756990,LA_E_A7643994,LA_E_A8520532,LA_E_A8820512 11 | LA_0010 LA_E_A3134888,LA_E_A3904853,LA_E_A4006253,LA_E_A4351518,LA_E_A6136654,LA_E_A6138183,LA_E_A6795707,LA_E_A7714540,LA_E_A7831194,LA_E_A8013850,LA_E_A8512745 12 | LA_0034 LA_E_A3254492,LA_E_A3353969,LA_E_A3384384,LA_E_A4969775,LA_E_A5459864,LA_E_A5902299,LA_E_A8288456,LA_E_A8437596,LA_E_A8639383,LA_E_A9210248,LA_E_A9760674 13 | LA_0033 LA_E_A2278657,LA_E_A3618104,LA_E_A3820212,LA_E_A6157632,LA_E_A6455734,LA_E_A6691385,LA_E_A7273228,LA_E_A7511208,LA_E_A7969490,LA_E_A8804831,LA_E_A9024376 14 | LA_0042 LA_E_A1198708,LA_E_A2193411,LA_E_A4244162,LA_E_A5124438,LA_E_A5815457,LA_E_A6081776,LA_E_A6924100,LA_E_A7506556,LA_E_A7888496,LA_E_A9618297,LA_E_A9829952 15 | LA_0035 LA_E_A1585336,LA_E_A2526555,LA_E_A2940472,LA_E_A3836347,LA_E_A4034855,LA_E_A4336680,LA_E_A4435680,LA_E_A6082825,LA_E_A6703766,LA_E_A6711472,LA_E_A7735424 16 | LA_0027 LA_E_A2777383,LA_E_A2992932,LA_E_A3321288,LA_E_A3345148,LA_E_A3778747,LA_E_A6684049,LA_E_A8100239,LA_E_A8239552,LA_E_A9038375,LA_E_A9435429,LA_E_A9515193 17 | LA_0014 LA_E_A1585392,LA_E_A3658404,LA_E_A3770777,LA_E_A4006695,LA_E_A5854979,LA_E_A6602358,LA_E_A7811753,LA_E_A8603666,LA_E_A9184573,LA_E_A9884360,LA_E_A9929223 18 | LA_0024 LA_E_A3614213,LA_E_A4614013,LA_E_A4759484,LA_E_A5641333,LA_E_A6256166,LA_E_A6801379,LA_E_A7238447,LA_E_A7361812,LA_E_A7830058,LA_E_A8972377,LA_E_A9143306 19 | LA_0016 LA_E_A1249655,LA_E_A3719322,LA_E_A4901704,LA_E_A5038438,LA_E_A5839270,LA_E_A6842353,LA_E_A7666759,LA_E_A7929078,LA_E_A7931622,LA_E_A8034727,LA_E_A9015457 20 | LA_0017 LA_E_A2900556,LA_E_A3225162,LA_E_A3469512,LA_E_A3519492,LA_E_A3822959,LA_E_A4628943,LA_E_A4646149,LA_E_A6677949,LA_E_A7665790,LA_E_A8557348,LA_E_A9898607 21 | LA_0019 LA_E_A1798025,LA_E_A1967771,LA_E_A2242096,LA_E_A5065809,LA_E_A6428663,LA_E_A6744935,LA_E_A7454132,LA_E_A8228964,LA_E_A8882063,LA_E_A9150661,LA_E_A9687807 22 | LA_0006 LA_E_A1871167,LA_E_A2856296,LA_E_A3183649,LA_E_A5037636,LA_E_A5890944,LA_E_A6542679,LA_E_A7145264,LA_E_A7504228,LA_E_A7662019,LA_E_A7961119,LA_E_A8497664 23 | LA_0039 LA_E_A1581105,LA_E_A3423971,LA_E_A3926347,LA_E_A4064171,LA_E_A4542113,LA_E_A4798522,LA_E_A5271858,LA_E_A6236286,LA_E_A7588072,LA_E_A8574000,LA_E_A9406597 24 | LA_0009 LA_E_A1204442,LA_E_A1390066,LA_E_A2232628,LA_E_A2677276,LA_E_A4267005,LA_E_A6283872,LA_E_A6317736,LA_E_A8023214,LA_E_A8460072,LA_E_A8566051,LA_E_A8871185 25 | LA_0047 LA_E_A2673534,LA_E_A3911996,LA_E_A7620458,LA_E_A7781578,LA_E_A7882315,LA_E_A8076326,LA_E_A9217821,LA_E_A9598155,LA_E_A9693501,LA_E_A9831653,LA_E_A9940557 26 | LA_0022 LA_E_A1478713,LA_E_A1598631,LA_E_A1639326,LA_E_A4722070,LA_E_A6663116,LA_E_A7227229,LA_E_A7869414,LA_E_A8042701,LA_E_A9686199,LA_E_A9827503,LA_E_A9975938 27 | LA_0020 LA_E_A1584444,LA_E_A1952273,LA_E_A2683985,LA_E_A2921021,LA_E_A3033742,LA_E_A3374020,LA_E_A4668744,LA_E_A6438857,LA_E_A6716830,LA_E_A7975193,LA_E_A8158713 28 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/protocols/ASVspoof2019.LA.asv.eval.male.trn.txt: -------------------------------------------------------------------------------- 1 | LA_0007 LA_E_A1579796,LA_E_A1669549,LA_E_A2092700,LA_E_A2287839,LA_E_A3734175,LA_E_A4085221,LA_E_A4165051,LA_E_A5069362,LA_E_A5863027,LA_E_A6161186,LA_E_A6315891,LA_E_A6479863,LA_E_A6565965,LA_E_A7384060,LA_E_A8093886,LA_E_A8763189,LA_E_A9002777,LA_E_A9429276,LA_E_A9569588 2 | LA_0003 LA_E_A2038846,LA_E_A3217904,LA_E_A3998756,LA_E_A4032623,LA_E_A4070913,LA_E_A4500897,LA_E_A4850706,LA_E_A5052570,LA_E_A5475189,LA_E_A6131259,LA_E_A6533314,LA_E_A7406609,LA_E_A7570344,LA_E_A7626807,LA_E_A8643969,LA_E_A8963027,LA_E_A9136617,LA_E_A9665991,LA_E_A9967055 3 | LA_0015 LA_E_A1096168,LA_E_A1942801,LA_E_A3022922,LA_E_A3666614,LA_E_A4126859,LA_E_A4704011,LA_E_A4922158,LA_E_A5080077,LA_E_A5555947,LA_E_A5752283,LA_E_A6140229,LA_E_A6479043,LA_E_A6801198,LA_E_A7216041,LA_E_A7344985,LA_E_A7455161,LA_E_A7915419,LA_E_A9103743,LA_E_A9352992 4 | LA_0005 LA_E_A1061661,LA_E_A1151528,LA_E_A1357547,LA_E_A1552302,LA_E_A1805932,LA_E_A1982652,LA_E_A2562108,LA_E_A2758018,LA_E_A4401481,LA_E_A4626256,LA_E_A5518426,LA_E_A5938161,LA_E_A6003322,LA_E_A6467075,LA_E_A8309685,LA_E_A8311618,LA_E_A8599621,LA_E_A8945195,LA_E_A9175975 5 | LA_0048 LA_E_A1286158,LA_E_A2120927,LA_E_A2329958,LA_E_A2636110,LA_E_A2645632,LA_E_A2917108,LA_E_A3109618,LA_E_A3573642,LA_E_A3971551,LA_E_A4727672,LA_E_A5591943,LA_E_A5764619,LA_E_A6442982,LA_E_A6681433,LA_E_A7385173,LA_E_A8136322,LA_E_A8439465,LA_E_A8533861,LA_E_A8968933 6 | LA_0038 LA_E_A1116626,LA_E_A1174652,LA_E_A1378276,LA_E_A1620277,LA_E_A2266714,LA_E_A2452306,LA_E_A2777403,LA_E_A4251249,LA_E_A4278406,LA_E_A4473171,LA_E_A4702188,LA_E_A4994837,LA_E_A5683769,LA_E_A6631075,LA_E_A7036528,LA_E_A7814286,LA_E_A8945198,LA_E_A9414340,LA_E_A9471582 7 | LA_0032 LA_E_A1323399,LA_E_A1790388,LA_E_A1974946,LA_E_A2004276,LA_E_A2116071,LA_E_A3263506,LA_E_A4712026,LA_E_A5584172,LA_E_A5835629,LA_E_A6845132,LA_E_A7589780,LA_E_A7785728,LA_E_A7992084,LA_E_A8182193,LA_E_A8650561,LA_E_A8942083,LA_E_A9290365,LA_E_A9407859,LA_E_A9538728 8 | LA_0046 LA_E_A2466976,LA_E_A2708278,LA_E_A3324898,LA_E_A3424299,LA_E_A3727759,LA_E_A4717497,LA_E_A4877678,LA_E_A5090657,LA_E_A6084667,LA_E_A6278681,LA_E_A6963337,LA_E_A7299292,LA_E_A7344026,LA_E_A7444716,LA_E_A7965023,LA_E_A8189181,LA_E_A8746471,LA_E_A9211497,LA_E_A9244386 9 | LA_0018 LA_E_A1222015,LA_E_A1230488,LA_E_A1315575,LA_E_A1643822,LA_E_A1707724,LA_E_A1737837,LA_E_A2840122,LA_E_A2952996,LA_E_A3868494,LA_E_A4039519,LA_E_A5209228,LA_E_A6894935,LA_E_A8525661,LA_E_A8817156,LA_E_A8961129,LA_E_A9262197,LA_E_A9386932,LA_E_A9585735,LA_E_A9592918 10 | LA_0013 LA_E_A1254130,LA_E_A1554007,LA_E_A1781636,LA_E_A2608252,LA_E_A3276842,LA_E_A3750761,LA_E_A4070304,LA_E_A4707964,LA_E_A5038290,LA_E_A5464563,LA_E_A6151518,LA_E_A6168904,LA_E_A8428454,LA_E_A8489144,LA_E_A9058515,LA_E_A9369469,LA_E_A9529647,LA_E_A9712403,LA_E_A9965149 11 | LA_0036 LA_E_A1478180,LA_E_A2389854,LA_E_A3071586,LA_E_A3429891,LA_E_A3527473,LA_E_A3599121,LA_E_A3618518,LA_E_A4409611,LA_E_A4484976,LA_E_A4545756,LA_E_A5861351,LA_E_A6082924,LA_E_A7146309,LA_E_A8502724,LA_E_A8557047,LA_E_A8731251,LA_E_A9137383,LA_E_A9679505,LA_E_A9936942 12 | LA_0023 LA_E_A1071592,LA_E_A1251439,LA_E_A3097813,LA_E_A3106218,LA_E_A3226899,LA_E_A4341221,LA_E_A4391250,LA_E_A4493336,LA_E_A5208875,LA_E_A5227434,LA_E_A5820641,LA_E_A6536184,LA_E_A6588986,LA_E_A7069053,LA_E_A7353069,LA_E_A7695856,LA_E_A8192912,LA_E_A8848010,LA_E_A9509852 13 | LA_0030 LA_E_A1225426,LA_E_A1723876,LA_E_A2079871,LA_E_A2542905,LA_E_A3024244,LA_E_A3707417,LA_E_A4060012,LA_E_A4091866,LA_E_A4748816,LA_E_A5206867,LA_E_A5646760,LA_E_A5705432,LA_E_A6299358,LA_E_A6419571,LA_E_A7089762,LA_E_A7208817,LA_E_A8161790,LA_E_A8341026,LA_E_A8940970 14 | LA_0002 LA_E_A1235554,LA_E_A1469990,LA_E_A1831517,LA_E_A1853447,LA_E_A1935359,LA_E_A2186276,LA_E_A3024508,LA_E_A3451001,LA_E_A4621792,LA_E_A4969008,LA_E_A5304363,LA_E_A6514235,LA_E_A6819813,LA_E_A6842395,LA_E_A8179954,LA_E_A8256043,LA_E_A8972701,LA_E_A9233994,LA_E_A9236736 15 | LA_0040 LA_E_A1709351,LA_E_A3754389,LA_E_A3976416,LA_E_A4523735,LA_E_A4571870,LA_E_A4713889,LA_E_A5779272,LA_E_A5952468,LA_E_A5956752,LA_E_A6986555,LA_E_A7814135,LA_E_A8009345,LA_E_A8011520,LA_E_A8283071,LA_E_A8897121,LA_E_A9015000,LA_E_A9046210,LA_E_A9555184,LA_E_A9618678 16 | LA_0028 LA_E_A2083414,LA_E_A2201003,LA_E_A2288128,LA_E_A2321693,LA_E_A2453228,LA_E_A2530101,LA_E_A2591871,LA_E_A2681646,LA_E_A3004056,LA_E_A3583344,LA_E_A4717284,LA_E_A5122263,LA_E_A5247656,LA_E_A5742757,LA_E_A5998686,LA_E_A7065441,LA_E_A7350428,LA_E_A9276815,LA_E_A9917416 17 | LA_0011 LA_E_A1163470,LA_E_A1376388,LA_E_A1452392,LA_E_A1659783,LA_E_A1923413,LA_E_A2291547,LA_E_A3362381,LA_E_A3892117,LA_E_A4237020,LA_E_A4698529,LA_E_A4919423,LA_E_A8390905,LA_E_A8691184,LA_E_A8832569,LA_E_A8910908,LA_E_A9344861,LA_E_A9425880,LA_E_A9646908,LA_E_A9997819 18 | LA_0001 LA_E_A1160049,LA_E_A2302299,LA_E_A2559248,LA_E_A2611901,LA_E_A3168085,LA_E_A3558732,LA_E_A3587142,LA_E_A3990835,LA_E_A4301313,LA_E_A4969429,LA_E_A5117382,LA_E_A5384841,LA_E_A5746825,LA_E_A7509220,LA_E_A7658339,LA_E_A7870154,LA_E_A8299065,LA_E_A8612771,LA_E_A8960118 19 | LA_0044 LA_E_A1806024,LA_E_A2090824,LA_E_A2291355,LA_E_A2455422,LA_E_A3157518,LA_E_A3810036,LA_E_A4030688,LA_E_A4368613,LA_E_A5939534,LA_E_A6156462,LA_E_A6308549,LA_E_A6728526,LA_E_A6957322,LA_E_A7103478,LA_E_A7501718,LA_E_A7865122,LA_E_A8444324,LA_E_A9236430,LA_E_A9472289 20 | LA_0021 LA_E_A1257083,LA_E_A1365554,LA_E_A1582740,LA_E_A2130868,LA_E_A2199656,LA_E_A2834548,LA_E_A3176274,LA_E_A3291031,LA_E_A5097897,LA_E_A5759463,LA_E_A6646147,LA_E_A6854846,LA_E_A7005142,LA_E_A7542238,LA_E_A8132158,LA_E_A8533788,LA_E_A9316550,LA_E_A9431669,LA_E_A9803663 21 | LA_0025 LA_E_A1598214,LA_E_A1661584,LA_E_A1895385,LA_E_A2078766,LA_E_A3024856,LA_E_A3865801,LA_E_A4030769,LA_E_A4269240,LA_E_A4707130,LA_E_A5390276,LA_E_A6262999,LA_E_A6517068,LA_E_A6562971,LA_E_A6650474,LA_E_A6741138,LA_E_A6976406,LA_E_A7054590,LA_E_A7559134,LA_E_A8796096 22 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/requirements.txt: -------------------------------------------------------------------------------- 1 | --find-links https://download.pytorch.org/whl/torch_stable.html 2 | torch==1.12.1+cu113 3 | torchaudio==0.12.1+cu113 4 | numpy==1.24.4 5 | scipy 6 | scikit-learn 7 | tqdm 8 | pyyaml 9 | soundfile 10 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/scheduler/cosine_annealing_warmup_restarts.py: -------------------------------------------------------------------------------- 1 | # ref: https://github.com/katsura-jp/pytorch-cosine-annealing-with-warmup/blob/master/cosine_annealing_warmup/scheduler.py 2 | #! /usr/bin/python 3 | # -*- encoding: utf-8 -*- 4 | import math 5 | import torch 6 | from torch.optim.lr_scheduler import _LRScheduler 7 | 8 | class CosineAnnealingWarmupRestarts(_LRScheduler): 9 | def __init__(self, 10 | optimizer : torch.optim.Optimizer, 11 | first_cycle_steps : int, 12 | cycle_mult : float = 1., 13 | max_lr : float = 0.1, 14 | min_lr : float = 0.001, 15 | warmup_steps : int = 0, 16 | gamma : float = 1., 17 | last_epoch : int = -1 18 | ): 19 | assert warmup_steps < first_cycle_steps 20 | self.first_cycle_steps = first_cycle_steps # first cycle step size 21 | self.cycle_mult = cycle_mult # cycle steps magnification 22 | self.base_max_lr = max_lr # first max learning rate 23 | self.max_lr = max_lr # max learning rate in the current cycle 24 | self.min_lr = min_lr # min learning rate 25 | self.warmup_steps = warmup_steps # warmup step size 26 | self.gamma = gamma # decrease rate of max learning rate by cycle 27 | self.cur_cycle_steps = first_cycle_steps # first cycle step size 28 | self.cycle = 0 # cycle count 29 | self.step_in_cycle = last_epoch # step size of the current cycle 30 | super(CosineAnnealingWarmupRestarts, self).__init__(optimizer, last_epoch) 31 | self.init_lr() 32 | 33 | def init_lr(self): 34 | self.base_lrs = [] 35 | for param_group in self.optimizer.param_groups: 36 | param_group['lr'] = self.min_lr 37 | self.base_lrs.append(self.min_lr) 38 | 39 | def get_lr(self): 40 | if self.step_in_cycle == -1: 41 | return self.base_lrs 42 | elif self.step_in_cycle < self.warmup_steps: 43 | return [(self.max_lr - base_lr)*self.step_in_cycle / self.warmup_steps + base_lr for base_lr in self.base_lrs] 44 | else: 45 | return [base_lr + (self.max_lr - base_lr) \ 46 | * (1 + math.cos(math.pi * (self.step_in_cycle-self.warmup_steps) \ 47 | / (self.cur_cycle_steps - self.warmup_steps))) / 2 48 | for base_lr in self.base_lrs] 49 | 50 | def step(self, epoch=None): 51 | if epoch is None: 52 | epoch = self.last_epoch + 1 53 | self.step_in_cycle = self.step_in_cycle + 1 54 | if self.step_in_cycle >= self.cur_cycle_steps: 55 | self.cycle += 1 56 | self.step_in_cycle = self.step_in_cycle - self.cur_cycle_steps 57 | self.cur_cycle_steps = int((self.cur_cycle_steps - self.warmup_steps) * self.cycle_mult) + self.warmup_steps 58 | else: 59 | if epoch >= self.first_cycle_steps: 60 | if self.cycle_mult == 1.: 61 | self.step_in_cycle = epoch % self.first_cycle_steps 62 | self.cycle = epoch // self.first_cycle_steps 63 | else: 64 | n = int(math.log((epoch / self.first_cycle_steps * (self.cycle_mult - 1) + 1), self.cycle_mult)) 65 | self.cycle = n 66 | self.step_in_cycle = epoch - int(self.first_cycle_steps * (self.cycle_mult ** n - 1) / (self.cycle_mult - 1)) 67 | self.cur_cycle_steps = self.first_cycle_steps * self.cycle_mult ** (n) 68 | else: 69 | self.cur_cycle_steps = self.first_cycle_steps 70 | self.step_in_cycle = epoch 71 | 72 | self.max_lr = self.base_max_lr * (self.gamma**self.cycle) 73 | self.last_epoch = math.floor(epoch) 74 | for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()): 75 | param_group['lr'] = lr 76 | 77 | 78 | def Scheduler(optimizer, lr_t0, lr_tmul, lr_max, lr_min, lr_wstep, lr_gamma, **kwargs): 79 | sche_fn = CosineAnnealingWarmupRestarts(optimizer, first_cycle_steps=lr_t0, cycle_mult=lr_tmul, max_lr=lr_max, min_lr=lr_min, warmup_steps=lr_wstep, gamma=lr_gamma) 80 | lr_step = 'epoch' 81 | print('Initialised CosineAnnealingWarmupRestarts scheduler') 82 | return sche_fn, lr_step 83 | #return sche_fn 84 | -------------------------------------------------------------------------------- /stage3/ASVspoof2019/spk_meta/spk_meta_dev.pk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/spk_meta/spk_meta_dev.pk -------------------------------------------------------------------------------- /stage3/ASVspoof2019/spk_meta/spk_meta_eval.pk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/spk_meta/spk_meta_eval.pk -------------------------------------------------------------------------------- /stage3/ASVspoof2019/spk_meta/spk_meta_trn.pk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/spk_meta/spk_meta_trn.pk -------------------------------------------------------------------------------- /stage3/ASVspoof2019/tuneThreshold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*- coding: utf-8 -*- 3 | import numpy 4 | from sklearn import metrics 5 | from operator import itemgetter 6 | 7 | def tuneThresholdfromScore(scores, labels, target_fa, target_fr = None): 8 | 9 | fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1) 10 | fnr = 1 - tpr 11 | 12 | tunedThreshold = [] 13 | if target_fr: 14 | for tfr in target_fr: 15 | idx = numpy.nanargmin(numpy.absolute((tfr - fnr))) 16 | tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]) 17 | 18 | for tfa in target_fa: 19 | idx = numpy.nanargmin(numpy.absolute((tfa - fpr))) # numpy.where(fpr<=tfa)[0][-1] 20 | tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]) 21 | 22 | idxE = numpy.nanargmin(numpy.absolute((fnr - fpr))) 23 | eer = max(fpr[idxE],fnr[idxE])*100 24 | 25 | return (tunedThreshold, eer, fpr, fnr) 26 | 27 | # Creates a list of false-negative rates, a list of false-positive rates 28 | # and a list of decision thresholds that give those error-rates. 29 | def ComputeErrorRates(scores, labels): 30 | 31 | # Sort the scores from smallest to largest, and also get the corresponding 32 | # indexes of the sorted scores. We will treat the sorted scores as the 33 | # thresholds at which the the error-rates are evaluated. 34 | sorted_indexes, thresholds = zip(*sorted( 35 | [(index, threshold) for index, threshold in enumerate(scores)], 36 | key=itemgetter(1))) 37 | sorted_labels = [] 38 | labels = [labels[i] for i in sorted_indexes] 39 | fnrs = [] 40 | fprs = [] 41 | 42 | # At the end of this loop, fnrs[i] is the number of errors made by 43 | # incorrectly rejecting scores less than thresholds[i]. And, fprs[i] 44 | # is the total number of times that we have correctly accepted scores 45 | # greater than thresholds[i]. 46 | for i in range(0, len(labels)): 47 | if i == 0: 48 | fnrs.append(labels[i]) 49 | fprs.append(1 - labels[i]) 50 | else: 51 | fnrs.append(fnrs[i-1] + labels[i]) 52 | fprs.append(fprs[i-1] + 1 - labels[i]) 53 | fnrs_norm = sum(labels) 54 | fprs_norm = len(labels) - fnrs_norm 55 | 56 | # Now divide by the total number of false negative errors to 57 | # obtain the false positive rates across all thresholds 58 | fnrs = [x / float(fnrs_norm) for x in fnrs] 59 | 60 | # Divide by the total number of corret positives to get the 61 | # true positive rate. Subtract these quantities from 1 to 62 | # get the false positive rates. 63 | fprs = [1 - x / float(fprs_norm) for x in fprs] 64 | return fnrs, fprs, thresholds 65 | 66 | # Computes the minimum of the detection cost function. The comments refer to 67 | # equations in Section 3 of the NIST 2016 Speaker Recognition Evaluation Plan. 68 | def ComputeMinDcf(fnrs, fprs, thresholds, p_target, c_miss, c_fa): 69 | min_c_det = float("inf") 70 | min_c_det_threshold = thresholds[0] 71 | for i in range(0, len(fnrs)): 72 | # See Equation (2). it is a weighted sum of false negative 73 | # and false positive errors. 74 | c_det = c_miss * fnrs[i] * p_target + c_fa * fprs[i] * (1 - p_target) 75 | if c_det < min_c_det: 76 | min_c_det = c_det 77 | min_c_det_threshold = thresholds[i] 78 | # See Equations (3) and (4). Now we normalize the cost. 79 | c_def = min(c_miss * p_target, c_fa * (1 - p_target)) 80 | min_dcf = min_c_det / c_def 81 | return min_dcf, min_c_det_threshold --------------------------------------------------------------------------------