├── LICENSE
├── README.md
├── img
    └── overview.png
├── stage1
    └── VoxCeleb2
    │   ├── DatasetLoader.py
    │   ├── README.md
    │   ├── SpeakerNet.py
    │   ├── loss
    │       ├── aamsoftmax.py
    │       ├── aamsoftmaxproto.py
    │       ├── amsoftmax.py
    │       ├── angleproto.py
    │       ├── ge2e.py
    │       ├── proto.py
    │       ├── softmax.py
    │       ├── softmaxproto.py
    │       └── triplet.py
    │   ├── models
    │       ├── ECAPA_TDNN.py
    │       ├── MFA_Conformer.py
    │       ├── SKA_TDNN.py
    │       ├── specaugment.py
    │       └── wenet
    │       │   ├── bin
    │       │       ├── .train.py.swp
    │       │       ├── alignment.py
    │       │       ├── average_model.py
    │       │       ├── export_jit.py
    │       │       ├── recognize.py
    │       │       └── train.py
    │       │   ├── dataset
    │       │       ├── dataset.py
    │       │       ├── kaldi_io.py
    │       │       └── wav_distortion.py
    │       │   ├── transformer
    │       │       ├── __pycache__
    │       │       │   ├── attention.cpython-36.pyc
    │       │       │   ├── attention.cpython-37.pyc
    │       │       │   ├── attention.cpython-39.pyc
    │       │       │   ├── convolution.cpython-36.pyc
    │       │       │   ├── convolution.cpython-37.pyc
    │       │       │   ├── convolution.cpython-39.pyc
    │       │       │   ├── embedding.cpython-36.pyc
    │       │       │   ├── embedding.cpython-37.pyc
    │       │       │   ├── embedding.cpython-39.pyc
    │       │       │   ├── encoder_cat.cpython-36.pyc
    │       │       │   ├── encoder_cat.cpython-37.pyc
    │       │       │   ├── encoder_cat.cpython-39.pyc
    │       │       │   ├── encoder_layer.cpython-36.pyc
    │       │       │   ├── encoder_layer.cpython-37.pyc
    │       │       │   ├── encoder_layer.cpython-39.pyc
    │       │       │   ├── positionwise_feed_forward.cpython-36.pyc
    │       │       │   ├── positionwise_feed_forward.cpython-37.pyc
    │       │       │   ├── positionwise_feed_forward.cpython-39.pyc
    │       │       │   ├── subsampling.cpython-36.pyc
    │       │       │   ├── subsampling.cpython-37.pyc
    │       │       │   ├── subsampling.cpython-39.pyc
    │       │       │   ├── swish.cpython-36.pyc
    │       │       │   ├── swish.cpython-37.pyc
    │       │       │   └── swish.cpython-39.pyc
    │       │       ├── asr_model.py
    │       │       ├── attention.py
    │       │       ├── cmvn.py
    │       │       ├── convolution.py
    │       │       ├── ctc.py
    │       │       ├── decoder.py
    │       │       ├── decoder_layer.py
    │       │       ├── embedding.py
    │       │       ├── encoder.py
    │       │       ├── encoder_cat.py
    │       │       ├── encoder_layer.py
    │       │       ├── encoder_weight.py
    │       │       ├── label_smoothing_loss.py
    │       │       ├── positionwise_feed_forward.py
    │       │       ├── subsampling.py
    │       │       └── swish.py
    │       │   └── utils
    │       │       ├── __pycache__
    │       │           ├── common.cpython-36.pyc
    │       │           ├── common.cpython-37.pyc
    │       │           ├── common.cpython-39.pyc
    │       │           ├── mask.cpython-36.pyc
    │       │           ├── mask.cpython-37.pyc
    │       │           └── mask.cpython-39.pyc
    │       │       ├── checkpoint.py
    │       │       ├── cmvn.py
    │       │       ├── common.py
    │       │       ├── ctc_util.py
    │       │       ├── executor.py
    │       │       ├── mask.py
    │       │       └── scheduler.py
    │   ├── optimizer
    │       ├── adam.py
    │       ├── adamP.py
    │       ├── adamW.py
    │       └── sgd.py
    │   ├── process_musan.py
    │   ├── requirements.txt
    │   ├── scheduler
    │       ├── cosine_annealing_warmup_restarts.py
    │       ├── cycliclr.py
    │       ├── exponentiallr.py
    │       └── steplr.py
    │   ├── trainSpeakerNet.py
    │   ├── tuneThreshold.py
    │   └── utils.py
├── stage2
    └── README.md
└── stage3
    └── ASVspoof2019
        ├── DatasetLoader.py
        ├── README.md
        ├── SASVNet.py
        ├── loss
            ├── aamsoftmax.py
            ├── angleproto_sasv.py
            └── sasv_e2e_v1.py
        ├── metrics.py
        ├── models
            ├── ECAPA_TDNN.py
            ├── MFA_Conformer.py
            ├── SKA_TDNN.py
            ├── specaugment.py
            └── wenet
            │   ├── bin
            │       ├── .train.py.swp
            │       ├── alignment.py
            │       ├── average_model.py
            │       ├── export_jit.py
            │       ├── recognize.py
            │       └── train.py
            │   ├── dataset
            │       ├── dataset.py
            │       ├── kaldi_io.py
            │       └── wav_distortion.py
            │   ├── transformer
            │       ├── __pycache__
            │       │   ├── attention.cpython-36.pyc
            │       │   ├── attention.cpython-37.pyc
            │       │   ├── attention.cpython-39.pyc
            │       │   ├── convolution.cpython-36.pyc
            │       │   ├── convolution.cpython-37.pyc
            │       │   ├── convolution.cpython-39.pyc
            │       │   ├── embedding.cpython-36.pyc
            │       │   ├── embedding.cpython-37.pyc
            │       │   ├── embedding.cpython-39.pyc
            │       │   ├── encoder_cat.cpython-36.pyc
            │       │   ├── encoder_cat.cpython-37.pyc
            │       │   ├── encoder_cat.cpython-39.pyc
            │       │   ├── encoder_layer.cpython-36.pyc
            │       │   ├── encoder_layer.cpython-37.pyc
            │       │   ├── encoder_layer.cpython-39.pyc
            │       │   ├── positionwise_feed_forward.cpython-36.pyc
            │       │   ├── positionwise_feed_forward.cpython-37.pyc
            │       │   ├── positionwise_feed_forward.cpython-39.pyc
            │       │   ├── subsampling.cpython-36.pyc
            │       │   ├── subsampling.cpython-37.pyc
            │       │   ├── subsampling.cpython-39.pyc
            │       │   ├── swish.cpython-36.pyc
            │       │   ├── swish.cpython-37.pyc
            │       │   └── swish.cpython-39.pyc
            │       ├── asr_model.py
            │       ├── attention.py
            │       ├── cmvn.py
            │       ├── convolution.py
            │       ├── ctc.py
            │       ├── decoder.py
            │       ├── decoder_layer.py
            │       ├── embedding.py
            │       ├── encoder.py
            │       ├── encoder_cat.py
            │       ├── encoder_layer.py
            │       ├── encoder_weight.py
            │       ├── label_smoothing_loss.py
            │       ├── positionwise_feed_forward.py
            │       ├── subsampling.py
            │       └── swish.py
            │   └── utils
            │       ├── __pycache__
            │           ├── common.cpython-36.pyc
            │           ├── common.cpython-37.pyc
            │           ├── common.cpython-39.pyc
            │           ├── mask.cpython-36.pyc
            │           ├── mask.cpython-37.pyc
            │           └── mask.cpython-39.pyc
            │       ├── checkpoint.py
            │       ├── cmvn.py
            │       ├── common.py
            │       ├── ctc_util.py
            │       ├── executor.py
            │       ├── mask.py
            │       └── scheduler.py
        ├── optimizer
            ├── adam.py
            ├── adamP.py
            ├── adamW.py
            └── sgd.py
        ├── protocols
            ├── ASVspoof2019.LA.asv.dev.gi.trl.txt
            ├── ASVspoof2019.LA.asv.eval.female.trn.txt
            ├── ASVspoof2019.LA.asv.eval.gi.trl.txt
            ├── ASVspoof2019.LA.asv.eval.male.trn.txt
            ├── ASVspoof2019.LA.cm.dev.trl.txt
            ├── ASVspoof2019.LA.cm.eval.trl.txt
            ├── ASVspoof2019.LA.cm.train.trn.txt
            └── ASVspoof2019.LA.cm.train_dev.trn.txt
        ├── requirements.txt
        ├── scheduler
            └── cosine_annealing_warmup_restarts.py
        ├── spk_meta
            ├── spk_meta_dev.pk
            ├── spk_meta_eval.pk
            └── spk_meta_trn.pk
        ├── trainSASVNet.py
        ├── tuneThreshold.py
        └── utils.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 sasv-challenge
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/img/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/img/overview.png


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/README.md:
--------------------------------------------------------------------------------
  1 | # Stage 1
  2 | 
  3 | This repository is developed based on the [voxceleb_trainer](https://github.com/clovaai/voxceleb_trainer) and [ska-tdnn](https://github.com/msh9184/ska-tdnn).
  4 | 
  5 | ## Dependencies
  6 | If you use the Anaconda virtual environment,
  7 | ```
  8 | conda create -n sasv python=3.9 cudatoolkit=11.3
  9 | conda activate sasv
 10 | ```
 11 | Install all dependency packages,
 12 | ```
 13 | pip3 install -r requirements.txt
 14 | ```
 15 | 
 16 | 
 17 | ## Data Preparation
 18 | The [VoxCeleb](https://mm.kaist.ac.kr/datasets/voxceleb/) datasets are used for these experiments.
 19 | The train list should contain the file path and speaker identity for instance,
 20 | ```
 21 | id00012/21Uxsk56VDQ/00001.wav id00012
 22 | id00012/21Uxsk56VDQ/00002.wav id00012
 23 | ...
 24 | id09272/u7VNkYraCw0/00026.wav id09272
 25 | id09272/u7VNkYraCw0/00027.wav id09272
 26 | ```
 27 | The example files of train list for VoxCeleb2 and the test lists for VoxCeleb1-O, VoxCeleb1-E, VoxCeleb1-H can be download from [train_vox2.txt](https://drive.google.com/file/d/1Y6yjKDULxJ40mhLzeKUzkeAvqNlP0tzX/view?usp=sharing) and [veri_test2.txt](https://drive.google.com/file/d/1EUDR5oCPC-zOexhLBHbFQpdnw1IRWq-B/view?usp=sharing), [list_test_all2](https://drive.google.com/file/d/1BgnEugORlSPsi4ZpTjTayAGPqyWTm7S8/view?usp=sharing), [list_test_hard2](https://drive.google.com/file/d/1p-gbPbDK4dy_SvSRWZ3KP17iZdHqjHQ4/view?usp=sharing), respectively. You can also follow the instructions on the [voxceleb_trainer](https://github.com/clovaai/voxceleb_trainer) repository for the download and data preparation of training, augmentation, and evaluation.
 28 | 
 29 | For the data augmentation of noise addition, you can download the [MUSAN noise corpus](https://www.openslr.org/17/).
 30 | After downloading and extracting the files, you can split the audio files into short segments for faster random access as the following command:
 31 | ```bash
 32 | python process_musan.py /path/to/dataset/MUSAN
 33 | ```
 34 | where `/path/to/dataset/MUSAN` is your path to the MUSAN corpus.
 35 | 
 36 | For the data augmentation of convolution with simulated RIRs, you can download the [Room Impulse Response and Noise Database](https://www.openslr.org/28/).
 37 | 
 38 | 
 39 | ## Models
 40 | Three models are included in this repository. You can select the model by the `--model` option:
 41 | ```
 42 | ECAPA_TDNN [1]
 43 | MFA_Conformer [2]
 44 | SKA_TDNN [3]
 45 | ```
 46 | [1] B. Desplanques, J. Thienpondt, and K. Demuynck, "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification," in *Proc. INTERSPEECH*, 2020, pp. 3707-3711.
 47 | 
 48 | [2] Y. Zhang, Z. Lv, H. Wu, S. Zhang, P. Hu, Z. Wu, H. Lee, and H. Meng., “MFA-Conformer: Multi-scale Feature Aggregation Conformer for Automatic Speaker Verification,” in *Proc. INTERSPEECH*, 2022.
 49 | 
 50 | [3] S. H. Mun, J. Jung, M. H. Han, and N. S. Kim, "Frequency and Multi-Scale Selective Kernel Attention for Speaker Verification," in *Proc. IEEE SLT*, 2022.
 51 | 
 52 | 
 53 | ## Training
 54 | Distributed Data Parallel (DDP) training example: SKA_TDNN with a vanilla cosine similarity (COS) evaluation every epoch,
 55 | ```
 56 | CUDA_VISIBLE_DEVICES=0,1,2,3 python trainSpeakerNet.py \
 57 |         --max_frames 200 \
 58 |         --eval_frames 0 \
 59 |         --num_eval 1 \
 60 |         --num_spk 100 \
 61 |         --num_utt 2 \
 62 |         --augment Ture \
 63 |         --optimizer adamW \
 64 |         --scheduler cosine_annealing_warmup_restarts \
 65 |         --lr_t0 25 \
 66 |         --lr_tmul 1.0 \
 67 |         --lr_max 1e-3 \
 68 |         --lr_min 1e-8 \
 69 |         --lr_wstep 10 \
 70 |         --lr_gamma 0.5 \
 71 |         --margin 0.2 \
 72 |         --scale 30 \
 73 |         --num_class 5994 \
 74 |         --save_path ./save/ska_tdnn \
 75 |         --train_list ./list/train_vox2.txt \
 76 |         --test_list ./list/veri_test2.txt \
 77 |         --train_path /path/to/dataset/VoxCeleb2/dev/wav \
 78 |         --test_path /path/to/dataset/VoxCeleb1/test/wav \
 79 |         --musan_path /path/to/dataset/MUSAN/musan_split \
 80 |         --rir_path /path/to/dataset/RIRS_NOISES/simulated_rirs \
 81 |         --model SKA_TDNN \
 82 |         --port 8000 \
 83 |         --distributed
 84 | ```
 85 | 
 86 | ## Evaluation
 87 | Evaluation example using vanilla cosine similarity (COS) on the VoxCeleb1-O,
 88 | ```
 89 | CUDA_VISIBLE_DEVICES=0,1,2,3 python trainSpeakerNet.py \
 90 |         --eval \
 91 |         --eval_frames 0 \
 92 |         --num_eval 1 \
 93 |         --initial_model ./save/model/your_model.model \
 94 |         --test_list ./list/veri_test2.txt \
 95 |         --test_path /path/to/dataset/VoxCeleb1/test/wav \
 96 |         --model SKA_TDNN \
 97 |         --port 8001 \
 98 |         --distributed
 99 | ```
100 | Evaluation example using Test Time Augmentation (TTA) on the VoxCeleb1-E,
101 | ```
102 | CUDA_VISIBLE_DEVICES=0,1,2,3 python trainSpeakerNet.py \
103 |         --eval \
104 |         --tta \
105 |         --eval_frames 400 \
106 |         --num_eval 10 \
107 |         --initial_model ./save/model/your_model.model \
108 |         --test_list ./list/list_test_all2 \
109 |         --test_path /path/to/dataset/VoxCeleb1/all/wav \
110 |         --model SKA_TDNN \
111 |         --port 8002 \
112 |         --distributed
113 | ```
114 | Evaluation example using Score Normalisation (SN) on the VoxCeleb1-H,
115 | ```
116 | CUDA_VISIBLE_DEVICES=0,1,2,3 python trainSpeakerNet.py \
117 |         --eval \
118 |         --score_norm \
119 |         --type_coh utt \
120 |         --top_coh_size 20000 \
121 |         --eval_frames 0 \
122 |         --num_eval 1 \
123 |         --initial_model ./save/model/your_model.model \
124 |         --train_list ./list/train_vox2.txt \
125 |         --test_list ./list/list_test_hard2 \
126 |         --train_path /path/to/dataset/VoxCeleb2/dev/wav \
127 |         --test_path /path/to/dataset/VoxCeleb1/all/wav \
128 |         --model SKA_TDNN \
129 |         --port 8003 \
130 |         --distributed
131 | ```
132 | 
133 | 
134 | ## Citation
135 | If you utilize this repository, please cite the following paper,
136 | ```
137 | @inproceedings{chung2020in,
138 |   title={In defence of metric learning for speaker recognition},
139 |   author={Chung, Joon Son and Huh, Jaesung and Mun, Seongkyu and Lee, Minjae and Heo, Hee Soo and Choe, Soyeon and Ham, Chiheon and Jung, Sunghwan and Lee, Bong-Jin and Han, Icksang},
140 |   booktitle={Proc. Interspeech},
141 |   year={2020}
142 | }
143 | ```
144 | 
145 | ```
146 | @inproceedings{jung2022pushing,
147 |   title={Pushing the limits of raw waveform speaker recognition},
148 |   author={Jung, Jee-weon and Kim, You Jin and Heo, Hee-Soo and Lee, Bong-Jin and Kwon, Youngki and Chung, Joon Son},
149 |   booktitle={Proc. Interspeech},
150 |   year={2022}
151 | }
152 | ```
153 | 
154 | ```
155 | @inproceedings{mun2022frequency,
156 |   title={Frequency and Multi-Scale Selective Kernel Attention for Speaker Verification},
157 |   author={Mun, Sung Hwan and Jung, Jee-weon and Han, Min Hyun and Kim, Nam Soo},
158 |   booktitle={Proc. IEEE SLT},
159 |   year={2022}
160 | }
161 | ```


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/loss/aamsoftmax.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | # Adapted from https://github.com/wujiyang/Face_Pytorch (Apache License)
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | import time, pdb, numpy, math
 9 | from utils import accuracy
10 | 
11 | class LossFunction(nn.Module):
12 |     def __init__(self, num_out, num_class, margin=0.3, scale=15, easy_margin=False, **kwargs):
13 |         super(LossFunction, self).__init__()
14 | 
15 |         self.test_normalize = True
16 |         
17 |         self.m = margin
18 |         self.s = scale
19 |         self.in_feats = num_out
20 |         self.weight = torch.nn.Parameter(torch.FloatTensor(num_class, num_out), requires_grad=True)
21 |         self.ce = nn.CrossEntropyLoss()
22 |         nn.init.xavier_normal_(self.weight, gain=1)
23 | 
24 |         self.easy_margin = easy_margin
25 |         self.cos_m = math.cos(self.m)
26 |         self.sin_m = math.sin(self.m)
27 | 
28 |         # make the function cos(theta+m) monotonic decreasing while theta in [0°,180°]
29 |         self.th = math.cos(math.pi - self.m)
30 |         self.mm = math.sin(math.pi - self.m) * self.m
31 | 
32 |         print('Initialised AAMSoftmax margin %.3f scale %.3f'%(self.m,self.s))
33 | 
34 |     def forward(self, x, label=None):
35 | 
36 |         assert x.size()[0] == label.size()[0]
37 |         assert x.size()[1] == self.in_feats
38 |         
39 |         # cos(theta)
40 |         cosine = F.linear(F.normalize(x), F.normalize(self.weight))
41 |         # cos(theta + m)
42 |         sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1))
43 |         phi = cosine * self.cos_m - sine * self.sin_m
44 | 
45 |         if self.easy_margin:
46 |             phi = torch.where(cosine > 0, phi, cosine)
47 |         else:
48 |             phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm)
49 | 
50 |         #one_hot = torch.zeros(cosine.size(), device='cuda' if torch.cuda.is_available() else 'cpu')
51 |         one_hot = torch.zeros_like(cosine)
52 |         one_hot.scatter_(1, label.view(-1, 1), 1)
53 |         output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
54 |         output = output * self.s
55 | 
56 |         loss    = self.ce(output, label)
57 |         prec1   = accuracy(output.detach(), label.detach(), topk=(1,))[0]
58 |         return loss, prec1
59 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/loss/aamsoftmaxproto.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import loss.aamsoftmax as aamsoftmax
 7 | import loss.angleproto as angleproto
 8 | 
 9 | class LossFunction(nn.Module):
10 |     def __init__(self, **kwargs):
11 |         super(LossFunction, self).__init__()
12 |         self.test_normalize = True
13 |         self.aamsoftmax = aamsoftmax.LossFunction(**kwargs)
14 |         self.angleproto = angleproto.LossFunction(**kwargs)
15 |         print('Initialised AAMSoftmaxPrototypicalLoss')
16 | 
17 |     def forward(self, x, label=None):
18 |         assert x.size()[1] == 2
19 |         nlossS, prec1 = self.aamsoftmax(x.reshape(-1,x.size()[-1]), label.repeat_interleave(2))
20 |         nlossM, _     = self.angleproto(x,None)
21 |         return nlossS+nlossM, prec1
22 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/loss/amsoftmax.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | # Adapted from https://github.com/CoinCheung/pytorch-loss (MIT License)
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | import time, pdb, numpy
 9 | from utils import accuracy
10 | 
11 | class LossFunction(nn.Module):
12 |     def __init__(self, nOut, nClasses, margin=0.3, scale=15, **kwargs):
13 |         super(LossFunction, self).__init__()
14 | 
15 |         self.test_normalize = True
16 |         
17 |         self.m = margin
18 |         self.s = scale
19 |         self.in_feats = nOut
20 |         self.W = torch.nn.Parameter(torch.randn(nOut, nClasses), requires_grad=True)
21 |         self.ce = nn.CrossEntropyLoss()
22 |         nn.init.xavier_normal_(self.W, gain=1)
23 | 
24 |         print('Initialised AMSoftmax m=%.3f s=%.3f'%(self.m,self.s))
25 | 
26 |     def forward(self, x, label=None):
27 | 
28 |         assert x.size()[0] == label.size()[0]
29 |         assert x.size()[1] == self.in_feats
30 | 
31 |         x_norm = torch.norm(x, p=2, dim=1, keepdim=True).clamp(min=1e-12)
32 |         x_norm = torch.div(x, x_norm)
33 |         w_norm = torch.norm(self.W, p=2, dim=0, keepdim=True).clamp(min=1e-12)
34 |         w_norm = torch.div(self.W, w_norm)
35 |         costh = torch.mm(x_norm, w_norm)
36 |         label_view = label.view(-1, 1)
37 |         if label_view.is_cuda: label_view = label_view.cpu()
38 |         delt_costh = torch.zeros(costh.size()).scatter_(1, label_view, self.m)
39 |         if x.is_cuda: delt_costh = delt_costh.cuda()
40 |         costh_m = costh - delt_costh
41 |         costh_m_s = self.s * costh_m
42 |         loss    = self.ce(costh_m_s, label)
43 |         prec1   = accuracy(costh_m_s.detach(), label.detach(), topk=(1,))[0]
44 |         return loss, prec1
45 | 
46 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/loss/angleproto.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | import time, pdb, numpy
 8 | from utils import accuracy
 9 | 
10 | class LossFunction(nn.Module):
11 | 
12 |     def __init__(self, init_w=10.0, init_b=-5.0, **kwargs):
13 |         super(LossFunction, self).__init__()
14 | 
15 |         self.test_normalize = True
16 |         
17 |         self.w = nn.Parameter(torch.tensor(init_w))
18 |         self.b = nn.Parameter(torch.tensor(init_b))
19 |         self.criterion  = torch.nn.CrossEntropyLoss()
20 | 
21 |         print('Initialised AngleProto')
22 | 
23 |     def forward(self, x, label=None):
24 | 
25 |         assert x.size()[1] >= 2
26 | 
27 |         out_anchor      = torch.mean(x[:,1:,:],1)
28 |         out_positive    = x[:,0,:]
29 |         stepsize        = out_anchor.size()[0]
30 | 
31 |         cos_sim_matrix  = F.cosine_similarity(out_positive.unsqueeze(-1),out_anchor.unsqueeze(-1).transpose(0,2))
32 |         torch.clamp(self.w, 1e-6)
33 |         cos_sim_matrix = cos_sim_matrix * self.w + self.b
34 |         
35 |         label   = torch.from_numpy(numpy.asarray(range(0,stepsize))).cuda()
36 |         nloss   = self.criterion(cos_sim_matrix, label)
37 |         prec1   = accuracy(cos_sim_matrix.detach(), label.detach(), topk=(1,))[0]
38 | 
39 |         return nloss, prec1
40 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/loss/ge2e.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | ## Fast re-implementation of the GE2E loss (https://arxiv.org/abs/1710.10467) 
 4 | ## Numerically checked against https://github.com/cvqluu/GE2E-Loss
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | import time, pdb, numpy
10 | from utils import accuracy
11 | 
12 | class LossFunction(nn.Module):
13 | 
14 |     def __init__(self, init_w=10.0, init_b=-5.0, **kwargs):
15 |         super(LossFunction, self).__init__()
16 | 
17 |         self.test_normalize = True
18 |         
19 |         self.w = nn.Parameter(torch.tensor(init_w))
20 |         self.b = nn.Parameter(torch.tensor(init_b))
21 |         self.criterion  = torch.nn.CrossEntropyLoss()
22 | 
23 |         print('Initialised GE2E')
24 | 
25 |     def forward(self, x, label=None):
26 | 
27 |         assert x.size()[1] >= 2
28 | 
29 |         gsize = x.size()[1]
30 |         centroids = torch.mean(x, 1)
31 |         stepsize = x.size()[0]
32 | 
33 |         cos_sim_matrix = []
34 | 
35 |         for ii in range(0,gsize): 
36 |             idx = [*range(0,gsize)]
37 |             idx.remove(ii)
38 |             exc_centroids = torch.mean(x[:,idx,:], 1)
39 |             cos_sim_diag    = F.cosine_similarity(x[:,ii,:],exc_centroids)
40 |             cos_sim         = F.cosine_similarity(x[:,ii,:].unsqueeze(-1),centroids.unsqueeze(-1).transpose(0,2))
41 |             cos_sim[range(0,stepsize),range(0,stepsize)] = cos_sim_diag
42 |             cos_sim_matrix.append(torch.clamp(cos_sim,1e-6))
43 | 
44 |         cos_sim_matrix = torch.stack(cos_sim_matrix,dim=1)
45 | 
46 |         torch.clamp(self.w, 1e-6)
47 |         cos_sim_matrix = cos_sim_matrix * self.w + self.b
48 |         
49 |         label = torch.from_numpy(numpy.asarray(range(0,stepsize))).cuda()
50 |         nloss = self.criterion(cos_sim_matrix.view(-1,stepsize), torch.repeat_interleave(label,repeats=gsize,dim=0).cuda())
51 |         prec1 = accuracy(cos_sim_matrix.view(-1,stepsize).detach(), torch.repeat_interleave(label,repeats=gsize,dim=0).detach(), topk=(1,))[0]
52 | 
53 |         return nloss, prec1


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/loss/proto.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | ## Re-implementation of prototypical networks (https://arxiv.org/abs/1703.05175).
 4 | ## Numerically checked against https://github.com/cyvius96/prototypical-network-pytorch
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | import time, pdb, numpy
10 | from utils import accuracy
11 | 
12 | class LossFunction(nn.Module):
13 | 
14 |     def __init__(self, **kwargs):
15 |         super(LossFunction, self).__init__()
16 | 
17 |         self.test_normalize = False
18 | 
19 |         self.criterion  = torch.nn.CrossEntropyLoss()
20 | 
21 |         print('Initialised Prototypical Loss')
22 | 
23 |     def forward(self, x, label=None):
24 | 
25 |         assert x.size()[1] >= 2
26 |         
27 |         out_anchor      = torch.mean(x[:,1:,:],1)
28 |         out_positive    = x[:,0,:]
29 |         stepsize        = out_anchor.size()[0]
30 | 
31 |         output  = -1 * (F.pairwise_distance(out_positive.unsqueeze(-1),out_anchor.unsqueeze(-1).transpose(0,2))**2)
32 |         label   = torch.from_numpy(numpy.asarray(range(0,stepsize))).cuda()
33 |         nloss   = self.criterion(output, label)
34 |         prec1   = accuracy(output.detach(), label.detach(), topk=(1,))[0]
35 | 
36 |         return nloss, prec1


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/loss/softmax.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | import time, pdb, numpy
 8 | from utils import accuracy
 9 | 
10 | class LossFunction(nn.Module):
11 | 	def __init__(self, nOut, nClasses, **kwargs):
12 | 	    super(LossFunction, self).__init__()
13 | 
14 | 	    self.test_normalize = True
15 | 	    
16 | 	    self.criterion  = torch.nn.CrossEntropyLoss()
17 | 	    self.fc 		= nn.Linear(nOut,nClasses)
18 | 
19 | 	    print('Initialised Softmax Loss')
20 | 
21 | 	def forward(self, x, label=None):
22 | 
23 | 		x 		= self.fc(x)
24 | 		nloss   = self.criterion(x, label)
25 | 		prec1	= accuracy(x.detach(), label.detach(), topk=(1,))[0]
26 | 
27 | 		return nloss, prec1


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/loss/softmaxproto.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import loss.softmax as softmax
 7 | import loss.angleproto as angleproto
 8 | 
 9 | class LossFunction(nn.Module):
10 | 
11 |     def __init__(self, **kwargs):
12 |         super(LossFunction, self).__init__()
13 | 
14 |         self.test_normalize = True
15 | 
16 |         self.softmax = softmax.LossFunction(**kwargs)
17 |         self.angleproto = angleproto.LossFunction(**kwargs)
18 | 
19 |         print('Initialised SoftmaxPrototypical Loss')
20 | 
21 |     def forward(self, x, label=None):
22 | 
23 |         assert x.size()[1] == 2
24 | 
25 |         nlossS, prec1   = self.softmax(x.reshape(-1,x.size()[-1]), label.repeat_interleave(2))
26 | 
27 |         nlossP, _       = self.angleproto(x,None)
28 | 
29 |         return nlossS+nlossP, prec1


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/loss/triplet.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | import time, pdb, numpy
 8 | from tuneThreshold import tuneThresholdfromScore
 9 | import random
10 | 
11 | class LossFunction(nn.Module):
12 | 
13 |     def __init__(self, hard_rank=0, hard_prob=0, margin=0, **kwargs):
14 |         super(LossFunction, self).__init__()
15 | 
16 |         self.test_normalize = True
17 |         
18 |         self.hard_rank  = hard_rank
19 |         self.hard_prob  = hard_prob
20 |         self.margin     = margin
21 | 
22 |         print('Initialised Triplet Loss')
23 | 
24 |     def forward(self, x, label=None):
25 | 
26 |         assert x.size()[1] == 2
27 |         
28 |         out_anchor      = F.normalize(x[:,0,:], p=2, dim=1)
29 |         out_positive    = F.normalize(x[:,1,:], p=2, dim=1)
30 |         stepsize        = out_anchor.size()[0]
31 | 
32 |         output      = -1 * (F.pairwise_distance(out_anchor.unsqueeze(-1),out_positive.unsqueeze(-1).transpose(0,2))**2)
33 | 
34 |         negidx      = self.mineHardNegative(output.detach())
35 | 
36 |         out_negative = out_positive[negidx,:]
37 | 
38 |         labelnp     = numpy.array([1]*len(out_positive)+[0]*len(out_negative))
39 | 
40 |         ## calculate distances
41 |         pos_dist    = F.pairwise_distance(out_anchor,out_positive)
42 |         neg_dist    = F.pairwise_distance(out_anchor,out_negative)
43 | 
44 |         ## loss function
45 |         nloss   = torch.mean(F.relu(torch.pow(pos_dist, 2) - torch.pow(neg_dist, 2) + self.margin))
46 | 
47 |         scores = -1 * torch.cat([pos_dist,neg_dist],dim=0).detach().cpu().numpy()
48 | 
49 |         errors = tuneThresholdfromScore(scores, labelnp, []);
50 | 
51 |         return nloss, errors[1]
52 | 
53 |     ## ===== ===== ===== ===== ===== ===== ===== =====
54 |     ## Hard negative mining
55 |     ## ===== ===== ===== ===== ===== ===== ===== =====
56 | 
57 |     def mineHardNegative(self, output):
58 | 
59 |         negidx = []
60 | 
61 |         for idx, similarity in enumerate(output):
62 | 
63 |             simval, simidx = torch.sort(similarity,descending=True)
64 | 
65 |             if self.hard_rank < 0:
66 | 
67 |                 ## Semi hard negative mining
68 | 
69 |                 semihardidx = simidx[(similarity[idx] - self.margin < simval) &  (simval < similarity[idx])]
70 | 
71 |                 if len(semihardidx) == 0:
72 |                     negidx.append(random.choice(simidx))
73 |                 else:
74 |                     negidx.append(random.choice(semihardidx))
75 | 
76 |             else:
77 | 
78 |                 ## Rank based negative mining
79 |                 
80 |                 simidx = simidx[simidx!=idx]
81 | 
82 |                 if random.random() < self.hard_prob:
83 |                     negidx.append(simidx[random.randint(0, self.hard_rank)])
84 |                 else:
85 |                     negidx.append(random.choice(simidx))
86 | 
87 |         return negidx


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/ECAPA_TDNN.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | # -*- encoding: utf-8 -*-
  3 | 
  4 | ## Here, log_input forces alternative mfcc implementation with pre-emphasis instead of actual log mfcc
  5 | 
  6 | import math
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | import torchaudio
 11 | import pdb
 12 | from utils import PreEmphasis
 13 | 
 14 | class SEModule(nn.Module):
 15 |     def __init__(self, channels, bottleneck=128):
 16 |         super(SEModule, self).__init__()
 17 |         self.se = nn.Sequential(
 18 |             nn.AdaptiveAvgPool1d(1),
 19 |             nn.Conv1d(channels, bottleneck, kernel_size=1, padding=0),
 20 |             nn.ReLU(),
 21 |             #nn.BatchNorm1d(bottleneck),
 22 |             nn.Conv1d(bottleneck, channels, kernel_size=1, padding=0),
 23 |             nn.Sigmoid(),
 24 |             )
 25 | 
 26 |     def forward(self, input):
 27 |         x = self.se(input)
 28 |         return input * x
 29 | 
 30 | class Bottle2neck(nn.Module):
 31 | 
 32 |     def __init__(self, inplanes, planes, kernel_size=None, dilation=None, scale = 8):
 33 |         super(Bottle2neck, self).__init__()
 34 |         width       = int(math.floor(planes / scale))       
 35 |         self.conv1  = nn.Conv1d(inplanes, width*scale, kernel_size=1)
 36 |         self.bn1    = nn.BatchNorm1d(width*scale)
 37 |         self.nums   = scale -1
 38 |         convs       = []
 39 |         bns         = []
 40 |         num_pad = math.floor(kernel_size/2)*dilation
 41 |         for i in range(self.nums):
 42 |             convs.append(nn.Conv1d(width, width, kernel_size=kernel_size, dilation=dilation, padding=num_pad))
 43 |             bns.append(nn.BatchNorm1d(width))
 44 |         self.convs  = nn.ModuleList(convs)
 45 |         self.bns    = nn.ModuleList(bns)
 46 |         self.conv3  = nn.Conv1d(width*scale, planes, kernel_size=1)
 47 |         self.bn3    = nn.BatchNorm1d(planes)
 48 |         self.relu   = nn.ReLU()
 49 |         self.width  = width
 50 |         self.se     = SEModule(planes)
 51 | 
 52 |     def forward(self, x):
 53 |         residual = x
 54 |         out = self.conv1(x)
 55 |         out = self.relu(out)
 56 |         out = self.bn1(out)
 57 | 
 58 |         spx = torch.split(out, self.width, 1)
 59 |         for i in range(self.nums):
 60 |           if i==0:
 61 |             sp = spx[i]
 62 |           else:
 63 |             sp = sp + spx[i]
 64 |           sp = self.convs[i](sp)
 65 |           sp = self.relu(sp)
 66 |           sp = self.bns[i](sp)
 67 |           if i==0:
 68 |             out = sp
 69 |           else:
 70 |             out = torch.cat((out, sp), 1)
 71 |         out = torch.cat((out, spx[self.nums]),1)
 72 | 
 73 |         out = self.conv3(out)
 74 |         out = self.relu(out)
 75 |         out = self.bn3(out)
 76 | 
 77 |         out = self.se(out)
 78 |         out += residual
 79 |         return out 
 80 | 
 81 | class FbankAug(nn.Module):
 82 | 
 83 |     def __init__(self, freq_mask_width = (0, 8), time_mask_width = (0, 10)):
 84 |         self.time_mask_width = time_mask_width
 85 |         self.freq_mask_width = freq_mask_width
 86 |         super().__init__()
 87 | 
 88 |     def mask_along_axis(self, x, dim):
 89 |         original_size = x.shape
 90 |         batch, fea, time = x.shape
 91 |         if dim == 1:
 92 |             D = fea
 93 |             width_range = self.freq_mask_width
 94 |         else:
 95 |             D = time
 96 |             width_range = self.time_mask_width
 97 | 
 98 |         mask_len = torch.randint(width_range[0], width_range[1], (batch, 1), device=x.device).unsqueeze(2)
 99 |         mask_pos = torch.randint(0, max(1, D - mask_len.max()), (batch, 1), device=x.device).unsqueeze(2)
100 |         arange = torch.arange(D, device=x.device).view(1, 1, -1)
101 |         mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len))
102 |         mask = mask.any(dim=1)
103 | 
104 |         if dim == 1:
105 |             mask = mask.unsqueeze(2)
106 |         else:
107 |             mask = mask.unsqueeze(1)
108 | 
109 |         x = x.masked_fill_(mask, 0.0)
110 |         return x.view(*original_size)
111 | 
112 |     def forward(self, x):
113 |         x = self.mask_along_axis(x, dim=2)
114 |         x = self.mask_along_axis(x, dim=1)
115 |         return x
116 | 
117 | class ECAPA_TDNN(nn.Module):
118 |     def __init__(self, block, C, model_scale, log_input=True, num_mels=80, num_out=192, **kwargs):
119 |         self.log_input  = log_input
120 |         super(ECAPA_TDNN, self).__init__()
121 |         self.scale  = model_scale
122 |         self.conv1  = nn.Conv1d(num_mels, C, kernel_size=5, stride=1, padding=2)
123 |         self.relu   = nn.ReLU()
124 |         self.bn1    = nn.BatchNorm1d(C)
125 |         self.layer1 = block(C, C, kernel_size=3, dilation=2, scale=self.scale)
126 |         self.layer2 = block(C, C, kernel_size=3, dilation=3, scale=self.scale)
127 |         self.layer3 = block(C, C, kernel_size=3, dilation=4, scale=self.scale)
128 |         self.layer4 = nn.Conv1d(3*C, 1536, kernel_size=1)
129 |         self.attention = nn.Sequential(
130 |             nn.Conv1d(4608, 256, kernel_size=1),
131 |             nn.ReLU(),
132 |             nn.BatchNorm1d(256),
133 |             nn.Tanh(),
134 |             nn.Conv1d(256, 1536, kernel_size=1),
135 |             nn.Softmax(dim=2),
136 |             )
137 |         self.torchfbank = torch.nn.Sequential(
138 |             PreEmphasis(),
139 |             torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \
140 |                                                  f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=num_mels),
141 |             )
142 |         self.specaug = FbankAug()
143 |         self.bn5 = nn.BatchNorm1d(3072)
144 |         self.fc6 = nn.Linear(3072, num_out)
145 |         self.bn6 = nn.BatchNorm1d(num_out)
146 | 
147 |     def forward(self, x, aug):
148 |         with torch.no_grad():
149 |             with torch.cuda.amp.autocast(enabled=False):
150 |                 x = self.torchfbank(x)+1e-6
151 |                 if self.log_input:
152 |                     x = x.log()
153 |                 x = x - torch.mean(x, dim=-1, keepdim=True)
154 |                 if aug == True:
155 |                     x = self.specaug(x)
156 |         x = self.conv1(x)
157 |         x = self.relu(x)
158 |         x = self.bn1(x)
159 |         x1 = self.layer1(x)
160 |         x2 = self.layer2(x+x1)
161 |         x3 = self.layer3(x+x1+x2)
162 |         x = self.layer4(torch.cat((x1,x2,x3),dim=1))
163 |         x = self.relu(x)
164 |         t = x.size()[-1]
165 |         global_x = torch.cat((x,torch.mean(x,dim=2,keepdim=True).repeat(1,1,t), torch.sqrt(torch.var(x,dim=2,keepdim=True).clamp(min=1e-4)).repeat(1,1,t)), dim=1)
166 |         w = self.attention(global_x)
167 |         mu = torch.sum(x * w, dim=2)
168 |         sg = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-4) )
169 |         x = torch.cat((mu,sg),1)
170 |         x = self.bn5(x)
171 |         x = self.fc6(x)
172 |         x = self.bn6(x)
173 |         return x
174 | 
175 | def MainModel(eca_c=1024, eca_s=8, log_input=True, num_mels=80, num_out=192, **kwargs):
176 |     model = ECAPA_TDNN(block=Bottle2neck, C=eca_c, model_scale=eca_s, log_input=log_input, num_mels=num_mels, num_out=num_out, **kwargs)
177 |     return model
178 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/MFA_Conformer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torchaudio
 4 | from torch import Tensor
 5 | from typing import Tuple
 6 | from utils import PreEmphasis
 7 | from .specaugment import SpecAugment
 8 | from .wenet.transformer.encoder_cat import ConformerEncoder
 9 | 
10 | class Conformer(nn.Module):
11 |     def __init__(self, num_mels=80, num_blocks=6, output_size=256, embedding_dim=192, input_layer="conv2d2", pos_enc_layer_type="rel_pos"):
12 |         super(Conformer, self).__init__()
13 |         print("input_layer: {}".format(input_layer))
14 |         print("pos_enc_layer_type: {}".format(pos_enc_layer_type))
15 |         self.conformer = ConformerEncoder(input_size=num_mels, num_blocks=num_blocks, output_size=output_size, input_layer=input_layer, pos_enc_layer_type=pos_enc_layer_type, )
16 |         self.bn = nn.BatchNorm1d(output_size*num_blocks*2)
17 |         self.fc = nn.Linear(output_size*num_blocks*2, embedding_dim)
18 | 
19 |         self.specaug = SpecAugment()
20 |         self.torchfbank = torch.nn.Sequential(
21 |             PreEmphasis(),
22 |             torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \
23 |                                                  f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=80),
24 |             )
25 |         output_dim = output_size*num_blocks
26 |         self.attention = nn.Sequential(
27 |             nn.Conv1d(output_dim*3, 256, kernel_size=1),
28 |             nn.ReLU(),
29 |             nn.BatchNorm1d(256),
30 |             nn.Tanh(),
31 |             nn.Conv1d(256, output_dim, kernel_size=1),
32 |             nn.Softmax(dim=2),
33 |             )
34 | 
35 |     def forward(self, x: Tensor, aug=False) -> Tuple[Tensor, bool]:
36 | 
37 |         with torch.no_grad():
38 |             with torch.cuda.amp.autocast(enabled=False):
39 |                 x = self.torchfbank(x)+1e-6
40 |                 x = x.log()
41 |                 x = x - torch.mean(x, dim=-1, keepdim=True)
42 |                 if aug == True:
43 |                     x = self.specaug(x)
44 |         x = x.transpose(1,2)
45 |         lens = torch.ones(x.shape[0]).to(x.device)
46 |         lens = torch.round(lens*x.shape[1]).int()
47 |         x, masks = self.conformer(x, lens)
48 |         x = x.transpose(1,2)
49 | 
50 |         # Context dependent ASP
51 |         t = x.size()[-1]
52 |         global_x = torch.cat((x,torch.mean(x, dim=2, keepdim=True).repeat(1, 1, t), torch.sqrt(torch.var(x, dim=2, keepdim=True).clamp(min=1e-4)).repeat(1, 1, t)), dim=1)
53 |         w = self.attention(global_x)
54 |         mu = torch.sum(x * w, dim=2)
55 |         sg = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-4) )
56 |         x = torch.cat((mu, sg), dim=1)
57 | 
58 |         # BN -> FC: embedding
59 |         x = self.bn(x)
60 |         x = self.fc(x)
61 | 
62 |         return x
63 | 
64 | def MainModel(num_mels=80, num_out=192, **kwargs):
65 |     model = Conformer(num_mels=num_mels, embedding_dim=num_out, input_layer="conv2d2")
66 |     return model
67 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/specaugment.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class SpecAugment(nn.Module):
 5 | 
 6 |     def __init__(self, freq_mask_width = (0, 8), time_mask_width = (0, 10)):
 7 |         self.time_mask_width = time_mask_width
 8 |         self.freq_mask_width = freq_mask_width
 9 |         super(SpecAugment, self).__init__()
10 | 
11 |     def mask_along_axis(self, x, dim):
12 |         original_size = x.shape
13 |         batch, fea, time = x.shape
14 |         if dim == 1:
15 |             D = fea
16 |             width_range = self.freq_mask_width
17 |         else:
18 |             D = time
19 |             width_range = self.time_mask_width
20 | 
21 |         mask_len = torch.randint(width_range[0], width_range[1], (batch, 1), device=x.device).unsqueeze(2)
22 |         mask_pos = torch.randint(0, max(1, D - mask_len.max()), (batch, 1), device=x.device).unsqueeze(2)
23 |         arange = torch.arange(D, device=x.device).view(1, 1, -1)
24 |         mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len))
25 |         mask = mask.any(dim=1)
26 | 
27 |         if dim == 1:
28 |             mask = mask.unsqueeze(2)
29 |         else:
30 |             mask = mask.unsqueeze(1)
31 | 
32 |         x = x.masked_fill_(mask, 0.0)
33 |         return x.view(*original_size)
34 | 
35 |     def forward(self, x):
36 |         x = self.mask_along_axis(x, dim=2)
37 |         x = self.mask_along_axis(x, dim=1)
38 |         return x
39 | 
40 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/bin/.train.py.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/bin/.train.py.swp


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/bin/average_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Mobvoi Inc. All Rights Reserved.
 2 | # Author: di.wu@mobvoi.com (DI WU)
 3 | import os
 4 | import argparse
 5 | import glob
 6 | 
 7 | import yaml
 8 | import numpy as np
 9 | import torch
10 | 
11 | if __name__ == '__main__':
12 |     parser = argparse.ArgumentParser(description='average model')
13 |     parser.add_argument('--dst_model', required=True, help='averaged model')
14 |     parser.add_argument('--src_path',
15 |                         required=True,
16 |                         help='src model path for average')
17 |     parser.add_argument('--val_best',
18 |                         action="store_true",
19 |                         help='averaged model')
20 |     parser.add_argument('--num',
21 |                         default=5,
22 |                         type=int,
23 |                         help='nums for averaged model')
24 |     parser.add_argument('--min_epoch',
25 |                         default=0,
26 |                         type=int,
27 |                         help='min epoch used for averaging model')
28 |     parser.add_argument('--max_epoch',
29 |                         default=65536,  # Big enough
30 |                         type=int,
31 |                         help='max epoch used for averaging model')
32 | 
33 |     args = parser.parse_args()
34 |     print(args)
35 |     checkpoints = []
36 |     val_scores = []
37 |     if args.val_best:
38 |         yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path))
39 |         for y in yamls:
40 |             with open(y, 'r') as f:
41 |                 dic_yaml = yaml.load(f, Loader=yaml.FullLoader)
42 |                 loss = dic_yaml['cv_loss']
43 |                 epoch = dic_yaml['epoch']
44 |                 if epoch >= args.min_epoch and epoch <= args.max_epoch:
45 |                     val_scores += [[epoch, loss]]
46 |         val_scores = np.array(val_scores)
47 |         sort_idx = np.argsort(val_scores[:, -1])
48 |         sorted_val_scores = val_scores[sort_idx][::1]
49 |         print("best val scores = " + str(sorted_val_scores[:args.num, 1]))
50 |         print("selected epochs = " +
51 |               str(sorted_val_scores[:args.num, 0].astype(np.int64)))
52 |         path_list = [
53 |             args.src_path + '/{}.pt'.format(int(epoch))
54 |             for epoch in sorted_val_scores[:args.num, 0]
55 |         ]
56 |     else:
57 |         path_list = glob.glob('{}/[!avg][!final]*.pt'.format(args.src_path))
58 |         path_list = sorted(path_list, key=os.path.getmtime)
59 |         path_list = path_list[-args.num:]
60 |     print(path_list)
61 |     avg = None
62 |     num = args.num
63 |     assert num == len(path_list)
64 |     for path in path_list:
65 |         print('Processing {}'.format(path))
66 |         states = torch.load(path, map_location=torch.device('cpu'))
67 |         if avg is None:
68 |             avg = states
69 |         else:
70 |             for k in avg.keys():
71 |                 avg[k] += states[k]
72 |     # average
73 |     for k in avg.keys():
74 |         if avg[k] is not None:
75 |             # pytorch 1.6 use true_divide instead of /=
76 |             avg[k] = torch.true_divide(avg[k], num)
77 |     print('Saving to {}'.format(args.dst_model))
78 |     torch.save(avg, args.dst_model)
79 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/bin/export_jit.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from __future__ import print_function
16 | 
17 | import argparse
18 | import os
19 | 
20 | import torch
21 | import yaml
22 | 
23 | from wenet.transformer.asr_model import init_asr_model
24 | from wenet.utils.checkpoint import load_checkpoint
25 | 
26 | if __name__ == '__main__':
27 |     parser = argparse.ArgumentParser(description='export your script model')
28 |     parser.add_argument('--config', required=True, help='config file')
29 |     parser.add_argument('--checkpoint', required=True, help='checkpoint model')
30 |     parser.add_argument('--output_file', required=True, help='output file')
31 |     parser.add_argument('--output_quant_file',
32 |                         default=None,
33 |                         help='output quantized model file')
34 |     args = parser.parse_args()
35 |     # No need gpu for model export
36 |     os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
37 | 
38 |     with open(args.config, 'r') as fin:
39 |         configs = yaml.load(fin, Loader=yaml.FullLoader)
40 |     model = init_asr_model(configs)
41 |     print(model)
42 | 
43 |     load_checkpoint(model, args.checkpoint)
44 |     # Export jit torch script model
45 | 
46 |     script_model = torch.jit.script(model)
47 |     script_model.save(args.output_file)
48 |     print('Export model successfully, see {}'.format(args.output_file))
49 | 
50 |     # Export quantized jit torch script model
51 |     if args.output_quant_file:
52 |         quantized_model = torch.quantization.quantize_dynamic(
53 |             model, {torch.nn.Linear}, dtype=torch.qint8
54 |         )
55 |         print(quantized_model)
56 |         script_quant_model = torch.jit.script(quantized_model)
57 |         script_quant_model.save(args.output_quant_file)
58 |         print('Export quantized model successfully, '
59 |               'see {}'.format(args.output_quant_file))
60 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/attention.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/attention.cpython-36.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/attention.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/attention.cpython-37.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/attention.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/attention.cpython-39.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/convolution.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/convolution.cpython-36.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/convolution.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/convolution.cpython-37.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/convolution.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/convolution.cpython-39.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/embedding.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/embedding.cpython-36.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/embedding.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/embedding.cpython-37.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/embedding.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/embedding.cpython-39.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_cat.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_cat.cpython-36.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_cat.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_cat.cpython-37.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_cat.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_cat.cpython-39.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_layer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_layer.cpython-36.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_layer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_layer.cpython-37.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_layer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/encoder_layer.cpython-39.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-36.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-37.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-39.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/subsampling.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/subsampling.cpython-36.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/subsampling.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/subsampling.cpython-37.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/subsampling.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/subsampling.cpython-39.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/swish.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/swish.cpython-36.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/swish.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/swish.cpython-37.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/swish.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/transformer/__pycache__/swish.cpython-39.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/cmvn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import torch
17 | 
18 | 
19 | class GlobalCMVN(torch.nn.Module):
20 |     def __init__(self,
21 |                  mean: torch.Tensor,
22 |                  istd: torch.Tensor,
23 |                  norm_var: bool = True):
24 |         """
25 |         Args:
26 |             mean (torch.Tensor): mean stats
27 |             istd (torch.Tensor): inverse std, std which is 1.0 / std
28 |         """
29 |         super().__init__()
30 |         assert mean.shape == istd.shape
31 |         self.norm_var = norm_var
32 |         # The buffer can be accessed from this module using self.mean
33 |         self.register_buffer("mean", mean)
34 |         self.register_buffer("istd", istd)
35 | 
36 |     def forward(self, x: torch.Tensor):
37 |         """
38 |         Args:
39 |             x (torch.Tensor): (batch, max_len, feat_dim)
40 | 
41 |         Returns:
42 |             (torch.Tensor): normalized feature
43 |         """
44 |         x = x - self.mean
45 |         if self.norm_var:
46 |             x = x * self.istd
47 |         return x
48 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/convolution.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2021 Mobvoi Inc. All Rights Reserved.
  5 | # Author: di.wu@mobvoi.com (DI WU)
  6 | """ConvolutionModule definition."""
  7 | 
  8 | from typing import Optional, Tuple
  9 | 
 10 | import torch
 11 | from torch import nn
 12 | from typeguard import check_argument_types
 13 | 
 14 | 
 15 | class ConvolutionModule(nn.Module):
 16 |     """ConvolutionModule in Conformer model."""
 17 |     def __init__(self,
 18 |                  channels: int,
 19 |                  kernel_size: int = 15,
 20 |                  activation: nn.Module = nn.ReLU(),
 21 |                  norm: str = "batch_norm",
 22 |                  causal: bool = False,
 23 |                  bias: bool = True):
 24 |         """Construct an ConvolutionModule object.
 25 |         Args:
 26 |             channels (int): The number of channels of conv layers.
 27 |             kernel_size (int): Kernel size of conv layers.
 28 |             causal (int): Whether use causal convolution or not
 29 |         """
 30 |         assert check_argument_types()
 31 |         super().__init__()
 32 | 
 33 |         self.pointwise_conv1 = nn.Conv1d(
 34 |             channels,
 35 |             2 * channels,
 36 |             kernel_size=1,
 37 |             stride=1,
 38 |             padding=0,
 39 |             bias=bias,
 40 |         )
 41 |         # self.lorder is used to distinguish if it's a causal convolution,
 42 |         # if self.lorder > 0: it's a causal convolution, the input will be
 43 |         #    padded with self.lorder frames on the left in forward.
 44 |         # else: it's a symmetrical convolution
 45 |         if causal:
 46 |             padding = 0
 47 |             self.lorder = kernel_size - 1
 48 |         else:
 49 |             # kernel_size should be an odd number for none causal convolution
 50 |             assert (kernel_size - 1) % 2 == 0
 51 |             padding = (kernel_size - 1) // 2
 52 |             self.lorder = 0
 53 |         self.depthwise_conv = nn.Conv1d(
 54 |             channels,
 55 |             channels,
 56 |             kernel_size,
 57 |             stride=1,
 58 |             padding=padding,
 59 |             groups=channels,
 60 |             bias=bias,
 61 |         )
 62 | 
 63 |         assert norm in ['batch_norm', 'layer_norm']
 64 |         if norm == "batch_norm":
 65 |             self.use_layer_norm = False
 66 |             self.norm = nn.BatchNorm1d(channels)
 67 |         else:
 68 |             self.use_layer_norm = True
 69 |             self.norm = nn.LayerNorm(channels)
 70 | 
 71 |         self.pointwise_conv2 = nn.Conv1d(
 72 |             channels,
 73 |             channels,
 74 |             kernel_size=1,
 75 |             stride=1,
 76 |             padding=0,
 77 |             bias=bias,
 78 |         )
 79 |         self.activation = activation
 80 | 
 81 |     def forward(
 82 |         self,
 83 |         x: torch.Tensor,
 84 |         mask_pad: Optional[torch.Tensor] = None,
 85 |         cache: Optional[torch.Tensor] = None,
 86 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 87 |         """Compute convolution module.
 88 |         Args:
 89 |             x (torch.Tensor): Input tensor (#batch, time, channels).
 90 |             mask_pad (torch.Tensor): used for batch padding (#batch, 1, time)
 91 |             cache (torch.Tensor): left context cache, it is only
 92 |                 used in causal convolution
 93 |         Returns:
 94 |             torch.Tensor: Output tensor (#batch, time, channels).
 95 |         """
 96 |         # exchange the temporal dimension and the feature dimension
 97 |         x = x.transpose(1, 2)  # (#batch, channels, time)
 98 | 
 99 |         # mask batch padding
100 |         if mask_pad is not None:
101 |             x.masked_fill_(~mask_pad, 0.0)
102 | 
103 |         if self.lorder > 0:
104 |             if cache is None:
105 |                 x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
106 |             else:
107 |                 assert cache.size(0) == x.size(0)
108 |                 assert cache.size(1) == x.size(1)
109 |                 x = torch.cat((cache, x), dim=2)
110 |             assert (x.size(2) > self.lorder)
111 |             new_cache = x[:, :, -self.lorder:]
112 |         else:
113 |             # It's better we just return None if no cache is requried,
114 |             # However, for JIT export, here we just fake one tensor instead of
115 |             # None.
116 |             new_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device)
117 | 
118 |         # GLU mechanism
119 |         x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
120 |         x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
121 | 
122 |         # 1D Depthwise Conv
123 |         x = self.depthwise_conv(x)
124 |         if self.use_layer_norm:
125 |             x = x.transpose(1, 2)
126 |         x = self.activation(self.norm(x))
127 |         if self.use_layer_norm:
128 |             x = x.transpose(1, 2)
129 |         x = self.pointwise_conv2(x)
130 |         # mask batch padding
131 |         if mask_pad is not None:
132 |             x.masked_fill_(~mask_pad, 0.0)
133 | 
134 |         return x.transpose(1, 2), new_cache
135 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/ctc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from typeguard import check_argument_types
 4 | 
 5 | 
 6 | class CTC(torch.nn.Module):
 7 |     """CTC module"""
 8 |     def __init__(
 9 |         self,
10 |         odim: int,
11 |         encoder_output_size: int,
12 |         dropout_rate: float = 0.0,
13 |         reduce: bool = True,
14 |     ):
15 |         """ Construct CTC module
16 |         Args:
17 |             odim: dimension of outputs
18 |             encoder_output_size: number of encoder projection units
19 |             dropout_rate: dropout rate (0.0 ~ 1.0)
20 |             reduce: reduce the CTC loss into a scalar
21 |         """
22 |         assert check_argument_types()
23 |         super().__init__()
24 |         eprojs = encoder_output_size
25 |         self.dropout_rate = dropout_rate
26 |         self.ctc_lo = torch.nn.Linear(eprojs, odim)
27 | 
28 |         reduction_type = "sum" if reduce else "none"
29 |         self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type)
30 | 
31 |     def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor,
32 |                 ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor:
33 |         """Calculate CTC loss.
34 | 
35 |         Args:
36 |             hs_pad: batch of padded hidden state sequences (B, Tmax, D)
37 |             hlens: batch of lengths of hidden state sequences (B)
38 |             ys_pad: batch of padded character id sequence tensor (B, Lmax)
39 |             ys_lens: batch of lengths of character sequence (B)
40 |         """
41 |         # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab)
42 |         ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate))
43 |         # ys_hat: (B, L, D) -> (L, B, D)
44 |         ys_hat = ys_hat.transpose(0, 1)
45 |         ys_hat = ys_hat.log_softmax(2)
46 |         loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens)
47 |         # Batch-size average
48 |         loss = loss / ys_hat.size(1)
49 |         return loss
50 | 
51 |     def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor:
52 |         """log_softmax of frame activations
53 | 
54 |         Args:
55 |             Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
56 |         Returns:
57 |             torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim)
58 |         """
59 |         return F.log_softmax(self.ctc_lo(hs_pad), dim=2)
60 | 
61 |     def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor:
62 |         """argmax of frame activations
63 | 
64 |         Args:
65 |             torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
66 |         Returns:
67 |             torch.Tensor: argmax applied 2d tensor (B, Tmax)
68 |         """
69 |         return torch.argmax(self.ctc_lo(hs_pad), dim=2)
70 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/decoder_layer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2019 Shigeki Karita
  5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  6 | """Decoder self-attention layer definition."""
  7 | from typing import Optional, Tuple
  8 | 
  9 | import torch
 10 | from torch import nn
 11 | 
 12 | 
 13 | class DecoderLayer(nn.Module):
 14 |     """Single decoder layer module.
 15 | 
 16 |     Args:
 17 |         size (int): Input dimension.
 18 |         self_attn (torch.nn.Module): Self-attention module instance.
 19 |             `MultiHeadedAttention` instance can be used as the argument.
 20 |         src_attn (torch.nn.Module): Inter-attention module instance.
 21 |             `MultiHeadedAttention` instance can be used as the argument.
 22 |         feed_forward (torch.nn.Module): Feed-forward module instance.
 23 |             `PositionwiseFeedForward` instance can be used as the argument.
 24 |         dropout_rate (float): Dropout rate.
 25 |         normalize_before (bool):
 26 |             True: use layer_norm before each sub-block.
 27 |             False: to use layer_norm after each sub-block.
 28 |         concat_after (bool): Whether to concat attention layer's inpu
 29 |             and output.
 30 |             True: x -> x + linear(concat(x, att(x)))
 31 |             False: x -> x + att(x)
 32 |     """
 33 |     def __init__(
 34 |         self,
 35 |         size: int,
 36 |         self_attn: nn.Module,
 37 |         src_attn: nn.Module,
 38 |         feed_forward: nn.Module,
 39 |         dropout_rate: float,
 40 |         normalize_before: bool = True,
 41 |         concat_after: bool = False,
 42 |     ):
 43 |         """Construct an DecoderLayer object."""
 44 |         super().__init__()
 45 |         self.size = size
 46 |         self.self_attn = self_attn
 47 |         self.src_attn = src_attn
 48 |         self.feed_forward = feed_forward
 49 |         self.norm1 = nn.LayerNorm(size, eps=1e-12)
 50 |         self.norm2 = nn.LayerNorm(size, eps=1e-12)
 51 |         self.norm3 = nn.LayerNorm(size, eps=1e-12)
 52 |         self.dropout = nn.Dropout(dropout_rate)
 53 |         self.normalize_before = normalize_before
 54 |         self.concat_after = concat_after
 55 |         self.concat_linear1 = nn.Linear(size + size, size)
 56 |         self.concat_linear2 = nn.Linear(size + size, size)
 57 | 
 58 |     def forward(
 59 |         self,
 60 |         tgt: torch.Tensor,
 61 |         tgt_mask: torch.Tensor,
 62 |         memory: torch.Tensor,
 63 |         memory_mask: torch.Tensor,
 64 |         cache: Optional[torch.Tensor] = None
 65 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
 66 |         """Compute decoded features.
 67 | 
 68 |         Args:
 69 |             tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
 70 |             tgt_mask (torch.Tensor): Mask for input tensor
 71 |                 (#batch, maxlen_out).
 72 |             memory (torch.Tensor): Encoded memory
 73 |                 (#batch, maxlen_in, size).
 74 |             memory_mask (torch.Tensor): Encoded memory mask
 75 |                 (#batch, maxlen_in).
 76 |             cache (torch.Tensor): cached tensors.
 77 |                 (#batch, maxlen_out - 1, size).
 78 | 
 79 |         Returns:
 80 |             torch.Tensor: Output tensor (#batch, maxlen_out, size).
 81 |             torch.Tensor: Mask for output tensor (#batch, maxlen_out).
 82 |             torch.Tensor: Encoded memory (#batch, maxlen_in, size).
 83 |             torch.Tensor: Encoded memory mask (#batch, maxlen_in).
 84 | 
 85 |         """
 86 |         residual = tgt
 87 |         if self.normalize_before:
 88 |             tgt = self.norm1(tgt)
 89 | 
 90 |         if cache is None:
 91 |             tgt_q = tgt
 92 |             tgt_q_mask = tgt_mask
 93 |         else:
 94 |             # compute only the last frame query keeping dim: max_time_out -> 1
 95 |             assert cache.shape == (
 96 |                 tgt.shape[0],
 97 |                 tgt.shape[1] - 1,
 98 |                 self.size,
 99 |             ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
100 |             tgt_q = tgt[:, -1:, :]
101 |             residual = residual[:, -1:, :]
102 |             tgt_q_mask = tgt_mask[:, -1:, :]
103 | 
104 |         if self.concat_after:
105 |             tgt_concat = torch.cat(
106 |                 (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)), dim=-1)
107 |             x = residual + self.concat_linear1(tgt_concat)
108 |         else:
109 |             x = residual + self.dropout(
110 |                 self.self_attn(tgt_q, tgt, tgt, tgt_q_mask))
111 |         if not self.normalize_before:
112 |             x = self.norm1(x)
113 | 
114 |         residual = x
115 |         if self.normalize_before:
116 |             x = self.norm2(x)
117 |         if self.concat_after:
118 |             x_concat = torch.cat(
119 |                 (x, self.src_attn(x, memory, memory, memory_mask)), dim=-1)
120 |             x = residual + self.concat_linear2(x_concat)
121 |         else:
122 |             x = residual + self.dropout(
123 |                 self.src_attn(x, memory, memory, memory_mask))
124 |         if not self.normalize_before:
125 |             x = self.norm2(x)
126 | 
127 |         residual = x
128 |         if self.normalize_before:
129 |             x = self.norm3(x)
130 |         x = residual + self.dropout(self.feed_forward(x))
131 |         if not self.normalize_before:
132 |             x = self.norm3(x)
133 | 
134 |         if cache is not None:
135 |             x = torch.cat([cache, x], dim=1)
136 | 
137 |         return x, tgt_mask, memory, memory_mask
138 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/embedding.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2019 Mobvoi Inc. All Rights Reserved.
  5 | # Author: di.wu@mobvoi.com (DI WU)
  6 | """Positonal Encoding Module."""
  7 | 
  8 | import math
  9 | from typing import Tuple
 10 | 
 11 | import torch
 12 | 
 13 | 
 14 | class PositionalEncoding(torch.nn.Module):
 15 |     """Positional encoding.
 16 | 
 17 |     :param int d_model: embedding dim
 18 |     :param float dropout_rate: dropout rate
 19 |     :param int max_len: maximum input length
 20 | 
 21 |     PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
 22 |     PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
 23 |     """
 24 |     def __init__(self,
 25 |                  d_model: int,
 26 |                  dropout_rate: float,
 27 |                  max_len: int = 50000,
 28 |                  reverse: bool = False):
 29 |         """Construct an PositionalEncoding object."""
 30 |         super().__init__()
 31 |         self.d_model = d_model
 32 |         self.xscale = math.sqrt(self.d_model)
 33 |         self.dropout = torch.nn.Dropout(p=dropout_rate)
 34 |         self.max_len = max_len
 35 | 
 36 |         self.pe = torch.zeros(self.max_len, self.d_model)
 37 |         position = torch.arange(0, self.max_len,
 38 |                                 dtype=torch.float32).unsqueeze(1)
 39 |         div_term = torch.exp(
 40 |             torch.arange(0, self.d_model, 2, dtype=torch.float32) *
 41 |             -(math.log(10000.0) / self.d_model))
 42 |         self.pe[:, 0::2] = torch.sin(position * div_term)
 43 |         self.pe[:, 1::2] = torch.cos(position * div_term)
 44 |         self.pe = self.pe.unsqueeze(0)
 45 | 
 46 |     def forward(self,
 47 |                 x: torch.Tensor,
 48 |                 offset: int = 0) -> Tuple[torch.Tensor, torch.Tensor]:
 49 |         """Add positional encoding.
 50 | 
 51 |         Args:
 52 |             x (torch.Tensor): Input. Its shape is (batch, time, ...)
 53 |             offset (int): position offset
 54 | 
 55 |         Returns:
 56 |             torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
 57 |             torch.Tensor: for compatibility to RelPositionalEncoding
 58 |         """
 59 |         assert offset + x.size(1) < self.max_len
 60 |         self.pe = self.pe.to(x.device)
 61 |         pos_emb = self.pe[:, offset:offset + x.size(1)]
 62 |         x = x * self.xscale + pos_emb
 63 |         return self.dropout(x), self.dropout(pos_emb)
 64 | 
 65 |     def position_encoding(self, offset: int, size: int) -> torch.Tensor:
 66 |         """ For getting encoding in a streaming fashion
 67 | 
 68 |         Attention!!!!!
 69 |         we apply dropout only once at the whole utterance level in a none
 70 |         streaming way, but will call this function several times with
 71 |         increasing input size in a streaming scenario, so the dropout will
 72 |         be applied several times.
 73 | 
 74 |         Args:
 75 |             offset (int): start offset
 76 |             size (int): requried size of position encoding
 77 | 
 78 |         Returns:
 79 |             torch.Tensor: Corresponding encoding
 80 |         """
 81 |         assert offset + size < self.max_len
 82 |         return self.dropout(self.pe[:, offset:offset + size])
 83 | 
 84 | 
 85 | class RelPositionalEncoding(PositionalEncoding):
 86 |     """Relative positional encoding module.
 87 |     See : Appendix B in https://arxiv.org/abs/1901.02860
 88 |     Args:
 89 |         d_model (int): Embedding dimension.
 90 |         dropout_rate (float): Dropout rate.
 91 |         max_len (int): Maximum input length.
 92 |     """
 93 |     def __init__(self, d_model: int, dropout_rate: float, max_len: int = 100000):
 94 |         """Initialize class."""
 95 |         super().__init__(d_model, dropout_rate, max_len, reverse=True)
 96 | 
 97 |     def forward(self,
 98 |                 x: torch.Tensor,
 99 |                 offset: int = 0) -> Tuple[torch.Tensor, torch.Tensor]:
100 |         """Compute positional encoding.
101 |         Args:
102 |             x (torch.Tensor): Input tensor (batch, time, `*`).
103 |         Returns:
104 |             torch.Tensor: Encoded tensor (batch, time, `*`).
105 |             torch.Tensor: Positional embedding tensor (1, time, `*`).
106 |         """
107 |         assert offset + x.size(1) < self.max_len
108 |         self.pe = self.pe.to(x.device)
109 |         x = x * self.xscale
110 |         pos_emb = self.pe[:, offset:offset + x.size(1)]
111 |         return self.dropout(x), self.dropout(pos_emb)
112 | 
113 | 
114 | class NoPositionalEncoding(torch.nn.Module):
115 |     """ No position encoding
116 |     """
117 |     def __init__(self, d_model: int, dropout_rate: float):
118 |         super().__init__()
119 |         self.d_model = d_model
120 |         self.dropout = torch.nn.Dropout(p=dropout_rate)
121 | 
122 |     def forward(self,
123 |                 x: torch.Tensor,
124 |                 offset: int = 0) -> Tuple[torch.Tensor, torch.Tensor]:
125 |         """ Just return zero vector for interface compatibility
126 |         """
127 |         pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device)
128 |         return self.dropout(x), pos_emb
129 | 
130 |     def position_encoding(self, offset: int, size: int) -> torch.Tensor:
131 |         return torch.zeros(1, size, self.d_model)
132 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/label_smoothing_loss.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | """Label smoothing module."""
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class LabelSmoothingLoss(nn.Module):
13 |     """Label-smoothing loss.
14 | 
15 |     In a standard CE loss, the label's data distribution is:
16 |     [0,1,2] ->
17 |     [
18 |         [1.0, 0.0, 0.0],
19 |         [0.0, 1.0, 0.0],
20 |         [1.0, 0.0, 1.0],
21 |     ]
22 | 
23 |     In the smoothing version CE Loss,some probabilities
24 |     are taken from the true label prob (1.0) and are divided
25 |     among other labels.
26 | 
27 |     e.g.
28 |     smoothing=0.1
29 |     [0,1,2] ->
30 |     [
31 |         [0.9, 0.05, 0.05],
32 |         [0.05, 0.9, 0.05],
33 |         [0.05, 0.05, 0.9],
34 |     ]
35 | 
36 |     Args:
37 |         size (int): the number of class
38 |         padding_idx (int): padding class id which will be ignored for loss
39 |         smoothing (float): smoothing rate (0.0 means the conventional CE)
40 |         normalize_length (bool):
41 |             normalize loss by sequence length if True
42 |             normalize loss by batch size if False
43 |     """
44 |     def __init__(self,
45 |                  size: int,
46 |                  padding_idx: int,
47 |                  smoothing: float,
48 |                  normalize_length: bool = False):
49 |         """Construct an LabelSmoothingLoss object."""
50 |         super(LabelSmoothingLoss, self).__init__()
51 |         self.criterion = nn.KLDivLoss(reduction="none")
52 |         self.padding_idx = padding_idx
53 |         self.confidence = 1.0 - smoothing
54 |         self.smoothing = smoothing
55 |         self.size = size
56 |         self.normalize_length = normalize_length
57 | 
58 |     def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
59 |         """Compute loss between x and target.
60 | 
61 |         The model outputs and data labels tensors are flatten to
62 |         (batch*seqlen, class) shape and a mask is applied to the
63 |         padding part which should not be calculated for loss.
64 | 
65 |         Args:
66 |             x (torch.Tensor): prediction (batch, seqlen, class)
67 |             target (torch.Tensor):
68 |                 target signal masked with self.padding_id (batch, seqlen)
69 |         Returns:
70 |             loss (torch.Tensor) : The KL loss, scalar float value
71 |         """
72 |         assert x.size(2) == self.size
73 |         batch_size = x.size(0)
74 |         x = x.view(-1, self.size)
75 |         target = target.view(-1)
76 |         # use zeros_like instead of torch.no_grad() for true_dist,
77 |         # since no_grad() can not be exported by JIT
78 |         true_dist = torch.zeros_like(x)
79 |         true_dist.fill_(self.smoothing / (self.size - 1))
80 |         ignore = target == self.padding_idx  # (B,)
81 |         total = len(target) - ignore.sum().item()
82 |         target = target.masked_fill(ignore, 0)  # avoid -1 index
83 |         true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
84 |         kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
85 |         denom = total if self.normalize_length else batch_size
86 |         return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom
87 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/positionwise_feed_forward.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | """Positionwise feed forward layer definition."""
 7 | 
 8 | import torch
 9 | 
10 | 
11 | class PositionwiseFeedForward(torch.nn.Module):
12 |     """Positionwise feed forward layer.
13 | 
14 |     FeedForward are appied on each position of the sequence.
15 |     The output dim is same with the input dim.
16 | 
17 |     Args:
18 |         idim (int): Input dimenstion.
19 |         hidden_units (int): The number of hidden units.
20 |         dropout_rate (float): Dropout rate.
21 |         activation (torch.nn.Module): Activation function
22 |     """
23 |     def __init__(self,
24 |                  idim: int,
25 |                  hidden_units: int,
26 |                  dropout_rate: float,
27 |                  activation: torch.nn.Module = torch.nn.ReLU()):
28 |         """Construct a PositionwiseFeedForward object."""
29 |         super(PositionwiseFeedForward, self).__init__()
30 |         self.w_1 = torch.nn.Linear(idim, hidden_units)
31 |         self.activation = activation
32 |         self.dropout = torch.nn.Dropout(dropout_rate)
33 |         self.w_2 = torch.nn.Linear(hidden_units, idim)
34 | 
35 |     def forward(self, xs: torch.Tensor) -> torch.Tensor:
36 |         """Forward function.
37 | 
38 |         Args:
39 |             xs: input tensor (B, L, D)
40 |         Returns:
41 |             output tensor, (B, L, D)
42 |         """
43 |         return self.w_2(self.dropout(self.activation(self.w_1(xs))))
44 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/transformer/swish.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe)
 5 | #                Northwestern Polytechnical University (Pengcheng Guo)
 6 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 7 | """Swish() activation function for Conformer."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | class Swish(torch.nn.Module):
13 |     """Construct an Swish object."""
14 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
15 |         """Return Swish activation function."""
16 |         return x * torch.sigmoid(x)
17 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/utils/__pycache__/common.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/utils/__pycache__/common.cpython-36.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/utils/__pycache__/common.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/utils/__pycache__/common.cpython-37.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/utils/__pycache__/common.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/utils/__pycache__/common.cpython-39.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/utils/__pycache__/mask.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/utils/__pycache__/mask.cpython-36.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/utils/__pycache__/mask.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/utils/__pycache__/mask.cpython-37.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/utils/__pycache__/mask.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage1/VoxCeleb2/models/wenet/utils/__pycache__/mask.cpython-39.pyc


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/utils/checkpoint.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Mobvoi Inc. All Rights Reserved.
 2 | # Author: binbinzhang@mobvoi.com (Binbin Zhang)
 3 | 
 4 | import logging
 5 | import os
 6 | import re
 7 | 
 8 | import yaml
 9 | import torch
10 | 
11 | 
12 | def load_checkpoint(model: torch.nn.Module, path: str) -> dict:
13 |     if torch.cuda.is_available():
14 |         logging.info('Checkpoint: loading from checkpoint %s for GPU' % path)
15 |         checkpoint = torch.load(path)
16 |     else:
17 |         logging.info('Checkpoint: loading from checkpoint %s for CPU' % path)
18 |         checkpoint = torch.load(path, map_location='cpu')
19 |     model.load_state_dict(checkpoint)
20 |     info_path = re.sub('.pt$', '.yaml', path)
21 |     configs = {}
22 |     if os.path.exists(info_path):
23 |         with open(info_path, 'r') as fin:
24 |             configs = yaml.load(fin, Loader=yaml.FullLoader)
25 |     return configs
26 | 
27 | 
28 | def save_checkpoint(model: torch.nn.Module, path: str, infos=None):
29 |     '''
30 |     Args:
31 |         infos (dict or None): any info you want to save.
32 |     '''
33 |     logging.info('Checkpoint: save to checkpoint %s' % path)
34 |     if isinstance(model, torch.nn.DataParallel):
35 |         state_dict = model.module.state_dict()
36 |     elif isinstance(model, torch.nn.parallel.DistributedDataParallel):
37 |         state_dict = model.module.state_dict()
38 |     else:
39 |         state_dict = model.state_dict()
40 |     torch.save(state_dict, path)
41 |     info_path = re.sub('.pt$', '.yaml', path)
42 |     if infos is None:
43 |         infos = {}
44 |     with open(info_path, 'w') as fout:
45 |         data = yaml.dump(infos)
46 |         fout.write(data)
47 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/utils/cmvn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import json
17 | import math
18 | 
19 | import numpy as np
20 | 
21 | 
22 | def _load_json_cmvn(json_cmvn_file):
23 |     """ Load the json format cmvn stats file and calculate cmvn
24 | 
25 |     Args:
26 |         json_cmvn_file: cmvn stats file in json format
27 | 
28 |     Returns:
29 |         a numpy array of [means, vars]
30 |     """
31 |     with open(json_cmvn_file) as f:
32 |         cmvn_stats = json.load(f)
33 | 
34 |     means = cmvn_stats['mean_stat']
35 |     variance = cmvn_stats['var_stat']
36 |     count = cmvn_stats['frame_num']
37 |     for i in range(len(means)):
38 |         means[i] /= count
39 |         variance[i] = variance[i] / count - means[i] * means[i]
40 |         if variance[i] < 1.0e-20:
41 |             variance[i] = 1.0e-20
42 |         variance[i] = 1.0 / math.sqrt(variance[i])
43 |     cmvn = np.array([means, variance])
44 |     return cmvn
45 | 
46 | 
47 | def _load_kaldi_cmvn(kaldi_cmvn_file):
48 |     """ Load the kaldi format cmvn stats file and calculate cmvn
49 | 
50 |     Args:
51 |         kaldi_cmvn_file:  kaldi text style global cmvn file, which
52 |            is generated by:
53 |            compute-cmvn-stats --binary=false scp:feats.scp global_cmvn
54 | 
55 |     Returns:
56 |         a numpy array of [means, vars]
57 |     """
58 |     means = []
59 |     variance = []
60 |     with open(kaldi_cmvn_file, 'r') as fid:
61 |         # kaldi binary file start with '\0B'
62 |         if fid.read(2) == '\0B':
63 |             logging.error('kaldi cmvn binary file is not supported, please '
64 |                           'recompute it by: compute-cmvn-stats --binary=false '
65 |                           ' scp:feats.scp global_cmvn')
66 |             sys.exit(1)
67 |         fid.seek(0)
68 |         arr = fid.read().split()
69 |         assert (arr[0] == '[')
70 |         assert (arr[-2] == '0')
71 |         assert (arr[-1] == ']')
72 |         feat_dim = int((len(arr) - 2 - 2) / 2)
73 |         for i in range(1, feat_dim + 1):
74 |             means.append(float(arr[i]))
75 |         count = float(arr[feat_dim + 1])
76 |         for i in range(feat_dim + 2, 2 * feat_dim + 2):
77 |             variance.append(float(arr[i]))
78 | 
79 |     for i in range(len(means)):
80 |         means[i] /= count
81 |         variance[i] = variance[i] / count - means[i] * means[i]
82 |         if variance[i] < 1.0e-20:
83 |             variance[i] = 1.0e-20
84 |         variance[i] = 1.0 / math.sqrt(variance[i])
85 |     cmvn = np.array([means, variance])
86 |     return cmvn
87 | 
88 | 
89 | def load_cmvn(cmvn_file, is_json):
90 |     if is_json:
91 |         cmvn = _load_json_cmvn(cmvn_file)
92 |     else:
93 |         cmvn = _load_kaldi_cmvn(cmvn_file)
94 |     return cmvn[0], cmvn[1]
95 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/utils/common.py:
--------------------------------------------------------------------------------
  1 | """Unility functions for Transformer."""
  2 | 
  3 | import math
  4 | from typing import Tuple, List
  5 | 
  6 | import torch
  7 | from torch.nn.utils.rnn import pad_sequence
  8 | 
  9 | IGNORE_ID = -1
 10 | 
 11 | 
 12 | def pad_list(xs: List[torch.Tensor], pad_value: int):
 13 |     """Perform padding for the list of tensors.
 14 | 
 15 |     Args:
 16 |         xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
 17 |         pad_value (float): Value for padding.
 18 | 
 19 |     Returns:
 20 |         Tensor: Padded tensor (B, Tmax, `*`).
 21 | 
 22 |     Examples:
 23 |         >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
 24 |         >>> x
 25 |         [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
 26 |         >>> pad_list(x, 0)
 27 |         tensor([[1., 1., 1., 1.],
 28 |                 [1., 1., 0., 0.],
 29 |                 [1., 0., 0., 0.]])
 30 | 
 31 |     """
 32 |     n_batch = len(xs)
 33 |     max_len = max([x.size(0) for x in xs])
 34 |     pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device)
 35 |     pad = pad.fill_(pad_value)
 36 |     for i in range(n_batch):
 37 |         pad[i, :xs[i].size(0)] = xs[i]
 38 | 
 39 |     return pad
 40 | 
 41 | 
 42 | def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int,
 43 |                 ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]:
 44 |     """Add <sos> and <eos> labels.
 45 | 
 46 |     Args:
 47 |         ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax)
 48 |         sos (int): index of <sos>
 49 |         eos (int): index of <eeos>
 50 |         ignore_id (int): index of padding
 51 | 
 52 |     Returns:
 53 |         ys_in (torch.Tensor) : (B, Lmax + 1)
 54 |         ys_out (torch.Tensor) : (B, Lmax + 1)
 55 | 
 56 |     Examples:
 57 |         >>> sos_id = 10
 58 |         >>> eos_id = 11
 59 |         >>> ignore_id = -1
 60 |         >>> ys_pad
 61 |         tensor([[ 1,  2,  3,  4,  5],
 62 |                 [ 4,  5,  6, -1, -1],
 63 |                 [ 7,  8,  9, -1, -1]], dtype=torch.int32)
 64 |         >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id)
 65 |         >>> ys_in
 66 |         tensor([[10,  1,  2,  3,  4,  5],
 67 |                 [10,  4,  5,  6, 11, 11],
 68 |                 [10,  7,  8,  9, 11, 11]])
 69 |         >>> ys_out
 70 |         tensor([[ 1,  2,  3,  4,  5, 11],
 71 |                 [ 4,  5,  6, 11, -1, -1],
 72 |                 [ 7,  8,  9, 11, -1, -1]])
 73 |     """
 74 |     _sos = torch.tensor([sos],
 75 |                         dtype=torch.long,
 76 |                         requires_grad=False,
 77 |                         device=ys_pad.device)
 78 |     _eos = torch.tensor([eos],
 79 |                         dtype=torch.long,
 80 |                         requires_grad=False,
 81 |                         device=ys_pad.device)
 82 |     ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
 83 |     ys_in = [torch.cat([_sos, y], dim=0) for y in ys]
 84 |     ys_out = [torch.cat([y, _eos], dim=0) for y in ys]
 85 |     return pad_list(ys_in, eos), pad_list(ys_out, ignore_id)
 86 | 
 87 | 
 88 | def reverse_pad_list(ys_pad: torch.Tensor,
 89 |                      ys_lens: torch.Tensor,
 90 |                      pad_value: float = -1.0) -> torch.Tensor:
 91 |     """Reverse padding for the list of tensors.
 92 | 
 93 |     Args:
 94 |         ys_pad (tensor): The padded tensor (B, Tokenmax).
 95 |         ys_lens (tensor): The lens of token seqs (B)
 96 |         pad_value (int): Value for padding.
 97 | 
 98 |     Returns:
 99 |         Tensor: Padded tensor (B, Tokenmax).
100 | 
101 |     Examples:
102 |         >>> x
103 |         tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]])
104 |         >>> pad_list(x, 0)
105 |         tensor([[4, 3, 2, 1],
106 |                 [7, 6, 5, 0],
107 |                 [9, 8, 0, 0]])
108 | 
109 |     """
110 |     r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0]))
111 |                              for y, i in zip(ys_pad, ys_lens)], True,
112 |                             pad_value)
113 |     return r_ys_pad
114 | 
115 | 
116 | def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor,
117 |                 ignore_label: int) -> float:
118 |     """Calculate accuracy.
119 | 
120 |     Args:
121 |         pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
122 |         pad_targets (LongTensor): Target label tensors (B, Lmax, D).
123 |         ignore_label (int): Ignore label id.
124 | 
125 |     Returns:
126 |         float: Accuracy value (0.0 - 1.0).
127 | 
128 |     """
129 |     pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1),
130 |                                 pad_outputs.size(1)).argmax(2)
131 |     mask = pad_targets != ignore_label
132 |     numerator = torch.sum(
133 |         pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
134 |     denominator = torch.sum(mask)
135 |     return float(numerator) / float(denominator)
136 | 
137 | 
138 | def get_activation(act):
139 |     """Return activation function."""
140 |     # Lazy load to avoid unused import
141 |     #from wenet.transformer.swish import Swish
142 |     from ..transformer.swish import Swish
143 | 
144 |     activation_funcs = {
145 |         "hardtanh": torch.nn.Hardtanh,
146 |         "tanh": torch.nn.Tanh,
147 |         "relu": torch.nn.ReLU,
148 |         "selu": torch.nn.SELU,
149 |         "swish": Swish,
150 |         "gelu": torch.nn.GELU
151 |     }
152 | 
153 |     return activation_funcs[act]()
154 | 
155 | 
156 | def get_subsample(config):
157 |     input_layer = config["encoder_conf"]["input_layer"]
158 |     assert input_layer in ["conv2d", "conv2d6", "conv2d8"]
159 |     if input_layer == "conv2d":
160 |         return 4
161 |     elif input_layer == "conv2d6":
162 |         return 6
163 |     elif input_layer == "conv2d8":
164 |         return 8
165 | 
166 | 
167 | def remove_duplicates_and_blank(hyp: List[int]) -> List[int]:
168 |     new_hyp: List[int] = []
169 |     cur = 0
170 |     while cur < len(hyp):
171 |         if hyp[cur] != 0:
172 |             new_hyp.append(hyp[cur])
173 |         prev = cur
174 |         while cur < len(hyp) and hyp[cur] == hyp[prev]:
175 |             cur += 1
176 |     return new_hyp
177 | 
178 | 
179 | def log_add(args: List[int]) -> float:
180 |     """
181 |     Stable log add
182 |     """
183 |     if all(a == -float('inf') for a in args):
184 |         return -float('inf')
185 |     a_max = max(args)
186 |     lsp = math.log(sum(math.exp(a - a_max) for a in args))
187 |     return a_max + lsp
188 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/utils/ctc_util.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Mobvoi Inc. All Rights Reserved.
 2 | # Author: binbinzhang@mobvoi.com (Di Wu)
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | def insert_blank(label, blank_id=0):
 8 |     """Insert blank token between every two label token."""
 9 |     label = np.expand_dims(label, 1)
10 |     blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id
11 |     label = np.concatenate([blanks, label], axis=1)
12 |     label = label.reshape(-1)
13 |     label = np.append(label, label[0])
14 |     return label
15 | 
16 | def forced_align(ctc_probs: torch.Tensor,
17 |                  y: torch.Tensor,
18 |                  blank_id=0) -> list:
19 |     """ctc forced alignment.
20 | 
21 |     Args:
22 |         torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D)
23 |         torch.Tensor y: id sequence tensor 1d tensor (L)
24 |         int blank_id: blank symbol index
25 |     Returns:
26 |         torch.Tensor: alignment result
27 |     """
28 |     y_insert_blank = insert_blank(y, blank_id)
29 | 
30 |     log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank)))
31 |     log_alpha = log_alpha - float('inf')  # log of zero
32 |     state_path = (torch.zeros(
33 |         (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1
34 |     )  # state path
35 | 
36 |     # init start state
37 |     log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]]
38 |     log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]]
39 | 
40 |     for t in range(1, ctc_probs.size(0)):
41 |         for s in range(len(y_insert_blank)):
42 |             if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[
43 |                     s] == y_insert_blank[s - 2]:
44 |                 candidates = torch.tensor(
45 |                     [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]])
46 |                 prev_state = [s, s - 1]
47 |             else:
48 |                 candidates = torch.tensor([
49 |                     log_alpha[t - 1, s],
50 |                     log_alpha[t - 1, s - 1],
51 |                     log_alpha[t - 1, s - 2],
52 |                 ])
53 |                 prev_state = [s, s - 1, s - 2]
54 |             log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]]
55 |             state_path[t, s] = prev_state[torch.argmax(candidates)]
56 | 
57 |     state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16)
58 | 
59 |     candidates = torch.tensor([
60 |         log_alpha[-1, len(y_insert_blank) - 1],
61 |         log_alpha[-1, len(y_insert_blank) - 2]
62 |     ])
63 |     prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2]
64 |     state_seq[-1] = prev_state[torch.argmax(candidates)]
65 |     for t in range(ctc_probs.size(0) - 2, -1, -1):
66 |         state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]]
67 | 
68 |     output_alignment = []
69 |     for t in range(0, ctc_probs.size(0)):
70 |         output_alignment.append(y_insert_blank[state_seq[t, 0]])
71 | 
72 |     return output_alignment
73 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/utils/executor.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Mobvoi Inc. All Rights Reserved.
  2 | # Author: binbinzhang@mobvoi.com (Binbin Zhang)
  3 | 
  4 | import logging
  5 | from contextlib import nullcontext
  6 | # if your python version < 3.7 use the below one
  7 | # from contextlib import suppress as nullcontext
  8 | import torch
  9 | from torch.nn.utils import clip_grad_norm_
 10 | 
 11 | 
 12 | class Executor:
 13 |     def __init__(self):
 14 |         self.step = 0
 15 | 
 16 |     def train(self, model, optimizer, scheduler, data_loader, device, writer,
 17 |               args, scaler):
 18 |         ''' Train one epoch
 19 |         '''
 20 |         model.train()
 21 |         clip = args.get('grad_clip', 50.0)
 22 |         log_interval = args.get('log_interval', 10)
 23 |         rank = args.get('rank', 0)
 24 |         accum_grad = args.get('accum_grad', 1)
 25 |         is_distributed = args.get('is_distributed', True)
 26 |         use_amp = args.get('use_amp', False)
 27 |         logging.info('using accumulate grad, new batch size is {} times'
 28 |                      'larger than before'.format(accum_grad))
 29 |         if use_amp:
 30 |             assert scaler is not None
 31 |         num_seen_utts = 0
 32 |         num_total_batch = len(data_loader)
 33 |         for batch_idx, batch in enumerate(data_loader):
 34 |             key, feats, target, feats_lengths, target_lengths = batch
 35 |             feats = feats.to(device)
 36 |             target = target.to(device)
 37 |             feats_lengths = feats_lengths.to(device)
 38 |             target_lengths = target_lengths.to(device)
 39 |             num_utts = target_lengths.size(0)
 40 |             if num_utts == 0:
 41 |                 continue
 42 |             context = None
 43 |             # Disable gradient synchronizations across DDP processes.
 44 |             # Within this context, gradients will be accumulated on module
 45 |             # variables, which will later be synchronized.
 46 |             if is_distributed and batch_idx % accum_grad != 0:
 47 |                 context = model.no_sync
 48 |             # Used for single gpu training and DDP gradient synchronization
 49 |             # processes.
 50 |             else:
 51 |                 context = nullcontext
 52 |             with context():
 53 |                 # autocast context
 54 |                 # The more details about amp can be found in
 55 |                 # https://pytorch.org/docs/stable/notes/amp_examples.html
 56 |                 with torch.cuda.amp.autocast(scaler is not None):
 57 |                     loss, loss_att, loss_ctc = model(feats, feats_lengths,
 58 |                                                      target, target_lengths)
 59 |                     loss = loss / accum_grad
 60 |                 if use_amp:
 61 |                     scaler.scale(loss).backward()
 62 |                 else:
 63 |                     loss.backward()
 64 | 
 65 |             num_seen_utts += num_utts
 66 |             if batch_idx % accum_grad == 0:
 67 |                 if rank == 0 and writer is not None:
 68 |                     writer.add_scalar('train_loss', loss, self.step)
 69 |                 # Use mixed precision training
 70 |                 if use_amp:
 71 |                     scaler.unscale_(optimizer)
 72 |                     grad_norm = clip_grad_norm_(model.parameters(), clip)
 73 |                     # Must invoke scaler.update() if unscale_() is used in the
 74 |                     # iteration to avoid the following error:
 75 |                     #   RuntimeError: unscale_() has already been called
 76 |                     #   on this optimizer since the last update().
 77 |                     # We don't check grad here since that if the gradient has
 78 |                     # inf/nan values, scaler.step will skip optimizer.step().
 79 |                     scaler.step(optimizer)
 80 |                     scaler.update()
 81 |                 else:
 82 |                     grad_norm = clip_grad_norm_(model.parameters(), clip)
 83 |                     if torch.isfinite(grad_norm):
 84 |                         optimizer.step()
 85 |                 optimizer.zero_grad()
 86 |                 scheduler.step()
 87 |                 self.step += 1
 88 |             if batch_idx % log_interval == 0:
 89 |                 lr = optimizer.param_groups[0]['lr']
 90 |                 log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format(
 91 |                     batch_idx, num_total_batch,
 92 |                     loss.item() * accum_grad)
 93 |                 if loss_att is not None:
 94 |                     log_str += 'loss_att {:.6f} '.format(loss_att.item())
 95 |                 if loss_ctc is not None:
 96 |                     log_str += 'loss_ctc {:.6f} '.format(loss_ctc.item())
 97 |                 log_str += 'lr {:.8f} rank {}'.format(lr, rank)
 98 |                 logging.debug(log_str)
 99 | 
100 |     def cv(self, model, data_loader, device, args):
101 |         ''' Cross validation on
102 |         '''
103 |         model.eval()
104 |         log_interval = args.get('log_interval', 10)
105 |         # in order to avoid division by 0
106 |         num_seen_utts = 1
107 |         total_loss = 0.0
108 |         num_total_batch = len(data_loader)
109 |         with torch.no_grad():
110 |             for batch_idx, batch in enumerate(data_loader):
111 |                 key, feats, target, feats_lengths, target_lengths = batch
112 |                 feats = feats.to(device)
113 |                 target = target.to(device)
114 |                 feats_lengths = feats_lengths.to(device)
115 |                 target_lengths = target_lengths.to(device)
116 |                 num_utts = target_lengths.size(0)
117 |                 if num_utts == 0:
118 |                     continue
119 |                 loss, loss_att, loss_ctc = model(feats, feats_lengths, target,
120 |                                                  target_lengths)
121 |                 if torch.isfinite(loss):
122 |                     num_seen_utts += num_utts
123 |                     total_loss += loss.item() * num_utts
124 |                 if batch_idx % log_interval == 0:
125 |                     log_str = 'CV Batch {}/{} loss {:.6f} '.format(
126 |                         batch_idx, num_total_batch, loss.item())
127 |                     if loss_att is not None:
128 |                         log_str += 'loss_att {:.6f} '.format(loss_att.item())
129 |                     if loss_ctc is not None:
130 |                         log_str += 'loss_ctc {:.6f} '.format(loss_ctc.item())
131 |                     log_str += 'history loss {:.6f}'.format(total_loss /
132 |                                                             num_seen_utts)
133 |                     logging.debug(log_str)
134 | 
135 |         return total_loss, num_seen_utts
136 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/models/wenet/utils/scheduler.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | import torch
 4 | from torch.optim.lr_scheduler import _LRScheduler
 5 | 
 6 | from typeguard import check_argument_types
 7 | 
 8 | 
 9 | class WarmupLR(_LRScheduler):
10 |     """The WarmupLR scheduler
11 | 
12 |     This scheduler is almost same as NoamLR Scheduler except for following
13 |     difference:
14 | 
15 |     NoamLR:
16 |         lr = optimizer.lr * model_size ** -0.5
17 |              * min(step ** -0.5, step * warmup_step ** -1.5)
18 |     WarmupLR:
19 |         lr = optimizer.lr * warmup_step ** 0.5
20 |              * min(step ** -0.5, step * warmup_step ** -1.5)
21 | 
22 |     Note that the maximum lr equals to optimizer.lr in this scheduler.
23 | 
24 |     """
25 | 
26 |     def __init__(
27 |         self,
28 |         optimizer: torch.optim.Optimizer,
29 |         warmup_steps: Union[int, float] = 25000,
30 |         last_epoch: int = -1,
31 |     ):
32 |         assert check_argument_types()
33 |         self.warmup_steps = warmup_steps
34 | 
35 |         # __init__() must be invoked before setting field
36 |         # because step() is also invoked in __init__()
37 |         super().__init__(optimizer, last_epoch)
38 | 
39 |     def __repr__(self):
40 |         return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})"
41 | 
42 |     def get_lr(self):
43 |         step_num = self.last_epoch + 1
44 |         return [
45 |             lr
46 |             * self.warmup_steps ** 0.5
47 |             * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5)
48 |             for lr in self.base_lrs
49 |         ]
50 | 
51 |     def set_step(self, step: int):
52 |         self.last_epoch = step
53 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/optimizer/adam.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | 
 6 | def Optimizer(parameters, lr, weight_decay, **kwargs):
 7 | 
 8 | 	print('Initialised Adam optimizer')
 9 | 	return torch.optim.Adam(parameters, lr = lr, weight_decay = weight_decay);
10 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/optimizer/adamP.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | from adamp import AdamP
 6 | 
 7 | def Optimizer(parameters, lr, weight_decay, **kwargs):
 8 |     print('Initialised AdamP optimizer')
 9 |     return AdamP(parameters, lr = lr, betas = (0.9, 0.999), weight_decay = weight_decay)
10 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/optimizer/adamW.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | 
 6 | def Optimizer(parameters, lr, weight_decay, **kwargs):
 7 | 
 8 | 	print('Initialised AdamW optimizer')
 9 | 	return torch.optim.AdamW(parameters, lr = lr, weight_decay = weight_decay)
10 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/optimizer/sgd.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | 
 6 | def Optimizer(parameters, lr, weight_decay, **kwargs):
 7 | 
 8 | 	print('Initialised SGD optimizer')
 9 | 
10 | 	return torch.optim.SGD(parameters, lr = lr, momentum = 0.9, weight_decay=weight_decay);
11 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/process_musan.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | # splits musan clips to chunks of 5 seconds at 3 second interval
 4 | # the first argument should be the parent directory of musan_v1
 5 | import os
 6 | import sys
 7 | import glob
 8 | from scipy.io import wavfile
 9 | 
10 | files = glob.glob('%s/musan/*/*/*.wav'%sys.argv[1])
11 | 
12 | audlen = 16000*5
13 | audstr = 16000*3
14 | 
15 | for idx,file in enumerate(files):
16 |     fs,aud = wavfile.read(file)
17 |     writedir = os.path.splitext(file.replace('/musan/','/musan_split/'))[0]
18 |     os.makedirs(writedir)
19 |     for st in range(0,len(aud)-audlen,audstr):
20 |         wavfile.write(writedir + '/%05d.wav'%(st/fs), fs, aud[st:st+audlen])
21 | 
22 |     print(idx,file)


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/requirements.txt:
--------------------------------------------------------------------------------
1 | --find-links https://download.pytorch.org/whl/torch_stable.html
2 | torch==1.12.1+cu113
3 | torchaudio==0.12.1+cu113
4 | numpy
5 | scipy
6 | scikit-learn
7 | tqdm
8 | pyyaml
9 | soundfile


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/scheduler/cosine_annealing_warmup_restarts.py:
--------------------------------------------------------------------------------
 1 | # ref: https://github.com/katsura-jp/pytorch-cosine-annealing-with-warmup/blob/master/cosine_annealing_warmup/scheduler.py
 2 | #! /usr/bin/python
 3 | # -*- encoding: utf-8 -*-
 4 | import math
 5 | import torch
 6 | from torch.optim.lr_scheduler import _LRScheduler
 7 | 
 8 | class CosineAnnealingWarmupRestarts(_LRScheduler):
 9 |     def __init__(self,
10 |                  optimizer : torch.optim.Optimizer,
11 |                  first_cycle_steps : int,
12 |                  cycle_mult : float = 1.,
13 |                  max_lr : float = 0.1,
14 |                  min_lr : float = 0.001,
15 |                  warmup_steps : int = 0,
16 |                  gamma : float = 1.,
17 |                  last_epoch : int = -1
18 |         ):
19 |         assert warmup_steps < first_cycle_steps
20 |         self.first_cycle_steps = first_cycle_steps # first cycle step size
21 |         self.cycle_mult = cycle_mult               # cycle steps magnification
22 |         self.base_max_lr = max_lr                  # first max learning rate
23 |         self.max_lr = max_lr                       # max learning rate in the current cycle
24 |         self.min_lr = min_lr                       # min learning rate
25 |         self.warmup_steps = warmup_steps           # warmup step size
26 |         self.gamma = gamma                         # decrease rate of max learning rate by cycle
27 |         self.cur_cycle_steps = first_cycle_steps   # first cycle step size
28 |         self.cycle = 0                             # cycle count
29 |         self.step_in_cycle = last_epoch            # step size of the current cycle
30 |         super(CosineAnnealingWarmupRestarts, self).__init__(optimizer, last_epoch)
31 |         self.init_lr()
32 |  
33 |     def init_lr(self):
34 |         self.base_lrs = []
35 |         for param_group in self.optimizer.param_groups:
36 |             param_group['lr'] = self.min_lr
37 |             self.base_lrs.append(self.min_lr)    
38 | 
39 |     def get_lr(self):
40 |         if self.step_in_cycle == -1:
41 |             return self.base_lrs
42 |         elif self.step_in_cycle < self.warmup_steps:
43 |             return [(self.max_lr - base_lr)*self.step_in_cycle / self.warmup_steps + base_lr for base_lr in self.base_lrs]
44 |         else:
45 |             return [base_lr + (self.max_lr - base_lr) \
46 |                     * (1 + math.cos(math.pi * (self.step_in_cycle-self.warmup_steps) \
47 |                                     / (self.cur_cycle_steps - self.warmup_steps))) / 2
48 |                     for base_lr in self.base_lrs]
49 | 
50 |     def step(self, epoch=None):
51 |         if epoch is None:
52 |             epoch = self.last_epoch + 1
53 |             self.step_in_cycle = self.step_in_cycle + 1
54 |             if self.step_in_cycle >= self.cur_cycle_steps:
55 |                 self.cycle += 1
56 |                 self.step_in_cycle = self.step_in_cycle - self.cur_cycle_steps
57 |                 self.cur_cycle_steps = int((self.cur_cycle_steps - self.warmup_steps) * self.cycle_mult) + self.warmup_steps
58 |         else:
59 |             if epoch >= self.first_cycle_steps:
60 |                 if self.cycle_mult == 1.:
61 |                     self.step_in_cycle = epoch % self.first_cycle_steps
62 |                     self.cycle = epoch // self.first_cycle_steps
63 |                 else:
64 |                     n = int(math.log((epoch / self.first_cycle_steps * (self.cycle_mult - 1) + 1), self.cycle_mult))
65 |                     self.cycle = n
66 |                     self.step_in_cycle = epoch - int(self.first_cycle_steps * (self.cycle_mult ** n - 1) / (self.cycle_mult - 1))
67 |                     self.cur_cycle_steps = self.first_cycle_steps * self.cycle_mult ** (n)
68 |             else:
69 |                 self.cur_cycle_steps = self.first_cycle_steps
70 |                 self.step_in_cycle = epoch
71 |                 
72 |         self.max_lr = self.base_max_lr * (self.gamma**self.cycle)
73 |         self.last_epoch = math.floor(epoch)
74 |         for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
75 |             param_group['lr'] = lr
76 | 
77 | 
78 | def Scheduler(optimizer, lr_t0, lr_tmul, lr_max, lr_min, lr_wstep, lr_gamma, **kwargs):
79 |     sche_fn = CosineAnnealingWarmupRestarts(optimizer, first_cycle_steps=lr_t0, cycle_mult=lr_tmul, max_lr=lr_max, min_lr=lr_min, warmup_steps=lr_wstep, gamma=lr_gamma)
80 |     lr_step = 'epoch'
81 |     print('Initialised CosineAnnealingWarmupRestarts scheduler')
82 |     return sche_fn, lr_step
83 |     #return sche_fn
84 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/scheduler/cycliclr.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | 
 6 | def Scheduler(optimizer, lr_cyclic_min, lr_cyclic_max, lr_up_size, lr_down_size, lr_mode, **kwargs):
 7 | 
 8 |     lr_step = 'epoch'
 9 |     sche_fn = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=lr_cyclic_min, max_lr=lr_cyclic_max, step_size_up=lr_up_size, step_size_down=lr_down_size, mode=lr_mode, cycle_momentum=False)
10 |     print('Initialised cyclic LR scheduler')
11 |     return sche_fn, lr_step
12 |     #return sche_fn
13 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/scheduler/exponentiallr.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | 
 6 | def Scheduler(optimizer, **kwargs):
 7 | 
 8 |     sche_fn = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
 9 |     lr_step = 'epoch'
10 |     print('Initialised exponential LR scheduler')
11 |     return sche_fn, lr_step
12 |     #return sche_fn
13 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/scheduler/steplr.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | 
 6 | def Scheduler(optimizer, lr_decay_interval, max_epoch, lr_decay, **kwargs):
 7 | 
 8 |     sche_fn = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_decay_interval, gamma=lr_decay)
 9 |     #lr_step = 'epoch'
10 |     lr_step = 'step'
11 |     print('Initialised step LR scheduler')
12 |     return sche_fn, lr_step
13 |     #return sche_fn
14 | 


--------------------------------------------------------------------------------
/stage1/VoxCeleb2/tuneThreshold.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*- coding: utf-8 -*-
 3 | import numpy
 4 | from sklearn import metrics
 5 | from operator import itemgetter
 6 | 
 7 | def tuneThresholdfromScore(scores, labels, target_fa, target_fr = None):
 8 |     
 9 |     fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1)
10 |     fnr = 1 - tpr
11 | 
12 |     tunedThreshold = [];
13 |     if target_fr:
14 |         for tfr in target_fr:
15 |             idx = numpy.nanargmin(numpy.absolute((tfr - fnr)))
16 |             tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]);
17 |     
18 |     for tfa in target_fa:
19 |         idx = numpy.nanargmin(numpy.absolute((tfa - fpr))) # numpy.where(fpr<=tfa)[0][-1]
20 |         tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]);
21 |     
22 |     idxE = numpy.nanargmin(numpy.absolute((fnr - fpr)))
23 |     eer  = max(fpr[idxE],fnr[idxE])*100
24 |     
25 |     return (tunedThreshold, eer, fpr, fnr);
26 | 
27 | # Creates a list of false-negative rates, a list of false-positive rates
28 | # and a list of decision thresholds that give those error-rates.
29 | def ComputeErrorRates(scores, labels):
30 | 
31 |       # Sort the scores from smallest to largest, and also get the corresponding
32 |       # indexes of the sorted scores.  We will treat the sorted scores as the
33 |       # thresholds at which the the error-rates are evaluated.
34 |       sorted_indexes, thresholds = zip(*sorted(
35 |           [(index, threshold) for index, threshold in enumerate(scores)],
36 |           key=itemgetter(1)))
37 |       sorted_labels = []
38 |       labels = [labels[i] for i in sorted_indexes]
39 |       fnrs = []
40 |       fprs = []
41 | 
42 |       # At the end of this loop, fnrs[i] is the number of errors made by
43 |       # incorrectly rejecting scores less than thresholds[i]. And, fprs[i]
44 |       # is the total number of times that we have correctly accepted scores
45 |       # greater than thresholds[i].
46 |       for i in range(0, len(labels)):
47 |           if i == 0:
48 |               fnrs.append(labels[i])
49 |               fprs.append(1 - labels[i])
50 |           else:
51 |               fnrs.append(fnrs[i-1] + labels[i])
52 |               fprs.append(fprs[i-1] + 1 - labels[i])
53 |       fnrs_norm = sum(labels)
54 |       fprs_norm = len(labels) - fnrs_norm
55 | 
56 |       # Now divide by the total number of false negative errors to
57 |       # obtain the false positive rates across all thresholds
58 |       fnrs = [x / float(fnrs_norm) for x in fnrs]
59 | 
60 |       # Divide by the total number of corret positives to get the
61 |       # true positive rate.  Subtract these quantities from 1 to
62 |       # get the false positive rates.
63 |       fprs = [1 - x / float(fprs_norm) for x in fprs]
64 |       return fnrs, fprs, thresholds
65 | 
66 | # Computes the minimum of the detection cost function.  The comments refer to
67 | # equations in Section 3 of the NIST 2016 Speaker Recognition Evaluation Plan.
68 | def ComputeMinDcf(fnrs, fprs, thresholds, p_target, c_miss, c_fa):
69 |     min_c_det = float("inf")
70 |     min_c_det_threshold = thresholds[0]
71 |     for i in range(0, len(fnrs)):
72 |         # See Equation (2).  it is a weighted sum of false negative
73 |         # and false positive errors.
74 |         c_det = c_miss * fnrs[i] * p_target + c_fa * fprs[i] * (1 - p_target)
75 |         if c_det < min_c_det:
76 |             min_c_det = c_det
77 |             min_c_det_threshold = thresholds[i]
78 |     # See Equations (3) and (4).  Now we normalize the cost.
79 |     c_def = min(c_miss * p_target, c_fa * (1 - p_target))
80 |     min_dcf = min_c_det / c_def
81 |     return min_dcf, min_c_det_threshold


--------------------------------------------------------------------------------
/stage2/README.md:
--------------------------------------------------------------------------------
1 | # Stage 2
2 | 
3 | Copy-synthesis training recipe and data generation script will be released soon.
4 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/README.md:
--------------------------------------------------------------------------------
  1 | # Stage 3
  2 | 
  3 | This repository is developed based on the [voxceleb_trainer](https://github.com/clovaai/voxceleb_trainer) and [ska-tdnn](https://github.com/msh9184/ska-tdnn).
  4 | 
  5 | ## Dependencies
  6 | If you use the Anaconda virtual environment,
  7 | ```
  8 | conda create -n sasv python=3.9 cudatoolkit=11.3
  9 | conda activate sasv
 10 | ```
 11 | Install all dependency packages,
 12 | ```
 13 | pip3 install -r requirements.txt
 14 | ```
 15 | 
 16 | ## Models
 17 | Three models are included in this repository. You can select the model by the `--model` option:
 18 | ```
 19 | ECAPA_TDNN [1]
 20 | MFA_Conformer [2]
 21 | SKA_TDNN [3]
 22 | ```
 23 | 
 24 | [1] B. Desplanques, J. Thienpondt, and K. Demuynck, "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification," in *Proc. INTERSPEECH*, 2020, pp. 3707-3711.
 25 | 
 26 | [2] Y. Zhang, Z. Lv, H. Wu, S. Zhang, P. Hu, Z. Wu, H. Lee, and H. Meng., “MFA-Conformer: Multi-scale Feature Aggregation Conformer for Automatic Speaker Verification,” in *Proc. INTERSPEECH*, 2022.
 27 | 
 28 | [3] S. H. Mun, J. Jung, M. H. Han, and N. S. Kim, "Frequency and Multi-Scale Selective Kernel Attention for Speaker Verification," in *Proc. IEEE SLT*, 2022.
 29 | 
 30 | 
 31 | ## Training
 32 | Training example 1: `SKA_TDNN` from scratch using `ASVspoof2019 LA train+dev`,
 33 | 
 34 | ```
 35 | CUDA_VISIBLE_DEVICES=0 python trainSASVNet.py \
 36 |         --max_frames 500 \
 37 |         --num_spk 40 \
 38 |         --num_utt 2 \
 39 |         --batch_size 160 \
 40 |         --trainfunc sasv_e2e_v1 \
 41 |         --optimizer adamW \
 42 |         --scheduler cosine_annealing_warmup_restarts \
 43 |         --lr_t0 8 \
 44 |         --lr_tmul 1.0 \
 45 |         --lr_max 1e-4 \
 46 |         --lr_min 0 \
 47 |         --lr_wstep 0 \
 48 |         --lr_gamma 0.8 \
 49 |         --margin 0.2 \
 50 |         --scale 30 \
 51 |         --num_class 41 \
 52 |         --save_path ./save/sasv_baseline_stage3 \
 53 |         --train_list ./protocols/ASVspoof2019.LA.cm.train_dev.trn.txt \
 54 |         --eval_list ./protocols/ASVspoof2019.LA.asv.eval.gi.trl.txt \
 55 |         --train_path /path/to/dataset/ASVSpoof/ASVSpoof2019/LA \
 56 |         --eval_path /path/to/dataset/ASVSpoof/ASVSpoof2019/LA/ASVspoof2019_LA_eval/flac \
 57 |         --spk_meta_train ./spk_meta/spk_meta_trn.pk
 58 |         --spk_meta_eval ./spk_meta/spk_meta_eval.pk
 59 |         --musan_path /path/to/dataset/MUSAN/musan_split \
 60 |         --rir_path /path/to/dataset/RIRS_NOISES/simulated_rirs \
 61 |         --model SKA_TDNN
 62 | ```
 63 | 
 64 | Training example 2: `MFA_Conformer` with pre-trained weight using `ASVspoof2019 LA train`,
 65 | ```
 66 | CUDA_VISIBLE_DEVICES=0 python trainSASVNet.py \
 67 |         --max_frames 500 \       
 68 |         --num_spk 20 \
 69 |         --num_utt 2 \
 70 |         --batch_size 80 \
 71 |         --trainfunc sasv_e2e_v1 \
 72 |         --optimizer adamW \
 73 |         --scheduler cosine_annealing_warmup_restarts \
 74 |         --lr_t0 8 \
 75 |         --lr_tmul 1.0 \
 76 |         --lr_max 1e-4 \
 77 |         --lr_min 0 \
 78 |         --lr_wstep 0 \
 79 |         --lr_gamma 0.8 \
 80 |         --margin 0.2 \
 81 |         --scale 30 \
 82 |         --num_class 21 \
 83 |         --save_path ./save/sasv_baseline_stage3 \
 84 |         --train_list ./protocols/ASVspoof2019.LA.cm.train_dev.trn.txt \
 85 |         --eval_list ./protocols/ASVspoof2019.LA.asv.eval.gi.trl.txt \
 86 |         --train_path /path/to/dataset/ASVSpoof/ASVSpoof2019/LA \
 87 |         --eval_path /path/to/dataset/ASVSpoof/ASVSpoof2019/LA/ASVspoof2019_LA_eval/flac \
 88 |         --spk_meta_train ./spk_meta/spk_meta_trn.pk
 89 |         --spk_meta_eval ./spk_meta/spk_meta_eval.pk
 90 |         --musan_path /path/to/dataset/MUSAN/musan_split \
 91 |         --rir_path /path/to/dataset/RIRS_NOISES/simulated_rirs \
 92 |         --model MFA_Conformer \
 93 |         --initial_model /path/to/your_model/pretrained_weight.model
 94 | ```
 95 | [In this repository](https://github.com/sasv-challenge/ASVSpoof5-SASVBaseline), you can download several pre-trained weights used in [this paper](https://arxiv.org/pdf/2305.19051.pdf) and fine-tune them using the above command.
 96 | 
 97 | ## Evaluation
 98 | Evaluation example: `SKA_TDNN` using `SASV protocol` on the ASVspoof2019 LA eval,
 99 | ```
100 | CUDA_VISIBLE_DEVICES=0 python trainSASVNet.py \
101 |         --eval \
102 |         --eval_frames 0 \
103 |         --num_eval 1 \
104 |         --eval_list ./protocols/ASVspoof2019.LA.asv.eval.gi.trl.txt \
105 |         --eval_path /path/to/dataset/ASVSpoof/ASVSpoof2019/LA/ASVspoof2019_LA_eval/flac \
106 |         --model SKA_TDNN \
107 |         --initial_model /path/to/your_model/pretrained_weight.model
108 | ```
109 | 
110 | ## Citation
111 | If you utilize this repository, please cite the following paper,
112 | ```
113 | @inproceedings{chung2020in,
114 |   title={In defence of metric learning for speaker recognition},
115 |   author={Chung, Joon Son and Huh, Jaesung and Mun, Seongkyu and Lee, Minjae and Heo, Hee Soo and Choe, Soyeon and Ham, Chiheon and Jung, Sunghwan and Lee, Bong-Jin and Han, Icksang},
116 |   booktitle={Proc. Interspeech},
117 |   year={2020}
118 | }
119 | ```
120 | 
121 | ```
122 | @inproceedings{jung2022pushing,
123 |   title={Pushing the limits of raw waveform speaker recognition},
124 |   author={Jung, Jee-weon and Kim, You Jin and Heo, Hee-Soo and Lee, Bong-Jin and Kwon, Youngki and Chung, Joon Son},
125 |   booktitle={Proc. Interspeech},
126 |   year={2022}
127 | }
128 | ```
129 | 
130 | ```
131 | @inproceedings{mun2022frequency,
132 |   title={Frequency and Multi-Scale Selective Kernel Attention for Speaker Verification},
133 |   author={Mun, Sung Hwan and Jung, Jee-weon and Han, Min Hyun and Kim, Nam Soo},
134 |   booktitle={Proc. IEEE SLT},
135 |   year={2022}
136 | }
137 | ```
138 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/loss/aamsoftmax.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | # Adapted from https://github.com/wujiyang/Face_Pytorch (Apache License)
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | import time, pdb, numpy, math
 9 | from utils import accuracy
10 | 
11 | class LossFunction(nn.Module):
12 |     def __init__(self, num_out, num_class, margin=0.3, scale=15, easy_margin=False, **kwargs):
13 |         super(LossFunction, self).__init__()
14 | 
15 |         self.test_normalize = True
16 |         
17 |         self.m = margin
18 |         self.s = scale
19 |         self.in_feats = num_out
20 |         self.weight = torch.nn.Parameter(torch.FloatTensor(num_class, num_out), requires_grad=True)
21 |         self.ce = nn.CrossEntropyLoss()
22 |         nn.init.xavier_normal_(self.weight, gain=1)
23 | 
24 |         self.easy_margin = easy_margin
25 |         self.cos_m = math.cos(self.m)
26 |         self.sin_m = math.sin(self.m)
27 | 
28 |         # make the function cos(theta+m) monotonic decreasing while theta in [0°,180°]
29 |         self.th = math.cos(math.pi - self.m)
30 |         self.mm = math.sin(math.pi - self.m) * self.m
31 | 
32 |         print('Initialized AAMSoftmax margin %.3f scale %.3f'%(self.m,self.s))
33 | 
34 |     def forward(self, x, label=None):
35 | 
36 |         assert x.size()[0] == label.size()[0]
37 |         assert x.size()[1] == self.in_feats
38 |         
39 |         # cos(theta)
40 |         cosine = F.linear(F.normalize(x), F.normalize(self.weight))
41 |         # cos(theta + m)
42 |         sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1))
43 |         phi = cosine * self.cos_m - sine * self.sin_m
44 | 
45 |         if self.easy_margin:
46 |             phi = torch.where(cosine > 0, phi, cosine)
47 |         else:
48 |             phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm)
49 | 
50 |         #one_hot = torch.zeros(cosine.size(), device='cuda' if torch.cuda.is_available() else 'cpu')
51 |         one_hot = torch.zeros_like(cosine)
52 |         one_hot.scatter_(1, label.view(-1, 1), 1)
53 |         output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
54 |         output = output * self.s
55 | 
56 |         loss    = self.ce(output, label)
57 |         prec1   = accuracy(output.detach(), label.detach(), topk=(1,))[0]
58 |         return loss, prec1
59 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/loss/angleproto_sasv.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | import time, pdb, numpy
 8 | from utils import accuracy
 9 | 
10 | class LossFunction(nn.Module):
11 | 
12 |     def __init__(self, init_w1=10.0, init_b1=-5.0, init_w2=10.0, init_b2=-5.0, **kwargs):
13 |         super(LossFunction, self).__init__()
14 | 
15 |         self.test_normalize = True       
16 |         self.w1 = nn.Parameter(torch.tensor(init_w1))
17 |         self.b1 = nn.Parameter(torch.tensor(init_b1))
18 |         self.criterion  = torch.nn.CrossEntropyLoss()
19 |         print('Initialized AngleProto')
20 | 
21 |     def forward(self, x, label=None, num_bna=0):
22 |         assert x.size()[1] >= 2
23 | 
24 |         out_anchor = x[:, 1, :]
25 |         out_positive = x[:, 0, :][ :num_bna]
26 |         stepsize = out_positive.size()[0]
27 | 
28 |         cos_sim_matrix1 = F.cosine_similarity(out_positive.unsqueeze(-1),out_anchor.unsqueeze(-1).transpose(0,2))
29 |         torch.clamp(self.w1, 1e-6)
30 |         cos_sim_matrix1 = cos_sim_matrix1 * self.w1 + self.b1       
31 | 
32 |         out_anchor = x[:, 0, :]
33 |         out_positive = x[:, 1, :][ :num_bna]
34 |         
35 |         label = torch.from_numpy(numpy.asarray(range(0,stepsize))).cuda()
36 |         nloss1 = self.criterion(cos_sim_matrix1, label)
37 |         nloss = nloss1
38 | 
39 |         prec1 = accuracy(cos_sim_matrix1.detach(), label.detach(), topk=(1,))[0]
40 |         prec = prec1
41 | 
42 |         return nloss, prec
43 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/loss/sasv_e2e_v1.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import loss.aamsoftmax as aamsoftmax
 7 | import loss.angleproto_sasv as angleproto_sasv
 8 | 
 9 | class LossFunction(nn.Module):
10 |     def __init__(self, **kwargs):
11 |         super(LossFunction, self).__init__()
12 |         self.test_normalize = True
13 |         self.aamsoftmax = aamsoftmax.LossFunction(**kwargs)
14 |         self.angleproto_sasv = angleproto_sasv.LossFunction(**kwargs)
15 |         self.num_class = kwargs.get('num_class')
16 |         print('Initialized SASV End-to-end v1 Loss Function')
17 | 
18 |     def forward(self, x, label=None):
19 |         assert x.size()[1] == 2
20 |         nlossS, prec = self.aamsoftmax(x.reshape(-1, x.size()[-1]), label.repeat_interleave(2))
21 |         
22 |         idx_bna = torch.where(label != self.num_class-1)
23 |         idx_spf = torch.where(label == self.num_class-1)
24 |         x1 = x[idx_bna]
25 |         x2 = x[idx_spf]
26 |         x = torch.cat((x1, x2))
27 |         nlossM, _ = self.angleproto_sasv(x, None, len(idx_bna[0]))
28 |         
29 |         return nlossS + nlossM, prec
30 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/metrics.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Union
 2 | 
 3 | import numpy
 4 | import torch
 5 | from scipy.interpolate import interp1d
 6 | from scipy.optimize import brentq
 7 | from sklearn.metrics import roc_curve
 8 | 
 9 | 
10 | def get_all_EERs(
11 |     preds: Union[torch.Tensor, List, numpy.ndarray], keys: List
12 | ) -> List[float]:
13 |     """
14 |     Calculate all three EERs used in the SASV Challenge 2022.
15 |     preds and keys should be pre-calculated using dev or eval protocol in
16 |     either 'protocols/ASVspoof2019.LA.asv.dev.gi.trl.txt' or
17 |     'protocols/ASVspoof2019.LA.asv.eval.gi.trl.txt'
18 | 
19 |     :param preds: list of scores in tensor
20 |     :param keys: list of keys where each element should be one of
21 |     ['target', 'nontarget', 'spoof']
22 |     """
23 |     sasv_labels, sv_labels, spf_labels = [], [], []
24 |     sv_preds, spf_preds = [], []
25 | 
26 |     for pred, key in zip(preds, keys):
27 |         if key == "target":
28 |             sasv_labels.append(1)
29 |             sv_labels.append(1)
30 |             spf_labels.append(1)
31 |             sv_preds.append(pred)
32 |             spf_preds.append(pred)
33 | 
34 |         elif key == "nontarget":
35 |             sasv_labels.append(0)
36 |             sv_labels.append(0)
37 |             sv_preds.append(pred)
38 | 
39 |         elif key == "spoof":
40 |             sasv_labels.append(0)
41 |             spf_labels.append(0)
42 |             spf_preds.append(pred)
43 |         else:
44 |             raise ValueError(
45 |                 f"should be one of 'target', 'nontarget', 'spoof', got:{key}"
46 |             )
47 | 
48 |     fpr, tpr, _ = roc_curve(sasv_labels, preds, pos_label=1)
49 |     sasv_eer = brentq(lambda x: 1.0 - x - interp1d(fpr, tpr)(x), 0.0, 1.0)
50 | 
51 |     fpr, tpr, _ = roc_curve(sv_labels, sv_preds, pos_label=1)
52 |     sv_eer = brentq(lambda x: 1.0 - x - interp1d(fpr, tpr)(x), 0.0, 1.0)
53 | 
54 |     fpr, tpr, _ = roc_curve(spf_labels, spf_preds, pos_label=1)
55 |     spf_eer = brentq(lambda x: 1.0 - x - interp1d(fpr, tpr)(x), 0.0, 1.0)
56 | 
57 |     return sasv_eer*100, sv_eer*100, spf_eer*100
58 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/ECAPA_TDNN.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | # -*- encoding: utf-8 -*-
  3 | 
  4 | ## Here, log_input forces alternative mfcc implementation with pre-emphasis instead of actual log mfcc
  5 | 
  6 | import math
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | import torchaudio
 11 | import pdb
 12 | from utils import PreEmphasis
 13 | 
 14 | class SEModule(nn.Module):
 15 |     def __init__(self, channels, bottleneck=128):
 16 |         super(SEModule, self).__init__()
 17 |         self.se = nn.Sequential(
 18 |             nn.AdaptiveAvgPool1d(1),
 19 |             nn.Conv1d(channels, bottleneck, kernel_size=1, padding=0),
 20 |             nn.ReLU(),
 21 |             #nn.BatchNorm1d(bottleneck),
 22 |             nn.Conv1d(bottleneck, channels, kernel_size=1, padding=0),
 23 |             nn.Sigmoid(),
 24 |             )
 25 | 
 26 |     def forward(self, input):
 27 |         x = self.se(input)
 28 |         return input * x
 29 | 
 30 | class Bottle2neck(nn.Module):
 31 | 
 32 |     def __init__(self, inplanes, planes, kernel_size=None, dilation=None, scale = 8):
 33 |         super(Bottle2neck, self).__init__()
 34 |         width       = int(math.floor(planes / scale))       
 35 |         self.conv1  = nn.Conv1d(inplanes, width*scale, kernel_size=1)
 36 |         self.bn1    = nn.BatchNorm1d(width*scale)
 37 |         self.nums   = scale -1
 38 |         convs       = []
 39 |         bns         = []
 40 |         num_pad = math.floor(kernel_size/2)*dilation
 41 |         for i in range(self.nums):
 42 |             convs.append(nn.Conv1d(width, width, kernel_size=kernel_size, dilation=dilation, padding=num_pad))
 43 |             bns.append(nn.BatchNorm1d(width))
 44 |         self.convs  = nn.ModuleList(convs)
 45 |         self.bns    = nn.ModuleList(bns)
 46 |         self.conv3  = nn.Conv1d(width*scale, planes, kernel_size=1)
 47 |         self.bn3    = nn.BatchNorm1d(planes)
 48 |         self.relu   = nn.ReLU()
 49 |         self.width  = width
 50 |         self.se     = SEModule(planes)
 51 | 
 52 |     def forward(self, x):
 53 |         residual = x
 54 |         out = self.conv1(x)
 55 |         out = self.relu(out)
 56 |         out = self.bn1(out)
 57 | 
 58 |         spx = torch.split(out, self.width, 1)
 59 |         for i in range(self.nums):
 60 |           if i==0:
 61 |             sp = spx[i]
 62 |           else:
 63 |             sp = sp + spx[i]
 64 |           sp = self.convs[i](sp)
 65 |           sp = self.relu(sp)
 66 |           sp = self.bns[i](sp)
 67 |           if i==0:
 68 |             out = sp
 69 |           else:
 70 |             out = torch.cat((out, sp), 1)
 71 |         out = torch.cat((out, spx[self.nums]),1)
 72 | 
 73 |         out = self.conv3(out)
 74 |         out = self.relu(out)
 75 |         out = self.bn3(out)
 76 | 
 77 |         out = self.se(out)
 78 |         out += residual
 79 |         return out 
 80 | 
 81 | class FbankAug(nn.Module):
 82 | 
 83 |     def __init__(self, freq_mask_width = (0, 8), time_mask_width = (0, 10)):
 84 |         self.time_mask_width = time_mask_width
 85 |         self.freq_mask_width = freq_mask_width
 86 |         super().__init__()
 87 | 
 88 |     def mask_along_axis(self, x, dim):
 89 |         original_size = x.shape
 90 |         batch, fea, time = x.shape
 91 |         if dim == 1:
 92 |             D = fea
 93 |             width_range = self.freq_mask_width
 94 |         else:
 95 |             D = time
 96 |             width_range = self.time_mask_width
 97 | 
 98 |         mask_len = torch.randint(width_range[0], width_range[1], (batch, 1), device=x.device).unsqueeze(2)
 99 |         mask_pos = torch.randint(0, max(1, D - mask_len.max()), (batch, 1), device=x.device).unsqueeze(2)
100 |         arange = torch.arange(D, device=x.device).view(1, 1, -1)
101 |         mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len))
102 |         mask = mask.any(dim=1)
103 | 
104 |         if dim == 1:
105 |             mask = mask.unsqueeze(2)
106 |         else:
107 |             mask = mask.unsqueeze(1)
108 | 
109 |         x = x.masked_fill_(mask, 0.0)
110 |         return x.view(*original_size)
111 | 
112 |     def forward(self, x):
113 |         x = self.mask_along_axis(x, dim=2)
114 |         x = self.mask_along_axis(x, dim=1)
115 |         return x
116 | 
117 | class ECAPA_TDNN(nn.Module):
118 |     def __init__(self, block, C, model_scale, log_input=True, num_mels=80, num_out=192, **kwargs):
119 |         self.log_input  = log_input
120 |         super(ECAPA_TDNN, self).__init__()
121 |         self.scale  = model_scale
122 |         self.conv1  = nn.Conv1d(num_mels, C, kernel_size=5, stride=1, padding=2)
123 |         self.relu   = nn.ReLU()
124 |         self.bn1    = nn.BatchNorm1d(C)
125 |         self.layer1 = block(C, C, kernel_size=3, dilation=2, scale=self.scale)
126 |         self.layer2 = block(C, C, kernel_size=3, dilation=3, scale=self.scale)
127 |         self.layer3 = block(C, C, kernel_size=3, dilation=4, scale=self.scale)
128 |         self.layer4 = nn.Conv1d(3*C, 1536, kernel_size=1)
129 |         self.attention = nn.Sequential(
130 |             nn.Conv1d(4608, 256, kernel_size=1),
131 |             nn.ReLU(),
132 |             nn.BatchNorm1d(256),
133 |             nn.Tanh(),
134 |             nn.Conv1d(256, 1536, kernel_size=1),
135 |             nn.Softmax(dim=2),
136 |             )
137 |         self.torchfbank = torch.nn.Sequential(
138 |             PreEmphasis(),
139 |             torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \
140 |                                                  f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=num_mels),
141 |             )
142 |         self.specaug = FbankAug()
143 |         self.bn5 = nn.BatchNorm1d(3072)
144 |         self.fc6 = nn.Linear(3072, num_out)
145 |         self.bn6 = nn.BatchNorm1d(num_out)
146 | 
147 |     def forward(self, x, aug):
148 |         with torch.no_grad():
149 |             with torch.cuda.amp.autocast(enabled=False):
150 |                 x = self.torchfbank(x)+1e-6
151 |                 if self.log_input:
152 |                     x = x.log()
153 |                 x = x - torch.mean(x, dim=-1, keepdim=True)
154 |                 if aug == True:
155 |                     x = self.specaug(x)
156 |         x = self.conv1(x)
157 |         x = self.relu(x)
158 |         x = self.bn1(x)
159 |         x1 = self.layer1(x)
160 |         x2 = self.layer2(x+x1)
161 |         x3 = self.layer3(x+x1+x2)
162 |         x = self.layer4(torch.cat((x1,x2,x3),dim=1))
163 |         x = self.relu(x)
164 |         t = x.size()[-1]
165 |         global_x = torch.cat((x,torch.mean(x,dim=2,keepdim=True).repeat(1,1,t), torch.sqrt(torch.var(x,dim=2,keepdim=True).clamp(min=1e-4)).repeat(1,1,t)), dim=1)
166 |         w = self.attention(global_x)
167 |         mu = torch.sum(x * w, dim=2)
168 |         sg = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-4) )
169 |         x = torch.cat((mu,sg),1)
170 |         x = self.bn5(x)
171 |         x = self.fc6(x)
172 |         x = self.bn6(x)
173 |         return x
174 | 
175 | def MainModel(eca_c=1024, eca_s=8, log_input=True, num_mels=80, num_out=192, **kwargs):
176 |     model = ECAPA_TDNN(block=Bottle2neck, C=eca_c, model_scale=eca_s, log_input=log_input, num_mels=num_mels, num_out=num_out, **kwargs)
177 |     return model
178 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/MFA_Conformer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torchaudio
 4 | from torch import Tensor
 5 | from typing import Tuple
 6 | from utils import PreEmphasis
 7 | from .specaugment import SpecAugment
 8 | from .wenet.transformer.encoder_cat import ConformerEncoder
 9 | 
10 | class Conformer(nn.Module):
11 |     def __init__(self, num_mels=80, num_blocks=6, output_size=256, embedding_dim=192, input_layer="conv2d2", pos_enc_layer_type="rel_pos"):
12 |         super(Conformer, self).__init__()
13 |         print("input_layer: {}".format(input_layer))
14 |         print("pos_enc_layer_type: {}".format(pos_enc_layer_type))
15 |         self.conformer = ConformerEncoder(input_size=num_mels, num_blocks=num_blocks, output_size=output_size, input_layer=input_layer, pos_enc_layer_type=pos_enc_layer_type, )
16 |         self.bn = nn.BatchNorm1d(output_size*num_blocks*2)
17 |         self.fc = nn.Linear(output_size*num_blocks*2, embedding_dim)
18 | 
19 |         self.specaug = SpecAugment()
20 |         self.torchfbank = torch.nn.Sequential(
21 |             PreEmphasis(),
22 |             torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \
23 |                                                  f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=80),
24 |             )
25 |         output_dim = output_size*num_blocks
26 |         self.attention = nn.Sequential(
27 |             nn.Conv1d(output_dim*3, 256, kernel_size=1),
28 |             nn.ReLU(),
29 |             nn.BatchNorm1d(256),
30 |             nn.Tanh(),
31 |             nn.Conv1d(256, output_dim, kernel_size=1),
32 |             nn.Softmax(dim=2),
33 |             )
34 | 
35 |     def forward(self, x: Tensor, aug=False) -> Tuple[Tensor, bool]:
36 | 
37 |         with torch.no_grad():
38 |             with torch.cuda.amp.autocast(enabled=False):
39 |                 x = self.torchfbank(x)+1e-6
40 |                 x = x.log()
41 |                 x = x - torch.mean(x, dim=-1, keepdim=True)
42 |                 if aug == True:
43 |                     x = self.specaug(x)
44 |         x = x.transpose(1,2)
45 |         lens = torch.ones(x.shape[0]).to(x.device)
46 |         lens = torch.round(lens*x.shape[1]).int()
47 |         x, masks = self.conformer(x, lens)
48 |         x = x.transpose(1,2)
49 | 
50 |         # Context dependent ASP
51 |         t = x.size()[-1]
52 |         global_x = torch.cat((x,torch.mean(x, dim=2, keepdim=True).repeat(1, 1, t), torch.sqrt(torch.var(x, dim=2, keepdim=True).clamp(min=1e-4)).repeat(1, 1, t)), dim=1)
53 |         w = self.attention(global_x)
54 |         mu = torch.sum(x * w, dim=2)
55 |         sg = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-4) )
56 |         x = torch.cat((mu, sg), dim=1)
57 | 
58 |         # BN -> FC: embedding
59 |         x = self.bn(x)
60 |         x = self.fc(x)
61 | 
62 |         return x
63 | 
64 | def MainModel(num_mels=80, num_out=192, **kwargs):
65 |     model = Conformer(num_mels=num_mels, embedding_dim=num_out, input_layer="conv2d2")
66 |     return model
67 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/specaugment.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class SpecAugment(nn.Module):
 5 | 
 6 |     def __init__(self, freq_mask_width = (0, 8), time_mask_width = (0, 10)):
 7 |         self.time_mask_width = time_mask_width
 8 |         self.freq_mask_width = freq_mask_width
 9 |         super(SpecAugment, self).__init__()
10 | 
11 |     def mask_along_axis(self, x, dim):
12 |         original_size = x.shape
13 |         batch, fea, time = x.shape
14 |         if dim == 1:
15 |             D = fea
16 |             width_range = self.freq_mask_width
17 |         else:
18 |             D = time
19 |             width_range = self.time_mask_width
20 | 
21 |         mask_len = torch.randint(width_range[0], width_range[1], (batch, 1), device=x.device).unsqueeze(2)
22 |         mask_pos = torch.randint(0, max(1, D - mask_len.max()), (batch, 1), device=x.device).unsqueeze(2)
23 |         arange = torch.arange(D, device=x.device).view(1, 1, -1)
24 |         mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len))
25 |         mask = mask.any(dim=1)
26 | 
27 |         if dim == 1:
28 |             mask = mask.unsqueeze(2)
29 |         else:
30 |             mask = mask.unsqueeze(1)
31 | 
32 |         x = x.masked_fill_(mask, 0.0)
33 |         return x.view(*original_size)
34 | 
35 |     def forward(self, x):
36 |         x = self.mask_along_axis(x, dim=2)
37 |         x = self.mask_along_axis(x, dim=1)
38 |         return x
39 | 
40 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/bin/.train.py.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/bin/.train.py.swp


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/bin/average_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Mobvoi Inc. All Rights Reserved.
 2 | # Author: di.wu@mobvoi.com (DI WU)
 3 | import os
 4 | import argparse
 5 | import glob
 6 | 
 7 | import yaml
 8 | import numpy as np
 9 | import torch
10 | 
11 | if __name__ == '__main__':
12 |     parser = argparse.ArgumentParser(description='average model')
13 |     parser.add_argument('--dst_model', required=True, help='averaged model')
14 |     parser.add_argument('--src_path',
15 |                         required=True,
16 |                         help='src model path for average')
17 |     parser.add_argument('--val_best',
18 |                         action="store_true",
19 |                         help='averaged model')
20 |     parser.add_argument('--num',
21 |                         default=5,
22 |                         type=int,
23 |                         help='nums for averaged model')
24 |     parser.add_argument('--min_epoch',
25 |                         default=0,
26 |                         type=int,
27 |                         help='min epoch used for averaging model')
28 |     parser.add_argument('--max_epoch',
29 |                         default=65536,  # Big enough
30 |                         type=int,
31 |                         help='max epoch used for averaging model')
32 | 
33 |     args = parser.parse_args()
34 |     print(args)
35 |     checkpoints = []
36 |     val_scores = []
37 |     if args.val_best:
38 |         yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path))
39 |         for y in yamls:
40 |             with open(y, 'r') as f:
41 |                 dic_yaml = yaml.load(f, Loader=yaml.FullLoader)
42 |                 loss = dic_yaml['cv_loss']
43 |                 epoch = dic_yaml['epoch']
44 |                 if epoch >= args.min_epoch and epoch <= args.max_epoch:
45 |                     val_scores += [[epoch, loss]]
46 |         val_scores = np.array(val_scores)
47 |         sort_idx = np.argsort(val_scores[:, -1])
48 |         sorted_val_scores = val_scores[sort_idx][::1]
49 |         print("best val scores = " + str(sorted_val_scores[:args.num, 1]))
50 |         print("selected epochs = " +
51 |               str(sorted_val_scores[:args.num, 0].astype(np.int64)))
52 |         path_list = [
53 |             args.src_path + '/{}.pt'.format(int(epoch))
54 |             for epoch in sorted_val_scores[:args.num, 0]
55 |         ]
56 |     else:
57 |         path_list = glob.glob('{}/[!avg][!final]*.pt'.format(args.src_path))
58 |         path_list = sorted(path_list, key=os.path.getmtime)
59 |         path_list = path_list[-args.num:]
60 |     print(path_list)
61 |     avg = None
62 |     num = args.num
63 |     assert num == len(path_list)
64 |     for path in path_list:
65 |         print('Processing {}'.format(path))
66 |         states = torch.load(path, map_location=torch.device('cpu'))
67 |         if avg is None:
68 |             avg = states
69 |         else:
70 |             for k in avg.keys():
71 |                 avg[k] += states[k]
72 |     # average
73 |     for k in avg.keys():
74 |         if avg[k] is not None:
75 |             # pytorch 1.6 use true_divide instead of /=
76 |             avg[k] = torch.true_divide(avg[k], num)
77 |     print('Saving to {}'.format(args.dst_model))
78 |     torch.save(avg, args.dst_model)
79 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/bin/export_jit.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from __future__ import print_function
16 | 
17 | import argparse
18 | import os
19 | 
20 | import torch
21 | import yaml
22 | 
23 | from wenet.transformer.asr_model import init_asr_model
24 | from wenet.utils.checkpoint import load_checkpoint
25 | 
26 | if __name__ == '__main__':
27 |     parser = argparse.ArgumentParser(description='export your script model')
28 |     parser.add_argument('--config', required=True, help='config file')
29 |     parser.add_argument('--checkpoint', required=True, help='checkpoint model')
30 |     parser.add_argument('--output_file', required=True, help='output file')
31 |     parser.add_argument('--output_quant_file',
32 |                         default=None,
33 |                         help='output quantized model file')
34 |     args = parser.parse_args()
35 |     # No need gpu for model export
36 |     os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
37 | 
38 |     with open(args.config, 'r') as fin:
39 |         configs = yaml.load(fin, Loader=yaml.FullLoader)
40 |     model = init_asr_model(configs)
41 |     print(model)
42 | 
43 |     load_checkpoint(model, args.checkpoint)
44 |     # Export jit torch script model
45 | 
46 |     script_model = torch.jit.script(model)
47 |     script_model.save(args.output_file)
48 |     print('Export model successfully, see {}'.format(args.output_file))
49 | 
50 |     # Export quantized jit torch script model
51 |     if args.output_quant_file:
52 |         quantized_model = torch.quantization.quantize_dynamic(
53 |             model, {torch.nn.Linear}, dtype=torch.qint8
54 |         )
55 |         print(quantized_model)
56 |         script_quant_model = torch.jit.script(quantized_model)
57 |         script_quant_model.save(args.output_quant_file)
58 |         print('Export quantized model successfully, '
59 |               'see {}'.format(args.output_quant_file))
60 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/attention.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/attention.cpython-36.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/attention.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/attention.cpython-37.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/attention.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/attention.cpython-39.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/convolution.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/convolution.cpython-36.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/convolution.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/convolution.cpython-37.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/convolution.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/convolution.cpython-39.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/embedding.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/embedding.cpython-36.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/embedding.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/embedding.cpython-37.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/embedding.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/embedding.cpython-39.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_cat.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_cat.cpython-36.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_cat.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_cat.cpython-37.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_cat.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_cat.cpython-39.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_layer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_layer.cpython-36.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_layer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_layer.cpython-37.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_layer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/encoder_layer.cpython-39.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-36.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-37.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-39.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/subsampling.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/subsampling.cpython-36.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/subsampling.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/subsampling.cpython-37.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/subsampling.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/subsampling.cpython-39.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/swish.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/swish.cpython-36.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/swish.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/swish.cpython-37.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/swish.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/transformer/__pycache__/swish.cpython-39.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/cmvn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import torch
17 | 
18 | 
19 | class GlobalCMVN(torch.nn.Module):
20 |     def __init__(self,
21 |                  mean: torch.Tensor,
22 |                  istd: torch.Tensor,
23 |                  norm_var: bool = True):
24 |         """
25 |         Args:
26 |             mean (torch.Tensor): mean stats
27 |             istd (torch.Tensor): inverse std, std which is 1.0 / std
28 |         """
29 |         super().__init__()
30 |         assert mean.shape == istd.shape
31 |         self.norm_var = norm_var
32 |         # The buffer can be accessed from this module using self.mean
33 |         self.register_buffer("mean", mean)
34 |         self.register_buffer("istd", istd)
35 | 
36 |     def forward(self, x: torch.Tensor):
37 |         """
38 |         Args:
39 |             x (torch.Tensor): (batch, max_len, feat_dim)
40 | 
41 |         Returns:
42 |             (torch.Tensor): normalized feature
43 |         """
44 |         x = x - self.mean
45 |         if self.norm_var:
46 |             x = x * self.istd
47 |         return x
48 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/convolution.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2021 Mobvoi Inc. All Rights Reserved.
  5 | # Author: di.wu@mobvoi.com (DI WU)
  6 | """ConvolutionModule definition."""
  7 | 
  8 | from typing import Optional, Tuple
  9 | 
 10 | import torch
 11 | from torch import nn
 12 | from typeguard import check_argument_types
 13 | 
 14 | 
 15 | class ConvolutionModule(nn.Module):
 16 |     """ConvolutionModule in Conformer model."""
 17 |     def __init__(self,
 18 |                  channels: int,
 19 |                  kernel_size: int = 15,
 20 |                  activation: nn.Module = nn.ReLU(),
 21 |                  norm: str = "batch_norm",
 22 |                  causal: bool = False,
 23 |                  bias: bool = True):
 24 |         """Construct an ConvolutionModule object.
 25 |         Args:
 26 |             channels (int): The number of channels of conv layers.
 27 |             kernel_size (int): Kernel size of conv layers.
 28 |             causal (int): Whether use causal convolution or not
 29 |         """
 30 |         assert check_argument_types()
 31 |         super().__init__()
 32 | 
 33 |         self.pointwise_conv1 = nn.Conv1d(
 34 |             channels,
 35 |             2 * channels,
 36 |             kernel_size=1,
 37 |             stride=1,
 38 |             padding=0,
 39 |             bias=bias,
 40 |         )
 41 |         # self.lorder is used to distinguish if it's a causal convolution,
 42 |         # if self.lorder > 0: it's a causal convolution, the input will be
 43 |         #    padded with self.lorder frames on the left in forward.
 44 |         # else: it's a symmetrical convolution
 45 |         if causal:
 46 |             padding = 0
 47 |             self.lorder = kernel_size - 1
 48 |         else:
 49 |             # kernel_size should be an odd number for none causal convolution
 50 |             assert (kernel_size - 1) % 2 == 0
 51 |             padding = (kernel_size - 1) // 2
 52 |             self.lorder = 0
 53 |         self.depthwise_conv = nn.Conv1d(
 54 |             channels,
 55 |             channels,
 56 |             kernel_size,
 57 |             stride=1,
 58 |             padding=padding,
 59 |             groups=channels,
 60 |             bias=bias,
 61 |         )
 62 | 
 63 |         assert norm in ['batch_norm', 'layer_norm']
 64 |         if norm == "batch_norm":
 65 |             self.use_layer_norm = False
 66 |             self.norm = nn.BatchNorm1d(channels)
 67 |         else:
 68 |             self.use_layer_norm = True
 69 |             self.norm = nn.LayerNorm(channels)
 70 | 
 71 |         self.pointwise_conv2 = nn.Conv1d(
 72 |             channels,
 73 |             channels,
 74 |             kernel_size=1,
 75 |             stride=1,
 76 |             padding=0,
 77 |             bias=bias,
 78 |         )
 79 |         self.activation = activation
 80 | 
 81 |     def forward(
 82 |         self,
 83 |         x: torch.Tensor,
 84 |         mask_pad: Optional[torch.Tensor] = None,
 85 |         cache: Optional[torch.Tensor] = None,
 86 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 87 |         """Compute convolution module.
 88 |         Args:
 89 |             x (torch.Tensor): Input tensor (#batch, time, channels).
 90 |             mask_pad (torch.Tensor): used for batch padding (#batch, 1, time)
 91 |             cache (torch.Tensor): left context cache, it is only
 92 |                 used in causal convolution
 93 |         Returns:
 94 |             torch.Tensor: Output tensor (#batch, time, channels).
 95 |         """
 96 |         # exchange the temporal dimension and the feature dimension
 97 |         x = x.transpose(1, 2)  # (#batch, channels, time)
 98 | 
 99 |         # mask batch padding
100 |         if mask_pad is not None:
101 |             x.masked_fill_(~mask_pad, 0.0)
102 | 
103 |         if self.lorder > 0:
104 |             if cache is None:
105 |                 x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
106 |             else:
107 |                 assert cache.size(0) == x.size(0)
108 |                 assert cache.size(1) == x.size(1)
109 |                 x = torch.cat((cache, x), dim=2)
110 |             assert (x.size(2) > self.lorder)
111 |             new_cache = x[:, :, -self.lorder:]
112 |         else:
113 |             # It's better we just return None if no cache is requried,
114 |             # However, for JIT export, here we just fake one tensor instead of
115 |             # None.
116 |             new_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device)
117 | 
118 |         # GLU mechanism
119 |         x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
120 |         x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
121 | 
122 |         # 1D Depthwise Conv
123 |         x = self.depthwise_conv(x)
124 |         if self.use_layer_norm:
125 |             x = x.transpose(1, 2)
126 |         x = self.activation(self.norm(x))
127 |         if self.use_layer_norm:
128 |             x = x.transpose(1, 2)
129 |         x = self.pointwise_conv2(x)
130 |         # mask batch padding
131 |         if mask_pad is not None:
132 |             x.masked_fill_(~mask_pad, 0.0)
133 | 
134 |         return x.transpose(1, 2), new_cache
135 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/ctc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from typeguard import check_argument_types
 4 | 
 5 | 
 6 | class CTC(torch.nn.Module):
 7 |     """CTC module"""
 8 |     def __init__(
 9 |         self,
10 |         odim: int,
11 |         encoder_output_size: int,
12 |         dropout_rate: float = 0.0,
13 |         reduce: bool = True,
14 |     ):
15 |         """ Construct CTC module
16 |         Args:
17 |             odim: dimension of outputs
18 |             encoder_output_size: number of encoder projection units
19 |             dropout_rate: dropout rate (0.0 ~ 1.0)
20 |             reduce: reduce the CTC loss into a scalar
21 |         """
22 |         assert check_argument_types()
23 |         super().__init__()
24 |         eprojs = encoder_output_size
25 |         self.dropout_rate = dropout_rate
26 |         self.ctc_lo = torch.nn.Linear(eprojs, odim)
27 | 
28 |         reduction_type = "sum" if reduce else "none"
29 |         self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type)
30 | 
31 |     def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor,
32 |                 ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor:
33 |         """Calculate CTC loss.
34 | 
35 |         Args:
36 |             hs_pad: batch of padded hidden state sequences (B, Tmax, D)
37 |             hlens: batch of lengths of hidden state sequences (B)
38 |             ys_pad: batch of padded character id sequence tensor (B, Lmax)
39 |             ys_lens: batch of lengths of character sequence (B)
40 |         """
41 |         # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab)
42 |         ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate))
43 |         # ys_hat: (B, L, D) -> (L, B, D)
44 |         ys_hat = ys_hat.transpose(0, 1)
45 |         ys_hat = ys_hat.log_softmax(2)
46 |         loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens)
47 |         # Batch-size average
48 |         loss = loss / ys_hat.size(1)
49 |         return loss
50 | 
51 |     def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor:
52 |         """log_softmax of frame activations
53 | 
54 |         Args:
55 |             Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
56 |         Returns:
57 |             torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim)
58 |         """
59 |         return F.log_softmax(self.ctc_lo(hs_pad), dim=2)
60 | 
61 |     def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor:
62 |         """argmax of frame activations
63 | 
64 |         Args:
65 |             torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
66 |         Returns:
67 |             torch.Tensor: argmax applied 2d tensor (B, Tmax)
68 |         """
69 |         return torch.argmax(self.ctc_lo(hs_pad), dim=2)
70 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/decoder_layer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2019 Shigeki Karita
  5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  6 | """Decoder self-attention layer definition."""
  7 | from typing import Optional, Tuple
  8 | 
  9 | import torch
 10 | from torch import nn
 11 | 
 12 | 
 13 | class DecoderLayer(nn.Module):
 14 |     """Single decoder layer module.
 15 | 
 16 |     Args:
 17 |         size (int): Input dimension.
 18 |         self_attn (torch.nn.Module): Self-attention module instance.
 19 |             `MultiHeadedAttention` instance can be used as the argument.
 20 |         src_attn (torch.nn.Module): Inter-attention module instance.
 21 |             `MultiHeadedAttention` instance can be used as the argument.
 22 |         feed_forward (torch.nn.Module): Feed-forward module instance.
 23 |             `PositionwiseFeedForward` instance can be used as the argument.
 24 |         dropout_rate (float): Dropout rate.
 25 |         normalize_before (bool):
 26 |             True: use layer_norm before each sub-block.
 27 |             False: to use layer_norm after each sub-block.
 28 |         concat_after (bool): Whether to concat attention layer's inpu
 29 |             and output.
 30 |             True: x -> x + linear(concat(x, att(x)))
 31 |             False: x -> x + att(x)
 32 |     """
 33 |     def __init__(
 34 |         self,
 35 |         size: int,
 36 |         self_attn: nn.Module,
 37 |         src_attn: nn.Module,
 38 |         feed_forward: nn.Module,
 39 |         dropout_rate: float,
 40 |         normalize_before: bool = True,
 41 |         concat_after: bool = False,
 42 |     ):
 43 |         """Construct an DecoderLayer object."""
 44 |         super().__init__()
 45 |         self.size = size
 46 |         self.self_attn = self_attn
 47 |         self.src_attn = src_attn
 48 |         self.feed_forward = feed_forward
 49 |         self.norm1 = nn.LayerNorm(size, eps=1e-12)
 50 |         self.norm2 = nn.LayerNorm(size, eps=1e-12)
 51 |         self.norm3 = nn.LayerNorm(size, eps=1e-12)
 52 |         self.dropout = nn.Dropout(dropout_rate)
 53 |         self.normalize_before = normalize_before
 54 |         self.concat_after = concat_after
 55 |         self.concat_linear1 = nn.Linear(size + size, size)
 56 |         self.concat_linear2 = nn.Linear(size + size, size)
 57 | 
 58 |     def forward(
 59 |         self,
 60 |         tgt: torch.Tensor,
 61 |         tgt_mask: torch.Tensor,
 62 |         memory: torch.Tensor,
 63 |         memory_mask: torch.Tensor,
 64 |         cache: Optional[torch.Tensor] = None
 65 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
 66 |         """Compute decoded features.
 67 | 
 68 |         Args:
 69 |             tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
 70 |             tgt_mask (torch.Tensor): Mask for input tensor
 71 |                 (#batch, maxlen_out).
 72 |             memory (torch.Tensor): Encoded memory
 73 |                 (#batch, maxlen_in, size).
 74 |             memory_mask (torch.Tensor): Encoded memory mask
 75 |                 (#batch, maxlen_in).
 76 |             cache (torch.Tensor): cached tensors.
 77 |                 (#batch, maxlen_out - 1, size).
 78 | 
 79 |         Returns:
 80 |             torch.Tensor: Output tensor (#batch, maxlen_out, size).
 81 |             torch.Tensor: Mask for output tensor (#batch, maxlen_out).
 82 |             torch.Tensor: Encoded memory (#batch, maxlen_in, size).
 83 |             torch.Tensor: Encoded memory mask (#batch, maxlen_in).
 84 | 
 85 |         """
 86 |         residual = tgt
 87 |         if self.normalize_before:
 88 |             tgt = self.norm1(tgt)
 89 | 
 90 |         if cache is None:
 91 |             tgt_q = tgt
 92 |             tgt_q_mask = tgt_mask
 93 |         else:
 94 |             # compute only the last frame query keeping dim: max_time_out -> 1
 95 |             assert cache.shape == (
 96 |                 tgt.shape[0],
 97 |                 tgt.shape[1] - 1,
 98 |                 self.size,
 99 |             ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
100 |             tgt_q = tgt[:, -1:, :]
101 |             residual = residual[:, -1:, :]
102 |             tgt_q_mask = tgt_mask[:, -1:, :]
103 | 
104 |         if self.concat_after:
105 |             tgt_concat = torch.cat(
106 |                 (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)), dim=-1)
107 |             x = residual + self.concat_linear1(tgt_concat)
108 |         else:
109 |             x = residual + self.dropout(
110 |                 self.self_attn(tgt_q, tgt, tgt, tgt_q_mask))
111 |         if not self.normalize_before:
112 |             x = self.norm1(x)
113 | 
114 |         residual = x
115 |         if self.normalize_before:
116 |             x = self.norm2(x)
117 |         if self.concat_after:
118 |             x_concat = torch.cat(
119 |                 (x, self.src_attn(x, memory, memory, memory_mask)), dim=-1)
120 |             x = residual + self.concat_linear2(x_concat)
121 |         else:
122 |             x = residual + self.dropout(
123 |                 self.src_attn(x, memory, memory, memory_mask))
124 |         if not self.normalize_before:
125 |             x = self.norm2(x)
126 | 
127 |         residual = x
128 |         if self.normalize_before:
129 |             x = self.norm3(x)
130 |         x = residual + self.dropout(self.feed_forward(x))
131 |         if not self.normalize_before:
132 |             x = self.norm3(x)
133 | 
134 |         if cache is not None:
135 |             x = torch.cat([cache, x], dim=1)
136 | 
137 |         return x, tgt_mask, memory, memory_mask
138 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/embedding.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2019 Mobvoi Inc. All Rights Reserved.
  5 | # Author: di.wu@mobvoi.com (DI WU)
  6 | """Positonal Encoding Module."""
  7 | 
  8 | import math
  9 | from typing import Tuple
 10 | 
 11 | import torch
 12 | 
 13 | 
 14 | class PositionalEncoding(torch.nn.Module):
 15 |     """Positional encoding.
 16 | 
 17 |     :param int d_model: embedding dim
 18 |     :param float dropout_rate: dropout rate
 19 |     :param int max_len: maximum input length
 20 | 
 21 |     PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
 22 |     PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
 23 |     """
 24 |     def __init__(self,
 25 |                  d_model: int,
 26 |                  dropout_rate: float,
 27 |                  max_len: int = 50000,
 28 |                  reverse: bool = False):
 29 |         """Construct an PositionalEncoding object."""
 30 |         super().__init__()
 31 |         self.d_model = d_model
 32 |         self.xscale = math.sqrt(self.d_model)
 33 |         self.dropout = torch.nn.Dropout(p=dropout_rate)
 34 |         self.max_len = max_len
 35 | 
 36 |         self.pe = torch.zeros(self.max_len, self.d_model)
 37 |         position = torch.arange(0, self.max_len,
 38 |                                 dtype=torch.float32).unsqueeze(1)
 39 |         div_term = torch.exp(
 40 |             torch.arange(0, self.d_model, 2, dtype=torch.float32) *
 41 |             -(math.log(10000.0) / self.d_model))
 42 |         self.pe[:, 0::2] = torch.sin(position * div_term)
 43 |         self.pe[:, 1::2] = torch.cos(position * div_term)
 44 |         self.pe = self.pe.unsqueeze(0)
 45 | 
 46 |     def forward(self,
 47 |                 x: torch.Tensor,
 48 |                 offset: int = 0) -> Tuple[torch.Tensor, torch.Tensor]:
 49 |         """Add positional encoding.
 50 | 
 51 |         Args:
 52 |             x (torch.Tensor): Input. Its shape is (batch, time, ...)
 53 |             offset (int): position offset
 54 | 
 55 |         Returns:
 56 |             torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
 57 |             torch.Tensor: for compatibility to RelPositionalEncoding
 58 |         """
 59 |         assert offset + x.size(1) < self.max_len
 60 |         self.pe = self.pe.to(x.device)
 61 |         pos_emb = self.pe[:, offset:offset + x.size(1)]
 62 |         x = x * self.xscale + pos_emb
 63 |         return self.dropout(x), self.dropout(pos_emb)
 64 | 
 65 |     def position_encoding(self, offset: int, size: int) -> torch.Tensor:
 66 |         """ For getting encoding in a streaming fashion
 67 | 
 68 |         Attention!!!!!
 69 |         we apply dropout only once at the whole utterance level in a none
 70 |         streaming way, but will call this function several times with
 71 |         increasing input size in a streaming scenario, so the dropout will
 72 |         be applied several times.
 73 | 
 74 |         Args:
 75 |             offset (int): start offset
 76 |             size (int): requried size of position encoding
 77 | 
 78 |         Returns:
 79 |             torch.Tensor: Corresponding encoding
 80 |         """
 81 |         assert offset + size < self.max_len
 82 |         return self.dropout(self.pe[:, offset:offset + size])
 83 | 
 84 | 
 85 | class RelPositionalEncoding(PositionalEncoding):
 86 |     """Relative positional encoding module.
 87 |     See : Appendix B in https://arxiv.org/abs/1901.02860
 88 |     Args:
 89 |         d_model (int): Embedding dimension.
 90 |         dropout_rate (float): Dropout rate.
 91 |         max_len (int): Maximum input length.
 92 |     """
 93 |     def __init__(self, d_model: int, dropout_rate: float, max_len: int = 100000):
 94 |         """Initialize class."""
 95 |         super().__init__(d_model, dropout_rate, max_len, reverse=True)
 96 | 
 97 |     def forward(self,
 98 |                 x: torch.Tensor,
 99 |                 offset: int = 0) -> Tuple[torch.Tensor, torch.Tensor]:
100 |         """Compute positional encoding.
101 |         Args:
102 |             x (torch.Tensor): Input tensor (batch, time, `*`).
103 |         Returns:
104 |             torch.Tensor: Encoded tensor (batch, time, `*`).
105 |             torch.Tensor: Positional embedding tensor (1, time, `*`).
106 |         """
107 |         assert offset + x.size(1) < self.max_len
108 |         self.pe = self.pe.to(x.device)
109 |         x = x * self.xscale
110 |         pos_emb = self.pe[:, offset:offset + x.size(1)]
111 |         return self.dropout(x), self.dropout(pos_emb)
112 | 
113 | 
114 | class NoPositionalEncoding(torch.nn.Module):
115 |     """ No position encoding
116 |     """
117 |     def __init__(self, d_model: int, dropout_rate: float):
118 |         super().__init__()
119 |         self.d_model = d_model
120 |         self.dropout = torch.nn.Dropout(p=dropout_rate)
121 | 
122 |     def forward(self,
123 |                 x: torch.Tensor,
124 |                 offset: int = 0) -> Tuple[torch.Tensor, torch.Tensor]:
125 |         """ Just return zero vector for interface compatibility
126 |         """
127 |         pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device)
128 |         return self.dropout(x), pos_emb
129 | 
130 |     def position_encoding(self, offset: int, size: int) -> torch.Tensor:
131 |         return torch.zeros(1, size, self.d_model)
132 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/label_smoothing_loss.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | """Label smoothing module."""
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class LabelSmoothingLoss(nn.Module):
13 |     """Label-smoothing loss.
14 | 
15 |     In a standard CE loss, the label's data distribution is:
16 |     [0,1,2] ->
17 |     [
18 |         [1.0, 0.0, 0.0],
19 |         [0.0, 1.0, 0.0],
20 |         [1.0, 0.0, 1.0],
21 |     ]
22 | 
23 |     In the smoothing version CE Loss,some probabilities
24 |     are taken from the true label prob (1.0) and are divided
25 |     among other labels.
26 | 
27 |     e.g.
28 |     smoothing=0.1
29 |     [0,1,2] ->
30 |     [
31 |         [0.9, 0.05, 0.05],
32 |         [0.05, 0.9, 0.05],
33 |         [0.05, 0.05, 0.9],
34 |     ]
35 | 
36 |     Args:
37 |         size (int): the number of class
38 |         padding_idx (int): padding class id which will be ignored for loss
39 |         smoothing (float): smoothing rate (0.0 means the conventional CE)
40 |         normalize_length (bool):
41 |             normalize loss by sequence length if True
42 |             normalize loss by batch size if False
43 |     """
44 |     def __init__(self,
45 |                  size: int,
46 |                  padding_idx: int,
47 |                  smoothing: float,
48 |                  normalize_length: bool = False):
49 |         """Construct an LabelSmoothingLoss object."""
50 |         super(LabelSmoothingLoss, self).__init__()
51 |         self.criterion = nn.KLDivLoss(reduction="none")
52 |         self.padding_idx = padding_idx
53 |         self.confidence = 1.0 - smoothing
54 |         self.smoothing = smoothing
55 |         self.size = size
56 |         self.normalize_length = normalize_length
57 | 
58 |     def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
59 |         """Compute loss between x and target.
60 | 
61 |         The model outputs and data labels tensors are flatten to
62 |         (batch*seqlen, class) shape and a mask is applied to the
63 |         padding part which should not be calculated for loss.
64 | 
65 |         Args:
66 |             x (torch.Tensor): prediction (batch, seqlen, class)
67 |             target (torch.Tensor):
68 |                 target signal masked with self.padding_id (batch, seqlen)
69 |         Returns:
70 |             loss (torch.Tensor) : The KL loss, scalar float value
71 |         """
72 |         assert x.size(2) == self.size
73 |         batch_size = x.size(0)
74 |         x = x.view(-1, self.size)
75 |         target = target.view(-1)
76 |         # use zeros_like instead of torch.no_grad() for true_dist,
77 |         # since no_grad() can not be exported by JIT
78 |         true_dist = torch.zeros_like(x)
79 |         true_dist.fill_(self.smoothing / (self.size - 1))
80 |         ignore = target == self.padding_idx  # (B,)
81 |         total = len(target) - ignore.sum().item()
82 |         target = target.masked_fill(ignore, 0)  # avoid -1 index
83 |         true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
84 |         kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
85 |         denom = total if self.normalize_length else batch_size
86 |         return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom
87 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/positionwise_feed_forward.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | """Positionwise feed forward layer definition."""
 7 | 
 8 | import torch
 9 | 
10 | 
11 | class PositionwiseFeedForward(torch.nn.Module):
12 |     """Positionwise feed forward layer.
13 | 
14 |     FeedForward are appied on each position of the sequence.
15 |     The output dim is same with the input dim.
16 | 
17 |     Args:
18 |         idim (int): Input dimenstion.
19 |         hidden_units (int): The number of hidden units.
20 |         dropout_rate (float): Dropout rate.
21 |         activation (torch.nn.Module): Activation function
22 |     """
23 |     def __init__(self,
24 |                  idim: int,
25 |                  hidden_units: int,
26 |                  dropout_rate: float,
27 |                  activation: torch.nn.Module = torch.nn.ReLU()):
28 |         """Construct a PositionwiseFeedForward object."""
29 |         super(PositionwiseFeedForward, self).__init__()
30 |         self.w_1 = torch.nn.Linear(idim, hidden_units)
31 |         self.activation = activation
32 |         self.dropout = torch.nn.Dropout(dropout_rate)
33 |         self.w_2 = torch.nn.Linear(hidden_units, idim)
34 | 
35 |     def forward(self, xs: torch.Tensor) -> torch.Tensor:
36 |         """Forward function.
37 | 
38 |         Args:
39 |             xs: input tensor (B, L, D)
40 |         Returns:
41 |             output tensor, (B, L, D)
42 |         """
43 |         return self.w_2(self.dropout(self.activation(self.w_1(xs))))
44 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/transformer/swish.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe)
 5 | #                Northwestern Polytechnical University (Pengcheng Guo)
 6 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 7 | """Swish() activation function for Conformer."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | class Swish(torch.nn.Module):
13 |     """Construct an Swish object."""
14 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
15 |         """Return Swish activation function."""
16 |         return x * torch.sigmoid(x)
17 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/utils/__pycache__/common.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/utils/__pycache__/common.cpython-36.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/utils/__pycache__/common.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/utils/__pycache__/common.cpython-37.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/utils/__pycache__/common.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/utils/__pycache__/common.cpython-39.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/utils/__pycache__/mask.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/utils/__pycache__/mask.cpython-36.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/utils/__pycache__/mask.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/utils/__pycache__/mask.cpython-37.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/utils/__pycache__/mask.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/models/wenet/utils/__pycache__/mask.cpython-39.pyc


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/utils/checkpoint.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Mobvoi Inc. All Rights Reserved.
 2 | # Author: binbinzhang@mobvoi.com (Binbin Zhang)
 3 | 
 4 | import logging
 5 | import os
 6 | import re
 7 | 
 8 | import yaml
 9 | import torch
10 | 
11 | 
12 | def load_checkpoint(model: torch.nn.Module, path: str) -> dict:
13 |     if torch.cuda.is_available():
14 |         logging.info('Checkpoint: loading from checkpoint %s for GPU' % path)
15 |         checkpoint = torch.load(path)
16 |     else:
17 |         logging.info('Checkpoint: loading from checkpoint %s for CPU' % path)
18 |         checkpoint = torch.load(path, map_location='cpu')
19 |     model.load_state_dict(checkpoint)
20 |     info_path = re.sub('.pt$', '.yaml', path)
21 |     configs = {}
22 |     if os.path.exists(info_path):
23 |         with open(info_path, 'r') as fin:
24 |             configs = yaml.load(fin, Loader=yaml.FullLoader)
25 |     return configs
26 | 
27 | 
28 | def save_checkpoint(model: torch.nn.Module, path: str, infos=None):
29 |     '''
30 |     Args:
31 |         infos (dict or None): any info you want to save.
32 |     '''
33 |     logging.info('Checkpoint: save to checkpoint %s' % path)
34 |     if isinstance(model, torch.nn.DataParallel):
35 |         state_dict = model.module.state_dict()
36 |     elif isinstance(model, torch.nn.parallel.DistributedDataParallel):
37 |         state_dict = model.module.state_dict()
38 |     else:
39 |         state_dict = model.state_dict()
40 |     torch.save(state_dict, path)
41 |     info_path = re.sub('.pt$', '.yaml', path)
42 |     if infos is None:
43 |         infos = {}
44 |     with open(info_path, 'w') as fout:
45 |         data = yaml.dump(infos)
46 |         fout.write(data)
47 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/utils/cmvn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import json
17 | import math
18 | 
19 | import numpy as np
20 | 
21 | 
22 | def _load_json_cmvn(json_cmvn_file):
23 |     """ Load the json format cmvn stats file and calculate cmvn
24 | 
25 |     Args:
26 |         json_cmvn_file: cmvn stats file in json format
27 | 
28 |     Returns:
29 |         a numpy array of [means, vars]
30 |     """
31 |     with open(json_cmvn_file) as f:
32 |         cmvn_stats = json.load(f)
33 | 
34 |     means = cmvn_stats['mean_stat']
35 |     variance = cmvn_stats['var_stat']
36 |     count = cmvn_stats['frame_num']
37 |     for i in range(len(means)):
38 |         means[i] /= count
39 |         variance[i] = variance[i] / count - means[i] * means[i]
40 |         if variance[i] < 1.0e-20:
41 |             variance[i] = 1.0e-20
42 |         variance[i] = 1.0 / math.sqrt(variance[i])
43 |     cmvn = np.array([means, variance])
44 |     return cmvn
45 | 
46 | 
47 | def _load_kaldi_cmvn(kaldi_cmvn_file):
48 |     """ Load the kaldi format cmvn stats file and calculate cmvn
49 | 
50 |     Args:
51 |         kaldi_cmvn_file:  kaldi text style global cmvn file, which
52 |            is generated by:
53 |            compute-cmvn-stats --binary=false scp:feats.scp global_cmvn
54 | 
55 |     Returns:
56 |         a numpy array of [means, vars]
57 |     """
58 |     means = []
59 |     variance = []
60 |     with open(kaldi_cmvn_file, 'r') as fid:
61 |         # kaldi binary file start with '\0B'
62 |         if fid.read(2) == '\0B':
63 |             logging.error('kaldi cmvn binary file is not supported, please '
64 |                           'recompute it by: compute-cmvn-stats --binary=false '
65 |                           ' scp:feats.scp global_cmvn')
66 |             sys.exit(1)
67 |         fid.seek(0)
68 |         arr = fid.read().split()
69 |         assert (arr[0] == '[')
70 |         assert (arr[-2] == '0')
71 |         assert (arr[-1] == ']')
72 |         feat_dim = int((len(arr) - 2 - 2) / 2)
73 |         for i in range(1, feat_dim + 1):
74 |             means.append(float(arr[i]))
75 |         count = float(arr[feat_dim + 1])
76 |         for i in range(feat_dim + 2, 2 * feat_dim + 2):
77 |             variance.append(float(arr[i]))
78 | 
79 |     for i in range(len(means)):
80 |         means[i] /= count
81 |         variance[i] = variance[i] / count - means[i] * means[i]
82 |         if variance[i] < 1.0e-20:
83 |             variance[i] = 1.0e-20
84 |         variance[i] = 1.0 / math.sqrt(variance[i])
85 |     cmvn = np.array([means, variance])
86 |     return cmvn
87 | 
88 | 
89 | def load_cmvn(cmvn_file, is_json):
90 |     if is_json:
91 |         cmvn = _load_json_cmvn(cmvn_file)
92 |     else:
93 |         cmvn = _load_kaldi_cmvn(cmvn_file)
94 |     return cmvn[0], cmvn[1]
95 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/utils/common.py:
--------------------------------------------------------------------------------
  1 | """Unility functions for Transformer."""
  2 | 
  3 | import math
  4 | from typing import Tuple, List
  5 | 
  6 | import torch
  7 | from torch.nn.utils.rnn import pad_sequence
  8 | 
  9 | IGNORE_ID = -1
 10 | 
 11 | 
 12 | def pad_list(xs: List[torch.Tensor], pad_value: int):
 13 |     """Perform padding for the list of tensors.
 14 | 
 15 |     Args:
 16 |         xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
 17 |         pad_value (float): Value for padding.
 18 | 
 19 |     Returns:
 20 |         Tensor: Padded tensor (B, Tmax, `*`).
 21 | 
 22 |     Examples:
 23 |         >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
 24 |         >>> x
 25 |         [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
 26 |         >>> pad_list(x, 0)
 27 |         tensor([[1., 1., 1., 1.],
 28 |                 [1., 1., 0., 0.],
 29 |                 [1., 0., 0., 0.]])
 30 | 
 31 |     """
 32 |     n_batch = len(xs)
 33 |     max_len = max([x.size(0) for x in xs])
 34 |     pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device)
 35 |     pad = pad.fill_(pad_value)
 36 |     for i in range(n_batch):
 37 |         pad[i, :xs[i].size(0)] = xs[i]
 38 | 
 39 |     return pad
 40 | 
 41 | 
 42 | def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int,
 43 |                 ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]:
 44 |     """Add <sos> and <eos> labels.
 45 | 
 46 |     Args:
 47 |         ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax)
 48 |         sos (int): index of <sos>
 49 |         eos (int): index of <eeos>
 50 |         ignore_id (int): index of padding
 51 | 
 52 |     Returns:
 53 |         ys_in (torch.Tensor) : (B, Lmax + 1)
 54 |         ys_out (torch.Tensor) : (B, Lmax + 1)
 55 | 
 56 |     Examples:
 57 |         >>> sos_id = 10
 58 |         >>> eos_id = 11
 59 |         >>> ignore_id = -1
 60 |         >>> ys_pad
 61 |         tensor([[ 1,  2,  3,  4,  5],
 62 |                 [ 4,  5,  6, -1, -1],
 63 |                 [ 7,  8,  9, -1, -1]], dtype=torch.int32)
 64 |         >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id)
 65 |         >>> ys_in
 66 |         tensor([[10,  1,  2,  3,  4,  5],
 67 |                 [10,  4,  5,  6, 11, 11],
 68 |                 [10,  7,  8,  9, 11, 11]])
 69 |         >>> ys_out
 70 |         tensor([[ 1,  2,  3,  4,  5, 11],
 71 |                 [ 4,  5,  6, 11, -1, -1],
 72 |                 [ 7,  8,  9, 11, -1, -1]])
 73 |     """
 74 |     _sos = torch.tensor([sos],
 75 |                         dtype=torch.long,
 76 |                         requires_grad=False,
 77 |                         device=ys_pad.device)
 78 |     _eos = torch.tensor([eos],
 79 |                         dtype=torch.long,
 80 |                         requires_grad=False,
 81 |                         device=ys_pad.device)
 82 |     ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
 83 |     ys_in = [torch.cat([_sos, y], dim=0) for y in ys]
 84 |     ys_out = [torch.cat([y, _eos], dim=0) for y in ys]
 85 |     return pad_list(ys_in, eos), pad_list(ys_out, ignore_id)
 86 | 
 87 | 
 88 | def reverse_pad_list(ys_pad: torch.Tensor,
 89 |                      ys_lens: torch.Tensor,
 90 |                      pad_value: float = -1.0) -> torch.Tensor:
 91 |     """Reverse padding for the list of tensors.
 92 | 
 93 |     Args:
 94 |         ys_pad (tensor): The padded tensor (B, Tokenmax).
 95 |         ys_lens (tensor): The lens of token seqs (B)
 96 |         pad_value (int): Value for padding.
 97 | 
 98 |     Returns:
 99 |         Tensor: Padded tensor (B, Tokenmax).
100 | 
101 |     Examples:
102 |         >>> x
103 |         tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]])
104 |         >>> pad_list(x, 0)
105 |         tensor([[4, 3, 2, 1],
106 |                 [7, 6, 5, 0],
107 |                 [9, 8, 0, 0]])
108 | 
109 |     """
110 |     r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0]))
111 |                              for y, i in zip(ys_pad, ys_lens)], True,
112 |                             pad_value)
113 |     return r_ys_pad
114 | 
115 | 
116 | def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor,
117 |                 ignore_label: int) -> float:
118 |     """Calculate accuracy.
119 | 
120 |     Args:
121 |         pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
122 |         pad_targets (LongTensor): Target label tensors (B, Lmax, D).
123 |         ignore_label (int): Ignore label id.
124 | 
125 |     Returns:
126 |         float: Accuracy value (0.0 - 1.0).
127 | 
128 |     """
129 |     pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1),
130 |                                 pad_outputs.size(1)).argmax(2)
131 |     mask = pad_targets != ignore_label
132 |     numerator = torch.sum(
133 |         pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
134 |     denominator = torch.sum(mask)
135 |     return float(numerator) / float(denominator)
136 | 
137 | 
138 | def get_activation(act):
139 |     """Return activation function."""
140 |     # Lazy load to avoid unused import
141 |     #from wenet.transformer.swish import Swish
142 |     from ..transformer.swish import Swish
143 | 
144 |     activation_funcs = {
145 |         "hardtanh": torch.nn.Hardtanh,
146 |         "tanh": torch.nn.Tanh,
147 |         "relu": torch.nn.ReLU,
148 |         "selu": torch.nn.SELU,
149 |         "swish": Swish,
150 |         "gelu": torch.nn.GELU
151 |     }
152 | 
153 |     return activation_funcs[act]()
154 | 
155 | 
156 | def get_subsample(config):
157 |     input_layer = config["encoder_conf"]["input_layer"]
158 |     assert input_layer in ["conv2d", "conv2d6", "conv2d8"]
159 |     if input_layer == "conv2d":
160 |         return 4
161 |     elif input_layer == "conv2d6":
162 |         return 6
163 |     elif input_layer == "conv2d8":
164 |         return 8
165 | 
166 | 
167 | def remove_duplicates_and_blank(hyp: List[int]) -> List[int]:
168 |     new_hyp: List[int] = []
169 |     cur = 0
170 |     while cur < len(hyp):
171 |         if hyp[cur] != 0:
172 |             new_hyp.append(hyp[cur])
173 |         prev = cur
174 |         while cur < len(hyp) and hyp[cur] == hyp[prev]:
175 |             cur += 1
176 |     return new_hyp
177 | 
178 | 
179 | def log_add(args: List[int]) -> float:
180 |     """
181 |     Stable log add
182 |     """
183 |     if all(a == -float('inf') for a in args):
184 |         return -float('inf')
185 |     a_max = max(args)
186 |     lsp = math.log(sum(math.exp(a - a_max) for a in args))
187 |     return a_max + lsp
188 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/utils/ctc_util.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Mobvoi Inc. All Rights Reserved.
 2 | # Author: binbinzhang@mobvoi.com (Di Wu)
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | def insert_blank(label, blank_id=0):
 8 |     """Insert blank token between every two label token."""
 9 |     label = np.expand_dims(label, 1)
10 |     blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id
11 |     label = np.concatenate([blanks, label], axis=1)
12 |     label = label.reshape(-1)
13 |     label = np.append(label, label[0])
14 |     return label
15 | 
16 | def forced_align(ctc_probs: torch.Tensor,
17 |                  y: torch.Tensor,
18 |                  blank_id=0) -> list:
19 |     """ctc forced alignment.
20 | 
21 |     Args:
22 |         torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D)
23 |         torch.Tensor y: id sequence tensor 1d tensor (L)
24 |         int blank_id: blank symbol index
25 |     Returns:
26 |         torch.Tensor: alignment result
27 |     """
28 |     y_insert_blank = insert_blank(y, blank_id)
29 | 
30 |     log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank)))
31 |     log_alpha = log_alpha - float('inf')  # log of zero
32 |     state_path = (torch.zeros(
33 |         (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1
34 |     )  # state path
35 | 
36 |     # init start state
37 |     log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]]
38 |     log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]]
39 | 
40 |     for t in range(1, ctc_probs.size(0)):
41 |         for s in range(len(y_insert_blank)):
42 |             if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[
43 |                     s] == y_insert_blank[s - 2]:
44 |                 candidates = torch.tensor(
45 |                     [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]])
46 |                 prev_state = [s, s - 1]
47 |             else:
48 |                 candidates = torch.tensor([
49 |                     log_alpha[t - 1, s],
50 |                     log_alpha[t - 1, s - 1],
51 |                     log_alpha[t - 1, s - 2],
52 |                 ])
53 |                 prev_state = [s, s - 1, s - 2]
54 |             log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]]
55 |             state_path[t, s] = prev_state[torch.argmax(candidates)]
56 | 
57 |     state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16)
58 | 
59 |     candidates = torch.tensor([
60 |         log_alpha[-1, len(y_insert_blank) - 1],
61 |         log_alpha[-1, len(y_insert_blank) - 2]
62 |     ])
63 |     prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2]
64 |     state_seq[-1] = prev_state[torch.argmax(candidates)]
65 |     for t in range(ctc_probs.size(0) - 2, -1, -1):
66 |         state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]]
67 | 
68 |     output_alignment = []
69 |     for t in range(0, ctc_probs.size(0)):
70 |         output_alignment.append(y_insert_blank[state_seq[t, 0]])
71 | 
72 |     return output_alignment
73 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/utils/executor.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Mobvoi Inc. All Rights Reserved.
  2 | # Author: binbinzhang@mobvoi.com (Binbin Zhang)
  3 | 
  4 | import logging
  5 | from contextlib import nullcontext
  6 | # if your python version < 3.7 use the below one
  7 | # from contextlib import suppress as nullcontext
  8 | import torch
  9 | from torch.nn.utils import clip_grad_norm_
 10 | 
 11 | 
 12 | class Executor:
 13 |     def __init__(self):
 14 |         self.step = 0
 15 | 
 16 |     def train(self, model, optimizer, scheduler, data_loader, device, writer,
 17 |               args, scaler):
 18 |         ''' Train one epoch
 19 |         '''
 20 |         model.train()
 21 |         clip = args.get('grad_clip', 50.0)
 22 |         log_interval = args.get('log_interval', 10)
 23 |         rank = args.get('rank', 0)
 24 |         accum_grad = args.get('accum_grad', 1)
 25 |         is_distributed = args.get('is_distributed', True)
 26 |         use_amp = args.get('use_amp', False)
 27 |         logging.info('using accumulate grad, new batch size is {} times'
 28 |                      'larger than before'.format(accum_grad))
 29 |         if use_amp:
 30 |             assert scaler is not None
 31 |         num_seen_utts = 0
 32 |         num_total_batch = len(data_loader)
 33 |         for batch_idx, batch in enumerate(data_loader):
 34 |             key, feats, target, feats_lengths, target_lengths = batch
 35 |             feats = feats.to(device)
 36 |             target = target.to(device)
 37 |             feats_lengths = feats_lengths.to(device)
 38 |             target_lengths = target_lengths.to(device)
 39 |             num_utts = target_lengths.size(0)
 40 |             if num_utts == 0:
 41 |                 continue
 42 |             context = None
 43 |             # Disable gradient synchronizations across DDP processes.
 44 |             # Within this context, gradients will be accumulated on module
 45 |             # variables, which will later be synchronized.
 46 |             if is_distributed and batch_idx % accum_grad != 0:
 47 |                 context = model.no_sync
 48 |             # Used for single gpu training and DDP gradient synchronization
 49 |             # processes.
 50 |             else:
 51 |                 context = nullcontext
 52 |             with context():
 53 |                 # autocast context
 54 |                 # The more details about amp can be found in
 55 |                 # https://pytorch.org/docs/stable/notes/amp_examples.html
 56 |                 with torch.cuda.amp.autocast(scaler is not None):
 57 |                     loss, loss_att, loss_ctc = model(feats, feats_lengths,
 58 |                                                      target, target_lengths)
 59 |                     loss = loss / accum_grad
 60 |                 if use_amp:
 61 |                     scaler.scale(loss).backward()
 62 |                 else:
 63 |                     loss.backward()
 64 | 
 65 |             num_seen_utts += num_utts
 66 |             if batch_idx % accum_grad == 0:
 67 |                 if rank == 0 and writer is not None:
 68 |                     writer.add_scalar('train_loss', loss, self.step)
 69 |                 # Use mixed precision training
 70 |                 if use_amp:
 71 |                     scaler.unscale_(optimizer)
 72 |                     grad_norm = clip_grad_norm_(model.parameters(), clip)
 73 |                     # Must invoke scaler.update() if unscale_() is used in the
 74 |                     # iteration to avoid the following error:
 75 |                     #   RuntimeError: unscale_() has already been called
 76 |                     #   on this optimizer since the last update().
 77 |                     # We don't check grad here since that if the gradient has
 78 |                     # inf/nan values, scaler.step will skip optimizer.step().
 79 |                     scaler.step(optimizer)
 80 |                     scaler.update()
 81 |                 else:
 82 |                     grad_norm = clip_grad_norm_(model.parameters(), clip)
 83 |                     if torch.isfinite(grad_norm):
 84 |                         optimizer.step()
 85 |                 optimizer.zero_grad()
 86 |                 scheduler.step()
 87 |                 self.step += 1
 88 |             if batch_idx % log_interval == 0:
 89 |                 lr = optimizer.param_groups[0]['lr']
 90 |                 log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format(
 91 |                     batch_idx, num_total_batch,
 92 |                     loss.item() * accum_grad)
 93 |                 if loss_att is not None:
 94 |                     log_str += 'loss_att {:.6f} '.format(loss_att.item())
 95 |                 if loss_ctc is not None:
 96 |                     log_str += 'loss_ctc {:.6f} '.format(loss_ctc.item())
 97 |                 log_str += 'lr {:.8f} rank {}'.format(lr, rank)
 98 |                 logging.debug(log_str)
 99 | 
100 |     def cv(self, model, data_loader, device, args):
101 |         ''' Cross validation on
102 |         '''
103 |         model.eval()
104 |         log_interval = args.get('log_interval', 10)
105 |         # in order to avoid division by 0
106 |         num_seen_utts = 1
107 |         total_loss = 0.0
108 |         num_total_batch = len(data_loader)
109 |         with torch.no_grad():
110 |             for batch_idx, batch in enumerate(data_loader):
111 |                 key, feats, target, feats_lengths, target_lengths = batch
112 |                 feats = feats.to(device)
113 |                 target = target.to(device)
114 |                 feats_lengths = feats_lengths.to(device)
115 |                 target_lengths = target_lengths.to(device)
116 |                 num_utts = target_lengths.size(0)
117 |                 if num_utts == 0:
118 |                     continue
119 |                 loss, loss_att, loss_ctc = model(feats, feats_lengths, target,
120 |                                                  target_lengths)
121 |                 if torch.isfinite(loss):
122 |                     num_seen_utts += num_utts
123 |                     total_loss += loss.item() * num_utts
124 |                 if batch_idx % log_interval == 0:
125 |                     log_str = 'CV Batch {}/{} loss {:.6f} '.format(
126 |                         batch_idx, num_total_batch, loss.item())
127 |                     if loss_att is not None:
128 |                         log_str += 'loss_att {:.6f} '.format(loss_att.item())
129 |                     if loss_ctc is not None:
130 |                         log_str += 'loss_ctc {:.6f} '.format(loss_ctc.item())
131 |                     log_str += 'history loss {:.6f}'.format(total_loss /
132 |                                                             num_seen_utts)
133 |                     logging.debug(log_str)
134 | 
135 |         return total_loss, num_seen_utts
136 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/models/wenet/utils/scheduler.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | import torch
 4 | from torch.optim.lr_scheduler import _LRScheduler
 5 | 
 6 | from typeguard import check_argument_types
 7 | 
 8 | 
 9 | class WarmupLR(_LRScheduler):
10 |     """The WarmupLR scheduler
11 | 
12 |     This scheduler is almost same as NoamLR Scheduler except for following
13 |     difference:
14 | 
15 |     NoamLR:
16 |         lr = optimizer.lr * model_size ** -0.5
17 |              * min(step ** -0.5, step * warmup_step ** -1.5)
18 |     WarmupLR:
19 |         lr = optimizer.lr * warmup_step ** 0.5
20 |              * min(step ** -0.5, step * warmup_step ** -1.5)
21 | 
22 |     Note that the maximum lr equals to optimizer.lr in this scheduler.
23 | 
24 |     """
25 | 
26 |     def __init__(
27 |         self,
28 |         optimizer: torch.optim.Optimizer,
29 |         warmup_steps: Union[int, float] = 25000,
30 |         last_epoch: int = -1,
31 |     ):
32 |         assert check_argument_types()
33 |         self.warmup_steps = warmup_steps
34 | 
35 |         # __init__() must be invoked before setting field
36 |         # because step() is also invoked in __init__()
37 |         super().__init__(optimizer, last_epoch)
38 | 
39 |     def __repr__(self):
40 |         return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})"
41 | 
42 |     def get_lr(self):
43 |         step_num = self.last_epoch + 1
44 |         return [
45 |             lr
46 |             * self.warmup_steps ** 0.5
47 |             * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5)
48 |             for lr in self.base_lrs
49 |         ]
50 | 
51 |     def set_step(self, step: int):
52 |         self.last_epoch = step
53 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/optimizer/adam.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | 
 6 | def Optimizer(parameters, lr, weight_decay, **kwargs):
 7 | 
 8 | 	print('Initialised Adam optimizer')
 9 | 	return torch.optim.Adam(parameters, lr = lr, weight_decay = weight_decay);
10 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/optimizer/adamP.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | from adamp import AdamP
 6 | 
 7 | def Optimizer(parameters, lr, weight_decay, **kwargs):
 8 |     print('Initialised AdamP optimizer')
 9 |     return AdamP(parameters, lr = lr, betas = (0.9, 0.999), weight_decay = weight_decay)
10 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/optimizer/adamW.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | 
 6 | def Optimizer(parameters, lr, weight_decay, **kwargs):
 7 | 
 8 | 	print('Initialised AdamW optimizer')
 9 | 	return torch.optim.AdamW(parameters, lr = lr, weight_decay = weight_decay)
10 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/optimizer/sgd.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import torch
 5 | 
 6 | def Optimizer(parameters, lr, weight_decay, **kwargs):
 7 | 
 8 | 	print('Initialised SGD optimizer')
 9 | 
10 | 	return torch.optim.SGD(parameters, lr = lr, momentum = 0.9, weight_decay=weight_decay);
11 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/protocols/ASVspoof2019.LA.asv.eval.female.trn.txt:
--------------------------------------------------------------------------------
 1 | LA_0026 LA_E_A6067886,LA_E_A6399397,LA_E_A7328076,LA_E_A7522472,LA_E_A8382737,LA_E_A8628133,LA_E_A8936529,LA_E_A9162657,LA_E_A9477785,LA_E_A9847831,LA_E_A9970430
 2 | LA_0041 LA_E_A2600639,LA_E_A2615005,LA_E_A3009558,LA_E_A3893238,LA_E_A4308598,LA_E_A5901043,LA_E_A6995425,LA_E_A8885354,LA_E_A9864327,LA_E_A9903225,LA_E_A9921939
 3 | LA_0043 LA_E_A2159621,LA_E_A2750783,LA_E_A2883190,LA_E_A3950726,LA_E_A6019368,LA_E_A6548293,LA_E_A6887259,LA_E_A7782002,LA_E_A8331933,LA_E_A8639897,LA_E_A9500389
 4 | LA_0012 LA_E_A1053965,LA_E_A1795401,LA_E_A2440720,LA_E_A2829678,LA_E_A4042686,LA_E_A4233081,LA_E_A5462934,LA_E_A5982169,LA_E_A6508387,LA_E_A7343806,LA_E_A8704258
 5 | LA_0031 LA_E_A1478121,LA_E_A2460512,LA_E_A2926096,LA_E_A3041661,LA_E_A3554530,LA_E_A3598858,LA_E_A4171094,LA_E_A4538545,LA_E_A6798483,LA_E_A7032162,LA_E_A7210101
 6 | LA_0037 LA_E_A1051956,LA_E_A1196355,LA_E_A2695639,LA_E_A3555619,LA_E_A3654052,LA_E_A3789634,LA_E_A4791598,LA_E_A5467066,LA_E_A5912013,LA_E_A6211829,LA_E_A9327727
 7 | LA_0008 LA_E_A1280994,LA_E_A2012637,LA_E_A2281694,LA_E_A3406491,LA_E_A3583360,LA_E_A3917123,LA_E_A5239949,LA_E_A5939507,LA_E_A6514798,LA_E_A9527561,LA_E_A9776482
 8 | LA_0029 LA_E_A1987953,LA_E_A2171329,LA_E_A2217302,LA_E_A2864595,LA_E_A4536568,LA_E_A4720897,LA_E_A6298434,LA_E_A7162720,LA_E_A7935803,LA_E_A8288389,LA_E_A9426249
 9 | LA_0004 LA_E_A1350785,LA_E_A2578761,LA_E_A3090180,LA_E_A4176263,LA_E_A4801136,LA_E_A5523997,LA_E_A5740231,LA_E_A6371783,LA_E_A7554553,LA_E_A8796242,LA_E_A9929520
10 | LA_0045 LA_E_A1383315,LA_E_A2359362,LA_E_A3521251,LA_E_A3574364,LA_E_A3821822,LA_E_A6030971,LA_E_A6066016,LA_E_A6756990,LA_E_A7643994,LA_E_A8520532,LA_E_A8820512
11 | LA_0010 LA_E_A3134888,LA_E_A3904853,LA_E_A4006253,LA_E_A4351518,LA_E_A6136654,LA_E_A6138183,LA_E_A6795707,LA_E_A7714540,LA_E_A7831194,LA_E_A8013850,LA_E_A8512745
12 | LA_0034 LA_E_A3254492,LA_E_A3353969,LA_E_A3384384,LA_E_A4969775,LA_E_A5459864,LA_E_A5902299,LA_E_A8288456,LA_E_A8437596,LA_E_A8639383,LA_E_A9210248,LA_E_A9760674
13 | LA_0033 LA_E_A2278657,LA_E_A3618104,LA_E_A3820212,LA_E_A6157632,LA_E_A6455734,LA_E_A6691385,LA_E_A7273228,LA_E_A7511208,LA_E_A7969490,LA_E_A8804831,LA_E_A9024376
14 | LA_0042 LA_E_A1198708,LA_E_A2193411,LA_E_A4244162,LA_E_A5124438,LA_E_A5815457,LA_E_A6081776,LA_E_A6924100,LA_E_A7506556,LA_E_A7888496,LA_E_A9618297,LA_E_A9829952
15 | LA_0035 LA_E_A1585336,LA_E_A2526555,LA_E_A2940472,LA_E_A3836347,LA_E_A4034855,LA_E_A4336680,LA_E_A4435680,LA_E_A6082825,LA_E_A6703766,LA_E_A6711472,LA_E_A7735424
16 | LA_0027 LA_E_A2777383,LA_E_A2992932,LA_E_A3321288,LA_E_A3345148,LA_E_A3778747,LA_E_A6684049,LA_E_A8100239,LA_E_A8239552,LA_E_A9038375,LA_E_A9435429,LA_E_A9515193
17 | LA_0014 LA_E_A1585392,LA_E_A3658404,LA_E_A3770777,LA_E_A4006695,LA_E_A5854979,LA_E_A6602358,LA_E_A7811753,LA_E_A8603666,LA_E_A9184573,LA_E_A9884360,LA_E_A9929223
18 | LA_0024 LA_E_A3614213,LA_E_A4614013,LA_E_A4759484,LA_E_A5641333,LA_E_A6256166,LA_E_A6801379,LA_E_A7238447,LA_E_A7361812,LA_E_A7830058,LA_E_A8972377,LA_E_A9143306
19 | LA_0016 LA_E_A1249655,LA_E_A3719322,LA_E_A4901704,LA_E_A5038438,LA_E_A5839270,LA_E_A6842353,LA_E_A7666759,LA_E_A7929078,LA_E_A7931622,LA_E_A8034727,LA_E_A9015457
20 | LA_0017 LA_E_A2900556,LA_E_A3225162,LA_E_A3469512,LA_E_A3519492,LA_E_A3822959,LA_E_A4628943,LA_E_A4646149,LA_E_A6677949,LA_E_A7665790,LA_E_A8557348,LA_E_A9898607
21 | LA_0019 LA_E_A1798025,LA_E_A1967771,LA_E_A2242096,LA_E_A5065809,LA_E_A6428663,LA_E_A6744935,LA_E_A7454132,LA_E_A8228964,LA_E_A8882063,LA_E_A9150661,LA_E_A9687807
22 | LA_0006 LA_E_A1871167,LA_E_A2856296,LA_E_A3183649,LA_E_A5037636,LA_E_A5890944,LA_E_A6542679,LA_E_A7145264,LA_E_A7504228,LA_E_A7662019,LA_E_A7961119,LA_E_A8497664
23 | LA_0039 LA_E_A1581105,LA_E_A3423971,LA_E_A3926347,LA_E_A4064171,LA_E_A4542113,LA_E_A4798522,LA_E_A5271858,LA_E_A6236286,LA_E_A7588072,LA_E_A8574000,LA_E_A9406597
24 | LA_0009 LA_E_A1204442,LA_E_A1390066,LA_E_A2232628,LA_E_A2677276,LA_E_A4267005,LA_E_A6283872,LA_E_A6317736,LA_E_A8023214,LA_E_A8460072,LA_E_A8566051,LA_E_A8871185
25 | LA_0047 LA_E_A2673534,LA_E_A3911996,LA_E_A7620458,LA_E_A7781578,LA_E_A7882315,LA_E_A8076326,LA_E_A9217821,LA_E_A9598155,LA_E_A9693501,LA_E_A9831653,LA_E_A9940557
26 | LA_0022 LA_E_A1478713,LA_E_A1598631,LA_E_A1639326,LA_E_A4722070,LA_E_A6663116,LA_E_A7227229,LA_E_A7869414,LA_E_A8042701,LA_E_A9686199,LA_E_A9827503,LA_E_A9975938
27 | LA_0020 LA_E_A1584444,LA_E_A1952273,LA_E_A2683985,LA_E_A2921021,LA_E_A3033742,LA_E_A3374020,LA_E_A4668744,LA_E_A6438857,LA_E_A6716830,LA_E_A7975193,LA_E_A8158713
28 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/protocols/ASVspoof2019.LA.asv.eval.male.trn.txt:
--------------------------------------------------------------------------------
 1 | LA_0007 LA_E_A1579796,LA_E_A1669549,LA_E_A2092700,LA_E_A2287839,LA_E_A3734175,LA_E_A4085221,LA_E_A4165051,LA_E_A5069362,LA_E_A5863027,LA_E_A6161186,LA_E_A6315891,LA_E_A6479863,LA_E_A6565965,LA_E_A7384060,LA_E_A8093886,LA_E_A8763189,LA_E_A9002777,LA_E_A9429276,LA_E_A9569588
 2 | LA_0003 LA_E_A2038846,LA_E_A3217904,LA_E_A3998756,LA_E_A4032623,LA_E_A4070913,LA_E_A4500897,LA_E_A4850706,LA_E_A5052570,LA_E_A5475189,LA_E_A6131259,LA_E_A6533314,LA_E_A7406609,LA_E_A7570344,LA_E_A7626807,LA_E_A8643969,LA_E_A8963027,LA_E_A9136617,LA_E_A9665991,LA_E_A9967055
 3 | LA_0015 LA_E_A1096168,LA_E_A1942801,LA_E_A3022922,LA_E_A3666614,LA_E_A4126859,LA_E_A4704011,LA_E_A4922158,LA_E_A5080077,LA_E_A5555947,LA_E_A5752283,LA_E_A6140229,LA_E_A6479043,LA_E_A6801198,LA_E_A7216041,LA_E_A7344985,LA_E_A7455161,LA_E_A7915419,LA_E_A9103743,LA_E_A9352992
 4 | LA_0005 LA_E_A1061661,LA_E_A1151528,LA_E_A1357547,LA_E_A1552302,LA_E_A1805932,LA_E_A1982652,LA_E_A2562108,LA_E_A2758018,LA_E_A4401481,LA_E_A4626256,LA_E_A5518426,LA_E_A5938161,LA_E_A6003322,LA_E_A6467075,LA_E_A8309685,LA_E_A8311618,LA_E_A8599621,LA_E_A8945195,LA_E_A9175975
 5 | LA_0048 LA_E_A1286158,LA_E_A2120927,LA_E_A2329958,LA_E_A2636110,LA_E_A2645632,LA_E_A2917108,LA_E_A3109618,LA_E_A3573642,LA_E_A3971551,LA_E_A4727672,LA_E_A5591943,LA_E_A5764619,LA_E_A6442982,LA_E_A6681433,LA_E_A7385173,LA_E_A8136322,LA_E_A8439465,LA_E_A8533861,LA_E_A8968933
 6 | LA_0038 LA_E_A1116626,LA_E_A1174652,LA_E_A1378276,LA_E_A1620277,LA_E_A2266714,LA_E_A2452306,LA_E_A2777403,LA_E_A4251249,LA_E_A4278406,LA_E_A4473171,LA_E_A4702188,LA_E_A4994837,LA_E_A5683769,LA_E_A6631075,LA_E_A7036528,LA_E_A7814286,LA_E_A8945198,LA_E_A9414340,LA_E_A9471582
 7 | LA_0032 LA_E_A1323399,LA_E_A1790388,LA_E_A1974946,LA_E_A2004276,LA_E_A2116071,LA_E_A3263506,LA_E_A4712026,LA_E_A5584172,LA_E_A5835629,LA_E_A6845132,LA_E_A7589780,LA_E_A7785728,LA_E_A7992084,LA_E_A8182193,LA_E_A8650561,LA_E_A8942083,LA_E_A9290365,LA_E_A9407859,LA_E_A9538728
 8 | LA_0046 LA_E_A2466976,LA_E_A2708278,LA_E_A3324898,LA_E_A3424299,LA_E_A3727759,LA_E_A4717497,LA_E_A4877678,LA_E_A5090657,LA_E_A6084667,LA_E_A6278681,LA_E_A6963337,LA_E_A7299292,LA_E_A7344026,LA_E_A7444716,LA_E_A7965023,LA_E_A8189181,LA_E_A8746471,LA_E_A9211497,LA_E_A9244386
 9 | LA_0018 LA_E_A1222015,LA_E_A1230488,LA_E_A1315575,LA_E_A1643822,LA_E_A1707724,LA_E_A1737837,LA_E_A2840122,LA_E_A2952996,LA_E_A3868494,LA_E_A4039519,LA_E_A5209228,LA_E_A6894935,LA_E_A8525661,LA_E_A8817156,LA_E_A8961129,LA_E_A9262197,LA_E_A9386932,LA_E_A9585735,LA_E_A9592918
10 | LA_0013 LA_E_A1254130,LA_E_A1554007,LA_E_A1781636,LA_E_A2608252,LA_E_A3276842,LA_E_A3750761,LA_E_A4070304,LA_E_A4707964,LA_E_A5038290,LA_E_A5464563,LA_E_A6151518,LA_E_A6168904,LA_E_A8428454,LA_E_A8489144,LA_E_A9058515,LA_E_A9369469,LA_E_A9529647,LA_E_A9712403,LA_E_A9965149
11 | LA_0036 LA_E_A1478180,LA_E_A2389854,LA_E_A3071586,LA_E_A3429891,LA_E_A3527473,LA_E_A3599121,LA_E_A3618518,LA_E_A4409611,LA_E_A4484976,LA_E_A4545756,LA_E_A5861351,LA_E_A6082924,LA_E_A7146309,LA_E_A8502724,LA_E_A8557047,LA_E_A8731251,LA_E_A9137383,LA_E_A9679505,LA_E_A9936942
12 | LA_0023 LA_E_A1071592,LA_E_A1251439,LA_E_A3097813,LA_E_A3106218,LA_E_A3226899,LA_E_A4341221,LA_E_A4391250,LA_E_A4493336,LA_E_A5208875,LA_E_A5227434,LA_E_A5820641,LA_E_A6536184,LA_E_A6588986,LA_E_A7069053,LA_E_A7353069,LA_E_A7695856,LA_E_A8192912,LA_E_A8848010,LA_E_A9509852
13 | LA_0030 LA_E_A1225426,LA_E_A1723876,LA_E_A2079871,LA_E_A2542905,LA_E_A3024244,LA_E_A3707417,LA_E_A4060012,LA_E_A4091866,LA_E_A4748816,LA_E_A5206867,LA_E_A5646760,LA_E_A5705432,LA_E_A6299358,LA_E_A6419571,LA_E_A7089762,LA_E_A7208817,LA_E_A8161790,LA_E_A8341026,LA_E_A8940970
14 | LA_0002 LA_E_A1235554,LA_E_A1469990,LA_E_A1831517,LA_E_A1853447,LA_E_A1935359,LA_E_A2186276,LA_E_A3024508,LA_E_A3451001,LA_E_A4621792,LA_E_A4969008,LA_E_A5304363,LA_E_A6514235,LA_E_A6819813,LA_E_A6842395,LA_E_A8179954,LA_E_A8256043,LA_E_A8972701,LA_E_A9233994,LA_E_A9236736
15 | LA_0040 LA_E_A1709351,LA_E_A3754389,LA_E_A3976416,LA_E_A4523735,LA_E_A4571870,LA_E_A4713889,LA_E_A5779272,LA_E_A5952468,LA_E_A5956752,LA_E_A6986555,LA_E_A7814135,LA_E_A8009345,LA_E_A8011520,LA_E_A8283071,LA_E_A8897121,LA_E_A9015000,LA_E_A9046210,LA_E_A9555184,LA_E_A9618678
16 | LA_0028 LA_E_A2083414,LA_E_A2201003,LA_E_A2288128,LA_E_A2321693,LA_E_A2453228,LA_E_A2530101,LA_E_A2591871,LA_E_A2681646,LA_E_A3004056,LA_E_A3583344,LA_E_A4717284,LA_E_A5122263,LA_E_A5247656,LA_E_A5742757,LA_E_A5998686,LA_E_A7065441,LA_E_A7350428,LA_E_A9276815,LA_E_A9917416
17 | LA_0011 LA_E_A1163470,LA_E_A1376388,LA_E_A1452392,LA_E_A1659783,LA_E_A1923413,LA_E_A2291547,LA_E_A3362381,LA_E_A3892117,LA_E_A4237020,LA_E_A4698529,LA_E_A4919423,LA_E_A8390905,LA_E_A8691184,LA_E_A8832569,LA_E_A8910908,LA_E_A9344861,LA_E_A9425880,LA_E_A9646908,LA_E_A9997819
18 | LA_0001 LA_E_A1160049,LA_E_A2302299,LA_E_A2559248,LA_E_A2611901,LA_E_A3168085,LA_E_A3558732,LA_E_A3587142,LA_E_A3990835,LA_E_A4301313,LA_E_A4969429,LA_E_A5117382,LA_E_A5384841,LA_E_A5746825,LA_E_A7509220,LA_E_A7658339,LA_E_A7870154,LA_E_A8299065,LA_E_A8612771,LA_E_A8960118
19 | LA_0044 LA_E_A1806024,LA_E_A2090824,LA_E_A2291355,LA_E_A2455422,LA_E_A3157518,LA_E_A3810036,LA_E_A4030688,LA_E_A4368613,LA_E_A5939534,LA_E_A6156462,LA_E_A6308549,LA_E_A6728526,LA_E_A6957322,LA_E_A7103478,LA_E_A7501718,LA_E_A7865122,LA_E_A8444324,LA_E_A9236430,LA_E_A9472289
20 | LA_0021 LA_E_A1257083,LA_E_A1365554,LA_E_A1582740,LA_E_A2130868,LA_E_A2199656,LA_E_A2834548,LA_E_A3176274,LA_E_A3291031,LA_E_A5097897,LA_E_A5759463,LA_E_A6646147,LA_E_A6854846,LA_E_A7005142,LA_E_A7542238,LA_E_A8132158,LA_E_A8533788,LA_E_A9316550,LA_E_A9431669,LA_E_A9803663
21 | LA_0025 LA_E_A1598214,LA_E_A1661584,LA_E_A1895385,LA_E_A2078766,LA_E_A3024856,LA_E_A3865801,LA_E_A4030769,LA_E_A4269240,LA_E_A4707130,LA_E_A5390276,LA_E_A6262999,LA_E_A6517068,LA_E_A6562971,LA_E_A6650474,LA_E_A6741138,LA_E_A6976406,LA_E_A7054590,LA_E_A7559134,LA_E_A8796096
22 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/requirements.txt:
--------------------------------------------------------------------------------
 1 | --find-links https://download.pytorch.org/whl/torch_stable.html
 2 | torch==1.12.1+cu113
 3 | torchaudio==0.12.1+cu113
 4 | numpy==1.24.4
 5 | scipy
 6 | scikit-learn
 7 | tqdm
 8 | pyyaml
 9 | soundfile
10 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/scheduler/cosine_annealing_warmup_restarts.py:
--------------------------------------------------------------------------------
 1 | # ref: https://github.com/katsura-jp/pytorch-cosine-annealing-with-warmup/blob/master/cosine_annealing_warmup/scheduler.py
 2 | #! /usr/bin/python
 3 | # -*- encoding: utf-8 -*-
 4 | import math
 5 | import torch
 6 | from torch.optim.lr_scheduler import _LRScheduler
 7 | 
 8 | class CosineAnnealingWarmupRestarts(_LRScheduler):
 9 |     def __init__(self,
10 |                  optimizer : torch.optim.Optimizer,
11 |                  first_cycle_steps : int,
12 |                  cycle_mult : float = 1.,
13 |                  max_lr : float = 0.1,
14 |                  min_lr : float = 0.001,
15 |                  warmup_steps : int = 0,
16 |                  gamma : float = 1.,
17 |                  last_epoch : int = -1
18 |         ):
19 |         assert warmup_steps < first_cycle_steps
20 |         self.first_cycle_steps = first_cycle_steps # first cycle step size
21 |         self.cycle_mult = cycle_mult               # cycle steps magnification
22 |         self.base_max_lr = max_lr                  # first max learning rate
23 |         self.max_lr = max_lr                       # max learning rate in the current cycle
24 |         self.min_lr = min_lr                       # min learning rate
25 |         self.warmup_steps = warmup_steps           # warmup step size
26 |         self.gamma = gamma                         # decrease rate of max learning rate by cycle
27 |         self.cur_cycle_steps = first_cycle_steps   # first cycle step size
28 |         self.cycle = 0                             # cycle count
29 |         self.step_in_cycle = last_epoch            # step size of the current cycle
30 |         super(CosineAnnealingWarmupRestarts, self).__init__(optimizer, last_epoch)
31 |         self.init_lr()
32 |  
33 |     def init_lr(self):
34 |         self.base_lrs = []
35 |         for param_group in self.optimizer.param_groups:
36 |             param_group['lr'] = self.min_lr
37 |             self.base_lrs.append(self.min_lr)    
38 | 
39 |     def get_lr(self):
40 |         if self.step_in_cycle == -1:
41 |             return self.base_lrs
42 |         elif self.step_in_cycle < self.warmup_steps:
43 |             return [(self.max_lr - base_lr)*self.step_in_cycle / self.warmup_steps + base_lr for base_lr in self.base_lrs]
44 |         else:
45 |             return [base_lr + (self.max_lr - base_lr) \
46 |                     * (1 + math.cos(math.pi * (self.step_in_cycle-self.warmup_steps) \
47 |                                     / (self.cur_cycle_steps - self.warmup_steps))) / 2
48 |                     for base_lr in self.base_lrs]
49 | 
50 |     def step(self, epoch=None):
51 |         if epoch is None:
52 |             epoch = self.last_epoch + 1
53 |             self.step_in_cycle = self.step_in_cycle + 1
54 |             if self.step_in_cycle >= self.cur_cycle_steps:
55 |                 self.cycle += 1
56 |                 self.step_in_cycle = self.step_in_cycle - self.cur_cycle_steps
57 |                 self.cur_cycle_steps = int((self.cur_cycle_steps - self.warmup_steps) * self.cycle_mult) + self.warmup_steps
58 |         else:
59 |             if epoch >= self.first_cycle_steps:
60 |                 if self.cycle_mult == 1.:
61 |                     self.step_in_cycle = epoch % self.first_cycle_steps
62 |                     self.cycle = epoch // self.first_cycle_steps
63 |                 else:
64 |                     n = int(math.log((epoch / self.first_cycle_steps * (self.cycle_mult - 1) + 1), self.cycle_mult))
65 |                     self.cycle = n
66 |                     self.step_in_cycle = epoch - int(self.first_cycle_steps * (self.cycle_mult ** n - 1) / (self.cycle_mult - 1))
67 |                     self.cur_cycle_steps = self.first_cycle_steps * self.cycle_mult ** (n)
68 |             else:
69 |                 self.cur_cycle_steps = self.first_cycle_steps
70 |                 self.step_in_cycle = epoch
71 |                 
72 |         self.max_lr = self.base_max_lr * (self.gamma**self.cycle)
73 |         self.last_epoch = math.floor(epoch)
74 |         for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
75 |             param_group['lr'] = lr
76 | 
77 | 
78 | def Scheduler(optimizer, lr_t0, lr_tmul, lr_max, lr_min, lr_wstep, lr_gamma, **kwargs):
79 |     sche_fn = CosineAnnealingWarmupRestarts(optimizer, first_cycle_steps=lr_t0, cycle_mult=lr_tmul, max_lr=lr_max, min_lr=lr_min, warmup_steps=lr_wstep, gamma=lr_gamma)
80 |     lr_step = 'epoch'
81 |     print('Initialised CosineAnnealingWarmupRestarts scheduler')
82 |     return sche_fn, lr_step
83 |     #return sche_fn
84 | 


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/spk_meta/spk_meta_dev.pk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/spk_meta/spk_meta_dev.pk


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/spk_meta/spk_meta_eval.pk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/spk_meta/spk_meta_eval.pk


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/spk_meta/spk_meta_trn.pk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sasv-challenge/SASV2_Baseline/032f2e82b2de84a97cedcfc06122cb862a98f3ff/stage3/ASVspoof2019/spk_meta/spk_meta_trn.pk


--------------------------------------------------------------------------------
/stage3/ASVspoof2019/tuneThreshold.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*- coding: utf-8 -*-
 3 | import numpy
 4 | from sklearn import metrics
 5 | from operator import itemgetter
 6 | 
 7 | def tuneThresholdfromScore(scores, labels, target_fa, target_fr = None):
 8 |     
 9 |     fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1)
10 |     fnr = 1 - tpr
11 | 
12 |     tunedThreshold = []
13 |     if target_fr:
14 |         for tfr in target_fr:
15 |             idx = numpy.nanargmin(numpy.absolute((tfr - fnr)))
16 |             tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]])
17 |     
18 |     for tfa in target_fa:
19 |         idx = numpy.nanargmin(numpy.absolute((tfa - fpr))) # numpy.where(fpr<=tfa)[0][-1]
20 |         tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]])
21 |     
22 |     idxE = numpy.nanargmin(numpy.absolute((fnr - fpr)))
23 |     eer  = max(fpr[idxE],fnr[idxE])*100
24 |     
25 |     return (tunedThreshold, eer, fpr, fnr)
26 | 
27 | # Creates a list of false-negative rates, a list of false-positive rates
28 | # and a list of decision thresholds that give those error-rates.
29 | def ComputeErrorRates(scores, labels):
30 | 
31 |       # Sort the scores from smallest to largest, and also get the corresponding
32 |       # indexes of the sorted scores.  We will treat the sorted scores as the
33 |       # thresholds at which the the error-rates are evaluated.
34 |       sorted_indexes, thresholds = zip(*sorted(
35 |           [(index, threshold) for index, threshold in enumerate(scores)],
36 |           key=itemgetter(1)))
37 |       sorted_labels = []
38 |       labels = [labels[i] for i in sorted_indexes]
39 |       fnrs = []
40 |       fprs = []
41 | 
42 |       # At the end of this loop, fnrs[i] is the number of errors made by
43 |       # incorrectly rejecting scores less than thresholds[i]. And, fprs[i]
44 |       # is the total number of times that we have correctly accepted scores
45 |       # greater than thresholds[i].
46 |       for i in range(0, len(labels)):
47 |           if i == 0:
48 |               fnrs.append(labels[i])
49 |               fprs.append(1 - labels[i])
50 |           else:
51 |               fnrs.append(fnrs[i-1] + labels[i])
52 |               fprs.append(fprs[i-1] + 1 - labels[i])
53 |       fnrs_norm = sum(labels)
54 |       fprs_norm = len(labels) - fnrs_norm
55 | 
56 |       # Now divide by the total number of false negative errors to
57 |       # obtain the false positive rates across all thresholds
58 |       fnrs = [x / float(fnrs_norm) for x in fnrs]
59 | 
60 |       # Divide by the total number of corret positives to get the
61 |       # true positive rate.  Subtract these quantities from 1 to
62 |       # get the false positive rates.
63 |       fprs = [1 - x / float(fprs_norm) for x in fprs]
64 |       return fnrs, fprs, thresholds
65 | 
66 | # Computes the minimum of the detection cost function.  The comments refer to
67 | # equations in Section 3 of the NIST 2016 Speaker Recognition Evaluation Plan.
68 | def ComputeMinDcf(fnrs, fprs, thresholds, p_target, c_miss, c_fa):
69 |     min_c_det = float("inf")
70 |     min_c_det_threshold = thresholds[0]
71 |     for i in range(0, len(fnrs)):
72 |         # See Equation (2).  it is a weighted sum of false negative
73 |         # and false positive errors.
74 |         c_det = c_miss * fnrs[i] * p_target + c_fa * fprs[i] * (1 - p_target)
75 |         if c_det < min_c_det:
76 |             min_c_det = c_det
77 |             min_c_det_threshold = thresholds[i]
78 |     # See Equations (3) and (4).  Now we normalize the cost.
79 |     c_def = min(c_miss * p_target, c_fa * (1 - p_target))
80 |     min_dcf = min_c_det / c_def
81 |     return min_dcf, min_c_det_threshold


--------------------------------------------------------------------------------