├── log └── nothing ├── data ├── dev │ └── empty ├── eval │ └── empty ├── train │ └── empty └── eval2021 │ └── empty ├── models └── nothing ├── scores └── nothing ├── .gitignore ├── env.yml ├── README.md ├── model.py ├── tools ├── dataset_loader.py └── audio_utils.py ├── test.py ├── evaluate_tDCF_asvspoof19.py ├── loss.py ├── feature_layers.py ├── resnet.py ├── main.py └── evaluation_metrics.py /log/nothing: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/dev/empty: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/eval/empty: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/train/empty: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/nothing: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scores/nothing: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/eval2021/empty: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ./data/train/*.flac 2 | ./data/eval/*.flac 3 | ./data/eval2021/*.flac 4 | ./oc-rep 5 | ./cqcc 6 | ./.idea -------------------------------------------------------------------------------- /env.yml: -------------------------------------------------------------------------------- 1 | name: pytorch-1.6 2 | channels: 3 | - defaults 4 | - pytorch 5 | - conda-forge 6 | - anaconda 7 | dependencies: 8 | - python=3.8 9 | - pytorch::pytorch=1.6 10 | - cudatoolkit=9.2 11 | - pytorch::torchvision=0.7.0 12 | - pytorch::torchaudio=0.6.0 13 | - scipy=1.4.1 14 | - numpy=1.18.1 15 | - conda-forge::libsndfile=1.0.31 16 | - conda-forge::pysoundfile 17 | - numba=0.48.0 18 | - librosa=0.8.0 19 | - mir_eval=0.6 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # voice-spoof-detection-system 2 | This is the implementation of our work titled "A Countermeasure Based on CQT Spectrogram for Deepfake Speech Detection" was presented in the 7th International Conference on Signal Processing and Intelligent Systems (ICSPIS), Dec 2021. 3 | 4 | We are using CQT spectrogram as input and a ResNet-18 with self-attention for feature extraction. 5 | For a better discrimination of genuine samples from fake ones we use One Class Softmax. 6 | 7 | Some part of the codes are borrowed from https://github.com/yzyouzhang/AIR-ASVspoof and https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts. 8 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | from resnet import * 2 | from loss import * 3 | import nnAudio.Spectrogram as torch_spec 4 | from torchaudio import transforms 5 | 6 | 7 | class Model(nn.Module): 8 | def __init__(self, input_channels, num_classes, device): 9 | super(Model, self).__init__() 10 | 11 | self.device = device 12 | self.cqt = torch_spec.CQT(output_format='Complex', sr=16000).to(device) 13 | self.amp_to_db = transforms.AmplitudeToDB() 14 | self.resnet = ResNet(3, 256, resnet_type='18', nclasses=256).to(device) 15 | 16 | self.mlp_layer1 = nn.Linear(num_classes, 256).to(device) 17 | self.mlp_layer2 = nn.Linear(256, 256).to(device) 18 | self.mlp_layer3 = nn.Linear(256, 256).to(device) 19 | self.drop_out = nn.Dropout(0.5) 20 | 21 | self.oc_softmax = OCSoftmax(256).to(device) 22 | 23 | def forward(self, x, labels, is_train=True): 24 | x = x.to(self.device) 25 | 26 | x = self.cqt(x) 27 | x = torch.pow(x[:, :, :, 0], 2) + torch.pow(x[:, :, :, 1], 2) 28 | 29 | x = self.amp_to_db(x) 30 | 31 | feat, mu = self.resnet(x.unsqueeze(1).float().to(self.device)) 32 | 33 | # x = F.relu(self.mlp_layer1(x)) 34 | # self.drop_out(x) 35 | # x = F.relu(self.mlp_layer2(x)) 36 | # self.drop_out(x) 37 | # x = F.relu(self.mlp_layer3(x)) 38 | # feat = x 39 | 40 | 41 | return self.oc_softmax(feat, labels, is_train) 42 | -------------------------------------------------------------------------------- /tools/dataset_loader.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import collections 3 | import os 4 | import soundfile as sf 5 | import librosa 6 | from torch.utils.data import DataLoader, Dataset 7 | import numpy as np 8 | from joblib import Parallel, delayed 9 | 10 | LOGICAL_DATA_ROOT = './data/' 11 | 12 | ASVFile = collections.namedtuple('ASVFile', 13 | ['speaker_id', 'file_name', 'path', 'sys_id', 'key']) 14 | 15 | 16 | class ASVDataset(Dataset): 17 | """ Utility class to load train/dev datatsets """ 18 | 19 | def __init__(self, 20 | transform=None, 21 | is_train=True, 22 | is_eval=False, 23 | is_eval2021=False 24 | ): 25 | super(ASVDataset, self).__init__() 26 | 27 | data_root = LOGICAL_DATA_ROOT 28 | 29 | self.is_eval = is_eval 30 | self.is_eval_2021 = is_eval2021 31 | 32 | self.data_root = data_root 33 | 34 | self.dset_name = 'eval2021' if is_eval2021 else 'eval' if is_eval else 'train' if is_train else 'dev' 35 | 36 | self.protocols_fname = os.path.join(self.data_root, self.dset_name + '.protocol.txt') 37 | 38 | self.files_dir = os.path.join(self.data_root, '{}'.format(self.dset_name)) 39 | self.transform = transform 40 | 41 | self.files_meta = self.parse_protocols_file(self.protocols_fname) 42 | self.data_files = os.listdir(self.files_dir) 43 | 44 | def __len__(self): 45 | return len(self.data_files) 46 | 47 | def __getitem__(self, idx): 48 | idx = idx % len(self.files_meta) 49 | meta = self.files_meta[idx] 50 | data_x, data_y, _ = self.read_file(meta) 51 | data_x = self.transform(data_x) 52 | x = data_x 53 | y = data_y 54 | return x, y, self.files_meta[idx] 55 | 56 | def read_file(self, meta): 57 | data_x, sample_rate = sf.read(meta.path) 58 | data_y = meta.key 59 | return data_x, float(data_y), meta.sys_id 60 | 61 | def _parse_line(self, line): 62 | tokens = line.strip().split(' ') 63 | if self.is_eval_2021: 64 | return ASVFile(speaker_id='', 65 | file_name=tokens[0], 66 | path=os.path.join(self.files_dir, tokens[0] + '.flac'), 67 | sys_id=0, 68 | key=0) 69 | elif self.is_eval: 70 | return ASVFile(speaker_id='', 71 | file_name=tokens[1], 72 | path=os.path.join(self.files_dir, tokens[1] + '.flac'), 73 | sys_id=0, 74 | key=int(tokens[4] == 'bonafide')) 75 | return ASVFile(speaker_id=tokens[0], 76 | file_name=tokens[1], 77 | path=os.path.join(self.files_dir, tokens[1] + '.flac'), 78 | sys_id=0, 79 | key=int(tokens[4] == 'bonafide')) 80 | 81 | def parse_protocols_file(self, protocols_fname): 82 | lines = open(protocols_fname).readlines() 83 | files_meta = map(self._parse_line, lines) 84 | return list(files_meta) 85 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import torch 4 | from torch.utils.data import DataLoader 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from evaluate_tDCF_asvspoof19 import evaluate_tDCF_asvspoof19 8 | import evaluation_metrics as em 9 | import numpy as np 10 | from model import Model 11 | from loss import * 12 | import librosa 13 | import torchvision 14 | from torch import Tensor 15 | 16 | from tools.dataset_loader import ASVDataset 17 | 18 | 19 | def pad(x, max_len=64000): 20 | x_len = x.shape[0] 21 | if x_len >= max_len: 22 | return x[:max_len] 23 | # need to pad 24 | num_repeats = (max_len / x_len) + 1 25 | x_repeat = np.repeat(x, num_repeats) 26 | padded_x = x_repeat[:max_len] 27 | return padded_x 28 | 29 | 30 | def test_model(model_path, device, batch_size, eval_2021): 31 | transforms = torchvision.transforms.Compose([ 32 | lambda x: pad(x), 33 | lambda x: librosa.util.normalize(x), 34 | lambda x: Tensor(x) 35 | ]) 36 | 37 | model = Model(input_channels=1, num_classes=256, device=device).to(device) 38 | 39 | model.load_state_dict(torch.load(model_path, map_location="cuda")) 40 | 41 | test_set = ASVDataset(is_train=False, is_eval=True, is_eval2021=False, transform=transforms) 42 | test_set_2021 = ASVDataset(is_train=False, is_eval=True, is_eval2021=True, transform=transforms) 43 | test_data_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=0) 44 | test_data_loader_2021 = DataLoader(test_set_2021, batch_size=batch_size, shuffle=False, num_workers=0) 45 | 46 | model.eval() 47 | if not eval_2021: 48 | with open('./scores/cm_score.txt', 'w') as cm_score_file: 49 | for batch_x, batch_y, batch_meta in test_data_loader: 50 | batch_x = batch_x.to(device) 51 | labels = batch_y.to(device) 52 | loss, score = model(batch_x, labels) 53 | 54 | for j in range(labels.size(0)): 55 | cm_score_file.write( 56 | '%s %s %s\n' % (batch_meta.file_name[j], 57 | 'bonafide' if labels[j] == float(1) else 'spoof', 58 | score[j].item())) 59 | 60 | evaluate_tDCF_asvspoof19(os.path.join('', './scores/cm_score.txt'), 61 | './scores/ASVspoof2019.LA.asv.eval.scores.txt', None) 62 | else: 63 | with open('./scores/cm_score_2021.txt', 'w') as cm_score_file_2021: 64 | for batch_x, batch_y, batch_meta in test_data_loader_2021: 65 | print('processing..', end="\r") 66 | batch_x = batch_x.to(device) 67 | 68 | labels = batch_y.to(device) 69 | 70 | loss, score = model(batch_x, labels) 71 | 72 | for j in range(labels.size(0)): 73 | cm_score_file_2021.write('%s %s\n' % (batch_meta.file_name[j], score[j].item())) 74 | 75 | 76 | return 77 | 78 | 79 | def test(model_path, device, batch_size, eval_2021): 80 | model_path = os.path.join(model_path) 81 | print(test_model(model_path, device, batch_size, eval_2021)) 82 | 83 | 84 | if __name__ == "__main__": 85 | parser = argparse.ArgumentParser(description=__doc__) 86 | parser.add_argument('-m', '--model-path', type=str, help="path to the trained model", default="./models/") 87 | parser.add_argument('-b', '--batch-size', type=int, help="batch size for test process", default=32) 88 | parser.add_argument('-e', '--eval-2021', type=bool, help="evaluate model over ASVspoof2021 data", default=False) 89 | parser.add_argument("--gpu", type=str, help="GPU index", default="0") 90 | args = parser.parse_args() 91 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 92 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 93 | test(args.model_path, device, args.batch_size, args.eval_2021) 94 | -------------------------------------------------------------------------------- /tools/audio_utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | import numpy as np 5 | import torch.nn.functional as F 6 | 7 | 8 | def trimf(x, params): 9 | """ 10 | trimf: similar to Matlab definition 11 | https://www.mathworks.com/help/fuzzy/trimf.html?s_tid=srchtitle 12 | 13 | """ 14 | if len(params) != 3: 15 | print("trimp requires params to be a list of 3 elements") 16 | sys.exit(1) 17 | a = params[0] 18 | b = params[1] 19 | c = params[2] 20 | if a > b or b > c: 21 | print("trimp(x, [a, b, c]) requires a<=b<=c") 22 | sys.exit(1) 23 | y = torch.zeros_like(x, dtype=torch.float32) 24 | if a < b: 25 | index = torch.logical_and(a < x, x < b) 26 | y[index] = (x[index] - a) / (b - a) 27 | if b < c: 28 | index = torch.logical_and(b < x, x < c) 29 | y[index] = (c - x[index]) / (c - b) 30 | y[x == b] = 1 31 | return y 32 | 33 | 34 | 35 | def dct1(x): 36 | """ 37 | Discrete Cosine Transform, Type I 38 | :param x: the input signal 39 | :return: the DCT-I of the signal over the last dimension 40 | """ 41 | x_shape = x.shape 42 | x = x.view(-1, x_shape[-1]) 43 | 44 | return torch.rfft( 45 | torch.cat([x, x.flip([1])[:, 1:-1]], dim=1), 1)[:, :, 0].view(*x_shape) 46 | 47 | 48 | def idct1(X): 49 | """ 50 | The inverse of DCT-I, which is just a scaled DCT-I 51 | Our definition if idct1 is such that idct1(dct1(x)) == x 52 | :param X: the input signal 53 | :return: the inverse DCT-I of the signal over the last dimension 54 | """ 55 | n = X.shape[-1] 56 | return dct1(X) / (2 * (n - 1)) 57 | 58 | 59 | def dct(x, norm=None): 60 | """ 61 | Discrete Cosine Transform, Type II (a.k.a. the DCT) 62 | For the meaning of the parameter `norm`, see: 63 | https://docs.scipy.org/doc/ scipy.fftpack.dct.html 64 | :param x: the input signal 65 | :param norm: the normalization, None or 'ortho' 66 | :return: the DCT-II of the signal over the last dimension 67 | """ 68 | x_shape = x.shape 69 | N = x_shape[-1] 70 | x = x.contiguous().view(-1, N) 71 | 72 | v = torch.cat([x[:, ::2], x[:, 1::2].flip([1])], dim=1) 73 | 74 | Vc = torch.rfft(v, 1, onesided=False) 75 | 76 | k = - torch.arange(N, dtype=x.dtype, device=x.device)[None, :] * np.pi/(2*N) 77 | W_r = torch.cos(k) 78 | W_i = torch.sin(k) 79 | 80 | V = Vc[:, :, 0] * W_r - Vc[:, :, 1] * W_i 81 | 82 | if norm == 'ortho': 83 | V[:, 0] /= np.sqrt(N) * 2 84 | V[:, 1:] /= np.sqrt(N / 2) * 2 85 | 86 | V = 2 * V.view(*x_shape) 87 | 88 | return V 89 | 90 | 91 | def idct(X, norm=None): 92 | """ 93 | The inverse to DCT-II, which is a scaled Discrete Cosine Transform, Type III 94 | Our definition of idct is that idct(dct(x)) == x 95 | For the meaning of the parameter `norm`, see: 96 | https://docs.scipy.org/doc/ scipy.fftpack.dct.html 97 | :param X: the input signal 98 | :param norm: the normalization, None or 'ortho' 99 | :return: the inverse DCT-II of the signal over the last dimension 100 | """ 101 | 102 | x_shape = X.shape 103 | N = x_shape[-1] 104 | 105 | X_v = X.contiguous().view(-1, x_shape[-1]) / 2 106 | 107 | if norm == 'ortho': 108 | X_v[:, 0] *= np.sqrt(N) * 2 109 | X_v[:, 1:] *= np.sqrt(N / 2) * 2 110 | 111 | k = torch.arange(x_shape[-1], dtype=X.dtype, 112 | device=X.device)[None, :]*np.pi/(2*N) 113 | W_r = torch.cos(k) 114 | W_i = torch.sin(k) 115 | 116 | V_t_r = X_v 117 | V_t_i = torch.cat([X_v[:, :1] * 0, -X_v.flip([1])[:, :-1]], dim=1) 118 | 119 | V_r = V_t_r * W_r - V_t_i * W_i 120 | V_i = V_t_r * W_i + V_t_i * W_r 121 | 122 | V = torch.cat([V_r.unsqueeze(2), V_i.unsqueeze(2)], dim=2) 123 | 124 | v = torch.irfft(V, 1, onesided=False) 125 | x = v.new_zeros(v.shape) 126 | x[:, ::2] += v[:, :N - (N // 2)] 127 | x[:, 1::2] += v.flip([1])[:, :N // 2] 128 | 129 | return x.view(*x_shape) 130 | 131 | 132 | def delta(x): 133 | """ By default 134 | input 135 | ----- 136 | x (batch, Length, dim) 137 | 138 | output 139 | ------ 140 | output (batch, Length, dim) 141 | 142 | Delta is calculated along Length dimension 143 | """ 144 | length = x.shape[1] 145 | output = torch.zeros_like(x) 146 | x_temp = F.pad(x.unsqueeze(1), (0, 0, 1, 1), 'replicate').squeeze(1) 147 | output = -1 * x_temp[:, 0:length] + x_temp[:, 2:] 148 | return output 149 | -------------------------------------------------------------------------------- /evaluate_tDCF_asvspoof19.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import evaluation_metrics as em 4 | import matplotlib.pyplot as plt 5 | 6 | def evaluate_tDCF_asvspoof19(cm_score_file, asv_score_file, legacy): 7 | 8 | # Fix tandem detection cost function (t-DCF) parameters 9 | if legacy: 10 | Pspoof = 0.05 11 | cost_model = { 12 | 'Pspoof': Pspoof, # Prior probability of a spoofing attack 13 | 'Ptar': (1 - Pspoof) * 0.99, # Prior probability of target speaker 14 | 'Pnon': (1 - Pspoof) * 0.01, # Prior probability of nontarget speaker 15 | 'Cmiss_asv': 1, # Cost of ASV system falsely rejecting target speaker 16 | 'Cfa_asv': 10, # Cost of ASV system falsely accepting nontarget speaker 17 | 'Cmiss_cm': 1, # Cost of CM system falsely rejecting target speaker 18 | 'Cfa_cm': 10, # Cost of CM system falsely accepting spoof 19 | } 20 | else: 21 | Pspoof = 0.05 22 | cost_model = { 23 | 'Pspoof': Pspoof, # Prior probability of a spoofing attack 24 | 'Ptar': (1 - Pspoof) * 0.99, # Prior probability of target speaker 25 | 'Pnon': (1 - Pspoof) * 0.01, # Prior probability of nontarget speaker 26 | 'Cmiss': 1, # Cost of tandem system falsely rejecting target speaker 27 | 'Cfa': 10, # Cost of tandem system falsely accepting nontarget speaker 28 | 'Cfa_spoof': 10, # Cost of tandem system falsely accepting spoof 29 | } 30 | 31 | # Load organizers' ASV scores 32 | asv_data = np.genfromtxt(asv_score_file, dtype=str) 33 | asv_keys = asv_data[:, 1] 34 | asv_scores = asv_data[:, 2].astype(np.float) 35 | 36 | # Load CM scores 37 | cm_data = np.genfromtxt(cm_score_file, dtype=str) 38 | cm_keys = cm_data[:, 1] 39 | cm_scores = cm_data[:, 2].astype(np.float) 40 | 41 | # Extract target, nontarget, and spoof scores from the ASV scores 42 | tar_asv = asv_scores[asv_keys == 'target'] 43 | non_asv = asv_scores[asv_keys == 'nontarget'] 44 | spoof_asv = asv_scores[asv_keys == 'spoof'] 45 | 46 | # Extract bona fide (real human) and spoof scores from the CM scores 47 | bona_cm = cm_scores[cm_keys == 'bonafide'] 48 | spoof_cm = cm_scores[cm_keys == 'spoof'] 49 | 50 | # EERs of the standalone systems and fix ASV operating point to EER threshold 51 | eer_asv, asv_threshold = em.compute_eer(tar_asv, non_asv) 52 | eer_cm = em.compute_eer(bona_cm, spoof_cm)[0] 53 | 54 | 55 | [Pfa_asv, Pmiss_asv, Pmiss_spoof_asv, Pfa_spoof_asv] = em.obtain_asv_error_rates(tar_asv, non_asv, spoof_asv, asv_threshold) 56 | 57 | 58 | # Compute t-DCF 59 | if legacy: 60 | tDCF_curve, CM_thresholds = em.compute_tDCF_legacy(bona_cm, spoof_cm, Pfa_asv, Pmiss_asv, Pmiss_spoof_asv, cost_model, True) 61 | else: 62 | tDCF_curve, CM_thresholds = em.compute_tDCF(bona_cm, spoof_cm, Pfa_asv, Pmiss_asv, Pfa_spoof_asv, cost_model, True) 63 | 64 | # Minimum t-DCF 65 | min_tDCF_index = np.argmin(tDCF_curve) 66 | min_tDCF = tDCF_curve[min_tDCF_index] 67 | min_tDCF_threshold = CM_thresholds[min_tDCF_index]; 68 | 69 | # compute DET of CM and get Pmiss and Pfa for the selected threshold t_CM 70 | Pmiss_cm, Pfa_cm, CM_thresholds = em.compute_det_curve(bona_cm, spoof_cm) 71 | Pmiss_t_CM = Pmiss_cm[CM_thresholds == min_tDCF_threshold] 72 | Pfa_t_CM = Pfa_cm[CM_thresholds == min_tDCF_threshold] 73 | 74 | 75 | print('ASV SYSTEM') 76 | print(' EER = {:8.5f} % (Equal error rate (target vs. nontarget discrimination)'.format(eer_asv * 100)) 77 | print(' Pfa = {:8.5f} % (False acceptance rate of nontargets)'.format(Pfa_asv * 100)) 78 | print(' Pmiss = {:8.5f} % (False rejection rate of targets)'.format(Pmiss_asv * 100)) 79 | if legacy: 80 | print(' 1-Pmiss,spoof = {:8.5f} % (Spoof false acceptance rate)'.format((1 - Pmiss_spoof_asv) * 100)) 81 | else: 82 | print(' Pfa,spoof = {:8.5f} % (Spoof false acceptance rate)'.format((1 - Pmiss_spoof_asv) * 100)) 83 | 84 | print('\nCM SYSTEM') 85 | print(' EER = {:8.5f} % (Equal error rate for countermeasure)'.format(eer_cm * 100)) 86 | print(' Pfa(t_CM_min_tDCF) = {:8.5f} % (False acceptance rate of spoofs)'.format(Pfa_t_CM[0] * 100)) 87 | print(' Pmiss(t_CM_min_tDCF) = {:8.5f} % (Miss (false rejection) rate of bonafide)'.format(Pmiss_t_CM[0] * 100)) 88 | 89 | print('\nTANDEM') 90 | print(' min-tDCF = {:8.5f}'.format(min_tDCF)) 91 | 92 | 93 | # Visualize ASV scores and CM scores 94 | plt.figure() 95 | ax = plt.subplot(121) 96 | plt.hist(tar_asv, histtype='step', density=True, bins=50, label='Target') 97 | plt.hist(non_asv, histtype='step', density=True, bins=50, label='Nontarget') 98 | plt.hist(spoof_asv, histtype='step', density=True, bins=50, label='Spoof') 99 | plt.plot(asv_threshold, 0, 'o', markersize=10, mfc='none', mew=2, clip_on=False, label='EER threshold') 100 | plt.legend() 101 | plt.xlabel('ASV score') 102 | plt.ylabel('Density') 103 | plt.title('ASV score histogram') 104 | 105 | ax = plt.subplot(122) 106 | plt.hist(bona_cm, histtype='step', density=True, bins=50, label='Bona fide') 107 | plt.hist(spoof_cm, histtype='step', density=True, bins=50, label='Spoof') 108 | plt.legend() 109 | plt.xlabel('CM score') 110 | #plt.ylabel('Density') 111 | plt.title('CM score histogram') 112 | 113 | 114 | # Plot t-DCF as function of the CM threshold. 115 | plt.figure() 116 | plt.plot(CM_thresholds, tDCF_curve) 117 | plt.plot(CM_thresholds[min_tDCF_index], min_tDCF, 'o', markersize=10, mfc='none', mew=2) 118 | plt.xlabel('CM threshold index (operating point)') 119 | plt.ylabel('Norm t-DCF'); 120 | plt.title('Normalized tandem t-DCF') 121 | plt.plot([np.min(CM_thresholds), np.max(CM_thresholds)], [1, 1], '--', color='black') 122 | plt.legend(('t-DCF', 'min t-DCF ({:.5f})'.format(min_tDCF), 'Arbitrarily bad CM (Norm t-DCF=1)')) 123 | plt.xlim([np.min(CM_thresholds), np.max(CM_thresholds)]) 124 | plt.ylim([0, 1.5]) 125 | 126 | plt.show() 127 | -------------------------------------------------------------------------------- /loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd.function import Function 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | from torch.nn import Parameter 7 | 8 | 9 | class OCAngleLayer(nn.Module): 10 | """ Output layer to produce activation for one-class softmax 11 | 12 | Usage example: 13 | batchsize = 64 14 | input_dim = 10 15 | class_num = 2 16 | 17 | l_layer = OCAngleLayer(input_dim) 18 | l_loss = OCSoftmaxWithLoss() 19 | 20 | data = torch.rand(batchsize, input_dim, requires_grad=True) 21 | target = (torch.rand(batchsize) * class_num).clamp(0, class_num-1) 22 | target = target.to(torch.long) 23 | 24 | scores = l_layer(data) 25 | loss = l_loss(scores, target) 26 | 27 | loss.backward() 28 | """ 29 | 30 | def __init__(self, in_planes, w_posi=0.9, w_nega=0.2, alpha=20.0): 31 | super(OCAngleLayer, self).__init__() 32 | self.in_planes = in_planes 33 | self.w_posi = w_posi 34 | self.w_nega = w_nega 35 | self.out_planes = 1 36 | 37 | self.weight = Parameter(torch.Tensor(in_planes, self.out_planes)) 38 | # self.weight.data.uniform_(-1, 1).renorm_(2,1,1e-5).mul_(1e5) 39 | nn.init.kaiming_uniform_(self.weight, 0.25) 40 | self.weight.data.renorm_(2, 1, 1e-5).mul_(1e5) 41 | 42 | self.alpha = alpha 43 | 44 | def forward(self, input, flag_angle_only=False): 45 | """ 46 | Compute oc-softmax activations 47 | 48 | input: 49 | ------ 50 | input tensor (batchsize, input_dim) 51 | 52 | output: 53 | ------- 54 | tuple of tensor ((batchsize, output_dim), (batchsize, output_dim)) 55 | """ 56 | # w (feature_dim, output_dim) 57 | w = self.weight.renorm(2, 1, 1e-5).mul(1e5) 58 | # x_modulus (batchsize) 59 | # sum input -> x_modules in shape (batchsize) 60 | x_modulus = input.pow(2).sum(1).pow(0.5) 61 | # w_modules (output_dim) 62 | # w_moduls should be 1, since w has been normalized 63 | # w_modulus = w.pow(2).sum(0).pow(0.5) 64 | 65 | # W * x = ||W|| * ||x|| * cos()))))))) 66 | # inner_wx (batchsize, 1) 67 | inner_wx = input.mm(w) 68 | # cos_theta (batchsize, output_dim) 69 | cos_theta = inner_wx / x_modulus.view(-1, 1) 70 | cos_theta = cos_theta.clamp(-1, 1) 71 | 72 | if flag_angle_only: 73 | pos_score = cos_theta 74 | neg_score = cos_theta 75 | else: 76 | pos_score = self.alpha * (self.w_posi - cos_theta) 77 | neg_score = -1 * self.alpha * (self.w_nega - cos_theta) 78 | 79 | # 80 | return pos_score, neg_score 81 | 82 | 83 | class OCSoftmaxWithLoss(nn.Module): 84 | """ 85 | OCSoftmaxWithLoss() 86 | 87 | """ 88 | 89 | def __init__(self): 90 | super(OCSoftmaxWithLoss, self).__init__() 91 | self.m_loss = nn.Softplus() 92 | 93 | def forward(self, inputs, target): 94 | """ 95 | input: 96 | ------ 97 | input: tuple of tensors ((batchsie, out_dim), (batchsie, out_dim)) 98 | output from OCAngle 99 | inputs[0]: positive class score 100 | inputs[1]: negative class score 101 | target: tensor (batchsize) 102 | tensor of target index 103 | output: 104 | ------ 105 | loss: scalar 106 | """ 107 | # Assume target is binary, positive = 1, negaitve = 0 108 | # 109 | # Equivalent to select the scores using if-elese 110 | # if target = 1, use inputs[0] 111 | # else, use inputs[1] 112 | output = inputs[0] * target.view(-1, 1) + \ 113 | inputs[1] * (1 - target.view(-1, 1)) 114 | loss = self.m_loss(output).mean() 115 | 116 | return loss 117 | 118 | 119 | class OCSoftmax(nn.Module): 120 | def __init__(self, feat_dim=2, r_real=0.9, r_fake=0.5, alpha=20.0): 121 | super(OCSoftmax, self).__init__() 122 | self.feat_dim = feat_dim 123 | self.r_real = r_real 124 | self.r_fake = r_fake 125 | self.alpha = alpha 126 | self.center = nn.Parameter(torch.randn(1, self.feat_dim)) 127 | nn.init.kaiming_uniform_(self.center, 0.25) 128 | self.softplus = nn.Softplus() 129 | 130 | def forward(self, x, labels, is_train=True): 131 | """ 132 | Args: 133 | x: feature matrix with shape (batch_size, feat_dim). 134 | labels: ground truth labels with shape (batch_size). 135 | is_train: check if we are in in train mode. 136 | """ 137 | w = F.normalize(self.center, p=2, dim=1) 138 | x = F.normalize(x, p=2, dim=1) 139 | 140 | 141 | scores = x @ w.transpose(0,1) 142 | output_scores = scores.clone() 143 | 144 | if is_train: 145 | scores[labels == 0] = self.r_real - scores[labels == 0] 146 | scores[labels == 1] = scores[labels == 1] - self.r_fake 147 | 148 | loss = self.softplus(self.alpha * scores).mean() 149 | 150 | return loss, -output_scores.squeeze(1) 151 | 152 | 153 | class AMSoftmax(nn.Module): 154 | def __init__(self, num_classes, enc_dim, s=20, m=0.9): 155 | super(AMSoftmax, self).__init__() 156 | self.enc_dim = enc_dim 157 | self.num_classes = num_classes 158 | self.s = s 159 | self.m = m 160 | self.centers = nn.Parameter(torch.randn(num_classes, enc_dim)) 161 | 162 | def forward(self, feat, label): 163 | batch_size = feat.shape[0] 164 | norms = torch.norm(feat, p=2, dim=-1, keepdim=True) 165 | nfeat = torch.div(feat, norms) 166 | 167 | norms_c = torch.norm(self.centers, p=2, dim=-1, keepdim=True) 168 | ncenters = torch.div(self.centers, norms_c) 169 | logits = torch.matmul(nfeat, torch.transpose(ncenters, 0, 1)) 170 | 171 | y_onehot = torch.FloatTensor(batch_size, self.num_classes) 172 | y_onehot.zero_() 173 | y_onehot = Variable(y_onehot).cuda() 174 | y_onehot.scatter_(1, torch.unsqueeze(label, dim=-1), self.m) 175 | margin_logits = self.s * (logits - y_onehot) 176 | 177 | return logits, margin_logits 178 | -------------------------------------------------------------------------------- /feature_layers.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import tools.audio_utils 4 | import librosa 5 | import numpy as np 6 | 7 | 8 | class CQT(nn.Module): 9 | def __init__(self, sampling_rate): 10 | super(CQT, self).__init__() 11 | self.sampling_rate = sampling_rate 12 | 13 | def forward(self, x, device): 14 | batch_size = x.shape[0] 15 | batch_output = torch.zeros(batch_size, 84, 126) 16 | batch_count = 0 17 | for item in x: 18 | numpy_item = item.numpy() 19 | item_cqt = librosa.cqt(numpy_item, sr=self.sampling_rate) 20 | item_cqt = librosa.amplitude_to_db(np.abs(item_cqt), ref=np.max) 21 | item_torch_cqt = torch.from_numpy(item_cqt).to(device) 22 | batch_output[batch_count] = item_torch_cqt 23 | batch_count += 1 24 | 25 | return batch_output.to(device) 26 | 27 | 28 | class Spectrogram(nn.Module): 29 | def __init__(self, n_fft): 30 | super(Spectrogram, self).__init__() 31 | self.n_fft = n_fft 32 | 33 | def forward(self, x, device): 34 | batch_size = x.shape[0] 35 | batch_output = torch.zeros(batch_size, 1025, 126) 36 | batch_count = 0 37 | for item in x: 38 | numpy_item = item.numpy() 39 | item_stft = librosa.stft(numpy_item, n_fft=self.n_fft) 40 | item_stft = librosa.amplitude_to_db(np.abs(item_stft), ref=np.max) 41 | item_torch_stft = torch.from_numpy(item_stft).to(device) 42 | batch_output[batch_count] = item_torch_stft 43 | batch_count += 1 44 | 45 | return batch_output.to(device) 46 | 47 | 48 | class LinearDCT(nn.Linear): 49 | """Implement any DCT as a linear layer; in practice this executes around 50 | 50x faster on GPU. Unfortunately, the DCT matrix is stored, which will 51 | increase memory usage. 52 | :param in_features: size of expected input 53 | :param type: which dct function in this file to use""" 54 | def __init__(self, in_features, type, norm=None, bias=False): 55 | self.type = type 56 | self.N = in_features 57 | self.norm = norm 58 | super(LinearDCT, self).__init__(in_features, in_features, bias=bias) 59 | 60 | def reset_parameters(self): 61 | # initialise using dct function 62 | I = torch.eye(self.N) 63 | if self.type == 'dct1': 64 | self.weight.data = tools.audio_utils.dct1(I).data.t() 65 | elif self.type == 'idct1': 66 | self.weight.data = tools.audio_utils.idct1(I).data.t() 67 | elif self.type == 'dct': 68 | self.weight.data = tools.audio_utils.dct(I, norm=self.norm).data.t() 69 | elif self.type == 'idct': 70 | self.weight.data = tools.audio_utils.idct(I, norm=self.norm).data.t() 71 | self.weight.requires_grad = False # don't learn this! 72 | 73 | 74 | class LFCC(nn.Module): 75 | """ Based on asvspoof.org baseline Matlab code. 76 | Difference: with_energy is added to set the first dimension as energy 77 | """ 78 | 79 | def __init__(self, fl, fs, fn, sr, filter_num, 80 | with_energy=False, with_emphasis=True, 81 | with_delta=True, flag_for_LFB=False): 82 | """ Initialize LFCC 83 | 84 | Para: 85 | ----- 86 | fl: int, frame length, (number of waveform points) 87 | fs: int, frame shift, (number of waveform points) 88 | fn: int, FFT points 89 | sr: int, sampling rate (Hz) 90 | filter_num: int, number of filters in filter-bank 91 | with_energy: bool, (default False), whether replace 1st dim to energy 92 | with_emphasis: bool, (default True), whether pre-emphaze input wav 93 | with_delta: bool, (default True), whether use delta and delta-delta 94 | 95 | for_LFB: bool (default False), reserved for LFB feature 96 | """ 97 | super(LFCC, self).__init__() 98 | self.fl = fl 99 | self.fs = fs 100 | self.fn = fn 101 | self.sr = sr 102 | self.filter_num = filter_num 103 | 104 | # build the triangle filter bank 105 | f = (sr / 2) * torch.linspace(0, 1, fn // 2 + 1) 106 | filter_bands = torch.linspace(min(f), max(f), filter_num + 2) 107 | 108 | filter_bank = torch.zeros([fn // 2 + 1, filter_num]) 109 | for idx in range(filter_num): 110 | filter_bank[:, idx] = tools.audio_utils.trimf( 111 | f, [filter_bands[idx], 112 | filter_bands[idx + 1], 113 | filter_bands[idx + 2]]) 114 | self.lfcc_fb = nn.Parameter(filter_bank, requires_grad=False) 115 | 116 | # DCT as a linear transformation layer 117 | self.l_dct = LinearDCT(filter_num, 'dct', norm='ortho') 118 | 119 | # opts 120 | self.with_energy = with_energy 121 | self.with_emphasis = with_emphasis 122 | self.with_delta = with_delta 123 | self.flag_for_LFB = flag_for_LFB 124 | return 125 | 126 | def forward(self, x): 127 | """ 128 | 129 | input: 130 | ------ 131 | x: tensor(batch, length), where length is waveform length 132 | 133 | output: 134 | ------- 135 | lfcc_output: tensor(batch, frame_num, dim_num) 136 | """ 137 | # pre-emphsis 138 | if self.with_emphasis: 139 | x[:, 1:] = x[:, 1:] - 0.97 * x[:, 0:-1] 140 | 141 | # STFT 142 | x_stft = torch.stft(x, self.fn, self.fs, self.fl, 143 | window=torch.hamming_window(self.fl).to(x.device), 144 | onesided=True, pad_mode="constant") 145 | # amplitude 146 | sp_amp = torch.norm(x_stft, 2, -1).pow(2).permute(0, 2, 1).contiguous() 147 | 148 | # filter bank 149 | fb_feature = torch.log10(torch.matmul(sp_amp, self.lfcc_fb) + 150 | torch.finfo(torch.float32).eps) 151 | 152 | # DCT (if necessary, remove DCT) 153 | lfcc = self.l_dct(fb_feature) if not self.flag_for_LFB else fb_feature 154 | 155 | # Add energy 156 | if self.with_energy: 157 | power_spec = sp_amp / self.fn 158 | energy = torch.log10(power_spec.sum(axis=2) + 159 | torch.finfo(torch.float32).eps) 160 | lfcc[:, :, 0] = energy 161 | 162 | # Add delta coefficients 163 | if self.with_delta: 164 | lfcc_delta = tools.audio_utils.delta(lfcc) 165 | lfcc_delta_delta = tools.audio_utils.delta(lfcc_delta) 166 | lfcc_output = torch.cat((lfcc, lfcc_delta, lfcc_delta_delta), 2) 167 | else: 168 | lfcc_output = lfcc 169 | 170 | # done 171 | return lfcc_output 172 | -------------------------------------------------------------------------------- /resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.nn.init as init 5 | import os 6 | import random 7 | import numpy as np 8 | 9 | 10 | # Adapted from https://github.com/joaomonteirof/e2e_antispoofing 11 | 12 | class SelfAttention(nn.Module): 13 | def __init__(self, hidden_size, mean_only=False): 14 | super(SelfAttention, self).__init__() 15 | 16 | self.hidden_size = hidden_size 17 | self.att_weights = nn.Parameter(torch.Tensor(1, hidden_size),requires_grad=True) 18 | 19 | self.mean_only = mean_only 20 | 21 | init.kaiming_uniform_(self.att_weights) 22 | 23 | def forward(self, inputs): 24 | 25 | batch_size = inputs.size(0) 26 | weights = torch.bmm(inputs, self.att_weights.permute(1, 0).unsqueeze(0).repeat(batch_size, 1, 1)) 27 | 28 | if inputs.size(0)==1: 29 | attentions = F.softmax(torch.tanh(weights),dim=1) 30 | weighted = torch.mul(inputs, attentions.expand_as(inputs)) 31 | else: 32 | attentions = F.softmax(torch.tanh(weights.squeeze()),dim=1) 33 | weighted = torch.mul(inputs, attentions.unsqueeze(2).expand_as(inputs)) 34 | 35 | if self.mean_only: 36 | return weighted.sum(1) 37 | else: 38 | noise = 1e-5*torch.randn(weighted.size()) 39 | 40 | if inputs.is_cuda: 41 | noise = noise.to(inputs.device) 42 | avg_repr, std_repr = weighted.sum(1), (weighted+noise).std(1) 43 | 44 | representations = torch.cat((avg_repr,std_repr),1) 45 | 46 | return representations 47 | 48 | 49 | class PreActBlock(nn.Module): 50 | '''Pre-activation version of the BasicBlock.''' 51 | expansion = 1 52 | 53 | def __init__(self, in_planes, planes, stride, *args, **kwargs): 54 | super(PreActBlock, self).__init__() 55 | self.bn1 = nn.BatchNorm2d(in_planes) 56 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 57 | self.bn2 = nn.BatchNorm2d(planes) 58 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) 59 | 60 | if stride != 1 or in_planes != self.expansion*planes: 61 | self.shortcut = nn.Sequential(nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)) 62 | 63 | def forward(self, x): 64 | out = F.relu(self.bn1(x)) 65 | shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x 66 | out = self.conv1(out) 67 | out = self.conv2(F.relu(self.bn2(out))) 68 | out += shortcut 69 | return out 70 | 71 | 72 | class PreActBottleneck(nn.Module): 73 | '''Pre-activation version of the original Bottleneck module.''' 74 | expansion = 4 75 | 76 | def __init__(self, in_planes, planes, stride, *args, **kwargs): 77 | super(PreActBottleneck, self).__init__() 78 | self.bn1 = nn.BatchNorm2d(in_planes) 79 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 80 | self.bn2 = nn.BatchNorm2d(planes) 81 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 82 | self.bn3 = nn.BatchNorm2d(planes) 83 | self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) 84 | 85 | if stride != 1 or in_planes != self.expansion*planes: 86 | self.shortcut = nn.Sequential(nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)) 87 | 88 | def forward(self, x): 89 | out = F.relu(self.bn1(x)) 90 | shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x 91 | out = self.conv1(out) 92 | out = self.conv2(F.relu(self.bn2(out))) 93 | out = self.conv3(F.relu(self.bn3(out))) 94 | out += shortcut 95 | return out 96 | 97 | 98 | def conv3x3(in_planes, out_planes, stride=1): 99 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) 100 | 101 | 102 | def conv1x1(in_planes, out_planes, stride=1): 103 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) 104 | 105 | 106 | RESNET_CONFIGS = {'18': [[2, 2, 2, 2], PreActBlock], 107 | '28': [[3, 4, 6, 3], PreActBlock], 108 | '34': [[3, 4, 6, 3], PreActBlock], 109 | '50': [[3, 4, 6, 3], PreActBottleneck], 110 | '101': [[3, 4, 23, 3], PreActBottleneck] 111 | } 112 | 113 | 114 | def setup_seed(random_seed, cudnn_deterministic=True): 115 | # initialization 116 | torch.manual_seed(random_seed) 117 | random.seed(random_seed) 118 | np.random.seed(random_seed) 119 | os.environ['PYTHONHASHSEED'] = str(random_seed) 120 | 121 | if torch.cuda.is_available(): 122 | torch.cuda.manual_seed_all(random_seed) 123 | torch.backends.cudnn.deterministic = cudnn_deterministic 124 | torch.backends.cudnn.benchmark = False 125 | 126 | 127 | class ResNet(nn.Module): 128 | def __init__(self, num_nodes, enc_dim, resnet_type='18', nclasses=2): 129 | self.in_planes = 16 130 | super(ResNet, self).__init__() 131 | 132 | layers, block = RESNET_CONFIGS[resnet_type] 133 | 134 | self._norm_layer = nn.BatchNorm2d 135 | 136 | self.conv1 = nn.Conv2d(1, 16, kernel_size=(9, 3), stride=(3, 1), padding=(1, 1), bias=False) 137 | self.bn1 = nn.BatchNorm2d(16) 138 | self.activation = nn.ReLU() 139 | 140 | self.layer1 = self._make_layer(block, 64, layers[0], stride=1) 141 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 142 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 143 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 144 | 145 | self.conv5 = nn.Conv2d(512 * block.expansion, 256, kernel_size=(num_nodes, 3), stride=(1, 1), padding=(0, 1), 146 | bias=False) 147 | self.bn5 = nn.BatchNorm2d(256) 148 | self.fc = nn.Linear(256 * 2, enc_dim) 149 | self.fc_mu = nn.Linear(enc_dim, nclasses) if nclasses >= 2 else nn.Linear(enc_dim, 1) 150 | 151 | self.initialize_params() 152 | self.attention = SelfAttention(256) 153 | 154 | def initialize_params(self): 155 | for layer in self.modules(): 156 | if isinstance(layer, torch.nn.Conv2d): 157 | init.kaiming_normal_(layer.weight, a=0, mode='fan_out') 158 | elif isinstance(layer, torch.nn.Linear): 159 | init.kaiming_uniform_(layer.weight) 160 | elif isinstance(layer, torch.nn.BatchNorm2d) or isinstance(layer, torch.nn.BatchNorm1d): 161 | layer.weight.data.fill_(1) 162 | layer.bias.data.zero_() 163 | 164 | def _make_layer(self, block, planes, num_blocks, stride=1): 165 | norm_layer = self._norm_layer 166 | downsample = None 167 | if stride != 1 or self.in_planes != planes * block.expansion: 168 | downsample = nn.Sequential(conv1x1(self.in_planes, planes * block.expansion, stride), 169 | norm_layer(planes * block.expansion)) 170 | layers = [] 171 | layers.append(block(self.in_planes, planes, stride, downsample, 1, 64, 1, norm_layer)) 172 | self.in_planes = planes * block.expansion 173 | for _ in range(1, num_blocks): 174 | layers.append( 175 | block(self.in_planes, planes, 1, groups=1, base_width=64, dilation=False, norm_layer=norm_layer)) 176 | 177 | return nn.Sequential(*layers) 178 | 179 | def forward(self, x): 180 | x = self.conv1(x) 181 | x = self.activation(self.bn1(x)) 182 | x = self.layer1(x) 183 | x = self.layer2(x) 184 | x = self.layer3(x) 185 | x = self.layer4(x) 186 | x = self.conv5(x) 187 | x = self.activation(self.bn5(x)).reshape(x.shape[0], x.shape[1], -1) 188 | stats = self.attention(x.permute(0, 2, 1).contiguous()) 189 | 190 | feat = self.fc(stats) 191 | 192 | mu = self.fc_mu(feat) 193 | 194 | return feat, mu 195 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import os 4 | import time 5 | import numpy as np 6 | import json 7 | 8 | from torch import Tensor 9 | import torchvision 10 | import librosa 11 | from model import Model 12 | from torch.utils.data import DataLoader 13 | from tensorboardX import SummaryWriter 14 | import tools.dataset_loader as dataset_loader 15 | from sklearn.model_selection import KFold 16 | 17 | from resnet import setup_seed 18 | from collections import defaultdict 19 | from loss import * 20 | import evaluation_metrics as em 21 | 22 | 23 | class Color: 24 | HEADER = '\033[95m' 25 | OKBLUE = '\033[94m' 26 | OKCYAN = '\033[96m' 27 | OKGREEN = '\033[92m' 28 | WARNING = '\033[93m' 29 | FAIL = '\033[91m' 30 | ENDC = '\033[0m' 31 | BOLD = '\033[1m' 32 | UNDERLINE = '\033[4m' 33 | 34 | 35 | def add_parser(parser): 36 | parser.add_argument("--feat_len", type=int, help="features length", default=750) 37 | parser.add_argument("--enc_dim", type=int, help="encoding dimension", default=256) 38 | 39 | 40 | parser.add_argument('--num-epochs', type=int, default=100, help="Number of epochs for training") 41 | parser.add_argument('--batch-size', type=int, default=4, help="Mini batch size for training") 42 | parser.add_argument('--epoch', type=int, default=0, help="current epoch number") 43 | parser.add_argument('--lr', type=float, default=0.0003, help="learning rate") 44 | parser.add_argument('--lr-decay', type=float, default=0.5, help="decay learning rate") 45 | parser.add_argument('--interval', type=int, default=10, help="interval to decay lr") 46 | 47 | parser.add_argument('--beta-1', type=float, default=0.9, help="bata_1 for Adam") 48 | parser.add_argument('--beta-2', type=float, default=0.999, help="beta_2 for Adam") 49 | parser.add_argument('--eps', type=float, default=1e-8, help="epsilon for Adam") 50 | parser.add_argument("--gpu", type=str, help="GPU index", default="1") 51 | parser.add_argument('--num-workers', type=int, default=0, help="number of workers") 52 | parser.add_argument('--seed', type=int, help="random number seed", default=598) 53 | 54 | parser.add_argument('--r-real', type=float, default=0.9, help="r_real for ocsoftmax") 55 | parser.add_argument('--r-fake', type=float, default=0.2, help="r_fake for ocsoftmax") 56 | parser.add_argument('--alpha', type=float, default=20, help="scale factor for ocsoftmax") 57 | 58 | parser.add_argument('--model-path', type=str, help="saved model path") 59 | args = parser.parse_args() 60 | 61 | # Change this to specify GPU 62 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 63 | 64 | # Set seeds 65 | setup_seed(args.seed) 66 | 67 | # assign device 68 | args.cuda = torch.cuda.is_available() 69 | print('Cuda device available: ', args.cuda) 70 | args.device = torch.device("cuda" if args.cuda else "cpu") 71 | 72 | return args 73 | 74 | 75 | def adjust_learning_rate(args, optimizer, epoch_num): 76 | lr = args.lr * (args.lr_decay ** (epoch_num // args.interval)) 77 | for param_group in optimizer.param_groups: 78 | param_group['lr'] = lr 79 | 80 | 81 | def pad(x, max_len=64000): 82 | x_len = x.shape[0] 83 | if x_len >= max_len: 84 | return x[:max_len] 85 | # need to pad 86 | num_repeats = (max_len / x_len) + 1 87 | x_repeat = np.repeat(x, num_repeats) 88 | padded_x = x_repeat[:max_len] 89 | return padded_x 90 | 91 | 92 | def split_dataset_to_train_and_val(k_fold, train_set, batch_size): 93 | for fold, (train_ids, test_ids) in enumerate(k_fold.split(train_set)): 94 | # Sample elements randomly from a given list of ids, no replacement. 95 | train_sub_sampler = torch.utils.data.SubsetRandomSampler(train_ids) 96 | test_sub_sampler = torch.utils.data.SubsetRandomSampler(test_ids) 97 | 98 | # Define data loaders for training and testing data in this fold 99 | train_loader_part = torch.utils.data.DataLoader( 100 | train_set, 101 | batch_size=batch_size, sampler=train_sub_sampler) 102 | validation_loader_part = torch.utils.data.DataLoader( 103 | train_set, 104 | batch_size=batch_size, sampler=test_sub_sampler) 105 | break 106 | return train_loader_part, validation_loader_part 107 | 108 | 109 | def train(parser, device): 110 | print(f'{Color.OKGREEN}Loading train dataset...{Color.ENDC}') 111 | args = parser.parse_args() 112 | model = Model(input_channels=1, num_classes=256, device=device) 113 | 114 | transforms = torchvision.transforms.Compose([ 115 | lambda x: pad(x), 116 | lambda x: librosa.util.normalize(x), 117 | lambda x: Tensor(x), 118 | ]) 119 | 120 | k_fold = KFold(n_splits=5, shuffle=True) 121 | 122 | if args.model_path: 123 | model.load_state_dict(torch.load(args.model_path)) 124 | print('Model loaded : {}'.format(args.model_path)) 125 | 126 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, 127 | betas=(args.beta_1, args.beta_2), eps=args.eps, weight_decay=0.0005) 128 | 129 | 130 | train_set = dataset_loader.ASVDataset(is_train=True, transform=transforms) 131 | # dev_set = dataset_loader.ASVDataset(is_train=False, transform=transforms) 132 | 133 | monitor_loss = 'loss' 134 | 135 | print(f'{Color.ENDC}Train Start...') 136 | 137 | train_loader, validation_loader = split_dataset_to_train_and_val(k_fold, train_set, batch_size=args.batch_size) 138 | model.train() 139 | 140 | # for epoch in range(checkpoint_epoch, number_of_epochs): 141 | for epoch in range(args.epoch, args.num_epochs): 142 | start = time.time() 143 | 144 | print(f'{Color.OKBLUE}Epoch:{epoch}{Color.ENDC}') 145 | model.train() 146 | train_loss_dict = defaultdict(list) 147 | dev_loss_dict = defaultdict(list) 148 | 149 | adjust_learning_rate(args, optimizer, epoch) 150 | 151 | for batch_x, batch_y, batch_meta in train_loader: 152 | batch_x = batch_x.to(device) 153 | batch_y = batch_y.view(-1).type(torch.int64).to(device) 154 | 155 | labels = batch_y.to(device) 156 | loss, score = model(batch_x, labels) 157 | train_loss_dict[monitor_loss].append(loss.item()) 158 | 159 | optimizer.zero_grad() 160 | loss.backward() 161 | optimizer.step() 162 | 163 | with open(os.path.join('./log/', 'train_loss.log'), 'a') as log: 164 | # log.write(str(fold) + "\t" + str(epoch) + "\t" + 165 | log.write(str(epoch) + "\t" + 166 | str(np.nanmean(train_loss_dict[monitor_loss])) + "\n") 167 | 168 | end = time.time() 169 | hours, rem = divmod(end - start, 3600) 170 | minutes, seconds = divmod(rem, 60) 171 | print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) 172 | print('start validation phase...') 173 | 174 | # Val the model 175 | model.eval() 176 | with torch.no_grad(): 177 | idx_loader, score_loader = [], [] 178 | for i, (batch_x, batch_y, batch_meta) in enumerate(validation_loader): 179 | labels = batch_y.to(device) 180 | loss, score = model(batch_x, labels, False) 181 | 182 | dev_loss_dict['loss'].append(loss.item()) 183 | idx_loader.append(labels) 184 | score_loader.append(score) 185 | 186 | scores = torch.cat(score_loader, 0).data.cpu().numpy() 187 | labels = torch.cat(idx_loader, 0).data.cpu().numpy() 188 | val_eer = em.compute_eer(scores[labels == 0], scores[labels == 1])[0] 189 | other_val_eer = em.compute_eer(-scores[labels == 0], -scores[labels == 1])[0] 190 | val_eer = min(val_eer, other_val_eer) 191 | 192 | with open(os.path.join('./log/', "dev_loss.log"), "a") as log: 193 | log.write(str(epoch) + "\t" + str( 194 | np.nanmean(dev_loss_dict[monitor_loss])) + "\t" + str( 195 | val_eer) + "\n") 196 | print("Val EER: {}".format(val_eer)) 197 | 198 | torch.save(model.state_dict(), os.path.join('./models/', 'model_%d.pt' % (epoch + 1))) 199 | end = time.time() 200 | hours, rem = divmod(end - start, 3600) 201 | minutes, seconds = divmod(rem, 60) 202 | print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) 203 | 204 | 205 | def main(): 206 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 207 | parser = argparse.ArgumentParser('ASVSpoof2021') 208 | add_parser(parser) 209 | train(parser, device) 210 | 211 | 212 | if __name__ == '__main__': 213 | main() 214 | -------------------------------------------------------------------------------- /evaluation_metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | 4 | def obtain_asv_error_rates(tar_asv, non_asv, spoof_asv, asv_threshold): 5 | 6 | # False alarm and miss rates for ASV 7 | Pfa_asv = sum(non_asv >= asv_threshold) / non_asv.size 8 | Pmiss_asv = sum(tar_asv < asv_threshold) / tar_asv.size 9 | 10 | # Rate of rejecting spoofs in ASV 11 | if spoof_asv.size == 0: 12 | Pmiss_spoof_asv = None 13 | Pfa_spoof_asv = None 14 | else: 15 | Pmiss_spoof_asv = np.sum(spoof_asv < asv_threshold) / spoof_asv.size 16 | Pfa_spoof_asv = np.sum(spoof_asv >= asv_threshold) / spoof_asv.size 17 | 18 | return Pfa_asv, Pmiss_asv, Pmiss_spoof_asv, Pfa_spoof_asv 19 | 20 | 21 | def compute_det_curve(target_scores, nontarget_scores): 22 | 23 | n_scores = target_scores.size + nontarget_scores.size 24 | all_scores = np.concatenate((target_scores, nontarget_scores)) 25 | labels = np.concatenate((np.ones(target_scores.size), np.zeros(nontarget_scores.size))) 26 | 27 | # Sort labels based on scores 28 | indices = np.argsort(all_scores, kind='mergesort') 29 | labels = labels[indices] 30 | 31 | # Compute false rejection and false acceptance rates 32 | tar_trial_sums = np.cumsum(labels) 33 | nontarget_trial_sums = nontarget_scores.size - (np.arange(1, n_scores + 1) - tar_trial_sums) 34 | 35 | frr = np.concatenate((np.atleast_1d(0), tar_trial_sums / target_scores.size)) # false rejection rates 36 | far = np.concatenate((np.atleast_1d(1), nontarget_trial_sums / nontarget_scores.size)) # false acceptance rates 37 | thresholds = np.concatenate((np.atleast_1d(all_scores[indices[0]] - 0.001), all_scores[indices])) # Thresholds are the sorted scores 38 | 39 | return frr, far, thresholds 40 | 41 | 42 | def compute_eer(target_scores, nontarget_scores): 43 | """ Returns equal error rate (EER) and the corresponding threshold. """ 44 | frr, far, thresholds = compute_det_curve(target_scores, nontarget_scores) 45 | abs_diffs = np.abs(frr - far) 46 | min_index = np.argmin(abs_diffs) 47 | eer = np.mean((frr[min_index], far[min_index])) 48 | return eer, thresholds[min_index] 49 | 50 | 51 | def compute_tDCF(bonafide_score_cm, spoof_score_cm, Pfa_asv, Pmiss_asv, Pfa_spoof_asv, cost_model, print_cost): 52 | """ 53 | Compute Tandem Detection Cost Function (t-DCF) [1] for a fixed ASV system. 54 | In brief, t-DCF returns a detection cost of a cascaded system of this form, 55 | 56 | Speech waveform -> [CM] -> [ASV] -> decision 57 | 58 | where CM stands for countermeasure and ASV for automatic speaker 59 | verification. The CM is therefore used as a 'gate' to decided whether or 60 | not the input speech sample should be passed onwards to the ASV system. 61 | Generally, both CM and ASV can do detection errors. Not all those errors 62 | are necessarily equally cost, and not all types of users are necessarily 63 | equally likely. The tandem t-DCF gives a principled with to compare 64 | different spoofing countermeasures under a detection cost function 65 | framework that takes that information into account. 66 | 67 | INPUTS: 68 | 69 | bonafide_score_cm A vector of POSITIVE CLASS (bona fide or human) 70 | detection scores obtained by executing a spoofing 71 | countermeasure (CM) on some positive evaluation trials. 72 | trial represents a bona fide case. 73 | spoof_score_cm A vector of NEGATIVE CLASS (spoofing attack) 74 | detection scores obtained by executing a spoofing 75 | CM on some negative evaluation trials. 76 | Pfa_asv False alarm (false acceptance) rate of the ASV 77 | system that is evaluated in tandem with the CM. 78 | Assumed to be in fractions, not percentages. 79 | Pmiss_asv Miss (false rejection) rate of the ASV system that 80 | is evaluated in tandem with the spoofing CM. 81 | Assumed to be in fractions, not percentages. 82 | Pmiss_spoof_asv Miss rate of spoof samples of the ASV system that 83 | is evaluated in tandem with the spoofing CM. That 84 | is, the fraction of spoof samples that were 85 | rejected by the ASV system. 86 | cost_model A struct that contains the parameters of t-DCF, 87 | with the following fields. 88 | 89 | Ptar Prior probability of target speaker. 90 | Pnon Prior probability of nontarget speaker (zero-effort impostor) 91 | Psoof Prior probability of spoofing attack. 92 | Cmiss Cost of tandem system falsely rejecting target speaker. 93 | Cfa Cost of tandem system falsely accepting nontarget speaker. 94 | Cfa_spoof Cost of tandem system falsely accepting spoof. 95 | 96 | print_cost Print a summary of the cost parameters and the 97 | implied t-DCF cost function? 98 | 99 | OUTPUTS: 100 | 101 | tDCF_norm Normalized t-DCF curve across the different CM 102 | system operating points; see [2] for more details. 103 | Normalized t-DCF > 1 indicates a useless 104 | countermeasure (as the tandem system would do 105 | better without it). min(tDCF_norm) will be the 106 | minimum t-DCF used in ASVspoof 2019 [2]. 107 | CM_thresholds Vector of same size as tDCF_norm corresponding to 108 | the CM threshold (operating point). 109 | 110 | NOTE: 111 | o In relative terms, higher detection scores values are assumed to 112 | indicate stronger support for the bona fide hypothesis. 113 | o You should provide real-valued soft scores, NOT hard decisions. The 114 | recommendation is that the scores are log-likelihood ratios (LLRs) 115 | from a bonafide-vs-spoof hypothesis based on some statistical model. 116 | This, however, is NOT required. The scores can have arbitrary range 117 | and scaling. 118 | o Pfa_asv, Pmiss_asv, Pmiss_spoof_asv are in fractions, not percentages. 119 | 120 | References: 121 | 122 | [1] T. Kinnunen, H. Delgado, N. Evans,K.-A. Lee, V. Vestman, 123 | A. Nautsch, M. Todisco, X. Wang, M. Sahidullah, J. Yamagishi, 124 | and D.-A. Reynolds, "Tandem Assessment of Spoofing Countermeasures 125 | and Automatic Speaker Verification: Fundamentals," IEEE/ACM Transaction on 126 | Audio, Speech and Language Processing (TASLP). 127 | 128 | [2] ASVspoof 2019 challenge evaluation plan 129 | https://www.asvspoof.org/asvspoof2019/asvspoof2019_evaluation_plan.pdf 130 | """ 131 | 132 | 133 | # Sanity check of cost parameters 134 | if cost_model['Cfa'] < 0 or cost_model['Cmiss'] < 0 or \ 135 | cost_model['Cfa'] < 0 or cost_model['Cmiss'] < 0: 136 | print('WARNING: Usually the cost values should be positive!') 137 | 138 | if cost_model['Ptar'] < 0 or cost_model['Pnon'] < 0 or cost_model['Pspoof'] < 0 or \ 139 | np.abs(cost_model['Ptar'] + cost_model['Pnon'] + cost_model['Pspoof'] - 1) > 1e-10: 140 | sys.exit('ERROR: Your prior probabilities should be positive and sum up to one.') 141 | 142 | # Unless we evaluate worst-case model, we need to have some spoof tests against asv 143 | if Pfa_spoof_asv is None: 144 | sys.exit('ERROR: you should provide false alarm rate of spoof tests against your ASV system.') 145 | 146 | # Sanity check of scores 147 | combined_scores = np.concatenate((bonafide_score_cm, spoof_score_cm)) 148 | if np.isnan(combined_scores).any() or np.isinf(combined_scores).any(): 149 | sys.exit('ERROR: Your scores contain nan or inf.') 150 | 151 | # Sanity check that inputs are scores and not decisions 152 | n_uniq = np.unique(combined_scores).size 153 | if n_uniq < 3: 154 | sys.exit('ERROR: You should provide soft CM scores - not binary decisions') 155 | 156 | # Obtain miss and false alarm rates of CM 157 | Pmiss_cm, Pfa_cm, CM_thresholds = compute_det_curve(bonafide_score_cm, spoof_score_cm) 158 | 159 | # Constants - see ASVspoof 2019 evaluation plan 160 | 161 | C0 = cost_model['Ptar'] * cost_model['Cmiss'] * Pmiss_asv + cost_model['Pnon']*cost_model['Cfa']*Pfa_asv 162 | C1 = cost_model['Ptar'] * cost_model['Cmiss'] - (cost_model['Ptar'] * cost_model['Cmiss'] * Pmiss_asv + cost_model['Pnon'] * cost_model['Cfa'] * Pfa_asv) 163 | C2 = cost_model['Pspoof'] * cost_model['Cfa_spoof'] * Pfa_spoof_asv; 164 | 165 | 166 | # Sanity check of the weights 167 | if C0 < 0 or C1 < 0 or C2 < 0: 168 | sys.exit('You should never see this error but I cannot evalute tDCF with negative weights - please check whether your ASV error rates are correctly computed?') 169 | 170 | # Obtain t-DCF curve for all thresholds 171 | tDCF = C0 + C1 * Pmiss_cm + C2 * Pfa_cm 172 | 173 | # Obtain default t-DCF 174 | tDCF_default = C0 + np.minimum(C1, C2) 175 | 176 | # Normalized t-DCF 177 | tDCF_norm = tDCF / tDCF_default 178 | 179 | # Everything should be fine if reaching here. 180 | if print_cost: 181 | 182 | print('t-DCF evaluation from [Nbona={}, Nspoof={}] trials\n'.format(bonafide_score_cm.size, spoof_score_cm.size)) 183 | print('t-DCF MODEL') 184 | print(' Ptar = {:8.5f} (Prior probability of target user)'.format(cost_model['Ptar'])) 185 | print(' Pnon = {:8.5f} (Prior probability of nontarget user)'.format(cost_model['Pnon'])) 186 | print(' Pspoof = {:8.5f} (Prior probability of spoofing attack)'.format(cost_model['Pspoof'])) 187 | print(' Cfa = {:8.5f} (Cost of tandem system falsely accepting a nontarget)'.format(cost_model['Cfa'])) 188 | print(' Cmiss = {:8.5f} (Cost of tandem system falsely rejecting target speaker)'.format(cost_model['Cmiss'])) 189 | print(' Cfa_spoof = {:8.5f} (Cost of tandem sysmte falsely accepting spoof)'.format(cost_model['Cfa_spoof'])) 190 | print('\n Implied normalized t-DCF function (depends on t-DCF parameters and ASV errors), t_CM=CM threshold)') 191 | print(' tDCF_norm(t_CM) = {:8.5f} + {:8.5f} x Pmiss_cm(t_CM) + {:8.5f} x Pfa_cm(t_CM)\n'.format(C0/tDCF_default, C1/tDCF_default, C2/tDCF_default)) 192 | print(' * The optimum value is given by the first term (0.06273). This is the normalized t-DCF obtained with an error-free CM system.') 193 | print(' * The minimum normalized cost (minimum over all possible thresholds) is always <= 1.00.') 194 | print('') 195 | 196 | return tDCF_norm, CM_thresholds 197 | 198 | def compute_tDCF_legacy(bonafide_score_cm, spoof_score_cm, Pfa_asv, Pmiss_asv, Pmiss_spoof_asv, cost_model, print_cost): 199 | """ 200 | Compute Tandem Detection Cost Function (t-DCF) [1] for a fixed ASV system. 201 | In brief, t-DCF returns a detection cost of a cascaded system of this form, 202 | 203 | Speech waveform -> [CM] -> [ASV] -> decision 204 | 205 | where CM stands for countermeasure and ASV for automatic speaker 206 | verification. The CM is therefore used as a 'gate' to decided whether or 207 | not the input speech sample should be passed onwards to the ASV system. 208 | Generally, both CM and ASV can do detection errors. Not all those errors 209 | are necessarily equally cost, and not all types of users are necessarily 210 | equally likely. The tandem t-DCF gives a principled with to compare 211 | different spoofing countermeasures under a detection cost function 212 | framework that takes that information into account. 213 | 214 | INPUTS: 215 | 216 | bonafide_score_cm A vector of POSITIVE CLASS (bona fide or human) 217 | detection scores obtained by executing a spoofing 218 | countermeasure (CM) on some positive evaluation trials. 219 | trial represents a bona fide case. 220 | spoof_score_cm A vector of NEGATIVE CLASS (spoofing attack) 221 | detection scores obtained by executing a spoofing 222 | CM on some negative evaluation trials. 223 | Pfa_asv False alarm (false acceptance) rate of the ASV 224 | system that is evaluated in tandem with the CM. 225 | Assumed to be in fractions, not percentages. 226 | Pmiss_asv Miss (false rejection) rate of the ASV system that 227 | is evaluated in tandem with the spoofing CM. 228 | Assumed to be in fractions, not percentages. 229 | Pmiss_spoof_asv Miss rate of spoof samples of the ASV system that 230 | is evaluated in tandem with the spoofing CM. That 231 | is, the fraction of spoof samples that were 232 | rejected by the ASV system. 233 | cost_model A struct that contains the parameters of t-DCF, 234 | with the following fields. 235 | 236 | Ptar Prior probability of target speaker. 237 | Pnon Prior probability of nontarget speaker (zero-effort impostor) 238 | Psoof Prior probability of spoofing attack. 239 | Cmiss_asv Cost of ASV falsely rejecting target. 240 | Cfa_asv Cost of ASV falsely accepting nontarget. 241 | Cmiss_cm Cost of CM falsely rejecting target. 242 | Cfa_cm Cost of CM falsely accepting spoof. 243 | 244 | print_cost Print a summary of the cost parameters and the 245 | implied t-DCF cost function? 246 | 247 | OUTPUTS: 248 | 249 | tDCF_norm Normalized t-DCF curve across the different CM 250 | system operating points; see [2] for more details. 251 | Normalized t-DCF > 1 indicates a useless 252 | countermeasure (as the tandem system would do 253 | better without it). min(tDCF_norm) will be the 254 | minimum t-DCF used in ASVspoof 2019 [2]. 255 | CM_thresholds Vector of same size as tDCF_norm corresponding to 256 | the CM threshold (operating point). 257 | 258 | NOTE: 259 | o In relative terms, higher detection scores values are assumed to 260 | indicate stronger support for the bona fide hypothesis. 261 | o You should provide real-valued soft scores, NOT hard decisions. The 262 | recommendation is that the scores are log-likelihood ratios (LLRs) 263 | from a bonafide-vs-spoof hypothesis based on some statistical model. 264 | This, however, is NOT required. The scores can have arbitrary range 265 | and scaling. 266 | o Pfa_asv, Pmiss_asv, Pmiss_spoof_asv are in fractions, not percentages. 267 | 268 | References: 269 | 270 | [1] T. Kinnunen, K.-A. Lee, H. Delgado, N. Evans, M. Todisco, 271 | M. Sahidullah, J. Yamagishi, D.A. Reynolds: "t-DCF: a Detection 272 | Cost Function for the Tandem Assessment of Spoofing Countermeasures 273 | and Automatic Speaker Verification", Proc. Odyssey 2018: the 274 | Speaker and Language Recognition Workshop, pp. 312--319, Les Sables d'Olonne, 275 | France, June 2018 (https://www.isca-speech.org/archive/Odyssey_2018/pdfs/68.pdf) 276 | 277 | [2] ASVspoof 2019 challenge evaluation plan 278 | https://www.asvspoof.org/asvspoof2019/asvspoof2019_evaluation_plan.pdf 279 | """ 280 | 281 | 282 | # Sanity check of cost parameters 283 | if cost_model['Cfa_asv'] < 0 or cost_model['Cmiss_asv'] < 0 or \ 284 | cost_model['Cfa_cm'] < 0 or cost_model['Cmiss_cm'] < 0: 285 | print('WARNING: Usually the cost values should be positive!') 286 | 287 | if cost_model['Ptar'] < 0 or cost_model['Pnon'] < 0 or cost_model['Pspoof'] < 0 or \ 288 | np.abs(cost_model['Ptar'] + cost_model['Pnon'] + cost_model['Pspoof'] - 1) > 1e-10: 289 | sys.exit('ERROR: Your prior probabilities should be positive and sum up to one.') 290 | 291 | # Unless we evaluate worst-case model, we need to have some spoof tests against asv 292 | if Pmiss_spoof_asv is None: 293 | sys.exit('ERROR: you should provide miss rate of spoof tests against your ASV system.') 294 | 295 | # Sanity check of scores 296 | combined_scores = np.concatenate((bonafide_score_cm, spoof_score_cm)) 297 | if np.isnan(combined_scores).any() or np.isinf(combined_scores).any(): 298 | sys.exit('ERROR: Your scores contain nan or inf.') 299 | 300 | # Sanity check that inputs are scores and not decisions 301 | n_uniq = np.unique(combined_scores).size 302 | if n_uniq < 3: 303 | sys.exit('ERROR: You should provide soft CM scores - not binary decisions') 304 | 305 | # Obtain miss and false alarm rates of CM 306 | Pmiss_cm, Pfa_cm, CM_thresholds = compute_det_curve(bonafide_score_cm, spoof_score_cm) 307 | 308 | # Constants - see ASVspoof 2019 evaluation plan 309 | C1 = cost_model['Ptar'] * (cost_model['Cmiss_cm'] - cost_model['Cmiss_asv'] * Pmiss_asv) - \ 310 | cost_model['Pnon'] * cost_model['Cfa_asv'] * Pfa_asv 311 | C2 = cost_model['Cfa_cm'] * cost_model['Pspoof'] * (1 - Pmiss_spoof_asv) 312 | 313 | # Sanity check of the weights 314 | if C1 < 0 or C2 < 0: 315 | sys.exit('You should never see this error but I cannot evalute tDCF with negative weights - please check whether your ASV error rates are correctly computed?') 316 | 317 | # Obtain t-DCF curve for all thresholds 318 | tDCF = C1 * Pmiss_cm + C2 * Pfa_cm 319 | 320 | # Normalized t-DCF 321 | tDCF_norm = tDCF / np.minimum(C1, C2) 322 | 323 | # Everything should be fine if reaching here. 324 | if print_cost: 325 | 326 | print('t-DCF evaluation from [Nbona={}, Nspoof={}] trials\n'.format(bonafide_score_cm.size, spoof_score_cm.size)) 327 | print('t-DCF MODEL') 328 | print(' Ptar = {:8.5f} (Prior probability of target user)'.format(cost_model['Ptar'])) 329 | print(' Pnon = {:8.5f} (Prior probability of nontarget user)'.format(cost_model['Pnon'])) 330 | print(' Pspoof = {:8.5f} (Prior probability of spoofing attack)'.format(cost_model['Pspoof'])) 331 | print(' Cfa_asv = {:8.5f} (Cost of ASV falsely accepting a nontarget)'.format(cost_model['Cfa_asv'])) 332 | print(' Cmiss_asv = {:8.5f} (Cost of ASV falsely rejecting target speaker)'.format(cost_model['Cmiss_asv'])) 333 | print(' Cfa_cm = {:8.5f} (Cost of CM falsely passing a spoof to ASV system)'.format(cost_model['Cfa_cm'])) 334 | print(' Cmiss_cm = {:8.5f} (Cost of CM falsely blocking target utterance which never reaches ASV)'.format(cost_model['Cmiss_cm'])) 335 | print('\n Implied normalized t-DCF function (depends on t-DCF parameters and ASV errors), s=CM threshold)') 336 | 337 | if C2 == np.minimum(C1, C2): 338 | print(' tDCF_norm(s) = {:8.5f} x Pmiss_cm(s) + Pfa_cm(s)\n'.format(C1 / C2)) 339 | else: 340 | print(' tDCF_norm(s) = Pmiss_cm(s) + {:8.5f} x Pfa_cm(s)\n'.format(C2 / C1)) 341 | 342 | return tDCF_norm, CM_thresholds 343 | --------------------------------------------------------------------------------