├── log
    └── nothing
├── data
    ├── dev
    │   └── empty
    ├── eval
    │   └── empty
    ├── train
    │   └── empty
    └── eval2021
    │   └── empty
├── models
    └── nothing
├── scores
    └── nothing
├── .gitignore
├── env.yml
├── README.md
├── model.py
├── tools
    ├── dataset_loader.py
    └── audio_utils.py
├── test.py
├── evaluate_tDCF_asvspoof19.py
├── loss.py
├── feature_layers.py
├── resnet.py
├── main.py
└── evaluation_metrics.py


/log/nothing:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/dev/empty:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/eval/empty:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/train/empty:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/nothing:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scores/nothing:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/eval2021/empty:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ./data/train/*.flac
2 | ./data/eval/*.flac
3 | ./data/eval2021/*.flac
4 | ./oc-rep
5 | ./cqcc
6 | ./.idea


--------------------------------------------------------------------------------
/env.yml:
--------------------------------------------------------------------------------
 1 | name: pytorch-1.6
 2 | channels:
 3 |   - defaults
 4 |   - pytorch
 5 |   - conda-forge
 6 |   - anaconda
 7 | dependencies:
 8 |   - python=3.8
 9 |   - pytorch::pytorch=1.6
10 |   - cudatoolkit=9.2
11 |   - pytorch::torchvision=0.7.0
12 |   - pytorch::torchaudio=0.6.0
13 |   - scipy=1.4.1
14 |   - numpy=1.18.1
15 |   - conda-forge::libsndfile=1.0.31
16 |   - conda-forge::pysoundfile
17 |   - numba=0.48.0
18 |   - librosa=0.8.0
19 |   - mir_eval=0.6


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # voice-spoof-detection-system
2 | This is the implementation of our work titled "A Countermeasure Based on CQT Spectrogram for Deepfake Speech Detection" was presented in the 7th International Conference on Signal Processing and Intelligent Systems (ICSPIS), Dec 2021. 
3 | 
4 | We are using CQT spectrogram as input and a ResNet-18 with self-attention for feature extraction.
5 | For a better discrimination of genuine samples from fake ones we use One Class Softmax.
6 | 
7 | Some part of the codes are borrowed from https://github.com/yzyouzhang/AIR-ASVspoof and https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts.
8 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | from resnet import *
 2 | from loss import *
 3 | import nnAudio.Spectrogram as torch_spec
 4 | from torchaudio import transforms
 5 | 
 6 | 
 7 | class Model(nn.Module):
 8 |     def __init__(self, input_channels, num_classes, device):
 9 |         super(Model, self).__init__()
10 | 
11 |         self.device = device
12 |         self.cqt = torch_spec.CQT(output_format='Complex', sr=16000).to(device)
13 |         self.amp_to_db = transforms.AmplitudeToDB()
14 |         self.resnet = ResNet(3, 256, resnet_type='18', nclasses=256).to(device)
15 | 
16 |         self.mlp_layer1 = nn.Linear(num_classes, 256).to(device)
17 |         self.mlp_layer2 = nn.Linear(256, 256).to(device)
18 |         self.mlp_layer3 = nn.Linear(256, 256).to(device)
19 |         self.drop_out = nn.Dropout(0.5)
20 | 
21 |         self.oc_softmax = OCSoftmax(256).to(device)
22 | 
23 |     def forward(self, x, labels, is_train=True):
24 |         x = x.to(self.device)
25 | 
26 |         x = self.cqt(x)
27 |         x = torch.pow(x[:, :, :, 0], 2) + torch.pow(x[:, :, :, 1], 2)
28 | 
29 |         x = self.amp_to_db(x)
30 | 
31 |         feat, mu = self.resnet(x.unsqueeze(1).float().to(self.device))
32 | 
33 |         # x = F.relu(self.mlp_layer1(x))
34 |         # self.drop_out(x)
35 |         # x = F.relu(self.mlp_layer2(x))
36 |         # self.drop_out(x)
37 |         # x = F.relu(self.mlp_layer3(x))
38 |         # feat = x
39 | 
40 | 
41 |         return self.oc_softmax(feat, labels, is_train)
42 | 


--------------------------------------------------------------------------------
/tools/dataset_loader.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import collections
 3 | import os
 4 | import soundfile as sf
 5 | import librosa
 6 | from torch.utils.data import DataLoader, Dataset
 7 | import numpy as np
 8 | from joblib import Parallel, delayed
 9 | 
10 | LOGICAL_DATA_ROOT = './data/'
11 | 
12 | ASVFile = collections.namedtuple('ASVFile',
13 |                                  ['speaker_id', 'file_name', 'path', 'sys_id', 'key'])
14 | 
15 | 
16 | class ASVDataset(Dataset):
17 |     """ Utility class to load  train/dev datatsets """
18 | 
19 |     def __init__(self,
20 |                  transform=None,
21 |                  is_train=True,
22 |                  is_eval=False,
23 |                  is_eval2021=False
24 |                  ):
25 |         super(ASVDataset, self).__init__()
26 | 
27 |         data_root = LOGICAL_DATA_ROOT
28 | 
29 |         self.is_eval = is_eval
30 |         self.is_eval_2021 = is_eval2021
31 | 
32 |         self.data_root = data_root
33 | 
34 |         self.dset_name = 'eval2021' if is_eval2021 else 'eval' if is_eval else 'train' if is_train else 'dev'
35 | 
36 |         self.protocols_fname = os.path.join(self.data_root, self.dset_name + '.protocol.txt')
37 | 
38 |         self.files_dir = os.path.join(self.data_root, '{}'.format(self.dset_name))
39 |         self.transform = transform
40 | 
41 |         self.files_meta = self.parse_protocols_file(self.protocols_fname)
42 |         self.data_files = os.listdir(self.files_dir)
43 | 
44 |     def __len__(self):
45 |         return len(self.data_files)
46 | 
47 |     def __getitem__(self, idx):
48 |         idx = idx % len(self.files_meta)
49 |         meta = self.files_meta[idx]
50 |         data_x, data_y, _ = self.read_file(meta)
51 |         data_x = self.transform(data_x)
52 |         x = data_x
53 |         y = data_y
54 |         return x, y, self.files_meta[idx]
55 | 
56 |     def read_file(self, meta):
57 |         data_x, sample_rate = sf.read(meta.path)
58 |         data_y = meta.key
59 |         return data_x, float(data_y), meta.sys_id
60 | 
61 |     def _parse_line(self, line):
62 |         tokens = line.strip().split(' ')
63 |         if self.is_eval_2021:
64 |             return ASVFile(speaker_id='',
65 |                            file_name=tokens[0],
66 |                            path=os.path.join(self.files_dir, tokens[0] + '.flac'),
67 |                            sys_id=0,
68 |                            key=0)
69 |         elif self.is_eval:
70 |             return ASVFile(speaker_id='',
71 |                            file_name=tokens[1],
72 |                            path=os.path.join(self.files_dir, tokens[1] + '.flac'),
73 |                            sys_id=0,
74 |                            key=int(tokens[4] == 'bonafide'))
75 |         return ASVFile(speaker_id=tokens[0],
76 |                        file_name=tokens[1],
77 |                        path=os.path.join(self.files_dir, tokens[1] + '.flac'),
78 |                        sys_id=0,
79 |                        key=int(tokens[4] == 'bonafide'))
80 | 
81 |     def parse_protocols_file(self, protocols_fname):
82 |         lines = open(protocols_fname).readlines()
83 |         files_meta = map(self._parse_line, lines)
84 |         return list(files_meta)
85 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import torch
 4 | from torch.utils.data import DataLoader
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | from evaluate_tDCF_asvspoof19 import evaluate_tDCF_asvspoof19
 8 | import evaluation_metrics as em
 9 | import numpy as np
10 | from model import Model
11 | from loss import *
12 | import librosa
13 | import torchvision
14 | from torch import Tensor
15 | 
16 | from tools.dataset_loader import ASVDataset
17 | 
18 | 
19 | def pad(x, max_len=64000):
20 |     x_len = x.shape[0]
21 |     if x_len >= max_len:
22 |         return x[:max_len]
23 |     # need to pad
24 |     num_repeats = (max_len / x_len) + 1
25 |     x_repeat = np.repeat(x, num_repeats)
26 |     padded_x = x_repeat[:max_len]
27 |     return padded_x
28 | 
29 | 
30 | def test_model(model_path, device, batch_size, eval_2021):
31 |     transforms = torchvision.transforms.Compose([
32 |         lambda x: pad(x),
33 |         lambda x: librosa.util.normalize(x),
34 |         lambda x: Tensor(x)
35 |     ])
36 | 
37 |     model = Model(input_channels=1, num_classes=256, device=device).to(device)
38 | 
39 |     model.load_state_dict(torch.load(model_path, map_location="cuda"))
40 | 
41 |     test_set = ASVDataset(is_train=False, is_eval=True, is_eval2021=False, transform=transforms)
42 |     test_set_2021 = ASVDataset(is_train=False, is_eval=True, is_eval2021=True, transform=transforms)
43 |     test_data_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=0)
44 |     test_data_loader_2021 = DataLoader(test_set_2021, batch_size=batch_size, shuffle=False, num_workers=0)
45 | 
46 |     model.eval()
47 |     if not eval_2021:
48 |         with open('./scores/cm_score.txt', 'w') as cm_score_file:
49 |             for batch_x, batch_y, batch_meta in test_data_loader:
50 |                 batch_x = batch_x.to(device)
51 |                 labels = batch_y.to(device)
52 |                 loss, score = model(batch_x, labels)
53 | 
54 |                 for j in range(labels.size(0)):
55 |                     cm_score_file.write(
56 |                         '%s %s %s\n' % (batch_meta.file_name[j],
57 |                                         'bonafide' if labels[j] == float(1) else 'spoof',
58 |                                         score[j].item()))
59 | 
60 |         evaluate_tDCF_asvspoof19(os.path.join('', './scores/cm_score.txt'),
61 |                                  './scores/ASVspoof2019.LA.asv.eval.scores.txt', None)
62 |     else:
63 |         with open('./scores/cm_score_2021.txt', 'w') as cm_score_file_2021:
64 |             for batch_x, batch_y, batch_meta in test_data_loader_2021:
65 |                 print('processing..', end="\r")
66 |                 batch_x = batch_x.to(device)
67 | 
68 |                 labels = batch_y.to(device)
69 | 
70 |                 loss, score = model(batch_x, labels)
71 | 
72 |                 for j in range(labels.size(0)):
73 |                     cm_score_file_2021.write('%s %s\n' % (batch_meta.file_name[j], score[j].item()))
74 | 
75 | 
76 |     return
77 | 
78 | 
79 | def test(model_path, device, batch_size, eval_2021):
80 |     model_path = os.path.join(model_path)
81 |     print(test_model(model_path, device, batch_size, eval_2021))
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     parser = argparse.ArgumentParser(description=__doc__)
86 |     parser.add_argument('-m', '--model-path', type=str, help="path to the trained model", default="./models/")
87 |     parser.add_argument('-b', '--batch-size', type=int, help="batch size for test process", default=32)
88 |     parser.add_argument('-e', '--eval-2021', type=bool, help="evaluate model over ASVspoof2021 data", default=False)
89 |     parser.add_argument("--gpu", type=str, help="GPU index", default="0")
90 |     args = parser.parse_args()
91 |     os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
92 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
93 |     test(args.model_path, device, args.batch_size, args.eval_2021)
94 | 


--------------------------------------------------------------------------------
/tools/audio_utils.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import torch
  4 | import numpy as np
  5 | import torch.nn.functional as F
  6 | 
  7 | 
  8 | def trimf(x, params):
  9 |     """
 10 |     trimf: similar to Matlab definition
 11 |     https://www.mathworks.com/help/fuzzy/trimf.html?s_tid=srchtitle
 12 | 
 13 |     """
 14 |     if len(params) != 3:
 15 |         print("trimp requires params to be a list of 3 elements")
 16 |         sys.exit(1)
 17 |     a = params[0]
 18 |     b = params[1]
 19 |     c = params[2]
 20 |     if a > b or b > c:
 21 |         print("trimp(x, [a, b, c]) requires a<=b<=c")
 22 |         sys.exit(1)
 23 |     y = torch.zeros_like(x, dtype=torch.float32)
 24 |     if a < b:
 25 |         index = torch.logical_and(a < x, x < b)
 26 |         y[index] = (x[index] - a) / (b - a)
 27 |     if b < c:
 28 |         index = torch.logical_and(b < x, x < c)
 29 |         y[index] = (c - x[index]) / (c - b)
 30 |     y[x == b] = 1
 31 |     return y
 32 | 
 33 | 
 34 | 
 35 | def dct1(x):
 36 |     """
 37 |     Discrete Cosine Transform, Type I
 38 |     :param x: the input signal
 39 |     :return: the DCT-I of the signal over the last dimension
 40 |     """
 41 |     x_shape = x.shape
 42 |     x = x.view(-1, x_shape[-1])
 43 | 
 44 |     return torch.rfft(
 45 |         torch.cat([x, x.flip([1])[:, 1:-1]], dim=1), 1)[:, :, 0].view(*x_shape)
 46 | 
 47 | 
 48 | def idct1(X):
 49 |     """
 50 |     The inverse of DCT-I, which is just a scaled DCT-I
 51 |     Our definition if idct1 is such that idct1(dct1(x)) == x
 52 |     :param X: the input signal
 53 |     :return: the inverse DCT-I of the signal over the last dimension
 54 |     """
 55 |     n = X.shape[-1]
 56 |     return dct1(X) / (2 * (n - 1))
 57 | 
 58 | 
 59 | def dct(x, norm=None):
 60 |     """
 61 |     Discrete Cosine Transform, Type II (a.k.a. the DCT)
 62 |     For the meaning of the parameter `norm`, see:
 63 |     https://docs.scipy.org/doc/ scipy.fftpack.dct.html
 64 |     :param x: the input signal
 65 |     :param norm: the normalization, None or 'ortho'
 66 |     :return: the DCT-II of the signal over the last dimension
 67 |     """
 68 |     x_shape = x.shape
 69 |     N = x_shape[-1]
 70 |     x = x.contiguous().view(-1, N)
 71 | 
 72 |     v = torch.cat([x[:, ::2], x[:, 1::2].flip([1])], dim=1)
 73 | 
 74 |     Vc = torch.rfft(v, 1, onesided=False)
 75 | 
 76 |     k = - torch.arange(N, dtype=x.dtype, device=x.device)[None, :] * np.pi/(2*N)
 77 |     W_r = torch.cos(k)
 78 |     W_i = torch.sin(k)
 79 | 
 80 |     V = Vc[:, :, 0] * W_r - Vc[:, :, 1] * W_i
 81 | 
 82 |     if norm == 'ortho':
 83 |         V[:, 0] /= np.sqrt(N) * 2
 84 |         V[:, 1:] /= np.sqrt(N / 2) * 2
 85 | 
 86 |     V = 2 * V.view(*x_shape)
 87 | 
 88 |     return V
 89 | 
 90 | 
 91 | def idct(X, norm=None):
 92 |     """
 93 |     The inverse to DCT-II, which is a scaled Discrete Cosine Transform, Type III
 94 |     Our definition of idct is that idct(dct(x)) == x
 95 |     For the meaning of the parameter `norm`, see:
 96 |     https://docs.scipy.org/doc/ scipy.fftpack.dct.html
 97 |     :param X: the input signal
 98 |     :param norm: the normalization, None or 'ortho'
 99 |     :return: the inverse DCT-II of the signal over the last dimension
100 |     """
101 | 
102 |     x_shape = X.shape
103 |     N = x_shape[-1]
104 | 
105 |     X_v = X.contiguous().view(-1, x_shape[-1]) / 2
106 | 
107 |     if norm == 'ortho':
108 |         X_v[:, 0] *= np.sqrt(N) * 2
109 |         X_v[:, 1:] *= np.sqrt(N / 2) * 2
110 | 
111 |     k = torch.arange(x_shape[-1], dtype=X.dtype,
112 |                      device=X.device)[None, :]*np.pi/(2*N)
113 |     W_r = torch.cos(k)
114 |     W_i = torch.sin(k)
115 | 
116 |     V_t_r = X_v
117 |     V_t_i = torch.cat([X_v[:, :1] * 0, -X_v.flip([1])[:, :-1]], dim=1)
118 | 
119 |     V_r = V_t_r * W_r - V_t_i * W_i
120 |     V_i = V_t_r * W_i + V_t_i * W_r
121 | 
122 |     V = torch.cat([V_r.unsqueeze(2), V_i.unsqueeze(2)], dim=2)
123 | 
124 |     v = torch.irfft(V, 1, onesided=False)
125 |     x = v.new_zeros(v.shape)
126 |     x[:, ::2] += v[:, :N - (N // 2)]
127 |     x[:, 1::2] += v.flip([1])[:, :N // 2]
128 | 
129 |     return x.view(*x_shape)
130 | 
131 | 
132 | def delta(x):
133 |     """ By default
134 |     input
135 |     -----
136 |     x (batch, Length, dim)
137 | 
138 |     output
139 |     ------
140 |     output (batch, Length, dim)
141 | 
142 |     Delta is calculated along Length dimension
143 |     """
144 |     length = x.shape[1]
145 |     output = torch.zeros_like(x)
146 |     x_temp = F.pad(x.unsqueeze(1), (0, 0, 1, 1), 'replicate').squeeze(1)
147 |     output = -1 * x_temp[:, 0:length] + x_temp[:, 2:]
148 |     return output
149 | 


--------------------------------------------------------------------------------
/evaluate_tDCF_asvspoof19.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import numpy as np
  3 | import evaluation_metrics as em
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | def evaluate_tDCF_asvspoof19(cm_score_file, asv_score_file, legacy):
  7 | 
  8 |     # Fix tandem detection cost function (t-DCF) parameters
  9 |     if legacy:
 10 |         Pspoof = 0.05
 11 |         cost_model = {
 12 |             'Pspoof': Pspoof,  # Prior probability of a spoofing attack
 13 |             'Ptar': (1 - Pspoof) * 0.99,  # Prior probability of target speaker
 14 |             'Pnon': (1 - Pspoof) * 0.01,  # Prior probability of nontarget speaker
 15 |             'Cmiss_asv': 1,  # Cost of ASV system falsely rejecting target speaker
 16 |             'Cfa_asv': 10,  # Cost of ASV system falsely accepting nontarget speaker
 17 |             'Cmiss_cm': 1,  # Cost of CM system falsely rejecting target speaker
 18 |             'Cfa_cm': 10,  # Cost of CM system falsely accepting spoof
 19 |         }
 20 |     else:
 21 |         Pspoof = 0.05
 22 |         cost_model = {
 23 |             'Pspoof': Pspoof,  # Prior probability of a spoofing attack
 24 |             'Ptar': (1 - Pspoof) * 0.99,  # Prior probability of target speaker
 25 |             'Pnon': (1 - Pspoof) * 0.01,  # Prior probability of nontarget speaker
 26 |             'Cmiss': 1,  # Cost of tandem system falsely rejecting target speaker
 27 |             'Cfa': 10,  # Cost of tandem system falsely accepting nontarget speaker
 28 |             'Cfa_spoof': 10,  # Cost of tandem system falsely accepting spoof
 29 |         }
 30 | 
 31 |     # Load organizers' ASV scores
 32 |     asv_data = np.genfromtxt(asv_score_file, dtype=str)
 33 |     asv_keys = asv_data[:, 1]
 34 |     asv_scores = asv_data[:, 2].astype(np.float)
 35 | 
 36 |     # Load CM scores
 37 |     cm_data = np.genfromtxt(cm_score_file, dtype=str)
 38 |     cm_keys = cm_data[:, 1]
 39 |     cm_scores = cm_data[:, 2].astype(np.float)
 40 | 
 41 |     # Extract target, nontarget, and spoof scores from the ASV scores
 42 |     tar_asv = asv_scores[asv_keys == 'target']
 43 |     non_asv = asv_scores[asv_keys == 'nontarget']
 44 |     spoof_asv = asv_scores[asv_keys == 'spoof']
 45 | 
 46 |     # Extract bona fide (real human) and spoof scores from the CM scores
 47 |     bona_cm = cm_scores[cm_keys == 'bonafide']
 48 |     spoof_cm = cm_scores[cm_keys == 'spoof']
 49 | 
 50 |     # EERs of the standalone systems and fix ASV operating point to EER threshold
 51 |     eer_asv, asv_threshold = em.compute_eer(tar_asv, non_asv)
 52 |     eer_cm = em.compute_eer(bona_cm, spoof_cm)[0]
 53 | 
 54 | 
 55 |     [Pfa_asv, Pmiss_asv, Pmiss_spoof_asv, Pfa_spoof_asv] = em.obtain_asv_error_rates(tar_asv, non_asv, spoof_asv, asv_threshold)
 56 | 
 57 | 
 58 |     # Compute t-DCF
 59 |     if legacy:
 60 |         tDCF_curve, CM_thresholds = em.compute_tDCF_legacy(bona_cm, spoof_cm, Pfa_asv, Pmiss_asv, Pmiss_spoof_asv, cost_model, True)
 61 |     else:
 62 |         tDCF_curve, CM_thresholds = em.compute_tDCF(bona_cm, spoof_cm, Pfa_asv, Pmiss_asv, Pfa_spoof_asv, cost_model, True)
 63 | 
 64 |     # Minimum t-DCF
 65 |     min_tDCF_index = np.argmin(tDCF_curve)
 66 |     min_tDCF = tDCF_curve[min_tDCF_index]
 67 |     min_tDCF_threshold = CM_thresholds[min_tDCF_index];
 68 | 
 69 |     # compute DET of CM and get Pmiss and Pfa for the selected threshold t_CM
 70 |     Pmiss_cm, Pfa_cm, CM_thresholds = em.compute_det_curve(bona_cm, spoof_cm)
 71 |     Pmiss_t_CM = Pmiss_cm[CM_thresholds == min_tDCF_threshold]
 72 |     Pfa_t_CM = Pfa_cm[CM_thresholds == min_tDCF_threshold]
 73 | 
 74 | 
 75 |     print('ASV SYSTEM')
 76 |     print('   EER            = {:8.5f} % (Equal error rate (target vs. nontarget discrimination)'.format(eer_asv * 100))
 77 |     print('   Pfa            = {:8.5f} % (False acceptance rate of nontargets)'.format(Pfa_asv * 100))
 78 |     print('   Pmiss          = {:8.5f} % (False rejection rate of targets)'.format(Pmiss_asv * 100))
 79 |     if legacy:
 80 |         print('   1-Pmiss,spoof  = {:8.5f} % (Spoof false acceptance rate)'.format((1 - Pmiss_spoof_asv) * 100))
 81 |     else:
 82 |         print('   Pfa,spoof  = {:8.5f} % (Spoof false acceptance rate)'.format((1 - Pmiss_spoof_asv) * 100))
 83 | 
 84 |     print('\nCM SYSTEM')
 85 |     print('   EER                  = {:8.5f} % (Equal error rate for countermeasure)'.format(eer_cm * 100))
 86 |     print('   Pfa(t_CM_min_tDCF)   = {:8.5f} % (False acceptance rate of spoofs)'.format(Pfa_t_CM[0] * 100))
 87 |     print('   Pmiss(t_CM_min_tDCF) = {:8.5f} % (Miss (false rejection) rate of bonafide)'.format(Pmiss_t_CM[0] * 100))
 88 | 
 89 |     print('\nTANDEM')
 90 |     print('   min-tDCF       = {:8.5f}'.format(min_tDCF))
 91 | 
 92 | 
 93 |     # Visualize ASV scores and CM scores
 94 |     plt.figure()
 95 |     ax = plt.subplot(121)
 96 |     plt.hist(tar_asv, histtype='step', density=True, bins=50, label='Target')
 97 |     plt.hist(non_asv, histtype='step', density=True, bins=50, label='Nontarget')
 98 |     plt.hist(spoof_asv, histtype='step', density=True, bins=50, label='Spoof')
 99 |     plt.plot(asv_threshold, 0, 'o', markersize=10, mfc='none', mew=2, clip_on=False, label='EER threshold')
100 |     plt.legend()
101 |     plt.xlabel('ASV score')
102 |     plt.ylabel('Density')
103 |     plt.title('ASV score histogram')
104 | 
105 |     ax = plt.subplot(122)
106 |     plt.hist(bona_cm, histtype='step', density=True, bins=50, label='Bona fide')
107 |     plt.hist(spoof_cm, histtype='step', density=True, bins=50, label='Spoof')
108 |     plt.legend()
109 |     plt.xlabel('CM score')
110 |     #plt.ylabel('Density')
111 |     plt.title('CM score histogram')
112 | 
113 | 
114 |     # Plot t-DCF as function of the CM threshold.
115 |     plt.figure()
116 |     plt.plot(CM_thresholds, tDCF_curve)
117 |     plt.plot(CM_thresholds[min_tDCF_index], min_tDCF, 'o', markersize=10, mfc='none', mew=2)
118 |     plt.xlabel('CM threshold index (operating point)')
119 |     plt.ylabel('Norm t-DCF');
120 |     plt.title('Normalized tandem t-DCF')
121 |     plt.plot([np.min(CM_thresholds), np.max(CM_thresholds)], [1, 1], '--', color='black')
122 |     plt.legend(('t-DCF', 'min t-DCF ({:.5f})'.format(min_tDCF), 'Arbitrarily bad CM (Norm t-DCF=1)'))
123 |     plt.xlim([np.min(CM_thresholds), np.max(CM_thresholds)])
124 |     plt.ylim([0, 1.5])
125 | 
126 |     plt.show()
127 | 


--------------------------------------------------------------------------------
/loss.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.autograd.function import Function
  4 | import torch.nn.functional as F
  5 | from torch.autograd import Variable
  6 | from torch.nn import Parameter
  7 | 
  8 | 
  9 | class OCAngleLayer(nn.Module):
 10 |     """ Output layer to produce activation for one-class softmax
 11 | 
 12 |     Usage example:
 13 |      batchsize = 64
 14 |      input_dim = 10
 15 |      class_num = 2
 16 | 
 17 |      l_layer = OCAngleLayer(input_dim)
 18 |      l_loss = OCSoftmaxWithLoss()
 19 | 
 20 |      data = torch.rand(batchsize, input_dim, requires_grad=True)
 21 |      target = (torch.rand(batchsize) * class_num).clamp(0, class_num-1)
 22 |      target = target.to(torch.long)
 23 | 
 24 |      scores = l_layer(data)
 25 |      loss = l_loss(scores, target)
 26 | 
 27 |      loss.backward()
 28 |     """
 29 | 
 30 |     def __init__(self, in_planes, w_posi=0.9, w_nega=0.2, alpha=20.0):
 31 |         super(OCAngleLayer, self).__init__()
 32 |         self.in_planes = in_planes
 33 |         self.w_posi = w_posi
 34 |         self.w_nega = w_nega
 35 |         self.out_planes = 1
 36 | 
 37 |         self.weight = Parameter(torch.Tensor(in_planes, self.out_planes))
 38 |         # self.weight.data.uniform_(-1, 1).renorm_(2,1,1e-5).mul_(1e5)
 39 |         nn.init.kaiming_uniform_(self.weight, 0.25)
 40 |         self.weight.data.renorm_(2, 1, 1e-5).mul_(1e5)
 41 | 
 42 |         self.alpha = alpha
 43 | 
 44 |     def forward(self, input, flag_angle_only=False):
 45 |         """
 46 |         Compute oc-softmax activations
 47 | 
 48 |         input:
 49 |         ------
 50 |           input tensor (batchsize, input_dim)
 51 | 
 52 |         output:
 53 |         -------
 54 |           tuple of tensor ((batchsize, output_dim), (batchsize, output_dim))
 55 |         """
 56 |         # w (feature_dim, output_dim)
 57 |         w = self.weight.renorm(2, 1, 1e-5).mul(1e5)
 58 |         # x_modulus (batchsize)
 59 |         # sum input -> x_modules in shape (batchsize)
 60 |         x_modulus = input.pow(2).sum(1).pow(0.5)
 61 |         # w_modules (output_dim)
 62 |         # w_moduls should be 1, since w has been normalized
 63 |         # w_modulus = w.pow(2).sum(0).pow(0.5)
 64 | 
 65 |         # W * x = ||W|| * ||x|| * cos())))))))
 66 |         # inner_wx (batchsize, 1)
 67 |         inner_wx = input.mm(w)
 68 |         # cos_theta (batchsize, output_dim)
 69 |         cos_theta = inner_wx / x_modulus.view(-1, 1)
 70 |         cos_theta = cos_theta.clamp(-1, 1)
 71 | 
 72 |         if flag_angle_only:
 73 |             pos_score = cos_theta
 74 |             neg_score = cos_theta
 75 |         else:
 76 |             pos_score = self.alpha * (self.w_posi - cos_theta)
 77 |             neg_score = -1 * self.alpha * (self.w_nega - cos_theta)
 78 | 
 79 |         #
 80 |         return pos_score, neg_score
 81 | 
 82 | 
 83 | class OCSoftmaxWithLoss(nn.Module):
 84 |     """
 85 |     OCSoftmaxWithLoss()
 86 | 
 87 |     """
 88 | 
 89 |     def __init__(self):
 90 |         super(OCSoftmaxWithLoss, self).__init__()
 91 |         self.m_loss = nn.Softplus()
 92 | 
 93 |     def forward(self, inputs, target):
 94 |         """
 95 |         input:
 96 |         ------
 97 |           input: tuple of tensors ((batchsie, out_dim), (batchsie, out_dim))
 98 |                  output from OCAngle
 99 |                  inputs[0]: positive class score
100 |                  inputs[1]: negative class score
101 |           target: tensor (batchsize)
102 |                  tensor of target index
103 |         output:
104 |         ------
105 |           loss: scalar
106 |         """
107 |         # Assume target is binary, positive = 1, negaitve = 0
108 |         #
109 |         # Equivalent to select the scores using if-elese
110 |         # if target = 1, use inputs[0]
111 |         # else, use inputs[1]
112 |         output = inputs[0] * target.view(-1, 1) + \
113 |                  inputs[1] * (1 - target.view(-1, 1))
114 |         loss = self.m_loss(output).mean()
115 | 
116 |         return loss
117 | 
118 | 
119 | class OCSoftmax(nn.Module):
120 |     def __init__(self, feat_dim=2, r_real=0.9, r_fake=0.5, alpha=20.0):
121 |         super(OCSoftmax, self).__init__()
122 |         self.feat_dim = feat_dim
123 |         self.r_real = r_real
124 |         self.r_fake = r_fake
125 |         self.alpha = alpha
126 |         self.center = nn.Parameter(torch.randn(1, self.feat_dim))
127 |         nn.init.kaiming_uniform_(self.center, 0.25)
128 |         self.softplus = nn.Softplus()
129 | 
130 |     def forward(self, x, labels, is_train=True):
131 |         """
132 |         Args:
133 |             x: feature matrix with shape (batch_size, feat_dim).
134 |             labels: ground truth labels with shape (batch_size).
135 |             is_train: check if we are in in train mode.
136 |         """
137 |         w = F.normalize(self.center, p=2, dim=1)
138 |         x = F.normalize(x, p=2, dim=1)
139 | 
140 | 
141 |         scores = x @ w.transpose(0,1)
142 |         output_scores = scores.clone()
143 | 
144 |         if is_train:
145 |             scores[labels == 0] = self.r_real - scores[labels == 0]
146 |             scores[labels == 1] = scores[labels == 1] - self.r_fake
147 | 
148 |         loss = self.softplus(self.alpha * scores).mean()
149 | 
150 |         return loss, -output_scores.squeeze(1)
151 | 
152 | 
153 | class AMSoftmax(nn.Module):
154 |     def __init__(self, num_classes, enc_dim, s=20, m=0.9):
155 |         super(AMSoftmax, self).__init__()
156 |         self.enc_dim = enc_dim
157 |         self.num_classes = num_classes
158 |         self.s = s
159 |         self.m = m
160 |         self.centers = nn.Parameter(torch.randn(num_classes, enc_dim))
161 | 
162 |     def forward(self, feat, label):
163 |         batch_size = feat.shape[0]
164 |         norms = torch.norm(feat, p=2, dim=-1, keepdim=True)
165 |         nfeat = torch.div(feat, norms)
166 | 
167 |         norms_c = torch.norm(self.centers, p=2, dim=-1, keepdim=True)
168 |         ncenters = torch.div(self.centers, norms_c)
169 |         logits = torch.matmul(nfeat, torch.transpose(ncenters, 0, 1))
170 | 
171 |         y_onehot = torch.FloatTensor(batch_size, self.num_classes)
172 |         y_onehot.zero_()
173 |         y_onehot = Variable(y_onehot).cuda()
174 |         y_onehot.scatter_(1, torch.unsqueeze(label, dim=-1), self.m)
175 |         margin_logits = self.s * (logits - y_onehot)
176 | 
177 |         return logits, margin_logits
178 | 


--------------------------------------------------------------------------------
/feature_layers.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch
  3 | import tools.audio_utils
  4 | import librosa
  5 | import numpy as np
  6 | 
  7 | 
  8 | class CQT(nn.Module):
  9 |     def __init__(self, sampling_rate):
 10 |         super(CQT, self).__init__()
 11 |         self.sampling_rate = sampling_rate
 12 | 
 13 |     def forward(self, x, device):
 14 |         batch_size = x.shape[0]
 15 |         batch_output = torch.zeros(batch_size, 84, 126)
 16 |         batch_count = 0
 17 |         for item in x:
 18 |             numpy_item = item.numpy()
 19 |             item_cqt = librosa.cqt(numpy_item, sr=self.sampling_rate)
 20 |             item_cqt = librosa.amplitude_to_db(np.abs(item_cqt), ref=np.max)
 21 |             item_torch_cqt = torch.from_numpy(item_cqt).to(device)
 22 |             batch_output[batch_count] = item_torch_cqt
 23 |             batch_count += 1
 24 | 
 25 |         return batch_output.to(device)
 26 | 
 27 | 
 28 | class Spectrogram(nn.Module):
 29 |     def __init__(self, n_fft):
 30 |         super(Spectrogram, self).__init__()
 31 |         self.n_fft = n_fft
 32 | 
 33 |     def forward(self, x, device):
 34 |         batch_size = x.shape[0]
 35 |         batch_output = torch.zeros(batch_size, 1025, 126)
 36 |         batch_count = 0
 37 |         for item in x:
 38 |             numpy_item = item.numpy()
 39 |             item_stft = librosa.stft(numpy_item, n_fft=self.n_fft)
 40 |             item_stft = librosa.amplitude_to_db(np.abs(item_stft), ref=np.max)
 41 |             item_torch_stft = torch.from_numpy(item_stft).to(device)
 42 |             batch_output[batch_count] = item_torch_stft
 43 |             batch_count += 1
 44 | 
 45 |         return batch_output.to(device)
 46 | 
 47 | 
 48 | class LinearDCT(nn.Linear):
 49 |     """Implement any DCT as a linear layer; in practice this executes around
 50 |     50x faster on GPU. Unfortunately, the DCT matrix is stored, which will
 51 |     increase memory usage.
 52 |     :param in_features: size of expected input
 53 |     :param type: which dct function in this file to use"""
 54 |     def __init__(self, in_features, type, norm=None, bias=False):
 55 |         self.type = type
 56 |         self.N = in_features
 57 |         self.norm = norm
 58 |         super(LinearDCT, self).__init__(in_features, in_features, bias=bias)
 59 | 
 60 |     def reset_parameters(self):
 61 |         # initialise using dct function
 62 |         I = torch.eye(self.N)
 63 |         if self.type == 'dct1':
 64 |             self.weight.data = tools.audio_utils.dct1(I).data.t()
 65 |         elif self.type == 'idct1':
 66 |             self.weight.data = tools.audio_utils.idct1(I).data.t()
 67 |         elif self.type == 'dct':
 68 |             self.weight.data = tools.audio_utils.dct(I, norm=self.norm).data.t()
 69 |         elif self.type == 'idct':
 70 |             self.weight.data = tools.audio_utils.idct(I, norm=self.norm).data.t()
 71 |         self.weight.requires_grad = False # don't learn this!
 72 | 
 73 | 
 74 | class LFCC(nn.Module):
 75 |     """ Based on asvspoof.org baseline Matlab code.
 76 |     Difference: with_energy is added to set the first dimension as energy
 77 |     """
 78 | 
 79 |     def __init__(self, fl, fs, fn, sr, filter_num,
 80 |                  with_energy=False, with_emphasis=True,
 81 |                  with_delta=True, flag_for_LFB=False):
 82 |         """ Initialize LFCC
 83 | 
 84 |         Para:
 85 |         -----
 86 |           fl: int, frame length, (number of waveform points)
 87 |           fs: int, frame shift, (number of waveform points)
 88 |           fn: int, FFT points
 89 |           sr: int, sampling rate (Hz)
 90 |           filter_num: int, number of filters in filter-bank
 91 |           with_energy: bool, (default False), whether replace 1st dim to energy
 92 |           with_emphasis: bool, (default True), whether pre-emphaze input wav
 93 |           with_delta: bool, (default True), whether use delta and delta-delta
 94 | 
 95 |           for_LFB: bool (default False), reserved for LFB feature
 96 |         """
 97 |         super(LFCC, self).__init__()
 98 |         self.fl = fl
 99 |         self.fs = fs
100 |         self.fn = fn
101 |         self.sr = sr
102 |         self.filter_num = filter_num
103 | 
104 |         # build the triangle filter bank
105 |         f = (sr / 2) * torch.linspace(0, 1, fn // 2 + 1)
106 |         filter_bands = torch.linspace(min(f), max(f), filter_num + 2)
107 | 
108 |         filter_bank = torch.zeros([fn // 2 + 1, filter_num])
109 |         for idx in range(filter_num):
110 |             filter_bank[:, idx] = tools.audio_utils.trimf(
111 |                 f, [filter_bands[idx],
112 |                     filter_bands[idx + 1],
113 |                     filter_bands[idx + 2]])
114 |         self.lfcc_fb = nn.Parameter(filter_bank, requires_grad=False)
115 | 
116 |         # DCT as a linear transformation layer
117 |         self.l_dct = LinearDCT(filter_num, 'dct', norm='ortho')
118 | 
119 |         # opts
120 |         self.with_energy = with_energy
121 |         self.with_emphasis = with_emphasis
122 |         self.with_delta = with_delta
123 |         self.flag_for_LFB = flag_for_LFB
124 |         return
125 | 
126 |     def forward(self, x):
127 |         """
128 | 
129 |         input:
130 |         ------
131 |          x: tensor(batch, length), where length is waveform length
132 | 
133 |         output:
134 |         -------
135 |          lfcc_output: tensor(batch, frame_num, dim_num)
136 |         """
137 |         # pre-emphsis
138 |         if self.with_emphasis:
139 |             x[:, 1:] = x[:, 1:] - 0.97 * x[:, 0:-1]
140 | 
141 |         # STFT
142 |         x_stft = torch.stft(x, self.fn, self.fs, self.fl,
143 |                             window=torch.hamming_window(self.fl).to(x.device),
144 |                             onesided=True, pad_mode="constant")
145 |         # amplitude
146 |         sp_amp = torch.norm(x_stft, 2, -1).pow(2).permute(0, 2, 1).contiguous()
147 | 
148 |         # filter bank
149 |         fb_feature = torch.log10(torch.matmul(sp_amp, self.lfcc_fb) +
150 |                                  torch.finfo(torch.float32).eps)
151 | 
152 |         # DCT (if necessary, remove DCT)
153 |         lfcc = self.l_dct(fb_feature) if not self.flag_for_LFB else fb_feature
154 | 
155 |         # Add energy
156 |         if self.with_energy:
157 |             power_spec = sp_amp / self.fn
158 |             energy = torch.log10(power_spec.sum(axis=2) +
159 |                                  torch.finfo(torch.float32).eps)
160 |             lfcc[:, :, 0] = energy
161 | 
162 |         # Add delta coefficients
163 |         if self.with_delta:
164 |             lfcc_delta = tools.audio_utils.delta(lfcc)
165 |             lfcc_delta_delta = tools.audio_utils.delta(lfcc_delta)
166 |             lfcc_output = torch.cat((lfcc, lfcc_delta, lfcc_delta_delta), 2)
167 |         else:
168 |             lfcc_output = lfcc
169 | 
170 |         # done
171 |         return lfcc_output
172 | 


--------------------------------------------------------------------------------
/resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import torch.nn.init as init
  5 | import os
  6 | import random
  7 | import numpy as np
  8 | 
  9 | 
 10 | # Adapted from https://github.com/joaomonteirof/e2e_antispoofing
 11 | 
 12 | class SelfAttention(nn.Module):
 13 |     def __init__(self, hidden_size, mean_only=False):
 14 |         super(SelfAttention, self).__init__()
 15 | 
 16 |         self.hidden_size = hidden_size
 17 |         self.att_weights = nn.Parameter(torch.Tensor(1, hidden_size),requires_grad=True)
 18 | 
 19 |         self.mean_only = mean_only
 20 | 
 21 |         init.kaiming_uniform_(self.att_weights)
 22 | 
 23 |     def forward(self, inputs):
 24 | 
 25 |         batch_size = inputs.size(0)
 26 |         weights = torch.bmm(inputs, self.att_weights.permute(1, 0).unsqueeze(0).repeat(batch_size, 1, 1))
 27 | 
 28 |         if inputs.size(0)==1:
 29 |             attentions = F.softmax(torch.tanh(weights),dim=1)
 30 |             weighted = torch.mul(inputs, attentions.expand_as(inputs))
 31 |         else:
 32 |             attentions = F.softmax(torch.tanh(weights.squeeze()),dim=1)
 33 |             weighted = torch.mul(inputs, attentions.unsqueeze(2).expand_as(inputs))
 34 | 
 35 |         if self.mean_only:
 36 |             return weighted.sum(1)
 37 |         else:
 38 |             noise = 1e-5*torch.randn(weighted.size())
 39 | 
 40 |             if inputs.is_cuda:
 41 |                 noise = noise.to(inputs.device)
 42 |             avg_repr, std_repr = weighted.sum(1), (weighted+noise).std(1)
 43 | 
 44 |             representations = torch.cat((avg_repr,std_repr),1)
 45 | 
 46 |             return representations
 47 | 
 48 | 
 49 | class PreActBlock(nn.Module):
 50 |     '''Pre-activation version of the BasicBlock.'''
 51 |     expansion = 1
 52 | 
 53 |     def __init__(self, in_planes, planes, stride, *args, **kwargs):
 54 |         super(PreActBlock, self).__init__()
 55 |         self.bn1 = nn.BatchNorm2d(in_planes)
 56 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 57 |         self.bn2 = nn.BatchNorm2d(planes)
 58 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
 59 | 
 60 |         if stride != 1 or in_planes != self.expansion*planes:
 61 |             self.shortcut = nn.Sequential(nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False))
 62 | 
 63 |     def forward(self, x):
 64 |         out = F.relu(self.bn1(x))
 65 |         shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
 66 |         out = self.conv1(out)
 67 |         out = self.conv2(F.relu(self.bn2(out)))
 68 |         out += shortcut
 69 |         return out
 70 | 
 71 | 
 72 | class PreActBottleneck(nn.Module):
 73 |     '''Pre-activation version of the original Bottleneck module.'''
 74 |     expansion = 4
 75 | 
 76 |     def __init__(self, in_planes, planes, stride, *args, **kwargs):
 77 |         super(PreActBottleneck, self).__init__()
 78 |         self.bn1 = nn.BatchNorm2d(in_planes)
 79 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
 80 |         self.bn2 = nn.BatchNorm2d(planes)
 81 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 82 |         self.bn3 = nn.BatchNorm2d(planes)
 83 |         self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
 84 | 
 85 |         if stride != 1 or in_planes != self.expansion*planes:
 86 |             self.shortcut = nn.Sequential(nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False))
 87 | 
 88 |     def forward(self, x):
 89 |         out = F.relu(self.bn1(x))
 90 |         shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
 91 |         out = self.conv1(out)
 92 |         out = self.conv2(F.relu(self.bn2(out)))
 93 |         out = self.conv3(F.relu(self.bn3(out)))
 94 |         out += shortcut
 95 |         return out
 96 | 
 97 | 
 98 | def conv3x3(in_planes, out_planes, stride=1):
 99 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
100 | 
101 | 
102 | def conv1x1(in_planes, out_planes, stride=1):
103 |     return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
104 | 
105 | 
106 | RESNET_CONFIGS = {'18': [[2, 2, 2, 2], PreActBlock],
107 |                   '28': [[3, 4, 6, 3], PreActBlock],
108 |                   '34': [[3, 4, 6, 3], PreActBlock],
109 |                   '50': [[3, 4, 6, 3], PreActBottleneck],
110 |                   '101': [[3, 4, 23, 3], PreActBottleneck]
111 |                   }
112 | 
113 | 
114 | def setup_seed(random_seed, cudnn_deterministic=True):
115 |     # initialization
116 |     torch.manual_seed(random_seed)
117 |     random.seed(random_seed)
118 |     np.random.seed(random_seed)
119 |     os.environ['PYTHONHASHSEED'] = str(random_seed)
120 | 
121 |     if torch.cuda.is_available():
122 |         torch.cuda.manual_seed_all(random_seed)
123 |         torch.backends.cudnn.deterministic = cudnn_deterministic
124 |         torch.backends.cudnn.benchmark = False
125 | 
126 | 
127 | class ResNet(nn.Module):
128 |     def __init__(self, num_nodes, enc_dim, resnet_type='18', nclasses=2):
129 |         self.in_planes = 16
130 |         super(ResNet, self).__init__()
131 | 
132 |         layers, block = RESNET_CONFIGS[resnet_type]
133 | 
134 |         self._norm_layer = nn.BatchNorm2d
135 | 
136 |         self.conv1 = nn.Conv2d(1, 16, kernel_size=(9, 3), stride=(3, 1), padding=(1, 1), bias=False)
137 |         self.bn1 = nn.BatchNorm2d(16)
138 |         self.activation = nn.ReLU()
139 | 
140 |         self.layer1 = self._make_layer(block, 64, layers[0], stride=1)
141 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
142 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
143 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
144 | 
145 |         self.conv5 = nn.Conv2d(512 * block.expansion, 256, kernel_size=(num_nodes, 3), stride=(1, 1), padding=(0, 1),
146 |                                bias=False)
147 |         self.bn5 = nn.BatchNorm2d(256)
148 |         self.fc = nn.Linear(256 * 2, enc_dim)
149 |         self.fc_mu = nn.Linear(enc_dim, nclasses) if nclasses >= 2 else nn.Linear(enc_dim, 1)
150 | 
151 |         self.initialize_params()
152 |         self.attention = SelfAttention(256)
153 | 
154 |     def initialize_params(self):
155 |         for layer in self.modules():
156 |             if isinstance(layer, torch.nn.Conv2d):
157 |                 init.kaiming_normal_(layer.weight, a=0, mode='fan_out')
158 |             elif isinstance(layer, torch.nn.Linear):
159 |                 init.kaiming_uniform_(layer.weight)
160 |             elif isinstance(layer, torch.nn.BatchNorm2d) or isinstance(layer, torch.nn.BatchNorm1d):
161 |                 layer.weight.data.fill_(1)
162 |                 layer.bias.data.zero_()
163 | 
164 |     def _make_layer(self, block, planes, num_blocks, stride=1):
165 |         norm_layer = self._norm_layer
166 |         downsample = None
167 |         if stride != 1 or self.in_planes != planes * block.expansion:
168 |             downsample = nn.Sequential(conv1x1(self.in_planes, planes * block.expansion, stride),
169 |                                        norm_layer(planes * block.expansion))
170 |         layers = []
171 |         layers.append(block(self.in_planes, planes, stride, downsample, 1, 64, 1, norm_layer))
172 |         self.in_planes = planes * block.expansion
173 |         for _ in range(1, num_blocks):
174 |             layers.append(
175 |                 block(self.in_planes, planes, 1, groups=1, base_width=64, dilation=False, norm_layer=norm_layer))
176 | 
177 |         return nn.Sequential(*layers)
178 | 
179 |     def forward(self, x):
180 |         x = self.conv1(x)
181 |         x = self.activation(self.bn1(x))
182 |         x = self.layer1(x)
183 |         x = self.layer2(x)
184 |         x = self.layer3(x)
185 |         x = self.layer4(x)
186 |         x = self.conv5(x)
187 |         x = self.activation(self.bn5(x)).reshape(x.shape[0], x.shape[1], -1)
188 |         stats = self.attention(x.permute(0, 2, 1).contiguous())
189 | 
190 |         feat = self.fc(stats)
191 | 
192 |         mu = self.fc_mu(feat)
193 | 
194 |         return feat, mu
195 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import argparse
  3 | import os
  4 | import time
  5 | import numpy as np
  6 | import json
  7 | 
  8 | from torch import Tensor
  9 | import torchvision
 10 | import librosa
 11 | from model import Model
 12 | from torch.utils.data import DataLoader
 13 | from tensorboardX import SummaryWriter
 14 | import tools.dataset_loader as dataset_loader
 15 | from sklearn.model_selection import KFold
 16 | 
 17 | from resnet import setup_seed
 18 | from collections import defaultdict
 19 | from loss import *
 20 | import evaluation_metrics as em
 21 | 
 22 | 
 23 | class Color:
 24 |     HEADER = '\033[95m'
 25 |     OKBLUE = '\033[94m'
 26 |     OKCYAN = '\033[96m'
 27 |     OKGREEN = '\033[92m'
 28 |     WARNING = '\033[93m'
 29 |     FAIL = '\033[91m'
 30 |     ENDC = '\033[0m'
 31 |     BOLD = '\033[1m'
 32 |     UNDERLINE = '\033[4m'
 33 | 
 34 | 
 35 | def add_parser(parser):
 36 |     parser.add_argument("--feat_len", type=int, help="features length", default=750)
 37 |     parser.add_argument("--enc_dim", type=int, help="encoding dimension", default=256)
 38 | 
 39 | 
 40 |     parser.add_argument('--num-epochs', type=int, default=100, help="Number of epochs for training")
 41 |     parser.add_argument('--batch-size', type=int, default=4, help="Mini batch size for training")
 42 |     parser.add_argument('--epoch', type=int, default=0, help="current epoch number")
 43 |     parser.add_argument('--lr', type=float, default=0.0003, help="learning rate")
 44 |     parser.add_argument('--lr-decay', type=float, default=0.5, help="decay learning rate")
 45 |     parser.add_argument('--interval', type=int, default=10, help="interval to decay lr")
 46 | 
 47 |     parser.add_argument('--beta-1', type=float, default=0.9, help="bata_1 for Adam")
 48 |     parser.add_argument('--beta-2', type=float, default=0.999, help="beta_2 for Adam")
 49 |     parser.add_argument('--eps', type=float, default=1e-8, help="epsilon for Adam")
 50 |     parser.add_argument("--gpu", type=str, help="GPU index", default="1")
 51 |     parser.add_argument('--num-workers', type=int, default=0, help="number of workers")
 52 |     parser.add_argument('--seed', type=int, help="random number seed", default=598)
 53 | 
 54 |     parser.add_argument('--r-real', type=float, default=0.9, help="r_real for ocsoftmax")
 55 |     parser.add_argument('--r-fake', type=float, default=0.2, help="r_fake for ocsoftmax")
 56 |     parser.add_argument('--alpha', type=float, default=20, help="scale factor for ocsoftmax")
 57 | 
 58 |     parser.add_argument('--model-path', type=str, help="saved model path")
 59 |     args = parser.parse_args()
 60 | 
 61 |     # Change this to specify GPU
 62 |     os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
 63 | 
 64 |     # Set seeds
 65 |     setup_seed(args.seed)
 66 | 
 67 |     # assign device
 68 |     args.cuda = torch.cuda.is_available()
 69 |     print('Cuda device available: ', args.cuda)
 70 |     args.device = torch.device("cuda" if args.cuda else "cpu")
 71 | 
 72 |     return args
 73 | 
 74 | 
 75 | def adjust_learning_rate(args, optimizer, epoch_num):
 76 |     lr = args.lr * (args.lr_decay ** (epoch_num // args.interval))
 77 |     for param_group in optimizer.param_groups:
 78 |         param_group['lr'] = lr
 79 | 
 80 | 
 81 | def pad(x, max_len=64000):
 82 |     x_len = x.shape[0]
 83 |     if x_len >= max_len:
 84 |         return x[:max_len]
 85 |     # need to pad
 86 |     num_repeats = (max_len / x_len) + 1
 87 |     x_repeat = np.repeat(x, num_repeats)
 88 |     padded_x = x_repeat[:max_len]
 89 |     return padded_x
 90 | 
 91 | 
 92 | def split_dataset_to_train_and_val(k_fold, train_set, batch_size):
 93 |     for fold, (train_ids, test_ids) in enumerate(k_fold.split(train_set)):
 94 |         # Sample elements randomly from a given list of ids, no replacement.
 95 |         train_sub_sampler = torch.utils.data.SubsetRandomSampler(train_ids)
 96 |         test_sub_sampler = torch.utils.data.SubsetRandomSampler(test_ids)
 97 | 
 98 |         # Define data loaders for training and testing data in this fold
 99 |         train_loader_part = torch.utils.data.DataLoader(
100 |             train_set,
101 |             batch_size=batch_size, sampler=train_sub_sampler)
102 |         validation_loader_part = torch.utils.data.DataLoader(
103 |             train_set,
104 |             batch_size=batch_size, sampler=test_sub_sampler)
105 |         break
106 |     return train_loader_part, validation_loader_part
107 | 
108 | 
109 | def train(parser, device):
110 |     print(f'{Color.OKGREEN}Loading  train dataset...{Color.ENDC}')
111 |     args = parser.parse_args()
112 |     model = Model(input_channels=1, num_classes=256, device=device)
113 | 
114 |     transforms = torchvision.transforms.Compose([
115 |         lambda x: pad(x),
116 |         lambda x: librosa.util.normalize(x),
117 |         lambda x: Tensor(x),
118 |     ])
119 | 
120 |     k_fold = KFold(n_splits=5, shuffle=True)
121 | 
122 |     if args.model_path:
123 |         model.load_state_dict(torch.load(args.model_path))
124 |         print('Model loaded : {}'.format(args.model_path))
125 | 
126 |     optimizer = torch.optim.Adam(model.parameters(), lr=args.lr,
127 |                                  betas=(args.beta_1, args.beta_2), eps=args.eps, weight_decay=0.0005)
128 | 
129 | 
130 |     train_set = dataset_loader.ASVDataset(is_train=True, transform=transforms)
131 |     # dev_set = dataset_loader.ASVDataset(is_train=False, transform=transforms)
132 | 
133 |     monitor_loss = 'loss'
134 | 
135 |     print(f'{Color.ENDC}Train Start...')
136 | 
137 |     train_loader, validation_loader = split_dataset_to_train_and_val(k_fold, train_set, batch_size=args.batch_size)
138 |     model.train()
139 | 
140 |     # for epoch in range(checkpoint_epoch, number_of_epochs):
141 |     for epoch in range(args.epoch,  args.num_epochs):
142 |         start = time.time()
143 | 
144 |         print(f'{Color.OKBLUE}Epoch:{epoch}{Color.ENDC}')
145 |         model.train()
146 |         train_loss_dict = defaultdict(list)
147 |         dev_loss_dict = defaultdict(list)
148 | 
149 |         adjust_learning_rate(args, optimizer, epoch)
150 | 
151 |         for batch_x, batch_y, batch_meta in train_loader:
152 |             batch_x = batch_x.to(device)
153 |             batch_y = batch_y.view(-1).type(torch.int64).to(device)
154 | 
155 |             labels = batch_y.to(device)
156 |             loss, score = model(batch_x, labels)
157 |             train_loss_dict[monitor_loss].append(loss.item())
158 | 
159 |             optimizer.zero_grad()
160 |             loss.backward()
161 |             optimizer.step()
162 | 
163 |             with open(os.path.join('./log/', 'train_loss.log'), 'a') as log:
164 |                 # log.write(str(fold) + "\t" + str(epoch) + "\t" +
165 |                 log.write(str(epoch) + "\t" +
166 |                           str(np.nanmean(train_loss_dict[monitor_loss])) + "\n")
167 | 
168 |         end = time.time()
169 |         hours, rem = divmod(end - start, 3600)
170 |         minutes, seconds = divmod(rem, 60)
171 |         print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
172 |         print('start validation phase...')
173 | 
174 |         # Val the model
175 |         model.eval()
176 |         with torch.no_grad():
177 |             idx_loader, score_loader = [], []
178 |             for i, (batch_x, batch_y, batch_meta) in enumerate(validation_loader):
179 |                 labels = batch_y.to(device)
180 |                 loss, score = model(batch_x, labels, False)
181 | 
182 |                 dev_loss_dict['loss'].append(loss.item())
183 |                 idx_loader.append(labels)
184 |                 score_loader.append(score)
185 | 
186 |             scores = torch.cat(score_loader, 0).data.cpu().numpy()
187 |             labels = torch.cat(idx_loader, 0).data.cpu().numpy()
188 |             val_eer = em.compute_eer(scores[labels == 0], scores[labels == 1])[0]
189 |             other_val_eer = em.compute_eer(-scores[labels == 0], -scores[labels == 1])[0]
190 |             val_eer = min(val_eer, other_val_eer)
191 | 
192 |             with open(os.path.join('./log/', "dev_loss.log"), "a") as log:
193 |                 log.write(str(epoch) + "\t" + str(
194 |                     np.nanmean(dev_loss_dict[monitor_loss])) + "\t" + str(
195 |                     val_eer) + "\n")
196 |             print("Val EER: {}".format(val_eer))
197 | 
198 |         torch.save(model.state_dict(), os.path.join('./models/', 'model_%d.pt' % (epoch + 1)))
199 |         end = time.time()
200 |         hours, rem = divmod(end - start, 3600)
201 |         minutes, seconds = divmod(rem, 60)
202 |         print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
203 | 
204 | 
205 | def main():
206 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
207 |     parser = argparse.ArgumentParser('ASVSpoof2021')
208 |     add_parser(parser)
209 |     train(parser, device)
210 | 
211 | 
212 | if __name__ == '__main__':
213 |     main()
214 | 


--------------------------------------------------------------------------------
/evaluation_metrics.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import sys
  3 | 
  4 | def obtain_asv_error_rates(tar_asv, non_asv, spoof_asv, asv_threshold):
  5 | 
  6 |     # False alarm and miss rates for ASV
  7 |     Pfa_asv = sum(non_asv >= asv_threshold) / non_asv.size
  8 |     Pmiss_asv = sum(tar_asv < asv_threshold) / tar_asv.size
  9 | 
 10 |     # Rate of rejecting spoofs in ASV
 11 |     if spoof_asv.size == 0:
 12 |         Pmiss_spoof_asv = None
 13 |         Pfa_spoof_asv = None
 14 |     else:
 15 |         Pmiss_spoof_asv = np.sum(spoof_asv < asv_threshold) / spoof_asv.size
 16 |         Pfa_spoof_asv = np.sum(spoof_asv >= asv_threshold) / spoof_asv.size
 17 | 
 18 |     return Pfa_asv, Pmiss_asv, Pmiss_spoof_asv, Pfa_spoof_asv
 19 | 
 20 | 
 21 | def compute_det_curve(target_scores, nontarget_scores):
 22 | 
 23 |     n_scores = target_scores.size + nontarget_scores.size
 24 |     all_scores = np.concatenate((target_scores, nontarget_scores))
 25 |     labels = np.concatenate((np.ones(target_scores.size), np.zeros(nontarget_scores.size)))
 26 | 
 27 |     # Sort labels based on scores
 28 |     indices = np.argsort(all_scores, kind='mergesort')
 29 |     labels = labels[indices]
 30 | 
 31 |     # Compute false rejection and false acceptance rates
 32 |     tar_trial_sums = np.cumsum(labels)
 33 |     nontarget_trial_sums = nontarget_scores.size - (np.arange(1, n_scores + 1) - tar_trial_sums)
 34 | 
 35 |     frr = np.concatenate((np.atleast_1d(0), tar_trial_sums / target_scores.size))  # false rejection rates
 36 |     far = np.concatenate((np.atleast_1d(1), nontarget_trial_sums / nontarget_scores.size))  # false acceptance rates
 37 |     thresholds = np.concatenate((np.atleast_1d(all_scores[indices[0]] - 0.001), all_scores[indices]))  # Thresholds are the sorted scores
 38 | 
 39 |     return frr, far, thresholds
 40 | 
 41 | 
 42 | def compute_eer(target_scores, nontarget_scores):
 43 |     """ Returns equal error rate (EER) and the corresponding threshold. """
 44 |     frr, far, thresholds = compute_det_curve(target_scores, nontarget_scores)
 45 |     abs_diffs = np.abs(frr - far)
 46 |     min_index = np.argmin(abs_diffs)
 47 |     eer = np.mean((frr[min_index], far[min_index]))
 48 |     return eer, thresholds[min_index]
 49 | 
 50 | 
 51 | def compute_tDCF(bonafide_score_cm, spoof_score_cm, Pfa_asv, Pmiss_asv, Pfa_spoof_asv, cost_model, print_cost):
 52 |     """
 53 |     Compute Tandem Detection Cost Function (t-DCF) [1] for a fixed ASV system.
 54 |     In brief, t-DCF returns a detection cost of a cascaded system of this form,
 55 | 
 56 |       Speech waveform -> [CM] -> [ASV] -> decision
 57 | 
 58 |     where CM stands for countermeasure and ASV for automatic speaker
 59 |     verification. The CM is therefore used as a 'gate' to decided whether or
 60 |     not the input speech sample should be passed onwards to the ASV system.
 61 |     Generally, both CM and ASV can do detection errors. Not all those errors
 62 |     are necessarily equally cost, and not all types of users are necessarily
 63 |     equally likely. The tandem t-DCF gives a principled with to compare
 64 |     different spoofing countermeasures under a detection cost function
 65 |     framework that takes that information into account.
 66 | 
 67 |     INPUTS:
 68 | 
 69 |       bonafide_score_cm   A vector of POSITIVE CLASS (bona fide or human)
 70 |                           detection scores obtained by executing a spoofing
 71 |                           countermeasure (CM) on some positive evaluation trials.
 72 |                           trial represents a bona fide case.
 73 |       spoof_score_cm      A vector of NEGATIVE CLASS (spoofing attack)
 74 |                           detection scores obtained by executing a spoofing
 75 |                           CM on some negative evaluation trials.
 76 |       Pfa_asv             False alarm (false acceptance) rate of the ASV
 77 |                           system that is evaluated in tandem with the CM.
 78 |                           Assumed to be in fractions, not percentages.
 79 |       Pmiss_asv           Miss (false rejection) rate of the ASV system that
 80 |                           is evaluated in tandem with the spoofing CM.
 81 |                           Assumed to be in fractions, not percentages.
 82 |       Pmiss_spoof_asv     Miss rate of spoof samples of the ASV system that
 83 |                           is evaluated in tandem with the spoofing CM. That
 84 |                           is, the fraction of spoof samples that were
 85 |                           rejected by the ASV system.
 86 |       cost_model          A struct that contains the parameters of t-DCF,
 87 |                           with the following fields.
 88 | 
 89 |                           Ptar        Prior probability of target speaker.
 90 |                           Pnon        Prior probability of nontarget speaker (zero-effort impostor)
 91 |                           Psoof       Prior probability of spoofing attack.
 92 |                           Cmiss       Cost of tandem system falsely rejecting target speaker.
 93 |                           Cfa         Cost of tandem system falsely accepting nontarget speaker.
 94 |                           Cfa_spoof   Cost of tandem system falsely accepting spoof.
 95 | 
 96 |       print_cost          Print a summary of the cost parameters and the
 97 |                           implied t-DCF cost function?
 98 | 
 99 |     OUTPUTS:
100 | 
101 |       tDCF_norm           Normalized t-DCF curve across the different CM
102 |                           system operating points; see [2] for more details.
103 |                           Normalized t-DCF > 1 indicates a useless
104 |                           countermeasure (as the tandem system would do
105 |                           better without it). min(tDCF_norm) will be the
106 |                           minimum t-DCF used in ASVspoof 2019 [2].
107 |       CM_thresholds       Vector of same size as tDCF_norm corresponding to
108 |                           the CM threshold (operating point).
109 | 
110 |     NOTE:
111 |     o     In relative terms, higher detection scores values are assumed to
112 |           indicate stronger support for the bona fide hypothesis.
113 |     o     You should provide real-valued soft scores, NOT hard decisions. The
114 |           recommendation is that the scores are log-likelihood ratios (LLRs)
115 |           from a bonafide-vs-spoof hypothesis based on some statistical model.
116 |           This, however, is NOT required. The scores can have arbitrary range
117 |           and scaling.
118 |     o     Pfa_asv, Pmiss_asv, Pmiss_spoof_asv are in fractions, not percentages.
119 | 
120 |     References:
121 | 
122 |       [1] T. Kinnunen, H. Delgado, N. Evans,K.-A. Lee, V. Vestman, 
123 |           A. Nautsch, M. Todisco, X. Wang, M. Sahidullah, J. Yamagishi, 
124 |           and D.-A. Reynolds, "Tandem Assessment of Spoofing Countermeasures
125 |           and Automatic Speaker Verification: Fundamentals," IEEE/ACM Transaction on
126 |           Audio, Speech and Language Processing (TASLP).
127 | 
128 |       [2] ASVspoof 2019 challenge evaluation plan
129 |           https://www.asvspoof.org/asvspoof2019/asvspoof2019_evaluation_plan.pdf
130 |     """
131 | 
132 | 
133 |     # Sanity check of cost parameters
134 |     if cost_model['Cfa'] < 0 or cost_model['Cmiss'] < 0 or \
135 |             cost_model['Cfa'] < 0 or cost_model['Cmiss'] < 0:
136 |         print('WARNING: Usually the cost values should be positive!')
137 | 
138 |     if cost_model['Ptar'] < 0 or cost_model['Pnon'] < 0 or cost_model['Pspoof'] < 0 or \
139 |             np.abs(cost_model['Ptar'] + cost_model['Pnon'] + cost_model['Pspoof'] - 1) > 1e-10:
140 |         sys.exit('ERROR: Your prior probabilities should be positive and sum up to one.')
141 | 
142 |     # Unless we evaluate worst-case model, we need to have some spoof tests against asv
143 |     if Pfa_spoof_asv is None:
144 |         sys.exit('ERROR: you should provide false alarm rate of spoof tests against your ASV system.')
145 | 
146 |     # Sanity check of scores
147 |     combined_scores = np.concatenate((bonafide_score_cm, spoof_score_cm))
148 |     if np.isnan(combined_scores).any() or np.isinf(combined_scores).any():
149 |         sys.exit('ERROR: Your scores contain nan or inf.')
150 | 
151 |     # Sanity check that inputs are scores and not decisions
152 |     n_uniq = np.unique(combined_scores).size
153 |     if n_uniq < 3:
154 |         sys.exit('ERROR: You should provide soft CM scores - not binary decisions')
155 | 
156 |     # Obtain miss and false alarm rates of CM
157 |     Pmiss_cm, Pfa_cm, CM_thresholds = compute_det_curve(bonafide_score_cm, spoof_score_cm)
158 | 
159 |     # Constants - see ASVspoof 2019 evaluation plan
160 | 
161 |     C0 = cost_model['Ptar'] * cost_model['Cmiss'] * Pmiss_asv + cost_model['Pnon']*cost_model['Cfa']*Pfa_asv
162 |     C1 = cost_model['Ptar'] * cost_model['Cmiss'] - (cost_model['Ptar'] * cost_model['Cmiss'] * Pmiss_asv + cost_model['Pnon'] * cost_model['Cfa'] * Pfa_asv)
163 |     C2 = cost_model['Pspoof'] * cost_model['Cfa_spoof'] * Pfa_spoof_asv;
164 | 
165 | 
166 |     # Sanity check of the weights
167 |     if C0 < 0 or C1 < 0 or C2 < 0:
168 |         sys.exit('You should never see this error but I cannot evalute tDCF with negative weights - please check whether your ASV error rates are correctly computed?')
169 | 
170 |     # Obtain t-DCF curve for all thresholds
171 |     tDCF = C0 + C1 * Pmiss_cm + C2 * Pfa_cm
172 | 
173 |     # Obtain default t-DCF
174 |     tDCF_default = C0 + np.minimum(C1, C2)
175 | 
176 |     # Normalized t-DCF
177 |     tDCF_norm = tDCF / tDCF_default
178 | 
179 |     # Everything should be fine if reaching here.
180 |     if print_cost:
181 | 
182 |         print('t-DCF evaluation from [Nbona={}, Nspoof={}] trials\n'.format(bonafide_score_cm.size, spoof_score_cm.size))
183 |         print('t-DCF MODEL')
184 |         print('   Ptar         = {:8.5f} (Prior probability of target user)'.format(cost_model['Ptar']))
185 |         print('   Pnon         = {:8.5f} (Prior probability of nontarget user)'.format(cost_model['Pnon']))
186 |         print('   Pspoof       = {:8.5f} (Prior probability of spoofing attack)'.format(cost_model['Pspoof']))
187 |         print('   Cfa          = {:8.5f} (Cost of tandem system falsely accepting a nontarget)'.format(cost_model['Cfa']))
188 |         print('   Cmiss        = {:8.5f} (Cost of tandem system falsely rejecting target speaker)'.format(cost_model['Cmiss']))
189 |         print('   Cfa_spoof    = {:8.5f} (Cost of tandem sysmte falsely accepting spoof)'.format(cost_model['Cfa_spoof']))
190 |         print('\n   Implied normalized t-DCF function (depends on t-DCF parameters and ASV errors), t_CM=CM threshold)')
191 |         print('   tDCF_norm(t_CM) = {:8.5f} + {:8.5f} x Pmiss_cm(t_CM) + {:8.5f} x Pfa_cm(t_CM)\n'.format(C0/tDCF_default, C1/tDCF_default, C2/tDCF_default))
192 |         print('     * The optimum value is given by the first term (0.06273). This is the normalized t-DCF obtained with an error-free CM system.')
193 |         print('     * The minimum normalized cost (minimum over all possible thresholds) is always <= 1.00.')
194 |         print('')
195 | 
196 |     return tDCF_norm, CM_thresholds
197 | 
198 | def compute_tDCF_legacy(bonafide_score_cm, spoof_score_cm, Pfa_asv, Pmiss_asv, Pmiss_spoof_asv, cost_model, print_cost):
199 |     """
200 |     Compute Tandem Detection Cost Function (t-DCF) [1] for a fixed ASV system.
201 |     In brief, t-DCF returns a detection cost of a cascaded system of this form,
202 | 
203 |       Speech waveform -> [CM] -> [ASV] -> decision
204 | 
205 |     where CM stands for countermeasure and ASV for automatic speaker
206 |     verification. The CM is therefore used as a 'gate' to decided whether or
207 |     not the input speech sample should be passed onwards to the ASV system.
208 |     Generally, both CM and ASV can do detection errors. Not all those errors
209 |     are necessarily equally cost, and not all types of users are necessarily
210 |     equally likely. The tandem t-DCF gives a principled with to compare
211 |     different spoofing countermeasures under a detection cost function
212 |     framework that takes that information into account.
213 | 
214 |     INPUTS:
215 | 
216 |       bonafide_score_cm   A vector of POSITIVE CLASS (bona fide or human)
217 |                           detection scores obtained by executing a spoofing
218 |                           countermeasure (CM) on some positive evaluation trials.
219 |                           trial represents a bona fide case.
220 |       spoof_score_cm      A vector of NEGATIVE CLASS (spoofing attack)
221 |                           detection scores obtained by executing a spoofing
222 |                           CM on some negative evaluation trials.
223 |       Pfa_asv             False alarm (false acceptance) rate of the ASV
224 |                           system that is evaluated in tandem with the CM.
225 |                           Assumed to be in fractions, not percentages.
226 |       Pmiss_asv           Miss (false rejection) rate of the ASV system that
227 |                           is evaluated in tandem with the spoofing CM.
228 |                           Assumed to be in fractions, not percentages.
229 |       Pmiss_spoof_asv     Miss rate of spoof samples of the ASV system that
230 |                           is evaluated in tandem with the spoofing CM. That
231 |                           is, the fraction of spoof samples that were
232 |                           rejected by the ASV system.
233 |       cost_model          A struct that contains the parameters of t-DCF,
234 |                           with the following fields.
235 | 
236 |                           Ptar        Prior probability of target speaker.
237 |                           Pnon        Prior probability of nontarget speaker (zero-effort impostor)
238 |                           Psoof       Prior probability of spoofing attack.
239 |                           Cmiss_asv   Cost of ASV falsely rejecting target.
240 |                           Cfa_asv     Cost of ASV falsely accepting nontarget.
241 |                           Cmiss_cm    Cost of CM falsely rejecting target.
242 |                           Cfa_cm      Cost of CM falsely accepting spoof.
243 | 
244 |       print_cost          Print a summary of the cost parameters and the
245 |                           implied t-DCF cost function?
246 | 
247 |     OUTPUTS:
248 | 
249 |       tDCF_norm           Normalized t-DCF curve across the different CM
250 |                           system operating points; see [2] for more details.
251 |                           Normalized t-DCF > 1 indicates a useless
252 |                           countermeasure (as the tandem system would do
253 |                           better without it). min(tDCF_norm) will be the
254 |                           minimum t-DCF used in ASVspoof 2019 [2].
255 |       CM_thresholds       Vector of same size as tDCF_norm corresponding to
256 |                           the CM threshold (operating point).
257 | 
258 |     NOTE:
259 |     o     In relative terms, higher detection scores values are assumed to
260 |           indicate stronger support for the bona fide hypothesis.
261 |     o     You should provide real-valued soft scores, NOT hard decisions. The
262 |           recommendation is that the scores are log-likelihood ratios (LLRs)
263 |           from a bonafide-vs-spoof hypothesis based on some statistical model.
264 |           This, however, is NOT required. The scores can have arbitrary range
265 |           and scaling.
266 |     o     Pfa_asv, Pmiss_asv, Pmiss_spoof_asv are in fractions, not percentages.
267 | 
268 |     References:
269 | 
270 |       [1] T. Kinnunen, K.-A. Lee, H. Delgado, N. Evans, M. Todisco,
271 |           M. Sahidullah, J. Yamagishi, D.A. Reynolds: "t-DCF: a Detection
272 |           Cost Function for the Tandem Assessment of Spoofing Countermeasures
273 |           and Automatic Speaker Verification", Proc. Odyssey 2018: the
274 |           Speaker and Language Recognition Workshop, pp. 312--319, Les Sables d'Olonne,
275 |           France, June 2018 (https://www.isca-speech.org/archive/Odyssey_2018/pdfs/68.pdf)
276 | 
277 |       [2] ASVspoof 2019 challenge evaluation plan
278 |           https://www.asvspoof.org/asvspoof2019/asvspoof2019_evaluation_plan.pdf
279 |     """
280 | 
281 | 
282 |     # Sanity check of cost parameters
283 |     if cost_model['Cfa_asv'] < 0 or cost_model['Cmiss_asv'] < 0 or \
284 |             cost_model['Cfa_cm'] < 0 or cost_model['Cmiss_cm'] < 0:
285 |         print('WARNING: Usually the cost values should be positive!')
286 | 
287 |     if cost_model['Ptar'] < 0 or cost_model['Pnon'] < 0 or cost_model['Pspoof'] < 0 or \
288 |             np.abs(cost_model['Ptar'] + cost_model['Pnon'] + cost_model['Pspoof'] - 1) > 1e-10:
289 |         sys.exit('ERROR: Your prior probabilities should be positive and sum up to one.')
290 | 
291 |     # Unless we evaluate worst-case model, we need to have some spoof tests against asv
292 |     if Pmiss_spoof_asv is None:
293 |         sys.exit('ERROR: you should provide miss rate of spoof tests against your ASV system.')
294 | 
295 |     # Sanity check of scores
296 |     combined_scores = np.concatenate((bonafide_score_cm, spoof_score_cm))
297 |     if np.isnan(combined_scores).any() or np.isinf(combined_scores).any():
298 |         sys.exit('ERROR: Your scores contain nan or inf.')
299 | 
300 |     # Sanity check that inputs are scores and not decisions
301 |     n_uniq = np.unique(combined_scores).size
302 |     if n_uniq < 3:
303 |         sys.exit('ERROR: You should provide soft CM scores - not binary decisions')
304 | 
305 |     # Obtain miss and false alarm rates of CM
306 |     Pmiss_cm, Pfa_cm, CM_thresholds = compute_det_curve(bonafide_score_cm, spoof_score_cm)
307 | 
308 |     # Constants - see ASVspoof 2019 evaluation plan
309 |     C1 = cost_model['Ptar'] * (cost_model['Cmiss_cm'] - cost_model['Cmiss_asv'] * Pmiss_asv) - \
310 |          cost_model['Pnon'] * cost_model['Cfa_asv'] * Pfa_asv
311 |     C2 = cost_model['Cfa_cm'] * cost_model['Pspoof'] * (1 - Pmiss_spoof_asv)
312 | 
313 |     # Sanity check of the weights
314 |     if C1 < 0 or C2 < 0:
315 |         sys.exit('You should never see this error but I cannot evalute tDCF with negative weights - please check whether your ASV error rates are correctly computed?')
316 | 
317 |     # Obtain t-DCF curve for all thresholds
318 |     tDCF = C1 * Pmiss_cm + C2 * Pfa_cm
319 | 
320 |     # Normalized t-DCF
321 |     tDCF_norm = tDCF / np.minimum(C1, C2)
322 | 
323 |     # Everything should be fine if reaching here.
324 |     if print_cost:
325 | 
326 |         print('t-DCF evaluation from [Nbona={}, Nspoof={}] trials\n'.format(bonafide_score_cm.size, spoof_score_cm.size))
327 |         print('t-DCF MODEL')
328 |         print('   Ptar         = {:8.5f} (Prior probability of target user)'.format(cost_model['Ptar']))
329 |         print('   Pnon         = {:8.5f} (Prior probability of nontarget user)'.format(cost_model['Pnon']))
330 |         print('   Pspoof       = {:8.5f} (Prior probability of spoofing attack)'.format(cost_model['Pspoof']))
331 |         print('   Cfa_asv      = {:8.5f} (Cost of ASV falsely accepting a nontarget)'.format(cost_model['Cfa_asv']))
332 |         print('   Cmiss_asv    = {:8.5f} (Cost of ASV falsely rejecting target speaker)'.format(cost_model['Cmiss_asv']))
333 |         print('   Cfa_cm       = {:8.5f} (Cost of CM falsely passing a spoof to ASV system)'.format(cost_model['Cfa_cm']))
334 |         print('   Cmiss_cm     = {:8.5f} (Cost of CM falsely blocking target utterance which never reaches ASV)'.format(cost_model['Cmiss_cm']))
335 |         print('\n   Implied normalized t-DCF function (depends on t-DCF parameters and ASV errors), s=CM threshold)')
336 | 
337 |         if C2 == np.minimum(C1, C2):
338 |             print('   tDCF_norm(s) = {:8.5f} x Pmiss_cm(s) + Pfa_cm(s)\n'.format(C1 / C2))
339 |         else:
340 |             print('   tDCF_norm(s) = Pmiss_cm(s) + {:8.5f} x Pfa_cm(s)\n'.format(C2 / C1))
341 | 
342 |     return tDCF_norm, CM_thresholds
343 | 


--------------------------------------------------------------------------------