├── .gitignore ├── README.md ├── asvtorch ├── backend │ ├── plda.py │ └── vector_processing.py ├── evaluation │ ├── eval_metrics.py │ ├── parameters.py │ └── trials.py ├── global_setup.py ├── ivector │ ├── featureloader.py │ ├── gmm.py │ ├── ivector_extractor.py │ ├── posteriors.py │ ├── settings.py │ └── statloader.py ├── kaldidata │ ├── kaldifeatloaders.py │ ├── posterior_io.py │ └── utils.py └── misc │ └── misc.py ├── config.py ├── environment.yml ├── kaldi └── egs │ └── voxceleb │ └── v1 │ └── extract_feats_and_train_ubm.sh └── run_voxceleb_ivector.py /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | __pycache__/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #### GPU accelerated PyTorch implementation of frame posterior computation and i-vector extractor training. 2 | Kaldi is required for MFCC extraction and UBM training. 3 | 4 | #### Steps to run example script with VoxCeleb data: 5 | - Move **kaldi/egs/voxceleb/v1/extract_feats_and_train_ubm.sh** to the corresponding folder in your Kaldi installation 6 | - In **extract_feats_and_train_ubm.sh**, update **output_dir**, **voxceleb1_root**, and **voxceleb2_root**. 7 | - If you are using newer version of VoxCeleb1 (1.1), you might have to modify **kaldi/egs/voxceleb/v1/local/make_voxceleb1.pl** as the data organization is different than in the original VoxCeleb release. 8 | - run **extract_feats_and_train_ubm.sh** 9 | - update **DATA_FOLDER** in **run_voxceleb_ivector.py** 10 | - install and activate compatible conda environment 11 | - **environment.yml** has all the needed packages 12 | - Main requirements: Python (>3.6), PyTorch(>1.1), NumPy, SciPy, PyKaldi 13 | - run **run_voxceleb_ivector.py** 14 | 15 | 16 | For more details: 17 | http://dx.doi.org/10.21437/Interspeech.2019-1955 18 | 19 | ``` 20 | @inproceedings{Vestman2019, 21 | author={Ville Vestman and Kong Aik Lee and Tomi H. Kinnunen and Takafumi Koshinaka}, 22 | title={{Unleashing the Unused Potential of i-Vectors Enabled by GPU Acceleration}}, 23 | year=2019, 24 | booktitle={Proc. Interspeech 2019}, 25 | pages={351--355}, 26 | doi={10.21437/Interspeech.2019-1955}, 27 | url={http://dx.doi.org/10.21437/Interspeech.2019-1955} 28 | } 29 | ``` 30 | 31 | -------------------------------------------------------------------------------- /asvtorch/backend/plda.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import time 3 | 4 | import numpy as np 5 | import torch 6 | from scipy.linalg import inv, svd 7 | 8 | # Based on the PLDA in LRE 2017 baseline 9 | 10 | class Plda: 11 | def __init__(self, St, Sb): 12 | self.St = St 13 | self.Sb = Sb 14 | self.plda_dim = 0 15 | self.l = None 16 | self.uk = None 17 | self.qhat = None 18 | 19 | @classmethod 20 | def train_closed_form(cls, data, speaker_labels, device): 21 | print('Training PLDA...') 22 | data = data.to(device) 23 | data, class_boundaries = _rearrange_data(data, speaker_labels) 24 | print('Computing within class covariance...') 25 | Sw = _compute_within_cov(data, class_boundaries) 26 | print('Computing data covariance...') 27 | St = _compute_cov(data) 28 | Sb = St - Sw 29 | print('PLDA trained!...') 30 | return Plda(St, Sb) 31 | 32 | @classmethod 33 | def train_em(cls, data, speaker_labels, plda_dim, iterations, device): 34 | print('Initializing simplified PLDA...') 35 | data = data.to(device) 36 | n_total_sessions, data_dim = data.size() 37 | F = torch.randn(data_dim, plda_dim, device=device) 38 | F = _orthogonalize_columns(F) 39 | S = 1000 * torch.randn(data_dim, data_dim, device=device) 40 | data_covariance = torch.matmul(data.t(), data) 41 | data_list, count_list = _arrange_data_by_counts(data, speaker_labels) 42 | eye_matrix = torch.eye(plda_dim, device=device) 43 | 44 | for iteration in range(1, iterations+1): 45 | print('Iteration {}...'.format(iteration), end='') 46 | iter_start_time = time.time() 47 | 48 | FS = torch.solve(F, S.t())[0].t() 49 | FSF = torch.matmul(FS, F) 50 | 51 | dataEh = torch.zeros(data_dim, plda_dim, device=device) 52 | Ehh = torch.zeros(plda_dim, plda_dim, device=device) 53 | #print(count_list) 54 | for count_data, count in zip(data_list, count_list): 55 | Sigma = torch.inverse(eye_matrix + count * FSF) 56 | my = torch.chain_matmul(Sigma, FS.repeat(1, count), count_data.view(-1, data_dim * count).t()) 57 | #print(torch.norm(my[:, 0])) 58 | dataEh += torch.matmul(count_data.t(), my.repeat(count, 1).t().reshape(count_data.size()[0], -1)) 59 | Ehh += count * (my.size()[1] * Sigma + torch.matmul(my, my.t())) 60 | 61 | F = torch.solve(dataEh.t(), Ehh.t())[0].t() 62 | S = (data_covariance - torch.chain_matmul(F, Ehh, F.t())) / n_total_sessions 63 | 64 | Sb = torch.matmul(F, F.t()) 65 | St = Sb + S 66 | 67 | print(' [elapsed time = {:0.1f} s]'.format(time.time() - iter_start_time)) 68 | yield Plda(St, Sb) 69 | 70 | def _compute_scoring_matrices(self, plda_dim): 71 | if self.plda_dim != plda_dim: 72 | self.plda_dim = plda_dim 73 | iSt = torch.inverse(self.St) 74 | iS = torch.inverse(self.St - torch.chain_matmul(self.Sb, iSt, self.Sb)) 75 | Q = iSt - iS 76 | P = torch.chain_matmul(iSt, self.Sb, iS) 77 | U, s = torch.svd(P)[:2] 78 | self.l = s[:plda_dim] 79 | self.uk = U[:, :plda_dim] 80 | self.qhat = torch.chain_matmul(self.uk.t(), Q, self.uk) 81 | 82 | def score_trials(self, model_iv, test_iv, plda_dim): 83 | self._compute_scoring_matrices(plda_dim) 84 | model_iv = model_iv.to(self.uk.device) 85 | test_iv = test_iv.to(self.uk.device) 86 | model_iv = torch.matmul(model_iv, self.uk) 87 | test_iv = torch.matmul(test_iv, self.uk) 88 | score_h1 = torch.sum(torch.matmul(model_iv, self.qhat) * model_iv, 1) 89 | score_h2 = torch.sum(torch.matmul(test_iv, self.qhat) * test_iv, 1) 90 | score_h1h2 = 2 * torch.sum(model_iv * self.l * test_iv, 1) 91 | scores = score_h1h2 + score_h1 + score_h2 92 | return scores.cpu().numpy() 93 | 94 | def compress(self, vectors, plda_dim): 95 | self._compute_scoring_matrices(plda_dim) 96 | return torch.matmul(vectors, self.uk.to(vectors.device)) 97 | 98 | def save(self, filename): 99 | print('Saving PLDA to file {}'.format(filename)) 100 | np.savez(filename, St=self.St.cpu().numpy(), Sb=self.Sb.cpu().numpy()) 101 | 102 | @classmethod 103 | def load(cls, filename, device): 104 | print('Loading PLDA from file {}'.format(filename)) 105 | holder = np.load(filename) 106 | St, Sb = holder['St'], holder['Sb'] 107 | return Plda(torch.from_numpy(St).to(device), torch.from_numpy(Sb).to(device)) 108 | 109 | 110 | def _compute_cov(data): 111 | data -= torch.mean(data, dim=0) 112 | cov = torch.matmul(data.t(), data) / (data.size()[0] - 1) 113 | return cov 114 | 115 | def _compute_within_cov(data, class_boundaries): 116 | data = data.clone() 117 | for start, end in zip(class_boundaries[:-1], class_boundaries[1:]): 118 | data[start:end, :] -= data[start:end, :].mean(dim=0) 119 | return _compute_cov(data) 120 | 121 | def _rearrange_data(data, speaker_labels): 122 | print('Rearranging data for PLDA training...') 123 | index_dict = defaultdict(list) 124 | for index, label in enumerate(speaker_labels): 125 | index_dict[label].append(index) 126 | new_data = torch.zeros(*data.size()) 127 | class_boundaries = [0] 128 | counter = 0 129 | for key in index_dict: 130 | indices = index_dict[key] 131 | new_data[counter:counter + len(indices), :] = data[indices, :] 132 | counter += len(indices) 133 | class_boundaries.append(counter) 134 | return new_data, class_boundaries 135 | 136 | def _orthogonalize_columns(matrix): 137 | matrix -= torch.mean(matrix, 1).unsqueeze(1) 138 | D, V = torch.svd(matrix)[1:] 139 | W = torch.matmul(V, torch.diag((1./(torch.sqrt(D) + 1e-10)))) 140 | return torch.matmul(matrix, W) 141 | 142 | def _arrange_data_by_counts(data, labels): 143 | spk2indices = defaultdict(list) 144 | for index, label in enumerate(labels): 145 | spk2indices[label].append(index) 146 | 147 | count2spks = defaultdict(list) 148 | for spk in spk2indices: 149 | count2spks[len(spk2indices[spk])].append(spk) 150 | 151 | data_list = [] 152 | count_list = [] 153 | for count in count2spks: 154 | count_list.append(count) 155 | count_indices = [] 156 | for spk in count2spks[count]: 157 | count_indices.extend(spk2indices[spk]) 158 | data_list.append(data[count_indices, :]) 159 | 160 | return data_list, count_list -------------------------------------------------------------------------------- /asvtorch/backend/vector_processing.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import torch 4 | 5 | class VectorProcessor: 6 | 7 | def __init__(self, centering_vectors, whitening_matrices, processing_instruction): 8 | self.centering_vectors = centering_vectors 9 | self.whitening_matrices = whitening_matrices 10 | self.processing_instruction = processing_instruction 11 | 12 | @classmethod 13 | def train(cls, vectors, processing_instruction, device): 14 | """[summary] 15 | 16 | Arguments: 17 | vectors {[type]} -- [description] 18 | processing_instruction {String} -- [Contains characters 'c', 'w', 'l'. For example 'cwlc' performs centering, whitening, length normalization, and centering (2nd time) in this order.] 19 | """ 20 | print('Training vector processor ...') 21 | 22 | c_count = processing_instruction.count('c') 23 | w_count = processing_instruction.count('w') 24 | 25 | vec_size = vectors.size()[1] 26 | 27 | whitening_matrices = torch.zeros(w_count, vec_size, vec_size, device=device) 28 | centering_vectors = torch.zeros(c_count, vec_size, device=device) 29 | 30 | vectors = vectors.to(device) 31 | 32 | c_count = 0 33 | w_count = 0 34 | for c in processing_instruction: 35 | if c == 'c': 36 | print('Centering...') 37 | centering_vectors[c_count, :] = torch.mean(vectors, dim=0) 38 | vectors = vectors - centering_vectors[c_count, :] 39 | c_count += 1 40 | elif c == 'w': 41 | print('Whitening...') 42 | l, U = torch.symeig(torch.matmul(vectors.t(), vectors) / vectors.size()[0], eigenvectors=True) 43 | l = torch.clamp(l, min=1e-10) 44 | whitening_matrices[w_count, :, :] = torch.rsqrt(l) * U # transposed 45 | vectors = torch.matmul(vectors, whitening_matrices[w_count, :, :]) 46 | w_count += 1 47 | elif c == 'l': 48 | print('Normalizing length...') 49 | vectors = unit_len_norm(vectors) 50 | 51 | return VectorProcessor(centering_vectors, whitening_matrices, processing_instruction) 52 | 53 | def process(self, vectors): 54 | print('Processing {} vectors ...'.format(vectors.size()[0])) 55 | vectors = vectors.to(self.centering_vectors.device) 56 | c_count = 0 57 | w_count = 0 58 | for c in self.processing_instruction: 59 | if c == 'c': 60 | print('Centering...') 61 | vectors = vectors - self.centering_vectors[c_count, :] 62 | c_count += 1 63 | elif c == 'w': 64 | print('Whitening...') 65 | vectors = torch.matmul(vectors, self.whitening_matrices[w_count, :, :]) 66 | w_count += 1 67 | elif c == 'l': 68 | print('Normalizing length...') 69 | vectors = unit_len_norm(vectors) 70 | return vectors 71 | 72 | def save(self, output_file): 73 | data = {'c': self.centering_vectors.cpu(), 'w': self.whitening_matrices.cpu(), 'i': self.processing_instruction} 74 | with open(output_file, 'wb') as outfile: 75 | pickle.dump(data, outfile, protocol=pickle.HIGHEST_PROTOCOL) 76 | print('VectorProcessor saved to {}'.format(output_file)) 77 | 78 | @classmethod 79 | def load(cls, input_file, device): 80 | with open(input_file, 'rb') as infile: 81 | data = pickle.load(infile) 82 | centering_vectors = data['c'].to(device) 83 | whitening_matrices = data['w'].to(device) 84 | processing_instruction = data['i'] 85 | print('VectorProcessor loaded from {}'.format(input_file)) 86 | return VectorProcessor(centering_vectors, whitening_matrices, processing_instruction) 87 | 88 | def unit_len_norm(data): 89 | data_norm = torch.sqrt(torch.sum(data ** 2, 1)) 90 | data_norm[data_norm == 0] = 1 91 | return data / data_norm.unsqueeze(1) 92 | -------------------------------------------------------------------------------- /asvtorch/evaluation/eval_metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def compute_det_curve(target_scores, nontarget_scores): 4 | n_scores = target_scores.size + nontarget_scores.size 5 | all_scores = np.concatenate((target_scores, nontarget_scores)) 6 | labels = np.concatenate((np.ones(target_scores.size), np.zeros(nontarget_scores.size))) 7 | 8 | # Sort labels based on scores 9 | indices = np.argsort(all_scores, kind='mergesort') 10 | labels = labels[indices] 11 | 12 | # Compute false rejection and false acceptance rates 13 | tar_trial_sums = np.cumsum(labels) 14 | nontarget_trial_sums = nontarget_scores.size - (np.arange(1, n_scores + 1) - tar_trial_sums) 15 | 16 | frr = np.concatenate((np.atleast_1d(0), tar_trial_sums / target_scores.size)) # false rejection rates 17 | far = np.concatenate((np.atleast_1d(1), nontarget_trial_sums / nontarget_scores.size)) # false acceptance rates 18 | thresholds = np.concatenate((np.atleast_1d(all_scores[indices[0]] - 0.001), all_scores[indices])) # Thresholds are the sorted scores 19 | 20 | return frr, far, thresholds 21 | 22 | 23 | def compute_eer(target_scores, nontarget_scores): 24 | """ Returns equal error rate (EER) and the corresponding threshold. """ 25 | frr, far, thresholds = compute_det_curve(target_scores, nontarget_scores) 26 | abs_diffs = np.abs(frr - far) 27 | min_index = np.argmin(abs_diffs) 28 | eer = np.mean((frr[min_index], far[min_index])) 29 | return eer, thresholds[min_index] 30 | 31 | def compute_min_dcf(target_scores, nontarget_scores, p_target, c_miss, c_fa): 32 | frr, far, thresholds = compute_det_curve(target_scores, nontarget_scores) 33 | return _compute_min_dcf(frr, far, thresholds, p_target, c_miss, c_fa) 34 | 35 | # Obtained from KALDI Toolkit: 36 | def _compute_min_dcf(fnrs, fprs, thresholds, p_target, c_miss, c_fa): 37 | min_c_det = float("inf") 38 | min_c_det_threshold = thresholds[0] 39 | for i in range(0, len(fnrs)): 40 | c_det = c_miss * fnrs[i] * p_target + c_fa * fprs[i] * (1 - p_target) 41 | if c_det < min_c_det: 42 | min_c_det = c_det 43 | min_c_det_threshold = thresholds[i] 44 | c_def = min(c_miss * p_target, c_fa * (1 - p_target)) 45 | min_dcf = min_c_det / c_def 46 | return min_dcf, min_c_det_threshold -------------------------------------------------------------------------------- /asvtorch/evaluation/parameters.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import itertools 3 | 4 | class ParameterChanger(): 5 | def __init__(self, config_file, obj_dict): 6 | print('Initializing parameter changer...') 7 | 8 | self.configs = [] 9 | self._index = 0 10 | self._old_config = None 11 | self.obj2id = {v: k for k, v in obj_dict.items()} 12 | 13 | with open(config_file) as f: 14 | settings = [] 15 | for line in f: 16 | line = line.strip() 17 | if line.startswith('#'): 18 | continue 19 | if line: 20 | settings.append(line) 21 | elif settings: 22 | self._process_config_set(settings, obj_dict) 23 | settings = [] 24 | if settings: 25 | self._process_config_set(settings, obj_dict) 26 | 27 | def _process_config_set(self, settings, obj_dict): 28 | line_objects = [] 29 | line_attributes = [] 30 | value_lists = [] 31 | for setting in settings: 32 | setting = setting.strip() 33 | name, values = setting.split('=') 34 | name = name.strip() 35 | obj_id, attr = name.split('.') 36 | obj = obj_dict[obj_id] 37 | if _exists(obj, attr): 38 | line_objects.append(obj) 39 | line_attributes.append(attr) 40 | values = values.strip() 41 | ldict = {} 42 | exec('value_list = {}'.format(values), globals(), ldict) 43 | value_list = ldict['value_list'] 44 | if not isinstance(value_list, (list, tuple)): 45 | value_list = [value_list] 46 | value_lists.append(value_list) 47 | for value_combination in itertools.product(*value_lists): 48 | self.configs.append((line_objects, line_attributes, value_combination)) 49 | 50 | def next(self): 51 | # Reverting to the initial config: 52 | if self._old_config is not None: 53 | _set_attributes(self._old_config) 54 | 55 | # Getting new config: 56 | try: 57 | config = self.configs[self._index] 58 | except IndexError: 59 | self._index = 0 60 | self._old_config = None 61 | return False 62 | self._index += 1 63 | 64 | # Saving current config: 65 | self._old_config = _get_current_config(config[0], config[1]) 66 | # Setting new config: 67 | _set_attributes(config) 68 | 69 | print('Parameter values (non-default ones):\n{}'.format(self.get_current_string(compact=False))) 70 | return True 71 | 72 | def get_current_string(self, compact=True): 73 | config = _get_current_config(*self._old_config[0:2]) 74 | string = '' 75 | for obj, attr, value in zip(*config): 76 | if compact: 77 | string += '{}.{} = {}; '.format(self.obj2id[obj], attr, value) 78 | else: 79 | string += '{}.{} = {}\n'.format(self.obj2id[obj], attr, value) 80 | if compact: 81 | string = string[:-2] 82 | return string 83 | 84 | def get_value_string(self): 85 | config = _get_current_config(*self._old_config[0:2]) 86 | return ';'.join(str(value) for value in config[2]) 87 | 88 | def _get_current_config(objs, attrs): 89 | values = [] 90 | for obj, attr in zip(objs, attrs): 91 | values.append(getattr(obj, attr)) 92 | return (objs, attrs, values) 93 | 94 | def _set_attributes(config): 95 | for obj, attr, value in zip(*config): 96 | setattr(obj, attr, value) 97 | 98 | def _exists(obj, attr): 99 | if not hasattr(obj, attr): 100 | sys.exit('Config attribute does not exist: {}'.format(attr)) 101 | return True -------------------------------------------------------------------------------- /asvtorch/evaluation/trials.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | def organize_trials(vectors, utt_ids, trial_file): 5 | trial_vector_dict = {} 6 | for index, segment in enumerate(utt_ids): 7 | trial_vector_dict[segment] = vectors[index, :] 8 | 9 | trials = [] 10 | with open(trial_file) as f: 11 | for line in f: 12 | parts = line.split() 13 | if parts[2].strip() == 'target': 14 | label = 1 15 | else: 16 | label = 0 17 | trials.append((parts[0], parts[1], label)) 18 | 19 | left_vectors = torch.zeros(len(trials), vectors.shape[1], device=vectors.device) 20 | right_vectors = torch.zeros(len(trials), vectors.shape[1], device=vectors.device) 21 | 22 | labels = [] 23 | for index, trial in enumerate(trials): 24 | left_vectors[index, :] = trial_vector_dict[trial[0]] 25 | right_vectors[index, :] = trial_vector_dict[trial[1]] 26 | labels.append(trial[2]) 27 | 28 | labels = np.asarray(labels, dtype=bool) 29 | 30 | return left_vectors, right_vectors, labels 31 | 32 | 33 | def organize_trials_in_chunks(vectors, utt_ids, trial_file, chunk_size): 34 | 35 | print('Preparing to iterate over trials...') 36 | 37 | trial_vector_dict = {} 38 | for index, segment in enumerate(utt_ids): 39 | trial_vector_dict[segment] = vectors[index, :] 40 | 41 | trials = [] 42 | with open(trial_file) as f: 43 | for line in f: 44 | parts = line.split() 45 | if parts[2].strip() == 'target': 46 | label = 1 47 | else: 48 | label = 0 49 | trials.append((parts[0], parts[1], label)) 50 | 51 | i = 0 52 | 53 | while i < len(trials): 54 | print('Iterated over {} trials'.format(i)) 55 | chunk_trials = trials[i:i+chunk_size] 56 | i += chunk_size 57 | 58 | left_vectors = torch.zeros(len(chunk_trials), vectors.shape[1], device=vectors.device) 59 | right_vectors = torch.zeros(len(chunk_trials), vectors.shape[1], device=vectors.device) 60 | 61 | labels = [] 62 | for index, trial in enumerate(chunk_trials): 63 | left_vectors[index, :] = trial_vector_dict[trial[0]] 64 | right_vectors[index, :] = trial_vector_dict[trial[1]] 65 | labels.append(trial[2]) 66 | 67 | labels = np.asarray(labels, dtype=bool) 68 | 69 | yield left_vectors, right_vectors, labels 70 | -------------------------------------------------------------------------------- /asvtorch/global_setup.py: -------------------------------------------------------------------------------- 1 | # Do not touch this file directly. Change the device from run_voxceleb_ivector.py. 2 | 3 | import torch 4 | 5 | device = torch.device("cpu") 6 | 7 | def set_gpu(device_id): 8 | if torch.cuda.is_available(): 9 | global device 10 | device = torch.device('cuda:{}'.format(device_id)) 11 | torch.backends.cudnn.benchmark = False 12 | print('Using GPU!') 13 | else: 14 | print('Cuda is not available!') 15 | -------------------------------------------------------------------------------- /asvtorch/ivector/featureloader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.utils import data 4 | 5 | from asvtorch.kaldidata.utils import count_total_number_of_active_frames 6 | 7 | 8 | def _get_clip_indices(utt_start, utt_end, batch_start, batch_end): 9 | """ Cuts the parts of the utterance that do not fit into the batch window. 10 | 11 | Arguments: 12 | utt_start {int} -- start point of the utterance 13 | utt_end {int} -- end point of the utterance 14 | batch_start {int} -- start point of the batch window 15 | batch_end {int} -- end point of the batch window 16 | 17 | Returns: 18 | (int, int), bool -- a tuple containing clipped start and end point of an utterance, the boolean flag is True if the end of the utterance is inside the batch window. 19 | """ 20 | if utt_end <= batch_start: 21 | return None 22 | if utt_start >= batch_end: 23 | return None 24 | start = 0 25 | end = utt_end - utt_start 26 | if utt_start < batch_start: 27 | start = batch_start - utt_start 28 | if utt_end > batch_end: 29 | end = batch_end - utt_start 30 | if utt_end <= batch_end: 31 | ends = True 32 | else: 33 | ends = False 34 | return (start, end), ends 35 | 36 | class _Kaldi_Dataset(data.Dataset): 37 | def __init__(self, rxspecifiers, feature_loader, frames_per_batch): 38 | self.feat_rxspecifiers = rxspecifiers[0] 39 | self.vad_rxspecifiers = rxspecifiers[1] 40 | self.feature_loader = feature_loader 41 | 42 | n_active_frames, break_points = count_total_number_of_active_frames(self.vad_rxspecifiers) 43 | n_batches = int(np.ceil(n_active_frames / frames_per_batch)) 44 | 45 | utt_index = 0 46 | self.batches = [] 47 | 48 | for i in range(n_batches): 49 | batch_indices = [] 50 | batch_endpoints = [] 51 | window_start = i * frames_per_batch 52 | window_end = (i + 1) * frames_per_batch 53 | while utt_index < len(self.feat_rxspecifiers): 54 | clip_indices = _get_clip_indices(break_points[utt_index], break_points[utt_index + 1], window_start, window_end) 55 | utt_index += 1 56 | if clip_indices is not None: 57 | batch_indices.append((utt_index - 1, clip_indices[0])) 58 | if clip_indices[1]: 59 | batch_endpoints.append(break_points[utt_index]) 60 | else: 61 | break 62 | else: 63 | if batch_indices: 64 | break 65 | self.batches.append((batch_indices, np.asarray(batch_endpoints))) 66 | batch_indices = [] 67 | batch_endpoints = [] 68 | utt_index -= 1 69 | 70 | def __len__(self): 71 | return len(self.batches) 72 | 73 | def __getitem__(self, index): 74 | batch_indices, batch_endpoints = self.batches[index] 75 | frames = [] 76 | for utt_indices in batch_indices: 77 | utt_index, selection_indices = utt_indices 78 | feats = self.feature_loader.load_features(self.feat_rxspecifiers[utt_index], self.vad_rxspecifiers[utt_index]) 79 | frames.append(feats[selection_indices[0]:selection_indices[1], :]) 80 | frames = torch.from_numpy(np.vstack(frames)) 81 | return frames, batch_endpoints 82 | 83 | 84 | def _collater(batch): 85 | """ In this "hack" batches are already formed in the DataSet object (batch consists of a single element, which is actually the batch). 86 | """ 87 | return batch[0] 88 | 89 | def get_feature_loader(rxspecifiers, feature_loader, batch_size, num_workers): 90 | """Returs a DataLoader that is used to load features from multiple utterances using a fixed batch size given in (active speech) frames. 91 | 92 | Arguments: 93 | rxspecifiers {(list, list)} -- [description] 94 | feature_loader {KaldiFeatureLoader} -- Feature loader. 95 | batch_size {int} -- Batch size in frames. 96 | num_workers {int} -- Number of processes used for data loading. 97 | 98 | Returns: 99 | DataLoader -- DataLoader for reading features. 100 | """ 101 | dataset = _Kaldi_Dataset(rxspecifiers, feature_loader, batch_size) 102 | return data.DataLoader(dataset, batch_size=1, shuffle=False, num_workers=num_workers, collate_fn=_collater) 103 | -------------------------------------------------------------------------------- /asvtorch/ivector/gmm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import kaldi.util.io as kio 4 | from kaldi.gmm import FullGmm as KaldiFullGmm 5 | from kaldi.matrix import Matrix as KaldiMatrix 6 | 7 | import asvtorch.global_setup as gs 8 | from asvtorch.misc.misc import ensure_npz 9 | from asvtorch.misc.misc import test_finiteness 10 | 11 | class Gmm(): 12 | def __init__(self, means, covariances, weights, device=torch.device("cpu")): 13 | self.means = means.to(device) 14 | self.covariances = covariances.to(device) 15 | self.weights = weights.to(device) 16 | # Preparation for posterior computation: 17 | const = torch.Tensor([-0.5 * self.means.size()[1] * np.log(2 * np.pi)]).to(self.means.device) 18 | self._inv_covariances = torch.inverse(self.covariances) 19 | self._component_constants = torch.zeros(self.weights.numel(), device=self.means.device) 20 | for i in range(self.weights.numel()): 21 | self._component_constants[i] = -0.5 * torch.logdet(self.covariances[i, :, :]) + const + torch.log(self.weights[i]) 22 | 23 | def to_device(self, device): 24 | return Gmm(self.means, self.covariances, self.weights, device=device) 25 | 26 | def compute_posteriors_top_select(self, frames, top_indices): 27 | logprob = torch.zeros(top_indices.size(), device=self.means.device) 28 | for i in range(self.weights.numel()): 29 | indices_of_component = (top_indices == i) 30 | frame_selection = torch.any(indices_of_component, 0) 31 | post_index = torch.argmax(indices_of_component, 0)[frame_selection] 32 | centered_frames = frames[frame_selection, :] - self.means[i, :] 33 | logprob[post_index, frame_selection] = self._component_constants[i] - 0.5 * torch.sum(torch.mm(centered_frames, self._inv_covariances[i, :, :]) * centered_frames, 1) 34 | llk = torch.logsumexp(logprob, dim=0) 35 | return torch.exp(logprob - llk) 36 | 37 | def compute_posteriors(self, frames): 38 | logprob = torch.zeros(self.weights.numel(), frames.size()[0], device=self.means.device) 39 | for i in range(self.weights.numel()): 40 | centered_frames = frames - self.means[i, :] 41 | logprob[i, :] = self._component_constants[i] - 0.5 * torch.sum(torch.mm(centered_frames, self._inv_covariances[i, :, :]) * centered_frames, 1) 42 | llk = torch.logsumexp(logprob, dim=0) 43 | return torch.exp(logprob - llk) 44 | 45 | def save_npz(self, filename): 46 | np.savez(filename, weights=self.weights.cpu().numpy(), means=self.means.cpu().numpy(), covariances=self.covariances.cpu().numpy()) 47 | print('GMM saved to {}'.format(ensure_npz(filename))) 48 | 49 | @classmethod 50 | def from_npz(cls, filename, device): 51 | data = np.load(ensure_npz(filename)) 52 | weights = torch.from_numpy(data['weights']) 53 | means = torch.from_numpy(data['means']) 54 | covariances = torch.from_numpy(data['covariances']) 55 | return Gmm(means, covariances, weights, device) 56 | 57 | @classmethod 58 | def from_kaldi(cls, filename, device): 59 | ubm = KaldiFullGmm() 60 | with kio.xopen(filename) as f: 61 | ubm.read(f.stream(), f.binary) 62 | means = torch.from_numpy(ubm.get_means().numpy()) 63 | weights = torch.from_numpy(ubm.weights().numpy()) 64 | n_components = weights.numel() 65 | feat_dim = means.size()[1] 66 | covariances = torch.zeros([n_components, feat_dim, feat_dim], device='cpu', dtype=torch.float32) 67 | for index, kaldicovar in enumerate(ubm.get_covars()): 68 | covariances[index, :, :] = torch.from_numpy(KaldiMatrix(kaldicovar).numpy()) 69 | return Gmm(means, covariances, weights, device=device) 70 | 71 | class DiagGmm(): 72 | def __init__(self, means, covariances, weights, device=torch.device("cpu")): 73 | self.means = means.to(device) 74 | self.covariances = covariances.to(device) 75 | self.weights = weights.to(device) 76 | # Preparation for posterior computation: 77 | const = torch.Tensor([self.means.size()[1] * np.log(2 * np.pi)]).to(self.means.device) 78 | self.posterior_constant = torch.sum(self.means * self.means / self.covariances, 1) + torch.sum(torch.log(self.covariances), 1) + const 79 | self.posterior_constant = self.posterior_constant.unsqueeze(1) 80 | self.precisions = (1 / self.covariances) 81 | self.mean_pres = (self.means / self.covariances) 82 | 83 | def compute_posteriors(self, frames): 84 | logprob = torch.mm(self.precisions, (frames * frames).t()) - 2 * torch.mm(self.mean_pres, frames.t()) 85 | logprob = -0.5 * (logprob + self.posterior_constant) 86 | logprob = logprob + torch.log(self.weights.unsqueeze(1)) 87 | llk = torch.logsumexp(logprob, 0) 88 | return torch.exp(logprob - llk) 89 | 90 | @classmethod 91 | def from_full_gmm(cls, full_gmm, device): 92 | means = full_gmm.means.clone() 93 | weights = full_gmm.weights.clone() 94 | covariances = torch.zeros(means.size(), device=full_gmm.covariances.device, dtype=full_gmm.covariances.dtype) 95 | for index in range(weights.numel()): 96 | covariances[index, :] = full_gmm.covariances[index, :, :].diag() 97 | return DiagGmm(means, covariances, weights, device=device) 98 | 99 | -------------------------------------------------------------------------------- /asvtorch/ivector/ivector_extractor.py: -------------------------------------------------------------------------------- 1 | import time 2 | import datetime 3 | 4 | import torch 5 | import numpy as np 6 | 7 | import asvtorch.ivector.statloader 8 | from asvtorch.misc.misc import ensure_npz 9 | from asvtorch.ivector.gmm import Gmm 10 | 11 | class IVectorExtractor(): 12 | def __init__(self, t_matrix, means, inv_covariances, prior_offset, device): 13 | # When prior offset is zero, standard (non-augmented) i-vector formulation is used. 14 | self.t_matrix = t_matrix.to(device) 15 | self.prior_offset = prior_offset.to(device) 16 | self.means = means.to(device) 17 | self.inv_covariances = inv_covariances.to(device) 18 | self.n_components, self.ivec_dim, self.feat_dim = self.t_matrix.size() 19 | self.identity = torch.eye(self.ivec_dim, device=device).unsqueeze(0) 20 | self.bias_offset = None 21 | 22 | def _compute_posterior_means_and_covariances(self, n_all, f_all, batch_size, component_batches): 23 | covariances = torch.zeros(self.ivec_dim, batch_size, self.ivec_dim, device=self.t_matrix.device) 24 | means = torch.zeros(self.ivec_dim, batch_size, device=self.t_matrix.device) 25 | for bstart, bend in component_batches: 26 | n = n_all[:, bstart:bend] 27 | f = f_all[bstart:bend, :, :] 28 | sub_t = self.t_matrix[bstart:bend, :, :] 29 | sub_inv_covars = self.inv_covariances[bstart:bend, :, :] 30 | sub_tc = torch.bmm(sub_t, sub_inv_covars) 31 | tt = torch.bmm(sub_tc, torch.transpose(sub_t, 1, 2)) 32 | tt.transpose_(0, 1) 33 | covariances += torch.matmul(n, tt) 34 | means = torch.addbmm(means, sub_tc, f) 35 | covariances.transpose_(0, 1) 36 | covariances += self.identity 37 | covariances = torch.inverse(covariances) 38 | means.t_() 39 | means[:, 0] += self.prior_offset 40 | means.unsqueeze_(2) 41 | means = torch.bmm(covariances, means) 42 | means = means.view((means.size()[:2])) 43 | return means, covariances 44 | 45 | def _get_component_batches(self, n_component_batches): 46 | cbatch_size = self.n_components // n_component_batches 47 | component_batches = [] 48 | for cbatch_index in range(n_component_batches): 49 | bstart = cbatch_index * cbatch_size 50 | bend = (cbatch_index + 1) * cbatch_size 51 | component_batches.append((bstart, bend)) 52 | return component_batches 53 | 54 | def _get_stat_loader(self, rxspecifiers, feature_loader, second_order, batch_size, n_workers): 55 | data_dims = (self.n_components, self.feat_dim) 56 | if self.prior_offset == 0: 57 | stat_loader = asvtorch.ivector.statloader.get_stat_loader(rxspecifiers, feature_loader, data_dims, batch_size, second_order, self.means, n_workers) 58 | else: # Kaldi style i-vector (augmented form) --> No centering required 59 | stat_loader = asvtorch.ivector.statloader.get_stat_loader(rxspecifiers, feature_loader, data_dims, batch_size, second_order, None, n_workers) 60 | return stat_loader 61 | 62 | def get_updated_ubm(self, ubm, device): 63 | if self.prior_offset == 0: 64 | means = self.means.clone() 65 | else: 66 | means = self.t_matrix[:, 0, :] * self.prior_offset 67 | covariances = ubm.covariances.clone() 68 | weights = ubm.weights.clone() 69 | return Gmm(means, covariances, weights, device) 70 | 71 | def extract(self, rxspecifiers, feature_loader, settings): 72 | stat_loader = self._get_stat_loader(rxspecifiers, feature_loader, False, settings.batch_size_in_utts, settings.dataloader_workers) 73 | component_batches = self._get_component_batches(settings.n_component_batches) 74 | print('Extracting i-vectors for {} utterances...'.format(len(rxspecifiers[0]))) 75 | start_time = time.time() 76 | ivectors = torch.zeros(len(rxspecifiers[0]), self.ivec_dim, device=self.t_matrix.device) 77 | counter = 0 78 | for batch_index, batch in enumerate(stat_loader): 79 | n_all, f_all = batch 80 | batch_size = n_all.size()[0] 81 | print('{:.0f} seconds elapsed, Batch {}/{}: utterance count = {}'.format(time.time() - start_time, batch_index+1, stat_loader.__len__(), batch_size)) 82 | n_all = n_all.to(self.t_matrix.device) 83 | f_all = f_all.to(self.t_matrix.device) 84 | means = self._compute_posterior_means_and_covariances(n_all, f_all, batch_size, component_batches)[0] 85 | ivectors[counter:counter+batch_size, :] = means 86 | counter += batch_size 87 | ivectors[:, 0] -= self.prior_offset 88 | print('I-vector extraction completed in {:.0f} seconds.'.format(time.time() - start_time)) 89 | return ivectors 90 | 91 | def train(self, rxspecifiers, feature_loader, output_filename, settings, resume=0): 92 | if resume < 0: 93 | resume = 0 94 | elif resume > 0: 95 | print('Resuming i-vector extractor training from iteration {}...'.format(resume)) 96 | extractor = IVectorExtractor.from_npz('{}.{}'.format(ensure_npz(output_filename, inverse=True), resume), self.t_matrix.device) 97 | self.t_matrix = extractor.t_matrix 98 | self.means = extractor.means 99 | self.inv_covariances = extractor.inv_covariances 100 | self.prior_offset = extractor.prior_offset 101 | 102 | print('Training i-vector extractor ({} iterations)...'.format(settings.n_iterations)) 103 | 104 | n_utts = len(rxspecifiers[0]) 105 | component_batches = self._get_component_batches(settings.n_component_batches) 106 | 107 | print('Allocating memory for accumulators...') 108 | z = torch.zeros(self.n_components, device=self.t_matrix.device) 109 | S = torch.zeros(self.n_components, self.feat_dim, self.feat_dim, device=self.t_matrix.device) 110 | Y = torch.zeros(self.n_components, self.feat_dim, self.ivec_dim, device=self.t_matrix.device) 111 | R = torch.zeros(self.n_components, self.ivec_dim, self.ivec_dim, device=self.t_matrix.device) # The biggest memory consumer! 112 | h = torch.zeros(self.ivec_dim, device=self.t_matrix.device) 113 | H = torch.zeros(self.ivec_dim, self.ivec_dim, device=self.t_matrix.device) 114 | 115 | iteration_times = [] 116 | start_time = time.time() 117 | for iteration in range(1, settings.n_iterations + 1): 118 | iter_start_time = time.time() 119 | 120 | print('Initializing statistics loader...') 121 | accumulate_2nd_stats = settings.update_covariances and iteration == 1 # 2nd order stats need to be accumulated only once 122 | stat_loader = self._get_stat_loader(rxspecifiers, feature_loader, accumulate_2nd_stats, settings.batch_size_in_utts, settings.dataloader_workers) 123 | 124 | print('Iterating over batches of utterances...') 125 | for batch_index, batch in enumerate(stat_loader): 126 | 127 | if accumulate_2nd_stats: 128 | n_all, f_all, s_batch_sum = batch 129 | S += s_batch_sum.to(self.t_matrix.device) 130 | else: 131 | n_all, f_all = batch 132 | 133 | batch_size = n_all.size()[0] 134 | print('Iteration {} ({:.0f} seconds), Batch {}/{}: utterance count = {}'.format(iteration + resume, time.time() - iter_start_time, batch_index+1, stat_loader.__len__(), batch_size)) 135 | 136 | n_all = n_all.to(self.t_matrix.device) 137 | f_all = f_all.to(self.t_matrix.device) 138 | if iteration == 1: # Need to be accumulated only once 139 | z += torch.sum(n_all, dim=0) 140 | 141 | means, covariances = self._compute_posterior_means_and_covariances(n_all, f_all, batch_size, component_batches) 142 | 143 | # Accumulating... 144 | h += torch.sum(means, dim=0) 145 | yy = torch.baddbmm(covariances, means.unsqueeze(2), means.unsqueeze(1)) 146 | H += torch.sum(yy, dim=0) 147 | yy = yy.permute((1, 2, 0)) 148 | for bstart, bend in component_batches: # Batching over components saves GPU memory 149 | n = n_all[:, bstart:bend] 150 | f = f_all[bstart:bend, :, :] 151 | Y[bstart:bend, :, :] += torch.matmul(f, means) 152 | R[bstart:bend, :, :] += torch.matmul(yy, n).permute((2, 0, 1)) 153 | 154 | self.weights = z / torch.sum(z) * n_utts 155 | h = h / n_utts 156 | H = H / n_utts 157 | H = H - torch.ger(h, h) 158 | 159 | # Updating: 160 | if settings.update_projections: self._update_projections(Y, R, component_batches) 161 | if settings.update_covariances: self._update_covariances(Y, R, z, S, component_batches) 162 | if settings.minimum_divergence: self._minimum_divergence_whitening(h, H, component_batches) 163 | if settings.update_means: self._minimum_divergence_centering(h, component_batches) 164 | 165 | print('Zeroing accumulators...') 166 | Y.zero_() 167 | R.zero_() 168 | h.zero_() 169 | H.zero_() 170 | 171 | if settings.save_every_iteration: 172 | self.save_npz('{}.{}'.format(ensure_npz(output_filename, inverse=True), iteration + resume)) 173 | 174 | iteration_times.append(time.time() - iter_start_time) 175 | 176 | self.save_npz(output_filename) 177 | print('Training completed in {:.0f} seconds.'.format(time.time() - start_time)) 178 | return iteration_times 179 | 180 | def _update_projections(self, Y, R, component_batches): 181 | print('Updating projections...') 182 | for bstart, bend in component_batches: 183 | self.t_matrix[bstart:bend, :, :] = torch.cholesky_solve(Y[bstart:bend, :, :].transpose(1, 2), torch.cholesky(R[bstart:bend, :, :], upper=True), upper=True) 184 | 185 | def _update_covariances(self, Y, R, z, S, component_batches): 186 | print('Updating covariances...') 187 | for bstart, bend in component_batches: 188 | crossterm = torch.matmul(Y[bstart:bend, :, :], self.t_matrix[bstart:bend, :, :]) 189 | crossterm = crossterm + crossterm.transpose(1, 2) 190 | self.inv_covariances[bstart:bend, :, :] = S[bstart:bend, :, :] - 0.5 * crossterm 191 | 192 | var_floor = torch.sum(self.inv_covariances, dim=0) 193 | var_floor *= 0.1 / torch.sum(z) 194 | self.inv_covariances = self.inv_covariances / z.unsqueeze(1).unsqueeze(1) 195 | self._covariances = (self.inv_covariances).clone() 196 | self._apply_floor_(self.inv_covariances, var_floor, component_batches) 197 | self.inv_covariances = torch.inverse(self.inv_covariances) 198 | 199 | def _apply_floor_(self, A, B, component_batches): 200 | #B = self._apply_floor_scalar(B, self._max_abs_eig(B) * 1e-4)[0] # To prevent Cholesky from failing 201 | L = torch.cholesky(B) 202 | L_inv = torch.inverse(L) 203 | num_floored = 0 204 | batch_size = component_batches[0][1] - component_batches[0][0] 205 | l = torch.zeros(batch_size, self.feat_dim, device=self.t_matrix.device) 206 | U = torch.zeros(batch_size, self.feat_dim, self.feat_dim, device=self.t_matrix.device) 207 | for bstart, bend in component_batches: 208 | D = L_inv.matmul(A[bstart:bend, :, :]).matmul(L_inv.t()) 209 | for c in range(batch_size): 210 | l[c, :], U[c, :, :] = torch.symeig(D[c, :, :], eigenvectors=True) 211 | num_floored += torch.sum(l < 1).item() 212 | l = torch.clamp(l, min=1) 213 | D = U.matmul(l.unsqueeze(2) * U.transpose(1,2)) 214 | A[bstart:bend, :, :] = L.matmul(D.transpose(1, 2)).matmul(L.t()) 215 | print('Floored {:.1%} of the eigenvalues...'.format(num_floored / (self.n_components * self.feat_dim))) 216 | 217 | def _max_abs_eig(self, A): 218 | l = torch.symeig(A)[0] 219 | return torch.max(torch.abs(l)) 220 | 221 | def _apply_floor_scalar(self, A, b): 222 | l, U = torch.symeig(A, eigenvectors=True) 223 | num_floored = torch.sum(l < b).item() 224 | l = torch.clamp(l, min=b) 225 | A = torch.matmul(U, l.unsqueeze(1) * U.t()) 226 | return A, num_floored 227 | 228 | def _minimum_divergence_whitening(self, h, H, component_batches): 229 | print('Minimum divergence re-estimation...') 230 | l, U = torch.symeig(H, eigenvectors=True) 231 | l = torch.clamp(l, min=1e-7) 232 | P1 = torch.rsqrt(l) * U # transposed 233 | torch.matmul(h, P1, out=h) # In place operation, so that the result is available for update_means() 234 | if self.prior_offset != 0: # Augmented formulation 235 | self.prior_offset = h[0] 236 | print('Prior offset: {}'.format(self.prior_offset)) 237 | P1 = torch.inverse(P1) 238 | for bstart, bend in component_batches: 239 | self.t_matrix[bstart:bend, :, :] = P1.matmul(self.t_matrix[bstart:bend, :, :]) 240 | 241 | def _minimum_divergence_centering(self, h, component_batches): 242 | if self.prior_offset == 0: 243 | self.means += torch.sum(self.t_matrix * h.unsqueeze(0).unsqueeze(2), dim=1) 244 | else: # Augmented formulation uses the Householder transformation 245 | x = h / h.norm() 246 | alpha = torch.rsqrt(2 * (1 - x[0])) 247 | print('Alpha: {}'.format(alpha)) 248 | a = x * alpha 249 | a[0] -= alpha 250 | P2 = self.identity - 2 * torch.ger(a, a) 251 | self.prior_offset = torch.dot(h, P2[:, 0].squeeze()) 252 | print('Prior offset: {}'.format(self.prior_offset)) 253 | P2 = torch.inverse(P2) 254 | for bstart, bend in component_batches: 255 | self.t_matrix[bstart:bend, :, :] = P2.matmul(self.t_matrix[bstart:bend, :, :]) 256 | 257 | def save_npz(self, filename): 258 | np.savez(filename, t_matrix=self.t_matrix.cpu().numpy(), means=self.means.cpu().numpy(), inv_covariances=self.inv_covariances.cpu().numpy(), prior_offset=self.prior_offset.cpu().numpy()) 259 | print('I-vector extractor saved to {}'.format(ensure_npz(filename))) 260 | 261 | @classmethod 262 | def random_init(cls, ubm, settings, device, seed=0): 263 | torch.manual_seed(seed) 264 | t_matrix = torch.randn(ubm.covariances.size()[0], settings.ivec_dim, ubm.covariances.size()[1]) 265 | means = ubm.means.cpu().clone() 266 | inv_covariances = ubm._inv_covariances.clone() 267 | if settings.type == 'augmented': 268 | prior_offset = torch.tensor([float(settings.initial_prior_offset)]) 269 | t_matrix[:, 0, :] = means / prior_offset 270 | else: 271 | prior_offset = torch.tensor([float(0)]) 272 | return IVectorExtractor(t_matrix, means, inv_covariances, prior_offset, device) 273 | 274 | @classmethod 275 | def from_npz(cls, filename, device, iteration=None): 276 | if iteration is not None: 277 | filename = '{}.{}'.format(ensure_npz(filename, inverse=True), iteration) 278 | data = np.load(ensure_npz(filename)) 279 | t_matrix = torch.from_numpy(data['t_matrix']) 280 | means = torch.from_numpy(data['means']) 281 | inv_covariances = torch.from_numpy(data['inv_covariances']) 282 | prior_offset = torch.from_numpy(data['prior_offset']) 283 | return IVectorExtractor(t_matrix, means, inv_covariances, prior_offset, device) 284 | -------------------------------------------------------------------------------- /asvtorch/ivector/posteriors.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import numpy as np 4 | import torch 5 | from kaldi.util.table import VectorWriter 6 | from kaldi.matrix import Vector 7 | 8 | import asvtorch.ivector.featureloader 9 | import asvtorch.global_setup as gs 10 | from asvtorch.ivector.gmm import DiagGmm 11 | from asvtorch.kaldidata.posterior_io import PosteriorWriter 12 | from asvtorch.misc.misc import ensure_npz 13 | 14 | 15 | def batch_extract_posteriors(rx_specifiers, utt_ids, feature_loader, ubm, output_filename, settings): 16 | """Extracts posteriors using full covariance matrices. Computational requirements and disk space requirements are reduced by performing Gaussian selection using diagonal covariances and by thresholding posteriors. 17 | 18 | Arguments: 19 | rx_specifiers {(list, list)} -- Two lists in a tuple containing scp lines without utterance IDs for features and VAD labels, respectively. 20 | utt_ids {list} -- Utterance IDs. 21 | feature_loader {KaldiFeatureLoader} -- Feature loader. 22 | ubm {Gmm} -- A GMM (UBM). 23 | output_filename {string} -- Output filename for posteriors (without extension). 24 | settings {PosteriorExtractionSettings} - Settings. 25 | """ 26 | 27 | print('Extracting posteriors for {} utterances...'.format(len(rx_specifiers[0]))) 28 | 29 | dataloader = asvtorch.ivector.featureloader.get_feature_loader(rx_specifiers, feature_loader, settings.batch_size_in_frames, settings.dataloader_workers) 30 | 31 | diag_ubm = DiagGmm.from_full_gmm(ubm, gs.device) 32 | 33 | sub_batch_count = int(np.ceil(ubm.means.size()[0] / ubm.means.size()[1])) 34 | 35 | wspecifier_top_posterior = "ark,scp:{0}.ark,{0}.scp".format(output_filename) 36 | posterior_writer = PosteriorWriter(wspecifier_top_posterior) 37 | 38 | posterior_buffer = torch.Tensor() 39 | top_buffer = torch.LongTensor() 40 | count_buffer = torch.LongTensor() 41 | 42 | start_time = time.time() 43 | frame_counter = 0 44 | utterance_counter = 0 45 | 46 | start_time = time.time() 47 | for batch_index, batch in enumerate(dataloader): 48 | 49 | frames, end_points = batch 50 | frames = frames.to(gs.device) 51 | frames_in_batch = frames.size()[0] 52 | 53 | chunks = torch.chunk(frames, sub_batch_count, dim=0) 54 | top_gaussians = [] 55 | for chunk in chunks: 56 | posteriors = diag_ubm.compute_posteriors(chunk) 57 | top_gaussians.append(torch.topk(posteriors, settings.n_top_gaussians, dim=0, largest=True, sorted=False)[1]) 58 | 59 | top_gaussians = torch.cat(top_gaussians, dim=1) 60 | 61 | posteriors = ubm.compute_posteriors_top_select(frames, top_gaussians) 62 | 63 | # Posterior thresholding: 64 | max_indices = torch.argmax(posteriors, dim=0) 65 | mask = posteriors.ge(settings.posterior_threshold) 66 | top_counts = torch.sum(mask, dim=0) 67 | posteriors[~mask] = 0 68 | divider = torch.sum(posteriors, dim=0) 69 | mask2 = divider.eq(0) # For detecting special cases 70 | posteriors[:, ~mask2] = posteriors[:, ~mask2] / divider[~mask2] 71 | # Special case that all the posteriors are discarded (force to use 1): 72 | posteriors[max_indices[mask2], mask2] = 1 73 | mask[max_indices[mask2], mask2] = 1 74 | top_counts[mask2] = 1 75 | 76 | # Vectorize the data & move to cpu memory 77 | posteriors = posteriors.t().masked_select(mask.t()) 78 | top_gaussians = top_gaussians.t().masked_select(mask.t()) 79 | posteriors = posteriors.cpu() 80 | top_gaussians = top_gaussians.cpu() 81 | top_counts = top_counts.cpu() 82 | 83 | end_points = end_points - frame_counter # relative end_points in a batch 84 | 85 | if end_points.size != 0: 86 | # Save utterance data that continues from the previous batch: 87 | psave = torch.cat([posterior_buffer, posteriors[:torch.sum(top_counts[:end_points[0]])]]) 88 | tsave = torch.cat([top_buffer, top_gaussians[:torch.sum(top_counts[:end_points[0]])]]) 89 | csave = torch.cat([count_buffer, top_counts[:end_points[0]]]) 90 | posterior_writer.write(utt_ids[utterance_counter], csave, psave, tsave) 91 | utterance_counter += 1 92 | 93 | # Save utterance data that is fully included in this batch: 94 | for start_point, end_point in zip(end_points[:-1], end_points[1:]): 95 | psave = posteriors[torch.sum(top_counts[:start_point]):torch.sum(top_counts[:end_point])] 96 | tsave = top_gaussians[torch.sum(top_counts[:start_point]):torch.sum(top_counts[:end_point])] 97 | csave = top_counts[start_point:end_point] 98 | posterior_writer.write(utt_ids[utterance_counter], csave, psave, tsave) 99 | utterance_counter += 1 100 | 101 | # Buffer partial data to be used in the next batch: 102 | posterior_buffer = posteriors[torch.sum(top_counts[:end_points[-1]]):] 103 | top_buffer = top_gaussians[torch.sum(top_counts[:end_points[-1]]):] 104 | count_buffer = top_counts[end_points[-1]:] 105 | else: 106 | # Buffer the whole data for the next batch (if the utterance is longer than the current batch (special case)): 107 | posterior_buffer = torch.cat([posterior_buffer, posteriors]) 108 | top_buffer = torch.cat([top_buffer, top_gaussians]) 109 | count_buffer = torch.cat([count_buffer, top_counts]) 110 | 111 | frame_counter += frames_in_batch 112 | 113 | print('{:.0f} seconds elapsed, batch {}/{}: {}, utterance count (roughly) = {}'.format(time.time() - start_time, batch_index+1, dataloader.__len__(), frames.size(), len(end_points))) 114 | 115 | posterior_writer.close() 116 | print('Posterior computation completed in {:.3f} seconds'.format(time.time() - start_time)) 117 | 118 | return time.time() - start_time 119 | -------------------------------------------------------------------------------- /asvtorch/ivector/settings.py: -------------------------------------------------------------------------------- 1 | 2 | class PosteriorExtractionSettings(): 3 | def __init__(self): 4 | # general settings 5 | self.n_top_gaussians = 20 6 | self.posterior_threshold = 0.025 7 | 8 | # data loading & batching settings 9 | self.batch_size_in_frames = 500000 10 | self.dataloader_workers = 4 11 | 12 | def print_settings(self): 13 | print('POSTERIOR EXTRACTION SETTINGS') 14 | print('- Number of top Gaussians to select for each frame: {}'.format(self.n_top_gaussians)) 15 | print('- Select Gaussians only if frame posterior is higher than: {}'.format(self.posterior_threshold)) 16 | 17 | print('Data loading & batching settings') 18 | print('- Number of data loader workers: {}'.format(self.dataloader_workers)) 19 | print('- Number of frames in a batch: {}'.format(self.batch_size_in_frames)) 20 | print('') 21 | 22 | 23 | 24 | class IVectorSettings(): 25 | 26 | def __init__(self): 27 | 28 | # general settings 29 | self.ivec_dim = 400 30 | self.type = 'kaldi' # 'standard' 31 | 32 | # training settings 33 | self.n_iterations = 5 34 | self.initial_prior_offset = 100 # Only useful in the augmented formulation ('kaldi') 35 | self.update_projections = True 36 | self.update_covariances = True 37 | self.minimum_divergence = True 38 | self.update_means = True 39 | 40 | # data loading & batching settings 41 | self.dataloader_workers = 6 42 | self.batch_size_in_utts = 200 # Higher batch size will have higher GPU memory usage. 43 | self.n_component_batches = 16 # must be a power of two! The higher the value, the less GPU memory will be used. 44 | 45 | # model saving settings 46 | self.save_every_iteration = True 47 | 48 | 49 | def print_settings(self): 50 | print('I-VECTOR EXTRACTOR SETTINGS') 51 | print('- I-vector type: {}'.format(self.type)) 52 | print('- I-vector dimensionality: {}'.format(self.ivec_dim)) 53 | 54 | print('Training settings') 55 | print('- Number of iterations: {}'.format(self.n_iterations)) 56 | if self.type == 'kaldi': 57 | print('- Initial prior offset: {}'.format(self.initial_prior_offset)) 58 | print('- Update projections (T matrix): {}'.format(self.update_projections)) 59 | print('- Update residual covariances: {}'.format(self.update_covariances)) 60 | print('- Minimum divergence re-estimation: {}'.format(self.minimum_divergence)) 61 | print('- Update means (bias term): {}'.format(self.update_means)) 62 | 63 | print('Data loading & batching settings') 64 | print('- Number of data loader workers: {}'.format(self.dataloader_workers)) 65 | print('- Number of utterances in a batch: {}'.format(self.batch_size_in_utts)) 66 | print('- Number of batches for components (has to be power of 2): {}'.format(self.n_component_batches)) 67 | 68 | print('Saving settings') 69 | print('- Save model after every iteration: {}'.format(self.save_every_iteration)) 70 | print('') 71 | -------------------------------------------------------------------------------- /asvtorch/ivector/statloader.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import numpy as np 4 | import torch 5 | from torch.utils import data 6 | import kaldi.util.io as kio 7 | 8 | import asvtorch.kaldidata.posterior_io 9 | 10 | class _StatDataset(data.Dataset): 11 | def __init__(self, rxspecifiers, feature_loader, data_dims, second_order, centering_means=None): 12 | self.feat_rxspecifiers = rxspecifiers[0] 13 | self.vad_rxspecifiers = rxspecifiers[1] 14 | self.posterior_rxspecifiers = rxspecifiers[2] 15 | self.feature_loader = feature_loader 16 | if centering_means is not None: 17 | self.centering_means = centering_means.cpu().numpy() 18 | else: 19 | self.centering_means = None 20 | self.second_order = second_order 21 | self.data_dims = data_dims 22 | if(second_order): 23 | self.second_order_sum = np.zeros((data_dims[0], data_dims[1], data_dims[1]), dtype=np.float32) 24 | 25 | def __len__(self): 26 | return len(self.feat_rxspecifiers) 27 | 28 | def accumulate_stats(self, feats, counts, posteriors, indices): 29 | """Computes 0th and 1st order statistics from the selected posteriors. 30 | 31 | Arguments: 32 | feats {ndarray} -- Feature array (feature vectors as rows). 33 | counts {ndarray} -- Array containing the numbers of selected posteriors for each frame. 34 | posteriors {ndarray} -- Array containing posteriors (flattened). 35 | indices {ndarray} -- Array containing Gaussian indices (flattened). 36 | 37 | Returns: 38 | ndarray -- 0th order statistics (row vector). 39 | ndarray -- 1st order statistics (row index = component index). 40 | """ 41 | 42 | n = np.zeros(self.data_dims[0], dtype=np.float32) 43 | f = np.zeros(self.data_dims, dtype=np.float32) 44 | posterior_count = 0 45 | for frame_index in range(counts.size): 46 | end = posterior_count+counts[frame_index] 47 | gaussian_indices = indices[posterior_count:end] 48 | frame_posteriors = posteriors[posterior_count:end] 49 | n[gaussian_indices] += frame_posteriors 50 | f[gaussian_indices, :] += np.outer(frame_posteriors, feats[frame_index, :]) 51 | if self.second_order: 52 | if self.centering_means is not None: 53 | feats_centered = np.atleast_3d(np.atleast_2d(feats[frame_index, :]) - self.centering_means[gaussian_indices, :]) # Ok: (atleast_2d and atleast_3d prepend and append dimensions, respectively) 54 | feat_outer = np.matmul(feats_centered, np.transpose(feats_centered, (0, 2, 1))) 55 | else: 56 | feat_outer = np.outer(feats[frame_index, :], feats[frame_index, :]) 57 | self.second_order_sum[gaussian_indices, :, :] += frame_posteriors[:, np.newaxis, np.newaxis] * feat_outer 58 | posterior_count += counts[frame_index] 59 | return n, f 60 | 61 | def __getitem__(self, index): 62 | feats = self.feature_loader.load_features(self.feat_rxspecifiers[index], self.vad_rxspecifiers[index]) 63 | counts, posteriors, indices = asvtorch.kaldidata.posterior_io.load_posteriors(self.posterior_rxspecifiers[index]) 64 | n, f = self.accumulate_stats(feats, counts, posteriors, indices) 65 | if self.centering_means is not None: 66 | f -= n[:, None] * self.centering_means 67 | return n, f 68 | 69 | def collater(self, batch): 70 | """Collates sufficient statistics from many utterances to form a batch. 71 | 72 | Returns: 73 | Tensor -- 0th order statistics (number of utterances x number of components) 74 | Tensor -- 1st order statistics (#components x feat_dim x #utterances) 75 | Tensor -- Sum of 2nd order statistics (#components x feat_dim x feat_dim) 76 | """ 77 | n, f = zip(*batch) 78 | n = np.stack(n, axis=0) 79 | f = np.stack(f, axis=2) 80 | 81 | if self.second_order: 82 | s = self.second_order_sum 83 | self.second_order_sum = np.zeros(self.second_order_sum.shape, dtype=np.float32) # Zero the accumulator 84 | return torch.from_numpy(n), torch.from_numpy(f), torch.from_numpy(s) 85 | else: 86 | return torch.from_numpy(n), torch.from_numpy(f) 87 | 88 | 89 | def get_stat_loader(rxspecifiers, feature_loader, data_dims, batch_size, second_order, centering_means, num_workers): 90 | """Loads Baum-Welch statistics in batches. 91 | 92 | Arguments: 93 | rxspecifiers {(list, list, list)} -- Three lists in a tuple containing scp lines without utterance IDs for features, VADs, and posteriors, respectively. 94 | feature_loader {KaldiFeatureLoader} -- Feature loader. 95 | data_dims -- {tuple} (#components, feat_dim). 96 | batch_size {int} -- Batch size in utterances. 97 | second_order {boolean} -- Whether or not to compute 2nd order stats. 98 | centering_means {Tensor} -- Which means to use for centering statistics. 99 | num_workers {int} -- Number of processes used for data loading. 100 | 101 | Returns: 102 | DataLoader -- A dataloader for loading Baum-Welch statistics. 103 | """ 104 | dataset = _StatDataset(rxspecifiers, feature_loader, data_dims, second_order, centering_means) 105 | return data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=dataset.collater) -------------------------------------------------------------------------------- /asvtorch/kaldidata/kaldifeatloaders.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | import numpy as np 4 | 5 | import kaldi.feat.functions as featfuncs 6 | import kaldi.util.io as kio 7 | 8 | class KaldiFeatureLoader(abc.ABC): 9 | @abc.abstractmethod 10 | def load_features(self, feat_rxspecifier, vad_rxspecifier): 11 | """Loads and processes KALDI features. 12 | 13 | Arguments: 14 | feat_rxspecifier {string} -- A line from feats.scp file excluding the utterance ID 15 | vad_rxspecifier {string} -- A line from vad.scp file excluding the utterance ID 16 | """ 17 | pass 18 | 19 | 20 | class VoxcelebFeatureLoader(KaldiFeatureLoader): 21 | """ This class is used to read features extracted by the KALDI recipe "egs/voxceleb/v1/". After loading the features are procesessed in the same manner as done in the KALDI recipe. 22 | """ 23 | def __init__(self): 24 | self.delta_opts = featfuncs.DeltaFeaturesOptions(order=2, window=3) 25 | self.cmn_opts = featfuncs.SlidingWindowCmnOptions() 26 | self.cmn_opts.center = True 27 | self.cmn_opts.cmn_window = 300 28 | self.cmn_opts.normalize_variance = False 29 | 30 | def load_features(self, feat_rxspecifier, vad_rxspecifier): 31 | feats = kio.read_matrix(feat_rxspecifier) 32 | vad_labels = kio.read_vector(vad_rxspecifier) 33 | feats = featfuncs.compute_deltas(self.delta_opts, feats) 34 | featfuncs.sliding_window_cmn(self.cmn_opts, feats, feats) 35 | feats = feats.numpy()[vad_labels.numpy().astype(bool), :] 36 | return feats 37 | -------------------------------------------------------------------------------- /asvtorch/kaldidata/posterior_io.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from kaldi.util.table import VectorWriter 3 | import kaldi.util.io as kio 4 | from kaldi.matrix import Vector 5 | 6 | class PosteriorWriter(): 7 | def __init__(self, wxspecifier): 8 | self.posterior_writer = VectorWriter(wxspecifier) 9 | 10 | def write(self, utt_id, counts, posteriors, indices): 11 | """Writes posteriors to disk in KALDI format. 12 | 13 | Arguments: 14 | utt_id {string} -- Utterance ID to be written to scp file 15 | counts {Tensor} -- Tensor containing the numbers of selected posteriors for each frame 16 | posteriors {Tensor} -- Flattened Tensor containing all posteriors 17 | indices {Tensor} -- Flattened Tensor containing all Gaussian indices 18 | """ 19 | 20 | counts = counts.numpy() 21 | posteriors = posteriors.numpy() 22 | indices = indices.numpy() 23 | nframes = np.atleast_1d(np.array([counts.size])) 24 | datavector = np.hstack([nframes, counts, posteriors, indices]) 25 | datavector = Vector(datavector) 26 | self.posterior_writer.write(utt_id, datavector) 27 | 28 | def close(self): 29 | self.posterior_writer.close() 30 | 31 | 32 | def load_posteriors(rxspecifier): 33 | """Loads posteriors stored in KALDI format from disk. 34 | 35 | Arguments: 36 | rxspecifier {string} -- A line from scp file excluding the utterance ID. 37 | 38 | Returns: 39 | ndarray -- Array containing the numbers of selected posteriors for each frame 40 | ndarray -- Array containing posteriors (flattened) 41 | ndarray -- Array containing Gaussian indices (flattened) 42 | """ 43 | 44 | datavector = kio.read_vector(rxspecifier) 45 | datavector = datavector.numpy() 46 | nframes = int(datavector[0]) 47 | counts = datavector[1:nframes+1].astype(int) 48 | n_posteriors = (datavector.size - counts.size - 1) // 2 49 | posteriors = datavector[nframes+1:-n_posteriors] 50 | indices = datavector[-n_posteriors:].astype(int) 51 | return counts, posteriors, indices -------------------------------------------------------------------------------- /asvtorch/kaldidata/utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import random 4 | 5 | import numpy as np 6 | import kaldi.util.io as kio 7 | 8 | def count_total_number_of_active_frames(vad_rxspecifiers): 9 | """Counts the total number of active speech frames in the given utterance list. 10 | 11 | Arguments: 12 | vad_rxspecifiers {list} -- List of lines of vad.scp file excluding utterance IDs. 13 | 14 | Returns: 15 | int -- Total number of active speech frames 16 | ndarray -- 1D array of frame indices that separate different utterances from each other (includes 0 as the first element and the total number of frames as the last element). 17 | """ 18 | n_frames = 0 19 | counts = [] 20 | for vad_specifier in vad_rxspecifiers: 21 | vad_labels = kio.read_vector(vad_specifier) 22 | n_active = np.sum(vad_labels.numpy().astype(int)) 23 | counts.append(n_active) 24 | n_frames += n_active 25 | break_points = np.concatenate((np.atleast_1d(np.asarray(0, dtype=int)), np.cumsum(np.asarray(counts), dtype=int))) 26 | return n_frames, break_points 27 | 28 | def load_posterior_specifiers(scp_file_without_ext): 29 | """Loads posterior reading specifiers from scp file. 30 | 31 | Arguments: 32 | scp_file_without_ext {string} -- Filename of scp file without the extension. 33 | 34 | Returns: 35 | list -- List of posterior reading specifiers. 36 | """ 37 | scp_file = scp_file_without_ext + '.scp' 38 | rxspecifiers = [] 39 | with open(scp_file) as f: 40 | for line in f: 41 | rxspecifiers.append(line.split()[-1].strip()) 42 | return rxspecifiers 43 | 44 | 45 | def _get_kaldi_dataset_files(folder): 46 | """Forms full filenames for feats.scp, vad.scp, utt2num_frames, and utt2spk files. 47 | 48 | Arguments: 49 | folder {string} -- Folder where the files are. 50 | 51 | Returns: 52 | (string, string, string, string) -- Full filenames for feats.scp, vad.scp, utt2num_frames, and utt2spk files, respectively. 53 | """ 54 | feat_scp_file = os.path.join(folder, 'feats.scp') 55 | vad_scp_file = os.path.join(folder, 'vad.scp') 56 | utt2num_frames_file = os.path.join(folder, 'utt2num_frames') 57 | utt2spk_file = os.path.join(folder, 'utt2spk') 58 | return feat_scp_file, vad_scp_file, utt2num_frames_file, utt2spk_file 59 | 60 | 61 | def _choose_utterances(data_folder, meta_folder, selected_utts): 62 | """Loads selected utterance and speaker IDs and feature and vad reading specifiers from the given folder (meta_folder). Fixes specifiers to point to ark files that have been moved from their original location. 63 | 64 | Arguments: 65 | data_folder {string} -- Used to fix specifiers to point to correct ark files in case ark files were moved from their original location. Last subfolder of data_folder should have the same name as in the original path to the ark files. 66 | meta_folder {string} -- Folder where the feats.scp, vad.scp, utt2num_frames, and utt2spk are. 67 | selected_utts {set} -- Set of utterance IDs that should be selected. If None, selects all. 68 | 69 | Returns: 70 | list -- Reading specifiers for features. 71 | list -- Reading specifiers for VAD labels. 72 | list -- Utterance IDs. 73 | list -- Speaker IDs. 74 | """ 75 | feat_scp_file, vad_scp_file, utt2num_frames_file, utt2spk_file = _get_kaldi_dataset_files(meta_folder) 76 | base_folder = os.sep + os.path.basename(os.path.normpath(data_folder)) + os.sep 77 | feat_rxfilenames = [] 78 | vad_rxfilenames = [] 79 | utts = [] 80 | spks = [] 81 | with open(feat_scp_file) as f1, open(vad_scp_file) as f2, open(utt2spk_file) as f3: 82 | for line1, line2, line3 in zip(f1, f2, f3): 83 | parts1 = line1.split() 84 | if selected_utts is None or parts1[0] in selected_utts: 85 | parts2 = line2.split() 86 | parts3 = line3.split() 87 | if parts1[0] != parts2[0] or parts1[0] != parts3[0]: 88 | sys.exit('Error: scp-files are not aligned!') 89 | feat_loc = parts1[1].split(base_folder)[1].strip() 90 | vad_loc = parts2[1].split(base_folder)[1].strip() 91 | feat_rxfilenames.append(os.path.join(data_folder, feat_loc)) 92 | vad_rxfilenames.append(os.path.join(data_folder, vad_loc)) 93 | utts.append(parts1[0]) 94 | spks.append(parts3[1].strip()) 95 | return feat_rxfilenames, vad_rxfilenames, utts, spks 96 | 97 | 98 | def choose_all(data_folder, meta_folder): 99 | """Loads all utterance and speaker IDs and feature and vad reading specifiers from the given folder (meta_folder). Fixes specifiers to point to ark files that have been moved from their original location. 100 | 101 | Arguments: 102 | data_folder {string} -- Used to fix specifiers to point to correct ark files in case ark files were moved from their original location. Last subfolder of data_folder should have the same name as in the original path to the ark files. 103 | meta_folder {string} -- Folder where the feats.scp, vad.scp, utt2num_frames, and utt2spk are. 104 | 105 | Returns: 106 | list -- Reading specifiers for features. 107 | list -- Reading specifiers for VAD labels. 108 | list -- Utterance IDs. 109 | list -- Speaker IDs. 110 | """ 111 | print('Loading all feature-specifiers, utterance labels, and speaker labels from folder {}'.format(meta_folder)) 112 | return _choose_utterances(data_folder, meta_folder, None) 113 | 114 | 115 | def choose_n_longest(data_folder, meta_folder, n): 116 | """Same as choose_all function with a difference that this functions chooses the n longest utterances from the specified folder. 117 | """ 118 | print('Loading feature-specifiers, utterance labels, and speaker labels of the {} longest files from folder {}'.format(n, meta_folder)) 119 | feat_scp_file, vad_scp_file, utt2num_frames_file, utt2spk_file = _get_kaldi_dataset_files(meta_folder) 120 | selected_utts = set() 121 | utts = [] 122 | num_frames = [] 123 | with open(utt2num_frames_file) as f: 124 | for line in f: 125 | parts = line.split() 126 | utts.append(parts[0]) 127 | num_frames.append(int(parts[1].strip())) 128 | num_frames = np.asarray(num_frames, dtype=int) 129 | indices = np.argsort(num_frames) 130 | indices = indices[-n:] 131 | for index in indices: 132 | selected_utts.add(utts[index]) 133 | return _choose_utterances(data_folder, meta_folder, selected_utts) 134 | 135 | 136 | def choose_n_random(data_folder, meta_folder, n, seed=0): 137 | """Same as choose_all function with a difference that this functions chooses n random utterances from the specified folder. 138 | """ 139 | random.seed(seed) 140 | print('Loading feature-specifiers, utterance labels, and speaker labels of {} random files from folder {}'.format(n, meta_folder)) 141 | feat_scp_file, vad_scp_file, utt2num_frames_file, utt2spk_file = _get_kaldi_dataset_files(meta_folder) 142 | utts = [] 143 | with open(utt2num_frames_file) as f: 144 | for line in f: 145 | parts = line.split() 146 | utts.append(parts[0]) 147 | return _choose_utterances(data_folder, meta_folder, set(random.sample(utts, n))) 148 | 149 | 150 | def choose_from_wavfile(data_folder, meta_folder, wav_file, every_nth=1): 151 | """Chooses every n:th utterance from the wav_file. 152 | """ 153 | print('Loading (every {}) feature-specifiers, utterance labels, and speaker labels for utterances in {} from folder {}'.format(every_nth, wav_file, meta_folder)) 154 | 155 | utts = set() 156 | with open(wav_file) as f: 157 | for index, line in enumerate(f): 158 | if index % every_nth == 0: 159 | parts = line.split() 160 | utts.add(parts[0]) 161 | return _choose_utterances(data_folder, meta_folder, utts) -------------------------------------------------------------------------------- /asvtorch/misc/misc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from os.path import isfile, join 4 | 5 | def ensure_exists(folder): 6 | """If the folder does not exist, create it. 7 | 8 | Arguments: 9 | folder {string} -- Folder. 10 | """ 11 | if not os.path.exists(folder): 12 | os.makedirs(folder) 13 | 14 | def ensure_npz(filename, inverse=False): 15 | if inverse: 16 | if filename.endswith('.npz'): 17 | filename = filename[:-4] 18 | else: 19 | if not filename.endswith('.npz'): 20 | filename = filename + '.npz' 21 | return filename 22 | 23 | def ensure_tar(filename, inverse=False): 24 | if inverse: 25 | if filename.endswith('.tar'): 26 | filename = filename[:-4] 27 | else: 28 | if not filename.endswith('.tar'): 29 | filename = filename + '.tar' 30 | return filename 31 | 32 | def list_files(folder): 33 | return [f for f in os.listdir(folder) if isfile(join(folder, f))] 34 | 35 | def test_finiteness(tensor, description): 36 | if (~torch.isfinite(tensor)).sum() > 0: 37 | print('{}: NOT FINITE!'.format(description)) -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # This is an example config file. 2 | # Format is txt, but I use py-extension to get the color highlighting that the editor provides. 3 | 4 | # Lines starting with # are comment lines and the config parser ignores them. 5 | # Comment sign # only works if it is placed in the beginning of the line. 6 | 7 | # Empty lines separate settings for different runs. 8 | # Different runs will be run one after another by run_voxceleb_xvector_standalone.py. 9 | # After each run the settings will be reverted to their default values. 10 | # That is, settings of the previous run do not affect the settings of the next run. 11 | 12 | # The settings that are not specified in this file will have the default values defined in the settings classes. 13 | 14 | 15 | # RUN 0: 16 | # Compute and save frame posteriors to disk 17 | # This can be commented out after it has been done once. 18 | recipe.start_stage = 0 19 | recipe.end_stage = 1 20 | 21 | # RUN 1: 22 | # Try two different variations of augmented model training (with and without residual covariance updates): 23 | recipe.start_stage = 2 24 | ivector.type = ['augmented'] 25 | ivector.update_covariances = [False, True] 26 | ivector.minimum_divergence = [True] 27 | ivector.update_means = [True] 28 | #Increase the number of iterations to improve performance: 29 | ivector.n_iterations = 5 30 | ivector.dataloader_workers = 44 31 | ivector.ivec_dim = 400 32 | 33 | # RUN 2: 34 | # Try four parameter combinations [residual updates (True/False) and minimum_divergence (True/False)] with standard formulation: 35 | recipe.start_stage = 2 36 | ivector.type = ['standard'] 37 | ivector.update_covariances = [True, False] 38 | ivector.minimum_divergence = [True, False] 39 | ivector.update_means = [False] 40 | #Increase the number of iterations to improve performance: 41 | ivector.n_iterations = 5 42 | ivector.dataloader_workers = 44 43 | ivector.ivec_dim = 400 -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: ivectors 2 | channels: 3 | - pytorch 4 | - pykaldi 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=main 8 | - blas=1.0=openblas 9 | - ca-certificates=2019.8.28=0 10 | - certifi=2019.9.11=py37_0 11 | - cffi=1.12.3=py37h2e261b9_0 12 | - cudatoolkit=10.0.130=0 13 | - freetype=2.9.1=h8a8886c_1 14 | - intel-openmp=2019.4=243 15 | - jpeg=9b=h024ee3a_2 16 | - libedit=3.1.20181209=hc058e9b_0 17 | - libffi=3.2.1=hd88cf55_4 18 | - libgcc-ng=9.1.0=hdf63c60_0 19 | - libgfortran-ng=7.3.0=hdf63c60_0 20 | - libopenblas=0.3.6=h5a2b251_1 21 | - libpng=1.6.37=hbc83047_0 22 | - libstdcxx-ng=9.1.0=hdf63c60_0 23 | - libtiff=4.0.10=h2733197_2 24 | - llvmlite=0.29.0=py37hd408876_0 25 | - mkl=2019.4=243 26 | - ncurses=6.1=he6710b0_1 27 | - ninja=1.9.0=py37hfd86e86_0 28 | - nomkl=3.0=0 29 | - numpy=1.16.5=py37h99e49ec_0 30 | - numpy-base=1.16.5=py37h2f8d375_0 31 | - olefile=0.46=py37_0 32 | - openblas=0.3.6=1 33 | - openblas-devel=0.3.6=1 34 | - openssl=1.1.1d=h7b6447c_1 35 | - pillow=6.1.0=py37h34e0f95_0 36 | - pip=19.2.2=py37_0 37 | - pycparser=2.19=py37_0 38 | - python=3.7.4=h265db76_1 39 | - readline=7.0=h7b6447c_5 40 | - scipy=1.3.1=py37he2b7bc3_0 41 | - setuptools=41.2.0=py37_0 42 | - six=1.12.0=py37_0 43 | - sqlite=3.29.0=h7b6447c_0 44 | - tk=8.6.8=hbc83047_0 45 | - wheel=0.33.6=py37_0 46 | - xz=5.2.4=h14c3975_4 47 | - zlib=1.2.11=h7b6447c_3 48 | - zstd=1.3.7=h0b5b093_0 49 | - pykaldi-cpu=0.1.3=py37h14c3975_1 50 | - pytorch=1.2.0=py3.7_cuda10.0.130_cudnn7.6.2_0 51 | - torchvision=0.4.0=py37_cu100 52 | - pip: 53 | - pykaldi==0.1.2 54 | - torch==1.2.0 55 | 56 | -------------------------------------------------------------------------------- /kaldi/egs/voxceleb/v1/extract_feats_and_train_ubm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2017 Johns Hopkins University (Author: Daniel Garcia-Romero) 3 | # 2017 Johns Hopkins University (Author: Daniel Povey) 4 | # 2017-2018 David Snyder 5 | # 2018 Ewald Enzinger 6 | # 7 | # 2019 Ville Vestman 8 | # Apache 2.0. 9 | 10 | 11 | . ./cmd.sh 12 | . ./path.sh 13 | set -e 14 | 15 | # This script should be run from egs/voxceleb/v1/ folder of your Kaldi installation. 16 | # This script extracts MFCCs and trains the UBM following the original VoxCeleb v1 recipe. 17 | 18 | # CHANGE THE FOLLOWING THREE FOLDERS BEFORE RUNNING THE SCRIPT: 19 | output_dir=/media/hdd2/vvestman/voxceleb_outputs 20 | voxceleb1_root=/media/hdd3/voxceleb 21 | voxceleb2_root=/media/hdd3/voxceleb2 22 | 23 | mfccdir=$output_dir/mfcc 24 | vaddir=$output_dir/mfcc 25 | 26 | stage=0 27 | 28 | if [ $stage -le 0 ]; then 29 | local/make_voxceleb2.pl $voxceleb2_root dev $output_dir/voxceleb2_train 30 | local/make_voxceleb2.pl $voxceleb2_root test $output_dir/voxceleb2_test 31 | local/make_voxceleb1.pl $voxceleb1_root $output_dir #IF YOU ARE USING THE NEWEST VERSION OF VOXCELEB1, THIS SCRIPT PROBABLY DOES NOT WORK (data organization changed from the original version) 32 | 33 | # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1. 34 | utils/combine_data.sh $output_dir/train $output_dir/voxceleb2_train $output_dir/voxceleb2_test $output_dir/voxceleb1_train 35 | fi 36 | 37 | if [ $stage -le 1 ]; then 38 | # Make MFCCs and compute the energy-based VAD for each dataset 39 | for name in train voxceleb1_test; do 40 | steps/make_mfcc.sh --write-utt2num-frames true \ 41 | --mfcc-config conf/mfcc.conf --nj 16 --cmd "$train_cmd" \ 42 | $output_dir/${name} $output_dir/make_mfcc $mfccdir 43 | utils/fix_data_dir.sh $output_dir/${name} 44 | sid/compute_vad_decision.sh --nj 16 --cmd "$train_cmd" \ 45 | $output_dir/${name} $output_dir/make_vad $vaddir 46 | utils/fix_data_dir.sh $output_dir/${name} 47 | done 48 | fi 49 | 50 | if [ $stage -le 2 ]; then 51 | # Train the UBM. 52 | sid/train_diag_ubm.sh --cmd "$train_cmd --mem 20G" \ 53 | --nj 16 --num-threads 8 \ 54 | $output_dir/train 2048 \ 55 | $output_dir/diag_ubm 56 | 57 | sid/train_full_ubm.sh --cmd "$train_cmd --mem 40G" \ 58 | --nj 16 --remove-low-count-gaussians false \ 59 | $output_dir/train \ 60 | $output_dir/diag_ubm $output_dir/full_ubm 61 | fi 62 | 63 | echo "Done!" 64 | -------------------------------------------------------------------------------- /run_voxceleb_ivector.py: -------------------------------------------------------------------------------- 1 | import os 2 | import socket 3 | import datetime 4 | 5 | import torch 6 | import numpy as np 7 | 8 | import asvtorch.kaldidata.kaldifeatloaders 9 | import asvtorch.kaldidata.utils 10 | from asvtorch.misc.misc import ensure_exists 11 | import asvtorch.ivector.posteriors 12 | import asvtorch.ivector 13 | import asvtorch.ivector.ivector_extractor 14 | import asvtorch.ivector.settings 15 | import asvtorch.evaluation.trials 16 | import asvtorch.evaluation.eval_metrics 17 | from asvtorch.ivector.gmm import Gmm 18 | from asvtorch.backend.plda import Plda 19 | from asvtorch.backend.vector_processing import VectorProcessor 20 | from asvtorch.evaluation.parameters import ParameterChanger 21 | 22 | # UPDATE THIS TO THE SAME FOLDER THAT WAS USED IN THE KALDI SCRIPT FOR OUTPUTS: 23 | DATA_FOLDER = '/media/hdd2/vvestman/voxceleb_outputs' 24 | 25 | TRY_TO_USE_GPU = True 26 | 27 | if TRY_TO_USE_GPU: 28 | if torch.cuda.is_available(): 29 | asvtorch.global_setup.device = torch.device("cuda:0") 30 | print('Using GPU!') 31 | 32 | print('Loading settings...') 33 | posterior_extraction_settings = asvtorch.ivector.settings.PosteriorExtractionSettings() 34 | posterior_extraction_settings.dataloader_workers = 4 35 | ivector_settings = asvtorch.ivector.settings.IVectorSettings() 36 | 37 | class RecipeSettings(): 38 | def __init__(self): 39 | self.start_stage = 0 40 | self.end_stage = 3 41 | self.plda_dim = 200 42 | recipe_settings = RecipeSettings() 43 | 44 | 45 | posterior_extraction_settings.print_settings() 46 | ivector_settings.print_settings() 47 | 48 | parameter_changer = ParameterChanger('config.py', {'ivector': ivector_settings, 'recipe': recipe_settings}) 49 | ensure_exists(os.path.join(DATA_FOLDER, 'results')) 50 | resultfile = open(os.path.join(DATA_FOLDER, 'results', 'results_{}.txt'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))), 'w') 51 | 52 | 53 | # Input data: 54 | TRAIN_FOLDER = os.path.join(DATA_FOLDER, 'train') 55 | TEST_FOLDER = os.path.join(DATA_FOLDER, 'voxceleb1_test') 56 | TRIAL_FILE = os.path.join(TEST_FOLDER, 'trials') 57 | UBM_FILE = os.path.join(DATA_FOLDER, 'full_ubm', 'final.ubm') 58 | VOX1_TRAIN_WAVFILE = os.path.join(DATA_FOLDER, 'voxceleb1_train', 'wav.scp') 59 | 60 | # Output files: 61 | IVEC_TRAINING_POSTERIOR_FILE = os.path.join(DATA_FOLDER, 'posteriors', 'ivec_posteriors') 62 | BACKEND_TRAINING_POSTERIOR_FILE = os.path.join(DATA_FOLDER, 'posteriors', 'backend_posteriors') 63 | TESTING_POSTERIOR_FILE = os.path.join(DATA_FOLDER, 'posteriors', 'testing_posteriors') 64 | ensure_exists(os.path.join(DATA_FOLDER, 'posteriors')) 65 | 66 | EXTRACTOR_OUTPUT_FILE = os.path.join(DATA_FOLDER, 'iextractor', 'iextractor') 67 | ensure_exists(os.path.join(DATA_FOLDER, 'iextractor')) 68 | 69 | 70 | print('Initializing feature loader...') 71 | feature_loader = asvtorch.kaldidata.kaldifeatloaders.VoxcelebFeatureLoader() 72 | print('Loading KALDI UBM...') 73 | ubm = Gmm.from_kaldi(UBM_FILE, asvtorch.global_setup.device) 74 | 75 | 76 | # Dataset preparation 77 | print('Choosing dataset for i-vector extractor training...') 78 | feat_rxspecifiers, vad_rxspecifiers, utt_ids, spk_ids = asvtorch.kaldidata.utils.choose_n_longest(DATA_FOLDER, TRAIN_FOLDER, 100000) 79 | rxspecifiers = (feat_rxspecifiers, vad_rxspecifiers) 80 | 81 | print('Choosing dataset for PLDA training...') 82 | plda_feat_rxspecifiers, plda_vad_rxspecifiers, plda_utt_ids, plda_spk_ids = asvtorch.kaldidata.utils.choose_from_wavfile(DATA_FOLDER, TRAIN_FOLDER, VOX1_TRAIN_WAVFILE, 1) 83 | plda_rxspecifiers = (plda_feat_rxspecifiers, plda_vad_rxspecifiers) 84 | 85 | test_feat_rxspecifiers, test_vad_rxspecifiers, test_utt_ids, test_spk_ids = asvtorch.kaldidata.utils.choose_all(DATA_FOLDER, TEST_FOLDER) 86 | test_rxspecifiers = (test_feat_rxspecifiers, test_vad_rxspecifiers) 87 | 88 | 89 | while parameter_changer.next(): 90 | 91 | # Frame posterior extraction 92 | if recipe_settings.start_stage <= 1 <= recipe_settings.end_stage: 93 | asvtorch.ivector.posteriors.batch_extract_posteriors(rxspecifiers, utt_ids, feature_loader, ubm, IVEC_TRAINING_POSTERIOR_FILE, posterior_extraction_settings) 94 | asvtorch.ivector.posteriors.batch_extract_posteriors(plda_rxspecifiers, plda_utt_ids, feature_loader, ubm, BACKEND_TRAINING_POSTERIOR_FILE, posterior_extraction_settings) 95 | asvtorch.ivector.posteriors.batch_extract_posteriors(test_rxspecifiers, test_utt_ids, feature_loader, ubm, TESTING_POSTERIOR_FILE, posterior_extraction_settings) 96 | 97 | 98 | # Preparing data with posteriors 99 | posterior_rxspecifiers = asvtorch.kaldidata.utils.load_posterior_specifiers(IVEC_TRAINING_POSTERIOR_FILE) 100 | rxspecifiers = (*rxspecifiers, posterior_rxspecifiers) # Tuple of three elements: (feats, vad, posteriors) 101 | plda_posterior_rxspecifiers = asvtorch.kaldidata.utils.load_posterior_specifiers(BACKEND_TRAINING_POSTERIOR_FILE) 102 | plda_rxspecifiers = (*plda_rxspecifiers, plda_posterior_rxspecifiers) # Tuple of three elements: (feats, vad, posteriors) 103 | test_posterior_rxspecifiers = asvtorch.kaldidata.utils.load_posterior_specifiers(TESTING_POSTERIOR_FILE) 104 | test_rxspecifiers = (*test_rxspecifiers, test_posterior_rxspecifiers) # Tuple of three elements: (feats, vad, posteriors) 105 | 106 | 107 | 108 | if recipe_settings.start_stage <= 2 <= recipe_settings.end_stage: 109 | 110 | # I-vector extractor training 111 | ivector_extractor = asvtorch.ivector.ivector_extractor.IVectorExtractor.random_init(ubm, ivector_settings, asvtorch.global_setup.device, seed=0) 112 | iteration_times = ivector_extractor.train(rxspecifiers, feature_loader, EXTRACTOR_OUTPUT_FILE, ivector_settings) 113 | 114 | for iteration in range(1, ivector_settings.n_iterations + 1): 115 | 116 | ivector_extractor = asvtorch.ivector.ivector_extractor.IVectorExtractor.from_npz(EXTRACTOR_OUTPUT_FILE, asvtorch.global_setup.device, iteration) 117 | 118 | # Extracting i-vectors 119 | plda_training_vectors = ivector_extractor.extract(plda_rxspecifiers, feature_loader, ivector_settings) 120 | test_vectors = ivector_extractor.extract(test_rxspecifiers, feature_loader, ivector_settings) 121 | 122 | # Processing i-vectors 123 | vector_processor = VectorProcessor.train(plda_training_vectors, 'cwl', asvtorch.global_setup.device) 124 | plda_training_vectors = vector_processor.process(plda_training_vectors) 125 | test_vectors = vector_processor.process(test_vectors) 126 | 127 | # Training PLDA 128 | plda = Plda.train_closed_form(plda_training_vectors, plda_spk_ids, asvtorch.global_setup.device) 129 | 130 | # Arranging trials 131 | left_vectors, right_vectors, labels = asvtorch.evaluation.trials.organize_trials(test_vectors, test_utt_ids, TRIAL_FILE) 132 | 133 | # Scoring 134 | scores = plda.score_trials(left_vectors, right_vectors, recipe_settings.plda_dim) 135 | eer = asvtorch.evaluation.eval_metrics.compute_eer(scores[labels], scores[~labels])[0] 136 | 137 | # Printing results 138 | print(parameter_changer.get_current_string(compact=False)) 139 | print('EER: {:.4f} %'.format(eer*100)) 140 | resultfile.write('{:.4f} {:.2f} {} {}\n'.format(eer*100, iteration_times[iteration-1], iteration, parameter_changer.get_value_string())) 141 | resultfile.flush() 142 | 143 | resultfile.close() 144 | --------------------------------------------------------------------------------