├── dataset
├── README.md
├── diarizationExample.wav
└── diarizationExample.segments
├── src
├── README.md
├── gmm.py
├── SAD.py
└── ReDiarization.py
└── README.md
/dataset/README.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/dataset/diarizationExample.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rehan-Ahmad/SpeakerDiarization/HEAD/dataset/diarizationExample.wav
--------------------------------------------------------------------------------
/dataset/diarizationExample.segments:
--------------------------------------------------------------------------------
1 | 0,6.3,speakerA
2 | 6.3,11.3,speakerB
3 | 11.3,15.8,speakerC
4 | 15.8,18.8,speakerB
5 | 18.8,27.8,speakerC
6 | 27.8,34.4,speakerB
7 | 34.4,42,speakerD
8 |
--------------------------------------------------------------------------------
/src/README.md:
--------------------------------------------------------------------------------
1 | ### Diarization Output for short audio recording from dataset folder. Initially 16 clusters were created, similar clusters merge heirachically based upon BIC metric.
2 | ====================== CLUSTERING ======================
3 | size of each cluster: [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 10]
4 | size of each cluster: [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 10, 10]
5 | size of each cluster: [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 10, 10, 10]
6 | size of each cluster: [5, 5, 5, 5, 5, 5, 5, 5, 10, 10, 10, 10]
7 | size of each cluster: [5, 5, 5, 5, 5, 5, 5, 10, 10, 10, 15]
8 | size of each cluster: [5, 5, 5, 5, 5, 10, 10, 10, 15, 10]
9 | size of each cluster: [5, 5, 5, 5, 10, 10, 10, 10, 20]
10 | size of each cluster: [5, 5, 5, 10, 10, 10, 20, 15]
11 | size of each cluster: [5, 5, 5, 10, 10, 20, 25]
12 | size of each cluster: [5, 5, 10, 10, 25, 25]
13 | size of each cluster: [5, 10, 25, 25, 15]
14 | size of each cluster: [5, 10, 25, 25, 15]
15 | === Final size of each cluster: [5, 10, 25, 25, 15]
16 |
17 | Diarization Error Rate: 9.96 %
18 | Cluster Purity: 94.77 %
19 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SpeakerDiarization
2 | ## Requirements:
3 | I have used Anaconda distribution with python3.
4 | ### Install other dependencies as follows:
5 | pip install pyannote.metrics librosa
6 | ## Main File:
7 | Use ReDiarization.py to run the speaker Diarization on audio files.
8 | See the "main" carefully to set the flags to run specific scenarios.
9 | e.g. Either to use sparse feature extraction with MFCC etc.
10 | Set your own paths in "main" for input audio and other output files.
11 | ## Data Set:
12 | Short audio file consisting of 40seconds recording with annotation is placed in "dataset" folder. To use large audio files, you can use the AMI corpus with annotation from http://groups.inf.ed.ac.uk/ami/download/.
13 | ## References:
14 | Main implementation of Speaker Diarization is based on GMM Heirarchical Agglomerative clustering which is taken from [1], while "Speech Activity detection" is taken from [2]. Used pyannote.metrics to compute Diarization Error rate, Cluster purity etc. from [3].
15 |
16 | __[1] E. Gonina, G. Friedland, H. Cook and K. Keutzer, "Fast speaker diarization using a high-level scripting language," 2011 IEEE Workshop on Automatic Speech Recognition & Understanding, Waikoloa, HI, 2011, pp. 553-558.__
17 |
18 | __[2] @article{giannakopoulos2015pyaudioanalysis,
19 | title={pyAudioAnalysis: An Open-Source Python Library for Audio Signal Analysis},
20 | author={Giannakopoulos, Theodoros},
21 | journal={PloS one},
22 | volume={10},
23 | number={12},
24 | year={2015},
25 | publisher={Public Library of Science}
26 | }__
27 |
28 | __[3] @inproceedings{pyannote.metrics,
29 | author = {Herv\'e Bredin},
30 | title = {{pyannote.metrics: a toolkit for reproducible evaluation, diagnostic, and error analysis of speaker diarization systems}},
31 | booktitle = {{Interspeech 2017, 18th Annual Conference of the International Speech Communication Association}},
32 | year = {2017},
33 | month = {August},
34 | address = {Stockholm, Sweden},
35 | url = {http://pyannote.github.io/pyannote-metrics},
36 | }__
37 |
--------------------------------------------------------------------------------
/src/gmm.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from numpy.random import random
3 | from sklearn.mixture import GaussianMixture
4 | import pdb
5 |
6 | class GMMComponents(object):
7 | """
8 | The Python interface to the components of a GMM.
9 | """
10 | def __init__(self, M, D, weights = None, means = None, covars = None):
11 | self.M = M
12 | self.D = D
13 | self.weights = weights if weights is not None else random(M)
14 | self.means = means if means is not None else random((M,D))
15 | self.covars = covars if covars is not None else random((M,D))
16 | self.comp_probs = np.empty(M, dtype=np.float32)
17 |
18 | def init_random_weights(self):
19 | self.weights = random((self.M))
20 |
21 | def init_random_means(self):
22 | self.means = random((self.M,self.D))
23 |
24 | def init_random_covars(self):
25 | self.covars = random((self.M, self.D, self.D))
26 |
27 | def shrink_components(self, new_M):
28 | self.weights = np.resize(self.weights, new_M)
29 | self.means = np.resize(self.means, new_M*self.D)
30 | self.covars = np.resize(self.covars, new_M*self.D*self.D)
31 |
32 | class GMMEvalData(object):
33 | """
34 | The Python interface to the evaluation data generated by scoring a GMM.
35 | """
36 | def __init__(self, N, M):
37 | self.N = N
38 | self.M = M
39 | self.memberships = np.zeros((M,N), dtype=np.float32)
40 | self.loglikelihoods = np.zeros(N, dtype=np.float32)
41 | self.likelihood = 0.0
42 |
43 | def resize(self, N, M):
44 | self.memberships.resize((M,N))
45 | self.memberships = np.ascontiguousarray(self.memberships)
46 | self.loglikelihoods.resize(N, refcheck=False)
47 | self.loglikelihoods = np.ascontiguousarray(self.loglikelihoods)
48 | self.M = M
49 | self.N = N
50 |
51 | class GMM(object):
52 | """
53 | The specialized GMM abstraction.
54 | """
55 | cvtype_name_list = ['diag','full'] #Types of covariance matrix
56 | def __init__(self, M, D, means=None, covars=None, weights=None, cvtype='diag'):
57 | """
58 | cvtype must be one of 'diag' or 'full'. Uninitialized components will be seeded.
59 | """
60 | self.M = M
61 | self.D = D
62 | if cvtype in GMM.cvtype_name_list:
63 | self.cvtype = cvtype
64 | else:
65 | raise RuntimeError("Specified cvtype is not allowed, try one of " + str(GMM.cvtype_name_list))
66 |
67 | self.components = GMMComponents(M, D, weights, means, covars)
68 | self.eval_data = GMMEvalData(1, M)
69 | self.clf = None # pure python mirror module
70 |
71 | if means is None and covars is None and weights is None:
72 | self.components_seeded = False
73 | else:
74 | self.components_seeded = True
75 |
76 | # Training and Evaluation of GMM
77 | def train_using_python(self, input_data, iters=10):
78 | seed = 6
79 | np.random.seed(seed)
80 | if self.components_seeded and self.clf is None:
81 | self.clf = GaussianMixture(n_components=self.M, covariance_type=self.cvtype, warm_start=True,\
82 | max_iter=iters, random_state=seed, init_params='random',reg_covar=0.01,\
83 | weights_init=self.components.weights, means_init=self.components.means)
84 | # precisions_init = self.components.covars)
85 |
86 | elif not self.components_seeded and self.clf is None:
87 | self.clf = GaussianMixture(n_components=self.M, covariance_type=self.cvtype, warm_start=True,\
88 | max_iter=iters, random_state=seed, init_params='random',reg_covar=0.01)
89 |
90 | # self.clf = GaussianMixture(n_components=self.M, covariance_type=self.cvtype,\
91 | # max_iter=iters, random_state=seed, init_params='random')
92 |
93 | self.clf.fit(input_data)
94 | return self.clf.means_, self.clf.covariances_
95 |
96 | def train(self, input_data, min_em_iters=1, max_em_iters=10):
97 | """
98 | Train the GMM on the data. Optinally specify max and min iterations.
99 | """
100 | if input_data.shape[1] != self.D:
101 | print("Error: Data has %d features, model expects %d features." % (input_data.shape[1], self.D))
102 |
103 | self.components.means,self.components.covars = self.train_using_python(input_data, iters=max_em_iters)
104 | self.components.weights = self.clf.weights_
105 |
106 | self.eval_data.likelihood = self.clf.bic(input_data)
107 |
108 | return self.eval_data.likelihood
109 |
110 | def eval(self, obs_data):
111 | if obs_data.shape[1] != self.D:
112 | print("Error: Data has %d features, model expects %d features." % (obs_data.shape[1], self.D))
113 |
114 | self.eval_data.loglikelihoods = self.clf.score_samples(obs_data)
115 |
116 | logprob = self.eval_data.loglikelihoods
117 | self.eval_data.memberships = self.clf.predict_proba(obs_data)
118 | posteriors = self.eval_data.memberships
119 | return logprob, posteriors # N log probabilities, NxM posterior probabilities for each component
120 |
121 | def score(self, obs_data):
122 | logprob, posteriors = self.eval(obs_data)
123 | return logprob # N log probabilities
124 |
125 | def decode(self, obs_data):
126 | logprob, posteriors = self.eval(obs_data)
127 | return logprob, posteriors.argmax(axis=0) # N log probabilities, N indexes of most likely components
128 |
129 | def predict(self, obs_data):
130 | logprob, posteriors = self.eval(obs_data)
131 | return posteriors.argmax(axis=0) # N indexes of most likely components
132 |
133 | #Functions for calculating distance between two GMMs according to BIC scores.
134 | def compute_distance_BIC(gmm1, gmm2, data, em_iters=10):
135 | cd1_M = gmm1.M
136 | cd2_M = gmm2.M
137 | nComps = cd1_M + cd2_M
138 |
139 | ratio1 = float(cd1_M)/float(nComps)
140 | ratio2 = float(cd2_M)/float(nComps)
141 |
142 | w = np.ascontiguousarray(np.append(ratio1*gmm1.components.weights, ratio2*gmm2.components.weights))
143 | m = np.ascontiguousarray(np.append(gmm1.components.means, gmm2.components.means, axis=0))
144 | c = np.ascontiguousarray(np.append(gmm1.components.covars, gmm2.components.covars,axis=0))
145 |
146 | temp_GMM = GMM(nComps, gmm1.D, weights=w, means=m, covars=c, cvtype=gmm1.cvtype)
147 |
148 | temp_GMM.train(data, max_em_iters=em_iters)
149 | # score = temp_GMM.eval_data.likelihood - (gmm1.eval_data.likelihood + gmm2.eval_data.likelihood)
150 | score = (gmm1.eval_data.likelihood + gmm2.eval_data.likelihood) - temp_GMM.eval_data.likelihood
151 |
152 | return temp_GMM, score
153 |
--------------------------------------------------------------------------------
/src/SAD.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Tue Nov 28 18:42:21 2017
5 |
6 | @author: rehan
7 | """
8 | import numpy as np
9 | import numpy
10 | import sklearn
11 | import matplotlib.pylab as plt
12 | import librosa
13 |
14 | def listOfFeatures2Matrix(features):
15 | '''
16 | listOfFeatures2Matrix(features)
17 |
18 | This function takes a list of feature matrices as argument and returns a single concatenated feature matrix and the respective class labels.
19 |
20 | ARGUMENTS:
21 | - features: a list of feature matrices
22 |
23 | RETURNS:
24 | - X: a concatenated matrix of features
25 | - Y: a vector of class indeces
26 | '''
27 |
28 | X = numpy.array([])
29 | Y = numpy.array([])
30 | for i, f in enumerate(features):
31 | if i == 0:
32 | X = f
33 | Y = i * numpy.ones((len(f), 1))
34 | else:
35 | X = numpy.vstack((X, f))
36 | Y = numpy.append(Y, i * numpy.ones((len(f), 1)))
37 | return (X, Y)
38 |
39 | def normalizeFeatures(features):
40 | '''
41 | This function normalizes a feature set to 0-mean and 1-std.
42 | Used in most classifier trainning cases.
43 |
44 | ARGUMENTS:
45 | - features: list of feature matrices (each one of them is a numpy matrix)
46 | RETURNS:
47 | - featuresNorm: list of NORMALIZED feature matrices
48 | - MEAN: mean vector
49 | - STD: std vector
50 | '''
51 | X = np.array([])
52 |
53 | for count, f in enumerate(features):
54 | if f.shape[0] > 0:
55 | if count == 0:
56 | X = f
57 | else:
58 | X = np.vstack((X, f))
59 | count += 1
60 |
61 | MEAN = np.mean(X, axis=0) + 0.00000000000001;
62 | STD = np.std(X, axis=0) + 0.00000000000001;
63 |
64 | featuresNorm = []
65 | for f in features:
66 | ft = f.copy()
67 | for nSamples in range(f.shape[0]):
68 | ft[nSamples, :] = (ft[nSamples, :] - MEAN) / STD
69 | featuresNorm.append(ft)
70 | return (featuresNorm, MEAN, STD)
71 |
72 | def trainSVM(features, Cparam):
73 | '''
74 | Train a multi-class probabilitistic SVM classifier.
75 | Note: This function is simply a wrapper to the sklearn functionality for SVM training
76 | See function trainSVM_feature() to use a wrapper on both the feature extraction and the SVM training (and parameter tuning) processes.
77 | ARGUMENTS:
78 | - features: a list ([numOfClasses x 1]) whose elements containt numpy matrices of features
79 | each matrix features[i] of class i is [numOfSamples x numOfDimensions]
80 | - Cparam: SVM parameter C (cost of constraints violation)
81 | RETURNS:
82 | - svm: the trained SVM variable
83 |
84 | NOTE:
85 | This function trains a linear-kernel SVM for a given C value. For a different kernel, other types of parameters should be provided.
86 | '''
87 |
88 | [X, Y] = listOfFeatures2Matrix(features)
89 | svm = sklearn.svm.SVC(C = Cparam, kernel = 'linear', probability = True)
90 | svm.fit(X,Y)
91 |
92 | return svm
93 |
94 | def smoothMovingAvg(inputSignal, windowLen=11):
95 | windowLen = int(windowLen)
96 | if inputSignal.ndim != 1:
97 | raise ValueError("")
98 | if inputSignal.size < windowLen:
99 | raise ValueError("Input vector needs to be bigger than window size.")
100 | if windowLen < 3:
101 | return inputSignal
102 | s = numpy.r_[2*inputSignal[0] - inputSignal[windowLen-1::-1], inputSignal, 2*inputSignal[-1]-inputSignal[-1:-windowLen:-1]]
103 | w = numpy.ones(windowLen, 'd')
104 | y = numpy.convolve(w/w.sum(), s, mode='same')
105 | return y[windowLen:-windowLen+1]
106 |
107 | def silenceRemoval(x, Fs, stWin, stStep, smoothWindow=0.5, Weight=0.5, plot=False):
108 | '''
109 | Event Detection (silence removal)
110 | ARGUMENTS:
111 | - x: the input audio signal
112 | - Fs: sampling freq
113 | - stWin, stStep: window size and step in seconds
114 | - smoothWindow: (optinal) smooth window (in seconds)
115 | - Weight: (optinal) weight factor (0 < Weight < 1) the higher, the more strict
116 | - plot: (optinal) True if results are to be plotted
117 | RETURNS:
118 | - segmentLimits: list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that
119 | the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds
120 | '''
121 |
122 | if Weight >= 1:
123 | Weight = 0.99
124 | if Weight <= 0:
125 | Weight = 0.01
126 |
127 | # Step 1: feature extraction
128 | # x = audioBasicIO.stereo2mono(x) # convert to mono
129 | # featTypes = [8,0,13,13]
130 | # ShortTermFeatures = aF.stFeatureExtraction(x, Fs, stWin * Fs, stStep * Fs,featTypes) # extract short-term features
131 | S = librosa.feature.melspectrogram(y=x, sr=Fs, n_fft=int(Fs*stWin), hop_length=int(Fs*stStep))
132 | fVects = librosa.feature.mfcc(y=x, S=librosa.power_to_db(S), sr=Fs, n_mfcc=19)
133 | feaEnergy = librosa.feature.rmse(y=x, frame_length=int(Fs*stWin), hop_length=int(Fs*stStep))
134 | ShortTermFeatures = np.append(feaEnergy,fVects,axis=0)
135 |
136 | # Step 2: train binary SVM classifier of low vs high energy frames
137 | EnergySt = ShortTermFeatures[0, :] # keep only the energy short-term sequence (1nd feature)
138 | E = numpy.sort(EnergySt) # sort the energy feature values:
139 | L1 = int(len(E) / 10) # number of 10% of the total short-term windows
140 | T1 = numpy.mean(E[0:L1]) + 0.000000000000001 # compute "lower" 10% energy threshold
141 | T2 = numpy.mean(E[-L1:-1]) + 0.000000000000001 # compute "higher" 10% energy threshold
142 | Class1 = ShortTermFeatures[:, numpy.where(EnergySt <= T1)[0]] # get all features that correspond to low energy
143 | Class2 = ShortTermFeatures[:, numpy.where(EnergySt >= T2)[0]] # get all features that correspond to high energy
144 | featuresSS = [Class1.T, Class2.T] # form the binary classification task and ...
145 |
146 | [featuresNormSS, MEANSS, STDSS] = normalizeFeatures(featuresSS) # normalize and ...
147 | SVM = trainSVM(featuresNormSS, 1.0) # train the respective SVM probabilistic model (ONSET vs SILENCE)
148 |
149 | # Step 3: compute onset probability based on the trained SVM
150 | ProbOnset = []
151 | for i in range(ShortTermFeatures.shape[1]): # for each frame
152 | curFV = (ShortTermFeatures[:, i] - MEANSS) / STDSS # normalize feature vector
153 | ProbOnset.append(SVM.predict_proba(curFV.reshape(1,-1))[0][1]) # get SVM probability (that it belongs to the ONSET class)
154 | ProbOnset = numpy.array(ProbOnset)
155 | ProbOnset = smoothMovingAvg(ProbOnset, smoothWindow / stStep) # smooth probability
156 |
157 | # Step 4A: detect onset frame indices:
158 | ProbOnsetSorted = numpy.sort(ProbOnset) # find probability Threshold as a weighted average of top 10% and lower 10% of the values
159 | Nt = int(ProbOnsetSorted.shape[0] / 10)
160 | T = (numpy.mean((1 - Weight) * ProbOnsetSorted[0:Nt]) + Weight * numpy.mean(ProbOnsetSorted[-Nt::]))
161 | MaxIdx = numpy.where(ProbOnset > T)[0] # get the indices of the frames that satisfy the thresholding
162 | i = 0
163 | timeClusters = []
164 | segmentLimits = []
165 |
166 | # Step 4B: group frame indices to onset segments
167 | while i < len(MaxIdx): # for each of the detected onset indices
168 | curCluster = [MaxIdx[i]]
169 | if i == len(MaxIdx)-1:
170 | break
171 | while MaxIdx[i+1] - curCluster[-1] <= 2:
172 | curCluster.append(MaxIdx[i+1])
173 | i += 1
174 | if i == len(MaxIdx)-1:
175 | break
176 | i += 1
177 | timeClusters.append(curCluster)
178 | segmentLimits.append([curCluster[0] * stStep, curCluster[-1] * stStep])
179 |
180 | # Step 5: Post process: remove very small segments:
181 | minDuration = 0.2
182 | segmentLimits2 = []
183 | for s in segmentLimits:
184 | if s[1] - s[0] > minDuration:
185 | segmentLimits2.append(s)
186 | segmentLimits = segmentLimits2
187 |
188 | if plot:
189 | timeX = numpy.arange(0, x.shape[0] / float(Fs), 1.0 / Fs)
190 |
191 | plt.subplot(2, 1, 1)
192 | plt.plot(timeX, x)
193 | for s in segmentLimits:
194 | plt.axvline(x=s[0])
195 | plt.axvline(x=s[1])
196 | plt.subplot(2, 1, 2)
197 | plt.plot(numpy.arange(0, ProbOnset.shape[0] * stStep, stStep), ProbOnset)
198 | plt.title('Signal')
199 | for s in segmentLimits:
200 | plt.axvline(x=s[0])
201 | plt.axvline(x=s[1])
202 | plt.title('SVM Probability')
203 | plt.show()
204 |
205 | return segmentLimits, MaxIdx
206 |
--------------------------------------------------------------------------------
/src/ReDiarization.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Parameters:
5 | 1. lengthe of feature vectors
6 | 2. types of features
7 | 3. Gaussian mixtures
8 | 4. Segment length
9 | 5. Number of iterations of EM
10 | 6.
11 |
12 | TODO: Density estimation of GMM for accuracy of GMM for specific speaker.
13 |
14 | Created on Sun Nov 19 12:29:28 2017
15 |
16 | @author: rehan
17 |
18 | --- Optional: ---
19 | spnsp_file: spnsp file (all features used by default)
20 | em_iterations: Number of iterations for the standard
21 | segmentation loop training (3 by default)
22 | num_seg_iters_init: Number of majority vote iterations
23 | in the initialization phase (2 by default)
24 | num_seg_iters: Number of majority vote iterations
25 | in the main loop (3 by default)
26 | seg_length: Segment length for majority vote in frames
27 | (250 frames by default)
28 |
29 | """
30 | import time
31 | import scipy.stats.mstats as stats
32 | import numpy as np
33 | from gmm import *
34 | from SAD import silenceRemoval
35 | from pyannote.core import Segment, Timeline, Annotation
36 | from pyannote.metrics.diarization import DiarizationErrorRate
37 | from pyannote.metrics.diarization import DiarizationPurity
38 | from pyannote.metrics.detection import DetectionErrorRate
39 | import librosa
40 | import xml.etree.ElementTree as ET
41 | from copy import copy
42 | from sklearn.preprocessing import StandardScaler
43 |
44 | class Diarizer(object):
45 |
46 | def __init__(self, data, total_frames):
47 | pruned_list = data
48 | floatArray = np.array(pruned_list, dtype = np.float32)
49 | self.X = floatArray.T
50 |
51 | self.N = self.X.shape[0]
52 | self.D = self.X.shape[1]
53 | self.total_num_frames = total_frames
54 |
55 | def write_to_RTTM(self, rttm_file_name, sp_file_name,\
56 | meeting_name, most_likely, num_gmms,\
57 | seg_length):
58 |
59 | print("...Writing out RTTM file...")
60 | #do majority voting in chunks of 250
61 | duration = seg_length
62 | chunk = 0
63 | end_chunk = duration
64 |
65 | max_gmm_list = []
66 |
67 | smoothed_most_likely = np.array([], dtype=np.float32)
68 |
69 | while end_chunk < len(most_likely):
70 | chunk_arr = most_likely[list(range(chunk, end_chunk))]
71 | max_gmm = stats.mode(chunk_arr)[0][0]
72 | max_gmm_list.append(max_gmm)
73 | smoothed_most_likely = np.append(smoothed_most_likely, max_gmm*np.ones(seg_length)) #changed ones from 250 to seg_length
74 | chunk += duration
75 | end_chunk += duration
76 |
77 | end_chunk -= duration
78 | if end_chunk < len(most_likely):
79 | chunk_arr = most_likely[list(range(end_chunk, len(most_likely)))]
80 | max_gmm = stats.mode(chunk_arr)[0][0]
81 | max_gmm_list.append(max_gmm)
82 | smoothed_most_likely = np.append(smoothed_most_likely,\
83 | max_gmm*np.ones(len(most_likely)-end_chunk))
84 | most_likely = smoothed_most_likely
85 |
86 | out_file = open(rttm_file_name, 'w')
87 |
88 | with_non_speech = -1*np.ones(self.total_num_frames)
89 |
90 | if sp_file_name:
91 | speech_seg = np.loadtxt(sp_file_name, delimiter=' ',usecols=(0,1))
92 | speech_seg_i = np.round(speech_seg).astype('int32')
93 | # speech_seg_i = np.round(speech_seg*100).astype('int32')
94 | sizes = np.diff(speech_seg_i)
95 |
96 | sizes = sizes.reshape(sizes.size)
97 | offsets = np.cumsum(sizes)
98 | offsets = np.hstack((0, offsets[0:-1]))
99 |
100 | offsets += np.array(list(range(len(offsets))))
101 |
102 | #populate the array with speech clusters
103 | speech_index = 0
104 | counter = 0
105 | for pair in speech_seg_i:
106 | st = pair[0]
107 | en = pair[1]
108 | speech_index = offsets[counter]
109 |
110 | counter+=1
111 | idx = 0
112 | for x in range(st+1, en+1):
113 | with_non_speech[x] = most_likely[speech_index+idx]
114 | idx += 1
115 | else:
116 | with_non_speech = most_likely
117 |
118 | cnum = with_non_speech[0]
119 | cst = 0
120 | cen = 0
121 | for i in range(1,self.total_num_frames):
122 | if with_non_speech[i] != cnum:
123 | if (cnum >= 0):
124 | start_secs = ((cst)*0.01)
125 | dur_secs = (cen - cst + 2)*0.01
126 | # out_file.write("SPEAKER " + meeting_name + " 1 " +\
127 | # str(start_secs) + " "+ str(dur_secs) +\
128 | # " " + "speaker_" + str(cnum) + " \n")
129 | out_file.write("SPEAKER " + meeting_name + " 1 " +\
130 | str(start_secs) + " "+ str(dur_secs) +\
131 | " speaker_" + str(cnum) + "\n")
132 | cst = i
133 | cen = i
134 | cnum = with_non_speech[i]
135 | else:
136 | cen+=1
137 |
138 | if cst < cen:
139 | cnum = with_non_speech[self.total_num_frames-1]
140 | if(cnum >= 0):
141 | start_secs = ((cst+1)*0.01)
142 | dur_secs = (cen - cst + 1)*0.01
143 | # out_file.write("SPEAKER " + meeting_name + " 1 " +\
144 | # str(start_secs) + " "+ str(dur_secs) +\
145 | # " " + "speaker_" + str(cnum) + " \n")
146 | out_file.write("SPEAKER " + meeting_name + " 1 " +\
147 | str(start_secs) + " "+ str(dur_secs) +\
148 | " speaker_" + str(cnum) + "\n")
149 |
150 | print("DONE writing RTTM file")
151 |
152 | def write_to_GMM(self, gmmfile):
153 | gmm_f = open(gmmfile, 'w')
154 |
155 | gmm_f.write("Number of clusters: " + str(len(self.gmm_list)) + "\n")
156 |
157 | #print parameters
158 | cluster_count = 0
159 | for gmm in self.gmm_list:
160 |
161 | gmm_f.write("Cluster " + str(cluster_count) + "\n")
162 | means = gmm.components.means
163 | covars = gmm.components.covars
164 | weights = gmm.components.weights
165 |
166 | gmm_f.write("Number of Gaussians: "+ str(gmm.M) + "\n")
167 |
168 | gmm_count = 0
169 | for g in range(0, gmm.M):
170 | g_means = means[gmm_count]
171 | g_covar_full = covars[gmm_count]
172 | g_covar = np.diag(g_covar_full)
173 | g_weight = weights[gmm_count]
174 |
175 | gmm_f.write("Gaussian: " + str(gmm_count) + "\n")
176 | gmm_f.write("Weight: " + str(g_weight) + "\n")
177 |
178 | for f in range(0, gmm.D):
179 | gmm_f.write("Feature " + str(f) + " Mean " + str(g_means[f]) +\
180 | " Var " + str(g_covar[f]) + "\n")
181 | gmm_count+=1
182 |
183 | cluster_count+=1
184 |
185 | print("DONE writing GMM file")
186 |
187 | def new_gmm(self, M, cvtype):
188 | self.M = M
189 | self.gmm = GMM(self.M, self.D, cvtype=cvtype)
190 |
191 | def new_gmm_list(self, M, K, cvtype):
192 | self.M = M
193 | self.init_num_clusters = K
194 | self.gmm_list = [GMM(self.M, self.D, cvtype=cvtype) for i in range(K)]
195 |
196 | def segment_majority_vote(self, interval_size, em_iters):
197 | num_clusters = len(self.gmm_list)
198 |
199 | # Resegment data based on likelihood scoring
200 | likelihoods = self.gmm_list[0].score(self.X)
201 | for g in self.gmm_list[1:]:
202 | likelihoods = np.column_stack((likelihoods, g.score(self.X)))
203 | if num_clusters == 1:
204 | most_likely = np.zeros(len(self.X))
205 | else:
206 | most_likely = likelihoods.argmax(axis=1)
207 |
208 | # Across 2.5 secs of observations, vote on which cluster they should be associated with
209 | iter_training = {}
210 |
211 | for i in range(interval_size, self.N, interval_size):
212 |
213 | arr = np.array(most_likely[(list(range(i-interval_size, i)))])
214 | max_gmm = int(stats.mode(arr)[0][0])
215 | iter_training.setdefault((self.gmm_list[max_gmm],max_gmm),[]).append(self.X[i-interval_size:i,:])
216 |
217 | arr = np.array(most_likely[(list(range((int(self.N/interval_size))*interval_size, self.N)))])
218 | max_gmm = int(stats.mode(arr)[0][0])
219 | iter_training.setdefault((self.gmm_list[max_gmm], max_gmm),[]).\
220 | append(self.X[int(self.N/interval_size) *interval_size:self.N,:])
221 |
222 | iter_bic_dict = {}
223 | iter_bic_list = []
224 |
225 | # for each gmm, append all the segments and retrain
226 | for gp, data_list in iter_training.items():
227 | g = gp[0]
228 | p = gp[1]
229 | cluster_data = data_list[0]
230 |
231 | for d in data_list[1:]:
232 | cluster_data = np.concatenate((cluster_data, d))
233 |
234 | g.train(cluster_data, max_em_iters=em_iters)
235 |
236 | iter_bic_list.append((g,cluster_data))
237 | iter_bic_dict[p] = cluster_data
238 |
239 | return iter_bic_dict, iter_bic_list, most_likely
240 |
241 | def cluster(self, em_iters, KL_ntop, NUM_SEG_LOOPS_INIT, NUM_SEG_LOOPS, seg_length):
242 | print(" ====================== CLUSTERING ====================== ")
243 | main_start = time.time()
244 |
245 | # ----------- Uniform Initialization -----------
246 | # Get the events, divide them into an initial k clusters and train each GMM on a cluster
247 | per_cluster = int(self.N/self.init_num_clusters)
248 | init_training = list(zip(self.gmm_list,np.vsplit(self.X, list(range(per_cluster, self.N, per_cluster)))))
249 |
250 | for g, x in init_training:
251 | g.train(x, max_em_iters=em_iters)
252 |
253 | # ----------- First majority vote segmentation loop ---------
254 | for segment_iter in range(0,NUM_SEG_LOOPS_INIT):
255 | iter_bic_dict, iter_bic_list, most_likely = self.segment_majority_vote(seg_length, em_iters)
256 |
257 | # ----------- Main Clustering Loop using BIC ------------
258 |
259 | # Perform hierarchical agglomeration based on BIC scores
260 | best_BIC_score = 1.0
261 | total_events = 0
262 | total_loops = 0
263 | while (best_BIC_score > 0 and len(self.gmm_list) > 1):
264 |
265 | total_loops+=1
266 | for segment_iter in range(0,NUM_SEG_LOOPS):
267 | iter_bic_dict, iter_bic_list, most_likely = self.segment_majority_vote(seg_length, em_iters)
268 |
269 | # Score all pairs of GMMs using BIC
270 | best_merged_gmm = None
271 | best_BIC_score = 0.0
272 | merged_tuple = None
273 | merged_tuple_indices = None
274 |
275 | # ------- KL distance to compute best pairs to merge -------
276 | if KL_ntop > 0:
277 | top_K_gmm_pairs = self.gmm_list[0].find_top_KL_pairs(KL_ntop, self.gmm_list)
278 | for pair in top_K_gmm_pairs:
279 | score = 0.0
280 | gmm1idx = pair[0]
281 | gmm2idx = pair[1]
282 | g1 = self.gmm_list[gmm1idx]
283 | g2 = self.gmm_list[gmm2idx]
284 |
285 | if gmm1idx in iter_bic_dict and gmm2idx in iter_bic_dict:
286 | d1 = iter_bic_dict[gmm1idx]
287 | d2 = iter_bic_dict[gmm2idx]
288 | data = np.concatenate((d1,d2))
289 | elif gmm1idx in iter_bic_dict:
290 | data = iter_bic_dict[gmm1idx]
291 | elif gmm2idx in iter_bic_dict:
292 | data = iter_bic_dict[gmm2idx]
293 | else:
294 | continue
295 |
296 | new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters)
297 |
298 | #print "Comparing BIC %d with %d: %f" % (gmm1idx, gmm2idx, score)
299 | if score > best_BIC_score:
300 | best_merged_gmm = new_gmm
301 | merged_tuple = (g1, g2)
302 | merged_tuple_indices = (gmm1idx, gmm2idx)
303 | best_BIC_score = score
304 |
305 | # ------- All-to-all comparison of gmms to merge -------
306 | else:
307 | l = len(iter_bic_list)
308 |
309 | for gmm1idx in range(l):
310 | for gmm2idx in range(gmm1idx+1, l):
311 | score = 0.0
312 | g1, d1 = iter_bic_list[gmm1idx]
313 | g2, d2 = iter_bic_list[gmm2idx]
314 |
315 | data = np.concatenate((d1,d2))
316 | new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters)
317 |
318 | #print "Comparing BIC %d with %d: %f" % (gmm1idx, gmm2idx, score)
319 | if score > best_BIC_score:
320 | best_merged_gmm = new_gmm
321 | merged_tuple = (g1, g2)
322 | merged_tuple_indices = (gmm1idx, gmm2idx)
323 | # print (best_BIC_score, score)
324 | best_BIC_score = score
325 |
326 | # Merge the winning candidate pair if its deriable to do so
327 | if best_BIC_score > 0.0:
328 | gmms_with_events = []
329 | for gp in iter_bic_list:
330 | gmms_with_events.append(gp[0])
331 |
332 | #cleanup the gmm_list - remove empty gmms
333 | for g in self.gmm_list:
334 | if g not in gmms_with_events and g != merged_tuple[0] and g!= merged_tuple[1]:
335 | #remove
336 | self.gmm_list.remove(g)
337 |
338 | self.gmm_list.remove(merged_tuple[0])
339 | self.gmm_list.remove(merged_tuple[1])
340 | self.gmm_list.append(best_merged_gmm)
341 |
342 | print(" size of each cluster:", [ g.M for g in self.gmm_list])
343 |
344 | print("=== Total clustering time: %.2f min" %((time.time()-main_start)/60))
345 | print("=== Final size of each cluster:", [ g.M for g in self.gmm_list])
346 | ################### Added later to find likelihood ####################
347 | lkhoods = self.gmm_list[0].score(self.X)
348 | for g in self.gmm_list[1:]:
349 | lkhoods = np.column_stack((lkhoods, g.score(self.X)))
350 | if len(lkhoods.shape)==2:
351 | ml = lkhoods.argmax(axis=1)
352 | else:
353 | ml = np.zeros(len(self.X))
354 | #######################################################################
355 | return most_likely,ml
356 |
357 | def DER(outfile, AudioDataSet,annotationlist, audioLength):
358 | reference = Annotation()
359 |
360 | if not AudioDataSet=='DiaExample':
361 | treeA = ET.parse(annotationlist[0])
362 | rootA = treeA.getroot()
363 | for child in rootA.findall('segment'):
364 | start,end = float(child.get('transcriber_start')), float(child.get('transcriber_end'))
365 | reference[Segment(start, end)] = 'A'
366 |
367 | treeB = ET.parse(annotationlist[1])
368 | rootB = treeB.getroot()
369 | for child in rootB.findall('segment'):
370 | start,end = float(child.get('transcriber_start')), float(child.get('transcriber_end'))
371 | reference[Segment(start, end)] = 'B'
372 |
373 | treeC = ET.parse(annotationlist[2])
374 | rootC = treeC.getroot()
375 | for child in rootC.findall('segment'):
376 | start,end = float(child.get('transcriber_start')), float(child.get('transcriber_end'))
377 | reference[Segment(start, end)] = 'C'
378 |
379 | treeD = ET.parse(annotationlist[3])
380 | rootD = treeD.getroot()
381 | for child in rootD.findall('segment'):
382 | start,end = float(child.get('transcriber_start')), float(child.get('transcriber_end'))
383 | reference[Segment(start, end)] = 'D'
384 | else:
385 | reference = Annotation()
386 | reference[Segment(0.15, 3.41)] = 'A'
387 | reference[Segment(3.83, 5.82)] = 'A'
388 | reference[Segment(6.75, 11.10)] = 'B'
389 | reference[Segment(11.32, 15.8)] = 'C'
390 | reference[Segment(15.9, 18.8)] = 'B'
391 | reference[Segment(18.8, 27.8)] = 'C'
392 | reference[Segment(27.8, 34.4)] = 'B'
393 | reference[Segment(34.4, 42)] = 'D'
394 |
395 | hypothesis = Annotation()
396 | f = open(outfile,'r')
397 | for line in f.readlines():
398 | start = float(line.split(' ')[3])
399 | end = start + float(line.split(' ')[4])
400 | annotation = line.split(' ')[5][0:-1]
401 | hypothesis[Segment(start, end)] = annotation
402 | f.close()
403 | metric = DiarizationErrorRate()
404 | metricPurity = DiarizationPurity()
405 | uem = Timeline([Segment(0, audioLength)])
406 |
407 | print('DER: %.2f %%' %(metric(reference, hypothesis, uem=uem)*100))
408 | print('Cluster Purity: %.2f %%' %(metricPurity(reference, hypothesis, uem=uem)*100))
409 |
410 | return metric, reference, hypothesis
411 |
412 | def SADError(segments, AudioDataSet, annotationlist, audioLength):
413 | reference = Annotation()
414 | treeA = ET.parse(annotationlist[0])
415 | rootA = treeA.getroot()
416 | for child in rootA.findall('segment'):
417 | start,end = float(child.get('transcriber_start')), float(child.get('transcriber_end'))
418 | reference[Segment(start, end)] = 'A'
419 |
420 | treeB = ET.parse(annotationlist[1])
421 | rootB = treeB.getroot()
422 | for child in rootB.findall('segment'):
423 | start,end = float(child.get('transcriber_start')), float(child.get('transcriber_end'))
424 | reference[Segment(start, end)] = 'A'
425 |
426 | treeC = ET.parse(annotationlist[2])
427 | rootC = treeC.getroot()
428 | for child in rootC.findall('segment'):
429 | start,end = float(child.get('transcriber_start')), float(child.get('transcriber_end'))
430 | reference[Segment(start, end)] = 'A'
431 |
432 | treeD = ET.parse(annotationlist[3])
433 | rootD = treeD.getroot()
434 | for child in rootD.findall('segment'):
435 | start,end = float(child.get('transcriber_start')), float(child.get('transcriber_end'))
436 | reference[Segment(start, end)] = 'A'
437 |
438 | hypothesis = Annotation()
439 | for seg in segments:
440 | start = seg[0]
441 | end = seg[1]
442 | hypothesis[Segment(start, end)] = 'A'
443 |
444 | metric = DetectionErrorRate()
445 | uem = Timeline([Segment(0, audioLength)])
446 | print('SAD Error Rate: %.2f %%' %(metric(reference, hypothesis, uem=uem)*100))
447 |
448 | return metric, reference, hypothesis
449 |
450 | def SpeechOnlySamplesOptimal(X,Fs,AudioDataSet, annotationlist):
451 | # This function makes non-speech (silence + noise) samples to zeros.
452 | XSpeech = np.zeros(X.shape)
453 | treeA = ET.parse(annotationlist[0])
454 | rootA = treeA.getroot()
455 | for child in rootA.findall('segment'):
456 | start,end = float(child.get('transcriber_start')), float(child.get('transcriber_end'))
457 | XSpeech[int(np.round(start*Fs)):int(np.round(end*Fs))] = copy(X[int(np.round(start*Fs)):int(np.round(end*Fs))])
458 |
459 | treeB = ET.parse(annotationlist[1])
460 | rootB = treeB.getroot()
461 | for child in rootB.findall('segment'):
462 | start,end = float(child.get('transcriber_start')), float(child.get('transcriber_end'))
463 | XSpeech[int(np.round(start*Fs)):int(np.round(end*Fs))] = copy(X[int(np.round(start*Fs)):int(np.round(end*Fs))])
464 |
465 | treeC = ET.parse(annotationlist[2])
466 | rootC = treeC.getroot()
467 | for child in rootC.findall('segment'):
468 | start,end = float(child.get('transcriber_start')), float(child.get('transcriber_end'))
469 | XSpeech[int(np.round(start*Fs)):int(np.round(end*Fs))] = copy(X[int(np.round(start*Fs)):int(np.round(end*Fs))])
470 |
471 | treeD = ET.parse(annotationlist[3])
472 | rootD = treeD.getroot()
473 | for child in rootD.findall('segment'):
474 | start,end = float(child.get('transcriber_start')), float(child.get('transcriber_end'))
475 | XSpeech[int(np.round(start*Fs)):int(np.round(end*Fs))] = copy(X[int(np.round(start*Fs)):int(np.round(end*Fs))])
476 |
477 | return XSpeech
478 |
479 | if __name__ == '__main__':
480 | '''
481 | For ExampleDiarization file use librosa, Weight=0.025.
482 | And segment of 50, best DER achieved 9.9 %.
483 | '''
484 | tic = time.time()
485 | stWin = 0.03
486 | stStep = 0.01
487 | M = 5 # no of GMM components
488 | K = 16 # no of clusters or GMMs
489 | n_mfcc = 19
490 |
491 | AudioDataSet = 'IS1008d' #DiaExample, IS1000a, IS1001a, IS1003a, IS1004a, IS1008a
492 | FlagFeatureNormalization = True
493 | UseSAD = True
494 | UseSparseFeatures = False
495 | spnp = None
496 | SpeechOnlyOptimal = True
497 | SparseFeatureEngineeringFlag = False
498 |
499 | if AudioDataSet=='DiaExample':
500 | fileName = '/home/rehan/Diarization/ReDiarization/data/diarizationExample.wav'
501 | outfile = '/home/rehan/Diarization/ReDiarization/data/diarizationExample.rttm'
502 | gmmfile = '/home/rehan/Diarization/ReDiarization/data/diarizationExample.gmm'
503 | if UseSAD: spnp = '/home/rehan/Diarization/ReDiarization/data/spnp.txt'
504 | meeting_name = 'diarizationExample'
505 | annotationlist = None
506 | else:
507 | fileName = '/home/rehan/Diarization/ReDiarization/data/AMI/'+AudioDataSet+'/audio/'+AudioDataSet+'.Mix-Headset.wav'
508 | annotationA = '/home/rehan/Diarization/ReDiarization/data/AMI/'+AudioDataSet+'/audio/'+AudioDataSet+'.A.segments.xml'
509 | annotationB = '/home/rehan/Diarization/ReDiarization/data/AMI/'+AudioDataSet+'/audio/'+AudioDataSet+'.B.segments.xml'
510 | annotationC = '/home/rehan/Diarization/ReDiarization/data/AMI/'+AudioDataSet+'/audio/'+AudioDataSet+'.C.segments.xml'
511 | annotationD = '/home/rehan/Diarization/ReDiarization/data/AMI/'+AudioDataSet+'/audio/'+AudioDataSet+'.D.segments.xml'
512 | annotationlist = [annotationA, annotationB, annotationC, annotationD]
513 | outfile = '/home/rehan/Diarization/ReDiarization/data/AMI/'+AudioDataSet+'/audio/'+AudioDataSet+'.rttm'
514 | gmmfile = '/home/rehan/Diarization/ReDiarization/data/AMI/'+AudioDataSet+'/audio/'+AudioDataSet+'.gmm'
515 | if UseSAD: spnp = '/home/rehan/Diarization/ReDiarization/data/AMI/'+AudioDataSet+'/audio/'+AudioDataSet+'_spnp.txt'
516 | meeting_name = AudioDataSet
517 |
518 | x, Fs = librosa.load(fileName, sr=16000)
519 | audioLength = len(x)/(Fs)
520 | if SpeechOnlyOptimal:
521 | x = SpeechOnlySamplesOptimal(x,Fs,AudioDataSet, annotationlist)
522 |
523 | S = librosa.feature.melspectrogram(y=x, sr=Fs, n_fft=int(Fs*stWin), hop_length=int(Fs*stStep))
524 | fVects = librosa.feature.mfcc(y=x, S=librosa.power_to_db(S), sr=Fs, n_mfcc = n_mfcc)
525 |
526 | if UseSAD:
527 | ###################### Speech Activity Detection ##########################
528 | segments, idx = silenceRemoval(x, Fs, stWin, stStep, smoothWindow=.0005, Weight=0.0001, plot=False)
529 | me,re,hy = SADError(segments, AudioDataSet, annotationlist, audioLength)
530 | ###########################################################################
531 |
532 | # Creating a speech/non-speech text File which contains speech only
533 | # features indices. 'idx' contains speech only features indices.
534 | st = idx[0]
535 | newst=copy(st)
536 | seglist = []
537 | for i in range(1,idx.size):
538 | if idx[i]==st+1:
539 | st+=1
540 | else:
541 | en = idx[i-1]
542 | seglist.append([newst,en])
543 | st = idx[i]
544 | newst = copy(st)
545 | en = idx[i]
546 | seglist.append([newst,en])
547 | segarray = np.array(seglist)
548 | np.savetxt(spnp, segarray, fmt='%d', delimiter=' ')
549 | #######################################################################
550 | # Take Speech only frames....
551 | fVectsSpeech = copy(fVects[:,idx])
552 | #######################################################################
553 | else:
554 | fVectsSpeech = fVects
555 |
556 | if FlagFeatureNormalization:
557 | ss = StandardScaler()
558 | fVectsSpeech = ss.fit_transform(fVectsSpeech.T).T
559 | print("Feature Normalization Done...")
560 |
561 | ###########################################################################
562 | diarizer = Diarizer(fVectsSpeech,fVects.shape[1])
563 | # Create the GMM list
564 | num_comps = M
565 | num_gmms = K
566 | diarizer.new_gmm_list(num_comps, num_gmms, 'diag')
567 |
568 | # Cluster
569 | kl_ntop = 0
570 | num_em_iters = 100
571 | num_seg_iters_init = 2 #2
572 | num_seg_iters = 3 #3
573 | seg_length = int(150)
574 | if AudioDataSet=='DiaExample': seg_length = 50
575 |
576 | most_likely,_ = diarizer.cluster(num_em_iters, kl_ntop, num_seg_iters_init, num_seg_iters, seg_length)
577 |
578 | # Write out RTTM and GMM parameter files
579 | diarizer.write_to_RTTM(outfile, spnp, meeting_name, most_likely, num_gmms, seg_length)
580 | metric, ref, hyp = DER(outfile, AudioDataSet,annotationlist, audioLength)
581 | #diarizer.write_to_GMM(gmmfile)
582 | print('=== Total Time Taken: %.2f min' %((time.time()-tic)/60.0))
583 |
--------------------------------------------------------------------------------