├── .gitignore ├── AdaBoostClassifier.py ├── README.md ├── RandomForestClassifier.py ├── classification_report.py ├── classifier_test.py ├── estimate_liblinear_svm_parameters.py ├── estimate_svm_parameters.py ├── format_convertor.py ├── input_and_target_dataset_generator.py ├── input_and_target_dataset_mfcc_26D.py ├── speech_noise_ir_audio_mixing_script.py └── svm_modelling.py /.gitignore: -------------------------------------------------------------------------------- 1 | A1.py 2 | beta_script.py -------------------------------------------------------------------------------- /AdaBoostClassifier.py: -------------------------------------------------------------------------------- 1 | from sklearn import svm 2 | from sklearn import cross_validation 3 | from sklearn import preprocessing 4 | from sklearn import metrics 5 | import numpy as np 6 | import time 7 | import sys 8 | import cPickle as pickle 9 | import scipy.sparse 10 | from itertools import groupby 11 | from sklearn.ensemble import RandomForestClassifier 12 | from sklearn.ensemble import AdaBoostClassifier 13 | from sklearn.neighbors import KNeighborsClassifier 14 | from sklearn.naive_bayes import GaussianNB 15 | from sklearn.tree import DecisionTreeClassifier 16 | 17 | #We have chosen l2 normalization to normalize the mfcc speech vector over the entire set of frames. 18 | #Training Data -- Speech Vector File 19 | with open('mfcc_weighted_vector.dat', 'rb') as infile1: 20 | InputData = pickle.load(infile1) 21 | InputDataSpeech = preprocessing.normalize(InputData,norm='l2') 22 | infile1.close() 23 | 24 | # Target Values -- Class Label Files. 25 | with open('class_label_weighted_vector.dat', 'rb') as infile2: 26 | TargetData = pickle.load(infile2) 27 | TargetClassLabelTemp = preprocessing.normalize(TargetData,norm='l2') 28 | infile2.close() 29 | 30 | TargetClassLabel = np.array(scipy.sparse.coo_matrix((TargetClassLabelTemp),dtype=np.int16).toarray()).tolist() 31 | TargetClassLabel1 = map(str, TargetClassLabel) 32 | TargetClassLabel = results = [int(i.strip('[').strip(']')) for i in TargetClassLabel1] 33 | #TargetClassLabel = TargetClassLabel[:200000] 34 | n_samples = len(TargetClassLabel) 35 | InputDataSpeechTemp = scipy.sparse.coo_matrix((InputDataSpeech),dtype=np.float64).toarray() 36 | #InputDataSpeechTemp = InputDataSpeechTemp[:200000] 37 | # merging the label and mfcc feature 38 | data_zipped = zip(TargetClassLabel, InputDataSpeechTemp) 39 | 40 | # permutation of whole dataset 41 | #data_zipped = np.random.permutation(data_zipped) 42 | 43 | # seperating the ones and zeros 44 | ones_index = [i for i, j in enumerate(TargetClassLabel) if j == 1] 45 | zeros_index = [i for i, j in enumerate(TargetClassLabel) if j == 0] 46 | ones = [data_zipped[i] for i in ones_index] 47 | zeros = [data_zipped[i] for i in zeros_index] 48 | num_zero = len(zeros) 49 | original_ratio = float(len(ones))/num_zero 50 | # whole number of zeros: 145082 51 | # whole number of ones: 2057376 52 | 53 | one_zero_ratio = 1 54 | train_test_ratio = 30 55 | 56 | # reconstructe test set according to original_ratio 57 | test_zero = zeros[:int(num_zero/train_test_ratio)] 58 | print 'test of zeros: ', len(test_zero) 59 | test_one = ones[:int(len(test_zero)*original_ratio)] 60 | #test_one = ones[:int(len(test_zero)*1)] 61 | print 'test of ones: ', len(test_one) 62 | test = test_zero + test_one 63 | np.random.shuffle(test) 64 | print 'test: ', len(test) 65 | test_label = [] 66 | test_feature = [] 67 | for i in test: 68 | test_label.append(i[0]) 69 | for i in test: 70 | test_feature.append(i[1]) 71 | 72 | # reconstructe training set according to one_zero_ratio 73 | # the ratio of ones to zeros on training is 1 74 | train_zero = zeros[len(test_zero)+1:] 75 | print 'train_zero: ', len(train_zero) 76 | train_one = ones[len(test_one)+1:len(test_one)+1+int(len(train_zero)*one_zero_ratio)] 77 | print 'train_one ', len(train_one) 78 | train = train_zero + train_one 79 | np.random.shuffle(train) 80 | print 'train: ', len(train) 81 | train_label = [] 82 | train_feature = [] 83 | for i in train: 84 | train_label.append(i[0]) 85 | #print 'train_label', len(train_label) 86 | for i in train: 87 | train_feature.append(i[1]) 88 | #print 'train_feature', len(train_feature) 89 | clf = RandomForestClassifier() 90 | clf_model = AdaBoostClassifier(base_estimator=clf,n_estimators=500).fit(train_feature, train_label) 91 | label_predicted = clf_model.predict(test_feature) 92 | 93 | target_names = ['0', '1' ] 94 | print("Classification report for classifier %s:\n%s\n" 95 | % (clf_model, metrics.classification_report(test_label, label_predicted, target_names=target_names))) 96 | cm = metrics.confusion_matrix(test_label, label_predicted) 97 | cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 98 | print('Confusion matrix, without normalization') 99 | print(cm) 100 | print('Normalized confusion matrix') 101 | print(cm_normalized) 102 | print 'Task is Finished!' 103 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Audio Based Machine Learning in Python: Assignment 1 2 | 3 | *Goals* 4 | * Mix speech corpora with noise sounds; fold with impulse responses 5 | * Extract MFCC features 6 | * Voice Activity Detection using out-of-the-box classifiers (Random Forests, SVM, Neural Networks etc.) with cross validation 7 | * Present your results 8 | 9 | *Details and constraints* 10 | 11 | In this first, introductory assignment you will create a dataset that simulates speech in every-day scenarios. You will train a classifier on this dataset for distinguishing voiced from non-voiced sections, a task called voice activity detection, VAD for short. This, of course, requires a ground truth in terms of VAD annotations. 12 | 13 | ### format_convertor.py 14 | 15 | Converts the format of the noise files into int_16 type. We convert all the noise files into the int_16 format and save them in a new folder which is used further in the generation of dataset. 16 | 17 | ``` 18 | Usage: python format_convertor.py 19 | ``` 20 | 21 | ### speech_noise_ir_audio_mixing_script.py 22 | 23 | Generates the Convoluted speech files in a given "output" directory, with customized command line parameters. 24 | ``` 25 | Usage: python speech_noise_ir_audio_mixing_script.py 26 | 27 | Command line arguments: 28 | : Root directory where all the VAD speaker speech data is located 29 | : Reference File for the Calculation of Replay gain, named as "ref_pink.wav" 30 | : Root directory where all the noise data is located 31 | : Root directory where all the impulse response noise samples is located 32 | : Root directory for storing all the output files in same directory hierarchy as the input speech directory 33 | ``` 34 | 35 | Sample Run and Output: 36 | ``` 37 | mlteam3@pcschlichter4:~/A1$ python gamma_script.py ../data/sample ref_pink.wav 38 | ../data/noise_sample/ /mnt/tatooine/data/impulse_responses/16kHz/wavs16b ../newout1 39 | Noise File Path: ../data/noise_sample/super_market_mall2_copy.wav 40 | IR-Noise File Path: /mnt/tatooine/data/impulse_responses/16kHz/wavs16b/lg_leather_bag.wav 41 | Final output file path: ../newout1/sample/SA2_632.wav 42 | Operations Finished! 43 | ``` 44 | 45 | ### input_and_target_dataset_generator.py 46 | 47 | Generates the input and target label dataset from the given noise mixed speech files and corresponding annotation files. Stores them in scipy.sparse.coo_matrix representation and saves the resultant dataset vectors onto disk. 48 | 49 | ``` 50 | Usage: python input_and_target_dataset_generator.py 51 | 52 | ``` 53 | 54 | Sample run: 55 | 56 | ``` 57 | mlteam3@pcschlichter4:~/A1$ python FrameLevelAnnotation.py ../data/vad_dirsample/ ../output_all/ mfcc_full_vector.dat class_label_full_vector.dat 58 | ``` 59 | 60 | ### svm_modelling.py 61 | 62 | This script, employs scikit learn's lib linear version of support vector machine and computes cross validation accuracy scores. 63 | 64 | ``` 65 | Usage: python svm_modelling.py 66 | ``` 67 | _CROSS Validation: 3_ 68 | ``` 69 | mlteam3@pcschlichter4:~/A1$ python svm_modelling.py mfcc_full_vector.dat class_label_full_vector.dat 70 | (2202458, 13) 71 | (2202458, 1) 72 | Starting Cross Validation 73 | Final Accuracy Score: 0.93 (+/- 0.00) 74 | Total execution time in minutes :: >> 75 | 1.27803570032 76 | Task is Finished! 77 | ``` 78 | 79 | _CROSS Validation: 10_ 80 | ``` 81 | mlteam3@pcschlichter4:~/A1$ python svm_modelling.py mfcc_full_vector.dat class_label_full_vector.dat 82 | (2202458, 13) 83 | (2202458, 1) 84 | Starting Cross Validation 85 | Final Accuracy Score: 0.93413 (+/- 0.00) 86 | Total execution time in minutes :: >> 87 | 5.50397278468 88 | Task is Finished! 89 | 90 | ``` 91 | ### classification_report.py 92 | 93 | Generates classification report for the given model. 94 | ``` 95 | Usage: python classification_report.py 96 | ``` 97 | 98 | Sample Run: 99 | 100 | ``` 101 | mlteam3@pcschlichter4:~/A1$ python classification_report.py mfcc_full_vector.dat class_label_full_vector.dat 102 | (2202458, 13) 103 | (2202458, 1) 104 | Starting Cross Validation 105 | /usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:958: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. 106 | 'precision', 'predicted', average, warn_for) 107 | Classification report for classifier LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, 108 | intercept_scaling=1, loss='squared_hinge', max_iter=1000, 109 | multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, 110 | verbose=0): 111 | precision recall f1-score support 112 | 113 | No Speech Detected 0.00 0.00 0.00 60627 114 | Speech Detected 0.94 1.00 0.97 1040602 115 | 116 | avg / total 0.89 0.94 0.92 1101229 117 | 118 | 119 | Confusion matrix: 120 | [[ 0 60627] 121 | [ 0 1040602]] 122 | Total execution time in minutes :: >> 123 | 0.405680398146 124 | Task is Finished! 125 | 126 | ``` 127 | 128 | ### Accuracy Scores 129 | 130 | Support Vector Machines (10 Fold Cross Validation): 0.93413 131 | 132 | ### RandomForestClassifier.py 133 | 134 | Seperate the speech frames and nonspeech frames; 135 | Rearrange and shuffle the training set according to adjustable speech to non-speech frames ratio and using RandomForestClassifier. 136 | ``` 137 | Usage: python classification_report.py 138 | ``` 139 | 140 | Sample Run: 141 | 142 | ``` 143 | mlteam3@pcschlichter4:~/A1$ python RandomForestClassifier.py 144 | test of zeros: 4836 145 | test of ones: 68578 146 | test: 73414 147 | train_zero: 140245 148 | train_one 140245 149 | train: 280490 150 | Classification report for classifier RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', 151 | max_depth=None, max_features='auto', max_leaf_nodes=None, 152 | min_samples_leaf=1, min_samples_split=2, 153 | min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, 154 | oob_score=False, random_state=None, verbose=0, 155 | warm_start=False): 156 | precision recall f1-score support 157 | 158 | 0 0.07 0.99 0.12 4836 159 | 1 0.96 0.02 0.04 68578 160 | 161 | avg / total 0.90 0.08 0.04 73414 162 | 163 | 164 | Confusion matrix, without normalization 165 | [[ 4779 57] 166 | [67331 1247]] 167 | Normalized confusion matrix 168 | [[ 0.9882134 0.0117866 ] 169 | [ 0.98181633 0.01818367]] 170 | Task is Finished! 171 | 172 | 173 | ``` 174 | ### estimate_svm_paramters.py and estimate_liblinear_svm_parameters.py 175 | 176 | This script(s) determines the best paramters for SVM and LinearSVC(LibLinear SVM implementation) using Grid search cross validation over five folds. 177 | 178 | ``` 179 | Usage: python estimate_svm_parameters.py 180 | 181 | Usage: estimate_liblinear_svm_parameters.py 182 | 183 | ``` 184 | 185 | 186 | The following parameters were experimented on, to find the most suitable configuration: 187 | 188 | ``` 189 | For SVM: 190 | tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 191 | 'C': [1, 10, 100, 1000]}, 192 | {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}, 193 | {{'kernel': ['sigmoid'], 'gamma': [1e-3, 1e-4], 194 | 'C': [1, 10, 100, 1000]}] 195 | For LinearSVC: 196 | 197 | tuned_parameters = [{'C': [1, 10, 100, 1000],'loss':['hinge' , 'squared_hinge'] }] 198 | 199 | ``` 200 | Results: 201 | 202 | ``` 203 | # Tuning hyper-parameters for precision 204 | 205 | Best parameters set found on development set: 206 | 207 | {'loss': 'hinge', 'C': 1000} 208 | 209 | Grid scores on development set: 210 | 211 | 0.852 (+/-0.000) for {'loss': 'hinge', 'C': 1} 212 | 0.852 (+/-0.000) for {'loss': 'squared_hinge', 'C': 1} 213 | 0.852 (+/-0.000) for {'loss': 'hinge', 'C': 10} 214 | 0.852 (+/-0.000) for {'loss': 'squared_hinge', 'C': 10} 215 | 0.852 (+/-0.000) for {'loss': 'hinge', 'C': 100} 216 | 0.852 (+/-0.000) for {'loss': 'squared_hinge', 'C': 100} 217 | 0.858 (+/-0.021) for {'loss': 'hinge', 'C': 1000} 218 | 0.854 (+/-0.007) for {'loss': 'squared_hinge', 'C': 1000} 219 | 220 | Detailed classification report: 221 | 222 | The model is trained on the full development set. 223 | The scores are computed on the full evaluation set. 224 | 225 | precision recall f1-score support 226 | 227 | 0 0.00 0.00 0.00 60627 228 | 1 0.94 1.00 0.97 1040602 229 | 230 | avg / total 0.89 0.94 0.92 1101229 231 | 232 | 233 | # Tuning hyper-parameters for recall 234 | 235 | Best parameters set found on development set: 236 | 237 | {'loss': 'hinge', 'C': 1} 238 | 239 | Grid scores on development set: 240 | 241 | 0.923 (+/-0.000) for {'loss': 'hinge', 'C': 1} 242 | 0.923 (+/-0.000) for {'loss': 'squared_hinge', 'C': 1} 243 | 0.923 (+/-0.000) for {'loss': 'hinge', 'C': 10} 244 | 0.923 (+/-0.000) for {'loss': 'squared_hinge', 'C': 10} 245 | 0.923 (+/-0.000) for {'loss': 'hinge', 'C': 100} 246 | 0.923 (+/-0.000) for {'loss': 'squared_hinge', 'C': 100} 247 | 0.923 (+/-0.000) for {'loss': 'hinge', 'C': 1000} 248 | 0.763 (+/-0.640) for {'loss': 'squared_hinge', 'C': 1000} 249 | 250 | Detailed classification report: 251 | 252 | The model is trained on the full development set. 253 | The scores are computed on the full evaluation set. 254 | 255 | precision recall f1-score support 256 | 257 | 0 0.00 0.00 0.00 60627 258 | 1 0.94 1.00 0.97 1040602 259 | 260 | avg / total 0.89 0.94 0.92 1101229 261 | 262 | 263 | Total execution time in minutes :: >> 264 | 174.473954884 265 | Task is Finished! 266 | ``` 267 | -------------------------------------------------------------------------------- /RandomForestClassifier.py: -------------------------------------------------------------------------------- 1 | from sklearn import svm 2 | from sklearn import cross_validation 3 | from sklearn import preprocessing 4 | from sklearn import metrics 5 | import numpy as np 6 | import time 7 | import sys 8 | import cPickle as pickle 9 | import scipy.sparse 10 | from itertools import groupby 11 | from sklearn.ensemble import RandomForestClassifier 12 | from sklearn.ensemble import AdaBoostClassifier 13 | from sklearn.neighbors import KNeighborsClassifier 14 | from sklearn.naive_bayes import GaussianNB 15 | from sklearn.tree import DecisionTreeClassifier 16 | 17 | #We have chosen l2 normalization to normalize the mfcc speech vector over the entire set of frames. 18 | #Training Data -- Speech Vector File 19 | with open('mfcc_weighted_vector_39D.dat', 'rb') as infile1: 20 | InputData = pickle.load(infile1) 21 | InputDataSpeech = preprocessing.normalize(InputData,norm='l2') 22 | infile1.close() 23 | 24 | # Target Values -- Class Label Files. 25 | with open('class_label_weighted_vector.dat', 'rb') as infile2: 26 | TargetData = pickle.load(infile2) 27 | TargetClassLabelTemp = preprocessing.normalize(TargetData,norm='l2') 28 | infile2.close() 29 | 30 | TargetClassLabel = np.array(scipy.sparse.coo_matrix((TargetClassLabelTemp),dtype=np.int16).toarray()).tolist() 31 | TargetClassLabel1 = map(str, TargetClassLabel) 32 | TargetClassLabel = results = [int(i.strip('[').strip(']')) for i in TargetClassLabel1] 33 | #TargetClassLabel = TargetClassLabel[:200000] 34 | n_samples = len(TargetClassLabel) 35 | InputDataSpeechTemp = scipy.sparse.coo_matrix((InputDataSpeech),dtype=np.float64).toarray() 36 | #InputDataSpeechTemp = InputDataSpeechTemp[:200000] 37 | # merging the label and mfcc feature 38 | data_zipped = zip(TargetClassLabel, InputDataSpeechTemp) 39 | 40 | # permutation of whole dataset 41 | #data_zipped = np.random.permutation(data_zipped) 42 | 43 | # seperating the ones and zeros 44 | ones_index = [i for i, j in enumerate(TargetClassLabel) if j == 1] 45 | zeros_index = [i for i, j in enumerate(TargetClassLabel) if j == 0] 46 | ones = [data_zipped[i] for i in ones_index] 47 | zeros = [data_zipped[i] for i in zeros_index] 48 | num_zero = len(zeros) 49 | original_ratio = float(len(ones))/num_zero 50 | # whole number of zeros: 145082 51 | # whole number of ones: 2057376 52 | 53 | one_zero_ratio = 1 54 | train_test_ratio = 30 55 | 56 | # reconstructe test set according to original_ratio 57 | test_zero = zeros[:int(num_zero/train_test_ratio)] 58 | print 'test of zeros: ', len(test_zero) 59 | test_one = ones[:int(len(test_zero)*original_ratio)] 60 | #test_one = ones[:int(len(test_zero)*1)] 61 | print 'test of ones: ', len(test_one) 62 | test = test_zero + test_one 63 | np.random.shuffle(test) 64 | print 'test: ', len(test) 65 | test_label = [] 66 | test_feature = [] 67 | for i in test: 68 | test_label.append(i[0]) 69 | for i in test: 70 | test_feature.append(i[1]) 71 | 72 | # reconstructe training set according to one_zero_ratio 73 | # the ratio of ones to zeros on training is 1 74 | train_zero = zeros[len(test_zero)+1:] 75 | print 'train_zero: ', len(train_zero) 76 | train_one = ones[len(test_one)+1:len(test_one)+1+int(len(train_zero)*one_zero_ratio)] 77 | print 'train_one ', len(train_one) 78 | train = train_zero + train_one 79 | np.random.shuffle(train) 80 | print 'train: ', len(train) 81 | train_label = [] 82 | train_feature = [] 83 | for i in train: 84 | train_label.append(i[0]) 85 | #print 'train_label', len(train_label) 86 | for i in train: 87 | train_feature.append(i[1]) 88 | #print 'train_feature', len(train_feature) 89 | 90 | clf_model = RandomForestClassifier(n_estimators=100).fit(train_feature, train_label) 91 | label_predicted = clf_model.predict(test_feature) 92 | 93 | target_names = ['0', '1' ] 94 | print("Classification report for classifier %s:\n%s\n" 95 | % (clf_model, metrics.classification_report(test_label, label_predicted, target_names=target_names))) 96 | cm = metrics.confusion_matrix(test_label, label_predicted) 97 | cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 98 | print('Confusion matrix, without normalization') 99 | print(cm) 100 | print('Normalized confusion matrix') 101 | print(cm_normalized) 102 | print 'Task is Finished!' 103 | -------------------------------------------------------------------------------- /classification_report.py: -------------------------------------------------------------------------------- 1 | from sklearn import svm 2 | from sklearn import cross_validation 3 | from sklearn import preprocessing 4 | from sklearn import metrics 5 | import numpy as np 6 | import time 7 | import sys 8 | import cPickle as pickle 9 | import scipy.sparse 10 | from itertools import groupby 11 | 12 | 13 | if len(sys.argv)!=3: 14 | print '\nUsage: python classification_report.py ' 15 | sys.exit() 16 | 17 | speech_vector_file = sys.argv[1] 18 | class_label_file = sys.argv[2] 19 | #cross_validation_folds_number = sys.argv[3] 20 | 21 | #We have chosen l2 normalization to normalize the mfcc speech vector over the entire set of frames. 22 | #Training Data -- Speech Vector File 23 | with open(speech_vector_file, 'rb') as infile1: 24 | InputData = pickle.load(infile1) 25 | InputDataSpeech = preprocessing.normalize(InputData,norm='l2') 26 | infile1.close() 27 | 28 | # Target Values -- Class Label Files. 29 | with open(class_label_file, 'rb') as infile2: 30 | TargetData = pickle.load(infile2) 31 | TargetClassLabelTemp = preprocessing.normalize(TargetData,norm='l2') 32 | infile2.close() 33 | 34 | #print InputDataSpeech.shape 35 | #print TargetClassLabelTemp.shape 36 | 37 | TargetClassLabel = np.array(scipy.sparse.coo_matrix((TargetClassLabelTemp),dtype=np.int16).toarray()).tolist() 38 | TargetClassLabel1 = map(str, TargetClassLabel) 39 | #TargetClassLabel = map(int, TargetClassLabel) 40 | TargetClassLabel = results = [int(i.strip('[').strip(']')) for i in TargetClassLabel1] 41 | 42 | #Recording the start time. 43 | start = time.time() 44 | 45 | #Choosing SVM as our machine learning model. 46 | #clf_model = svm.LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,intercept_scaling=1, loss='squared_hinge', multi_class='ovr', penalty='l2',random_state=None, tol=0.0001, verbose=0) 47 | clf_model = svm.LinearSVC() 48 | 49 | print 'Starting Cross Validation' 50 | 51 | n_samples = len(TargetClassLabel) 52 | InputDataSpeechTemp = scipy.sparse.coo_matrix((InputDataSpeech),dtype=np.float64).toarray() 53 | first_half_input_vector = InputDataSpeechTemp[:n_samples/2,:13] 54 | second_half_input_vector = InputDataSpeechTemp[n_samples/2:,:13] 55 | #print "Predicted: "+str(second_half_input_vector.shape) 56 | 57 | clf_model.fit(first_half_input_vector, TargetClassLabel[:n_samples / 2]) 58 | 59 | # Now predict the value of the digit on the second half: 60 | expected = TargetClassLabel[n_samples / 2:] 61 | predicted = clf_model.predict(second_half_input_vector) 62 | #predicted = predicted1.tolist() 63 | #predicted[0] = 0 64 | 65 | target_names = ['No Speech Detected', 'Speech Detected' ] 66 | 67 | #Generate Classification Report 68 | print("Classification report for classifier %s:\n%s\n" 69 | % (clf_model, metrics.classification_report(expected, predicted,target_names=target_names))) 70 | print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted)) 71 | 72 | #Recording the end time. 73 | end = time.time() 74 | print "Total execution time in minutes :: >>" 75 | print (end - start)/60 76 | print 'Task is Finished!' 77 | 78 | # print "Expected: "+str([len(list(group)) for key, group in groupby(expected)]) 79 | # print "Predicted: "+str([len(list(group)) for key, group in groupby(predicted)]) 80 | 81 | #print "Expected: "+str(expected) 82 | #print "Predicted: "+str(predicted) 83 | -------------------------------------------------------------------------------- /classifier_test.py: -------------------------------------------------------------------------------- 1 | from sklearn import svm 2 | from sklearn import cross_validation 3 | from sklearn import preprocessing 4 | from sklearn import metrics 5 | import numpy as np 6 | import time 7 | import sys 8 | import cPickle as pickle 9 | import scipy.sparse 10 | from itertools import groupby 11 | from sklearn.ensemble import RandomForestClassifier 12 | from sklearn.ensemble import AdaBoostClassifier 13 | from sklearn.neighbors import KNeighborsClassifier 14 | from sklearn.naive_bayes import GaussianNB 15 | 16 | 17 | #We have chosen l2 normalization to normalize the mfcc speech vector over the entire set of frames. 18 | #Training Data -- Speech Vector File 19 | with open('mfcc_full_vector.dat', 'rb') as infile1: 20 | InputData = pickle.load(infile1) 21 | InputDataSpeech = preprocessing.normalize(InputData,norm='l2') 22 | infile1.close() 23 | 24 | # Target Values -- Class Label Files. 25 | with open('class_label_full_vector.dat', 'rb') as infile2: 26 | TargetData = pickle.load(infile2) 27 | TargetClassLabelTemp = preprocessing.normalize(TargetData,norm='l2') 28 | infile2.close() 29 | 30 | #print InputDataSpeech.shape 31 | #print TargetClassLabelTemp.shape 32 | 33 | TargetClassLabel = np.array(scipy.sparse.coo_matrix((TargetClassLabelTemp),dtype=np.int16).toarray()).tolist() 34 | TargetClassLabel1 = map(str, TargetClassLabel) 35 | #TargetClassLabel = map(int, TargetClassLabel) 36 | TargetClassLabel = results = [int(i.strip('[').strip(']')) for i in TargetClassLabel1] 37 | 38 | n_samples = len(TargetClassLabel) 39 | InputDataSpeechTemp = scipy.sparse.coo_matrix((InputDataSpeech),dtype=np.float64).toarray() 40 | 41 | sub_feature = InputDataSpeechTemp[:n_samples/5,:13] 42 | sub_label = TargetClassLabel[:n_samples / 5] 43 | test_feature = InputDataSpeechTemp[99*n_samples/100:,:13] 44 | test_label = TargetClassLabel[99*n_samples/100:] 45 | 46 | # rearranging training set 47 | ones_index= [i for i, j in enumerate(sub_label) if j == 1] 48 | zeros_index = [i for i, j in enumerate(sub_label) if j == 0] 49 | ones_index = np.random.permutation(ones_index) 50 | zeros_index= np.random.permutation(zeros_index) 51 | sub_zero = [sub_feature[i] for i in zeros_index] 52 | sub_one = [sub_feature[i] for i in ones_index] 53 | print str(len(sub_zero)) + ' 0' 54 | print str(len(sub_one)) + ' 1' 55 | one_cutted = sub_one[:len(sub_zero)] 56 | feature_balanced = np.concatenate((sub_zero, one_cutted), axis=0) 57 | label_balanced = np.append(np.zeros(len(zeros_index)),np.ones(len(one_cutted))) 58 | 59 | # rearranging test set FOR TEST 60 | ones_index_test= [i for i, j in enumerate(test_label) if j == 1] 61 | zeros_index_test = [i for i, j in enumerate(test_label) if j == 0] 62 | ones_index = np.random.permutation(ones_index_test) 63 | zeros_index= np.random.permutation(zeros_index_test) 64 | test_zero = [test_feature[i] for i in zeros_index_test] 65 | test_one = [test_feature[i] for i in ones_index_test] 66 | print str(len(test_zero)) + ' 0' 67 | print str(len(test_one)) + ' 1' 68 | one_cutted_test = test_one[:len(test_zero)] 69 | feature_balanced_test = np.concatenate((test_zero, one_cutted_test), axis=0) 70 | label_balanced_test = np.append(np.zeros(len(test_zero)),np.ones(len(one_cutted_test))) 71 | 72 | clf_model = KNeighborsClassifier().fit(feature_balanced, label_balanced) 73 | label_predicted = clf_model.predict(feature_balanced_test) 74 | 75 | target_names = ['No Speech Detected', 'Speech Detected' ] 76 | print("Classification report for classifier %s:\n%s\n" 77 | % (clf_model, metrics.classification_report(label_balanced_test, label_predicted,target_names=target_names))) 78 | cm = metrics.confusion_matrix(label_balanced_test, label_predicted) 79 | print label_balanced_test 80 | print label_predicted 81 | print len(label_balanced_test) 82 | print len(label_predicted) 83 | print sum(label_balanced_test) 84 | print sum(label_predicted) 85 | cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 86 | print('Confusion matrix, without normalization') 87 | print(cm) 88 | print('Normalized confusion matrix') 89 | print(cm_normalized) 90 | print 'Task is Finished!' 91 | -------------------------------------------------------------------------------- /estimate_liblinear_svm_parameters.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from sklearn import datasets 4 | from sklearn.cross_validation import train_test_split 5 | from sklearn.grid_search import GridSearchCV 6 | from sklearn.metrics import classification_report 7 | #from sklearn.svm import SVC 8 | 9 | from sklearn import svm 10 | from sklearn import cross_validation 11 | from sklearn import preprocessing 12 | from sklearn import metrics 13 | import numpy as np 14 | import time 15 | import sys 16 | import cPickle as pickle 17 | import scipy.sparse 18 | from itertools import groupby 19 | 20 | 21 | if len(sys.argv)!=3: 22 | print ('\nUsage: python estimate_liblinear_svm_parameters.py ') 23 | sys.exit() 24 | 25 | speech_vector_file = sys.argv[1] 26 | class_label_file = sys.argv[2] 27 | #cross_validation_folds_number = sys.argv[3] 28 | 29 | #We have chosen l2 normalization to normalize the mfcc speech vector over the entire set of frames. 30 | #Training Data -- Speech Vector File 31 | with open(speech_vector_file, 'rb') as infile1: 32 | InputData = pickle.load(infile1) 33 | InputDataSpeech = preprocessing.normalize(InputData,norm='l2') 34 | infile1.close() 35 | 36 | # Target Values -- Class Label Files. 37 | with open(class_label_file, 'rb') as infile2: 38 | TargetData = pickle.load(infile2) 39 | TargetClassLabelTemp = preprocessing.normalize(TargetData,norm='l2') 40 | infile2.close() 41 | 42 | #print InputDataSpeech.shape 43 | #print TargetClassLabelTemp.shape 44 | 45 | TargetClassLabel = np.array(scipy.sparse.coo_matrix((TargetClassLabelTemp),dtype=np.int16).toarray()).tolist() 46 | TargetClassLabel1 = map(str, TargetClassLabel) 47 | #TargetClassLabel = map(int, TargetClassLabel) 48 | TargetClassLabel = results = [int(i.strip('[').strip(']')) for i in TargetClassLabel1] 49 | 50 | #Recording the start time. 51 | start = time.time() 52 | 53 | # Loading the Digits dataset 54 | #digits = datasets.load_digits() 55 | 56 | # To apply an classifier on this data, we need to flatten the image, to 57 | # turn the data in a (samples, feature) matrix: 58 | #n_samples = len(digits.images) 59 | #X = digits.images.reshape((n_samples, -1)) 60 | #y = digits.target 61 | 62 | # Split the dataset in two equal parts 63 | #X_train, X_test, y_train, y_test = train_test_split( 64 | # X, y, test_size=0.5, random_state=0) 65 | 66 | n_samples = len(TargetClassLabel) 67 | InputDataSpeechTemp = scipy.sparse.coo_matrix((InputDataSpeech),dtype=np.float64).toarray() 68 | X_train = InputDataSpeechTemp[:n_samples/2,:13] 69 | X_test = InputDataSpeechTemp[n_samples/2:,:13] 70 | y_train = TargetClassLabel[:n_samples / 2] 71 | y_test = TargetClassLabel[n_samples / 2:] 72 | 73 | # Set the parameters by cross-validation 74 | tuned_parameters = [{'C': [1, 10, 100, 1000],'loss':['hinge' , 'squared_hinge'] }] 75 | 76 | scores = ['precision', 'recall'] 77 | 78 | for score in scores: 79 | print("# Tuning hyper-parameters for %s" % score) 80 | print() 81 | 82 | clf = GridSearchCV(svm.LinearSVC(C=1), tuned_parameters, cv=5, 83 | scoring='%s_weighted' % score) 84 | 85 | clf.fit(X_train, y_train) 86 | 87 | print("Best parameters set found on development set:") 88 | print() 89 | print(clf.best_params_) 90 | print() 91 | print("Grid scores on development set:") 92 | print() 93 | for params, mean_score, scores in clf.grid_scores_: 94 | print("%0.3f (+/-%0.03f) for %r" 95 | % (mean_score, scores.std() * 2, params)) 96 | print() 97 | 98 | print("Detailed classification report:") 99 | print() 100 | print("The model is trained on the full development set.") 101 | print("The scores are computed on the full evaluation set.") 102 | print() 103 | y_true, y_pred = y_test, clf.predict(X_test) 104 | print(classification_report(y_true, y_pred)) 105 | print() 106 | 107 | #Recording the end time. 108 | end = time.time() 109 | print ("Total execution time in minutes :: >>") 110 | print ((end - start)/60) 111 | print ('Task is Finished!') 112 | -------------------------------------------------------------------------------- /estimate_svm_parameters.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from sklearn import datasets 4 | from sklearn.cross_validation import train_test_split 5 | from sklearn.grid_search import GridSearchCV 6 | from sklearn.metrics import classification_report 7 | from sklearn.svm import SVC 8 | 9 | #from sklearn import svm 10 | from sklearn import cross_validation 11 | from sklearn import preprocessing 12 | from sklearn import metrics 13 | import numpy as np 14 | import time 15 | import sys 16 | import cPickle as pickle 17 | import scipy.sparse 18 | from itertools import groupby 19 | 20 | 21 | if len(sys.argv)!=3: 22 | print '\nUsage: python estimate_svm_parameters.py ' 23 | sys.exit() 24 | 25 | speech_vector_file = sys.argv[1] 26 | class_label_file = sys.argv[2] 27 | #cross_validation_folds_number = sys.argv[3] 28 | 29 | #We have chosen l2 normalization to normalize the mfcc speech vector over the entire set of frames. 30 | #Training Data -- Speech Vector File 31 | with open(speech_vector_file, 'rb') as infile1: 32 | InputData = pickle.load(infile1) 33 | InputDataSpeech = preprocessing.normalize(InputData,norm='l2') 34 | infile1.close() 35 | 36 | # Target Values -- Class Label Files. 37 | with open(class_label_file, 'rb') as infile2: 38 | TargetData = pickle.load(infile2) 39 | TargetClassLabelTemp = preprocessing.normalize(TargetData,norm='l2') 40 | infile2.close() 41 | 42 | #print InputDataSpeech.shape 43 | #print TargetClassLabelTemp.shape 44 | 45 | TargetClassLabel = np.array(scipy.sparse.coo_matrix((TargetClassLabelTemp),dtype=np.int16).toarray()).tolist() 46 | TargetClassLabel1 = map(str, TargetClassLabel) 47 | #TargetClassLabel = map(int, TargetClassLabel) 48 | TargetClassLabel = results = [int(i.strip('[').strip(']')) for i in TargetClassLabel1] 49 | 50 | #Recording the start time. 51 | start = time.time() 52 | 53 | # Loading the Digits dataset 54 | #digits = datasets.load_digits() 55 | 56 | # To apply an classifier on this data, we need to flatten the image, to 57 | # turn the data in a (samples, feature) matrix: 58 | #n_samples = len(digits.images) 59 | #X = digits.images.reshape((n_samples, -1)) 60 | #y = digits.target 61 | 62 | # Split the dataset in two equal parts 63 | #X_train, X_test, y_train, y_test = train_test_split( 64 | # X, y, test_size=0.5, random_state=0) 65 | 66 | n_samples = len(TargetClassLabel) 67 | InputDataSpeechTemp = scipy.sparse.coo_matrix((InputDataSpeech),dtype=np.float64).toarray() 68 | X_train = InputDataSpeechTemp[:n_samples/2,:13] 69 | X_test = InputDataSpeechTemp[n_samples/2:,:13] 70 | y_train = TargetClassLabel[:n_samples / 2] 71 | y_test = TargetClassLabel[n_samples / 2:] 72 | 73 | # Set the parameters by cross-validation 74 | tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 75 | 'C': [1, 10, 100, 1000]}, 76 | {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}, 77 | {{'kernel': ['sigmoid'], 'gamma': [1e-3, 1e-4], 78 | 'C': [1, 10, 100, 1000]}] 79 | 80 | scores = ['precision', 'recall'] 81 | 82 | for score in scores: 83 | print("# Tuning hyper-parameters for %s" % score) 84 | print() 85 | 86 | clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5, 87 | scoring='%s_weighted' % score) 88 | 89 | clf.fit(X_train, y_train) 90 | 91 | print("Best parameters set found on development set:") 92 | print() 93 | print(clf.best_params_) 94 | print() 95 | print("Grid scores on development set:") 96 | print() 97 | for params, mean_score, scores in clf.grid_scores_: 98 | print("%0.3f (+/-%0.03f) for %r" 99 | % (mean_score, scores.std() * 2, params)) 100 | print() 101 | 102 | print("Detailed classification report:") 103 | print() 104 | print("The model is trained on the full development set.") 105 | print("The scores are computed on the full evaluation set.") 106 | print() 107 | y_true, y_pred = y_test, clf.predict(X_test) 108 | print(classification_report(y_true, y_pred)) 109 | print() 110 | 111 | #Recording the end time. 112 | end = time.time() 113 | print "Total execution time in minutes :: >>" 114 | print (end - start)/60 115 | print 'Task is Finished!' 116 | -------------------------------------------------------------------------------- /format_convertor.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | NoiseFileDir = "/mnt/tatooine/data/noise/noise_equal_concat/train" 4 | outputDir = "/mnt/alderaan/mlteam3/data/noise_int16" 5 | for root, dirs, files in os.walk(NoiseFileDir): 6 | path = root.split('/') 7 | #print (len(path) - 1) *'---' , os.path.basename(root) 8 | for file in files: 9 | if (file.lower().endswith('.wav')): 10 | #ir_file_list.append(file) 11 | (rate_IR, data_IR) = wav.read(file) 12 | data_IR = data_IR.astype(numpy.int16) 13 | outputfilename = os.path.basename(file) 14 | outputfilepath = os.path.join(outputDir,outputfilename) 15 | wav.write(outputfilepath, rate_IR, data_IR) 16 | -------------------------------------------------------------------------------- /input_and_target_dataset_generator.py: -------------------------------------------------------------------------------- 1 | import scipy.io.wavfile as wav 2 | import numpy as np 3 | import math 4 | from features import mfcc 5 | import sys 6 | import os 7 | import cPickle 8 | import scipy.sparse 9 | 10 | if len(sys.argv)!=5: 11 | print '\nUsage: python input_and_target_dataset_generator.py ' 12 | sys.exit() 13 | 14 | annotation_dir = sys.argv[1] 15 | audio_files_dir= sys.argv[2] 16 | mfcc_vector_output_file=sys.argv[3] 17 | class_label_vector_output_file=sys.argv[4] 18 | 19 | 20 | 21 | 22 | 23 | def getframelevelanno(noise_mix_speech_ano_file): 24 | #print "#: "+str(noise_mix_speech_ano_file) 25 | #MFCC Parameters 26 | samplerate=16000 27 | winlen=0.025 28 | winstep=0.01 29 | 30 | annotation = [line.strip() for line in open(noise_mix_speech_ano_file, 'r')] 31 | 32 | window = [] 33 | frames_ano = [] 34 | 35 | for bv in annotation: 36 | window.append(float(bv)) 37 | if len(window)==winlen*samplerate: 38 | frames_ano.append(math.ceil(sum(window)/len(window))) 39 | window=window[int(winstep*samplerate):] #moving window 40 | 41 | frames_ano.append(math.ceil(float(sum(window))/len(window))) #the remaining samples for the last one frame 42 | #frames_ano_array = np.array(frames_ano).reshape(len(frames_ano),1) 43 | #return frames_ano_array 44 | return frames_ano 45 | 46 | 47 | 48 | def getMfccVector(noise_mix_speech_file): 49 | (rate, signal) = wav.read(noise_mix_speech_file) 50 | mfcc_vec = mfcc(signal,rate) 51 | return mfcc_vec 52 | 53 | def getDataset(annotation_dir, audio_files_dir): 54 | speech_vector_final = np.zeros((1,13)) 55 | speech_vector_final = np.delete(speech_vector_final, (0), axis=0) 56 | class_label_vector_final = [] 57 | directoryCount = 1 58 | #failedFilesCount = 1 59 | for root, dirs, files in os.walk(annotation_dir): 60 | print "Directory Count: "+str(directoryCount) 61 | path = root.split('/') 62 | for file in files: 63 | if (file.lower().endswith('.wav')): 64 | speechFilePath = os.path.join(root,str(file)) 65 | tmp = os.path.dirname(speechFilePath) 66 | root_dir_name = os.path.basename(tmp) 67 | 68 | annotationfilename = str(os.path.splitext(file)[0])+'.ano' 69 | annotation_file_fullpath = os.path.join(annotation_dir,root_dir_name,annotationfilename) 70 | audio_file_fullpath = os.path.join(audio_files_dir,root_dir_name,file) 71 | print "Annotation File: "+ annotation_file_fullpath 72 | print "Audio file:"+ audio_file_fullpath 73 | mfcc_vector = getMfccVector(audio_file_fullpath) 74 | #print mfcc_vector 75 | class_label_vector = getframelevelanno(annotation_file_fullpath) 76 | (x,y) = mfcc_vector.shape 77 | z = len(class_label_vector) 78 | 79 | #if (x!=z ): 80 | #failedFilesCount = failedFilesCount +1 81 | #print "mfcc - x: "+ str(x) 82 | #print "class - z: "+ str(z) 83 | # if (w != 1): 84 | # print "Failed" 85 | # print class_label_vector.shape 86 | 87 | if (x==z): 88 | print "MFCC: " + str(x) 89 | print "LABEL: " + str(len(class_label_vector)) 90 | speech_vector_final = np.vstack((speech_vector_final,mfcc_vector)) 91 | class_label_vector_final.extend(class_label_vector) 92 | directoryCount = directoryCount+1 93 | #print "Total Number of Failed Files: "+ str(failedFilesCount) 94 | return (speech_vector_final,class_label_vector_final) 95 | 96 | 97 | 98 | 99 | 100 | #Main Routine 101 | print "Start of the Program ....." 102 | #speech_vector_final = np.empty((1,13)) 103 | #class_label_vector_final = [] 104 | (speech_vector_final,class_label_vector_final) = getDataset(annotation_dir, audio_files_dir) 105 | #print "Class Labels:"+str(len(class_label_vector_final)) 106 | 107 | 108 | # Generate the various output files 109 | #MFCC Speech 110 | mfcc_vector_file = open(mfcc_vector_output_file, 'w') 111 | temp1 = scipy.sparse.coo_matrix(speech_vector_final) 112 | cPickle.dump(temp1,mfcc_vector_file,-1) 113 | mfcc_vector_file.close() 114 | 115 | #Class Labels 116 | class_label_vector_file = open(class_label_vector_output_file, 'w') 117 | class_label_vector_final_array = np.array(class_label_vector_final).reshape(len(class_label_vector_final),1) 118 | temp2 = scipy.sparse.coo_matrix(class_label_vector_final_array) 119 | cPickle.dump(temp2,class_label_vector_file,-1) 120 | class_label_vector_file.close() 121 | 122 | print "Final Shapes:" 123 | print "Speech Vector:"+str(speech_vector_final.shape) 124 | print "Class Labels:"+str(class_label_vector_final_array.shape) 125 | 126 | print "Program completed Successfully" 127 | 128 | 129 | -------------------------------------------------------------------------------- /input_and_target_dataset_mfcc_26D.py: -------------------------------------------------------------------------------- 1 | import scipy.io.wavfile as wav 2 | import numpy as np 3 | import math 4 | from features import mfcc 5 | import sys 6 | import os 7 | import cPickle 8 | import scipy.sparse 9 | from scipy import signal 10 | 11 | if len(sys.argv)!=5: 12 | print '\nUsage: FrameLevelAnnotation_weighted_39D.py ' 13 | sys.exit() 14 | 15 | annotation_dir = sys.argv[1] 16 | audio_files_dir= sys.argv[2] 17 | mfcc_vector_output_file=sys.argv[3] 18 | class_label_vector_output_file=sys.argv[4] 19 | 20 | def getframelevelanno(noise_mix_speech_ano_file): 21 | #MFCC Parameters 22 | samplerate=16000 23 | winlen=0.025 24 | winstep=0.01 25 | winweight = signal.hamming(winlen*samplerate) 26 | winweight = winweight/sum(winweight) 27 | annotation = [line.strip() for line in open(noise_mix_speech_ano_file, 'r')] 28 | 29 | window = [] 30 | frames_ano = [] 31 | 32 | for bv in annotation: 33 | window.append(float(bv)) 34 | if len(window)==winlen*samplerate: 35 | frames_ano.append(sum(np.multiply(window,winweight))) 36 | window=window[int(winstep*samplerate):] #moving window 37 | frames_ano.append(sum(np.multiply(window,winweight[:len(window)]))) #the remaining samples for the last one frame 38 | return frames_ano 39 | 40 | 41 | 42 | def getMfccVector(noise_mix_speech_file): 43 | (rate, signal) = wav.read(noise_mix_speech_file) 44 | mfcc_vec = mfcc(signal,rate,numcep=26) 45 | return mfcc_vec 46 | 47 | def getDataset(annotation_dir, audio_files_dir): 48 | speech_vector_final = np.zeros((1,26)) 49 | speech_vector_final = np.delete(speech_vector_final, (0), axis=0) 50 | print "Speech Vector Final: "+ str(speech_vector_final.shape) 51 | 52 | class_label_vector_final = [] 53 | directoryCount = 1 54 | 55 | for root, dirs, files in os.walk(annotation_dir): 56 | path = root.split('/') 57 | for file in files: 58 | if (file.lower().endswith('.wav')): 59 | speechFilePath = os.path.join(root,str(file)) 60 | tmp = os.path.dirname(speechFilePath) 61 | root_dir_name = os.path.basename(tmp) 62 | 63 | annotationfilename = str(os.path.splitext(file)[0])+'.ano' 64 | annotation_file_fullpath = os.path.join(annotation_dir,root_dir_name,annotationfilename) 65 | audio_file_fullpath = os.path.join(audio_files_dir,root_dir_name,file) 66 | 67 | print "Annotation File: "+ annotation_file_fullpath 68 | print "Audio file:"+ audio_file_fullpath 69 | 70 | mfcc_vector = getMfccVector(audio_file_fullpath) 71 | class_label_vector = getframelevelanno(annotation_file_fullpath) 72 | (x,y) = mfcc_vector.shape 73 | z = len(class_label_vector) 74 | 75 | if (x==z): 76 | print "MFCC: " + str(x) 77 | print "LABEL: " + str(len(class_label_vector)) 78 | print "speech_v: " + str(speech_vector_final.shape) 79 | print "mfcc_v: " + str(mfcc_vector.shape) 80 | speech_vector_final = np.vstack((speech_vector_final,mfcc_vector)) 81 | class_label_vector_final.extend(class_label_vector) 82 | print "Directory Count: "+str(directoryCount) 83 | directoryCount = directoryCount+1 84 | 85 | return (speech_vector_final,class_label_vector_final) 86 | 87 | 88 | #Main Routine 89 | print "Start of the Program ....." 90 | (speech_vector_final,class_label_vector_final) = getDataset(annotation_dir, audio_files_dir) 91 | 92 | 93 | 94 | # Generate the various output files 95 | #MFCC Speech 96 | mfcc_vector_file = open(mfcc_vector_output_file, 'w') 97 | temp1 = scipy.sparse.coo_matrix(speech_vector_final) 98 | cPickle.dump(temp1,mfcc_vector_file,-1) 99 | mfcc_vector_file.close() 100 | 101 | #Class Labels 102 | class_label_vector_file = open(class_label_vector_output_file, 'w') 103 | class_label_vector_final_array = np.array(class_label_vector_final).reshape(len(class_label_vector_final),1) 104 | temp2 = scipy.sparse.coo_matrix(class_label_vector_final_array) 105 | cPickle.dump(temp2,class_label_vector_file,-1) 106 | class_label_vector_file.close() 107 | 108 | print "Final Shapes:" 109 | print "Speech Vector:"+str(speech_vector_final.shape) 110 | print "Class Labels:"+str(class_label_vector_final_array.shape) 111 | 112 | print "Program completed Successfully" 113 | 114 | 115 | -------------------------------------------------------------------------------- /speech_noise_ir_audio_mixing_script.py: -------------------------------------------------------------------------------- 1 | import scipy.io.wavfile as wav 2 | import scipy.io as sio 3 | import audiotools 4 | import numpy 5 | import math 6 | import pydub 7 | import random 8 | from features import mfcc 9 | from features import logfbank 10 | import os 11 | import sys 12 | 13 | if len(sys.argv)!=6: 14 | print '\nUsage: python speech_noise_ir_audio_mixing_script.py ' 15 | print '\nCommand line arguments:' 16 | print ': Root directory where all the VAD speaker speech data is located' 17 | print ': Reference File for the Calculation of Replay gain, named as "ref_pink.wav" ' 18 | print ': Root directory where all the noise data is located' 19 | print ' : Root directory where all the impulse response noise samples is located' 20 | print ': Root directory for storing all the output files in same directory hierarchy as the input speech directory' 21 | print'\n' 22 | sys.exit() 23 | 24 | #Command Line Arguments 25 | speakerSpeechDir = sys.argv[1] 26 | referenceFile = sys.argv[2] 27 | noiseFileDir = sys.argv[3] 28 | irNoiseFileDir =sys.argv[4] 29 | output_root_directory =sys.argv[5] 30 | 31 | #Create the output directory if its not already present in the filesystem. 32 | if not os.path.exists(output_root_directory): 33 | os.makedirs(output_root_directory) 34 | 35 | def audioeval(speechFile, referenceFile,noiseFile, root_dir_name, output_root_directory,ir_noise_file): 36 | "This function evaluates a single audio file." 37 | print 'Noise File Path: '+str(noiseFile) 38 | print 'IR-Noise File Path: '+str(ir_noise_file) 39 | #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 40 | # ReplayGain calculation of reference 41 | ref = audiotools.open(referenceFile) 42 | ref_replay_gain = audiotools.calculate_replay_gain([ref]) 43 | ref_track_gain = list(list(ref_replay_gain)[0])[1] 44 | #print ref_track_gain 45 | 46 | # ReplayGain calculation of example speech file 47 | speech = audiotools.open(speechFile) 48 | speech_replay_gain = audiotools.calculate_replay_gain([speech]) 49 | speech_track_gain = list(list(speech_replay_gain)[0])[1] 50 | #print speech_track_gain 51 | 52 | # Normalization of example speech file 53 | (rate_speech, data_speech) = wav.read(speechFile) 54 | gain = ref_track_gain-speech_track_gain 55 | data_normalized = numpy.asarray(data_speech*math.pow(10, (-(gain)/20)), dtype=numpy.int16) 56 | normalizedFile = "speech_normalized.wav" 57 | wav.write(normalizedFile , rate_speech, data_normalized) 58 | 59 | # Loudness test of normalized example speech 60 | test = audiotools.open(normalizedFile) 61 | test_replay_gain = audiotools.calculate_replay_gain([test]) 62 | test_track_gain = list(list(test_replay_gain)[0])[1] 63 | #print test_track_gain 64 | 65 | # Randomly choosing one noise file from the pool 66 | # here I just fix one waiting for implementation later 67 | 68 | # Using pydub API to calculate the length of normalized speech file and the noise file 69 | speech_normalized = pydub.AudioSegment.from_wav(normalizedFile) 70 | 71 | #We have converted all the noise files to 16 bit int format and then passed the directoyr location to randomly choose noise files, which is different for each speech file. 72 | noise = pydub.AudioSegment.from_wav(noiseFile) 73 | speech_normalized_length = speech_normalized.duration_seconds 74 | noise_length = noise.duration_seconds 75 | 76 | # Selecting a randow start point of the noise file to get a segment of the required length 77 | start = random.randrange(0,int(noise_length-speech_normalized_length)*1000) 78 | # pydub does things in milliseconds 79 | noise_segmented = noise[start:int(start+speech_normalized_length*1000)] 80 | noise_segmented.export("noise_segmented.wav",format="wav") 81 | 82 | # Linear fading of sharply segmented noised segment 83 | # 1 sec fade in, 1 sec fade out 84 | noise_faded = noise_segmented.fade_in(1000).fade_out(1000) 85 | noise_faded.export("noise_faded.wav",format="wav") 86 | 87 | # how long is good? 1 sec? 88 | 89 | # Picking a random signal to noise ratio (SNR) 90 | SNR_ratio = random.randint(-2, 20) 91 | #print "SNR_ratio: " + str(SNR_ratio) 92 | 93 | # loudness in dBFS (Decibels relative to full scale) 94 | # (all peak measurements will be negative numbers) 95 | speech_dB = speech_normalized.dBFS 96 | noise_dB = noise_segmented.dBFS 97 | #print "loudness of speech: " + str(speech_dB) 98 | #print "loudness of noise: " + str(noise_dB) 99 | 100 | # Change the amplitude (generally, loudness) of the speech by SNR ratio from noise. 101 | # Gain is specified in dB. 102 | gain = SNR_ratio-(speech_dB-noise_dB) 103 | #print "gain: " + str(gain) 104 | speech_SNRed = speech_normalized.apply_gain(gain) 105 | #print "loudness of adjusted speech: " + str(speech_SNRed.dBFS) 106 | # check SNR 107 | #print "check SNR: " + str(speech_SNRed.dBFS - noise_dB) 108 | 109 | # mix the two tracks by adding the respective samples 110 | # (If the overlaid AudioSegment is longer than this one, the result will be truncated) 111 | noisy_speech = speech_SNRed.overlay(noise_segmented) 112 | noisy_speech.export("noisy_speech.wav",format="wav") 113 | # Since the sample values have increased through the summation, it is possible that they exceed the maximum imposed by the data type. How this API deals with this problem? 114 | 115 | 116 | # draw an impulse response from the pool 117 | # ...waiting to implement 118 | 119 | # peak-normalize it to 0dB (=1) by dividing the IR vector through its maximum value. 120 | (rate_IR, data_IR) = wav.read(ir_noise_file) 121 | 122 | # data_IR.dtype is int16, change it into float64 123 | data_IR = data_IR.astype(numpy.float64)/65536.0 124 | data_IR = data_IR / data_IR.max() 125 | 126 | # convolve speech with the normalized IR 127 | (rate_noisy_speech, data_noisy_speech) = wav.read("noisy_speech.wav") 128 | speech_convolved = numpy.convolve(data_IR, data_noisy_speech) 129 | 130 | #print "Root Directory Name: "+str(root_dir_name) 131 | output_directory = os.path.join(output_root_directory, root_dir_name) 132 | #print output_directory 133 | 134 | if not os.path.exists(output_directory): 135 | os.makedirs(output_directory) 136 | 137 | #speech_convolved_file = output_directory+'/'+str(os.path.splitext(speechFile)[0])+"_convolved.wav" 138 | speech_convolved_file_name = os.path.basename(speechFile) 139 | #print "Speech File Name: "+str(speech_convolved_file_name) 140 | speech_convolved_file = os.path.join(output_directory, speech_convolved_file_name) 141 | print "Final output file path: "+str(speech_convolved_file) 142 | 143 | # cut the convolved track to its original length if prolonged and store the resulting track 144 | wav.write(speech_convolved_file, rate_noisy_speech, speech_convolved[:data_noisy_speech.size]) 145 | 146 | # MFCC CODE *********** COMMENTED OUT ************** 147 | # MFCC Feature extraction 148 | # Do the default parameters (frame size etc.) work for you? 149 | #(rate,sig) = wav.read(speech_convolved_file) 150 | #mfcc_feat = mfcc(sig,rate) 151 | #print "MFCC Shape:" 152 | #print mfcc_feat.shape 153 | #print mfcc_feat 154 | #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 155 | ## Cleaup code which deletes the intermediate files which get generated. 156 | return; 157 | 158 | noise_file_list = [] 159 | ir_file_list = [] 160 | 161 | for root, dirs, files in os.walk(noiseFileDir): 162 | path = root.split('/') 163 | #print (len(path) - 1) *'---' , os.path.basename(root) 164 | for file in files: 165 | if (file.lower().endswith('.wav')): 166 | noise_file_list.append(file) 167 | 168 | 169 | for root, dirs, files in os.walk(irNoiseFileDir): 170 | path = root.split('/') 171 | #print (len(path) - 1) *'---' , os.path.basename(root) 172 | for file in files: 173 | if (file.lower().endswith('.wav')): 174 | ir_file_list.append(file) 175 | 176 | # traverse root directory, and list directories as dirs and files as files 177 | for root, dirs, files in os.walk(speakerSpeechDir): 178 | path = root.split('/') 179 | #print (len(path) - 1) *'---' , os.path.basename(root) 180 | for file in files: 181 | if (file.lower().endswith('.wav')): 182 | #print 'Current File: ', file 183 | speechFilePath = os.path.join(root,str(file)) 184 | tmp = os.path.dirname(speechFilePath) 185 | #print "Root: "+str(os.path.basename(tmp)) 186 | #print os.path.basename(speechFilePath) 187 | root_dir_name = os.path.basename(tmp) 188 | #print "Main Root Directory Name: "+str(os.path.basename(root)) 189 | 190 | #Choose a random noise and ir file from the directory specified in the command line argument. 191 | noiseFile = os.path.join(noiseFileDir,random.choice(noise_file_list)) 192 | ir_noise_file = os.path.join (irNoiseFileDir,random.choice(ir_file_list)) 193 | 194 | audioeval(speechFilePath,referenceFile,noiseFile,root_dir_name,output_root_directory,ir_noise_file) 195 | print "Operations Finished!" 196 | 197 | -------------------------------------------------------------------------------- /svm_modelling.py: -------------------------------------------------------------------------------- 1 | from sklearn import svm 2 | from sklearn import cross_validation 3 | from sklearn import preprocessing 4 | import numpy as np 5 | import time 6 | import sys 7 | import cPickle as pickle 8 | import scipy.sparse 9 | 10 | 11 | if len(sys.argv)!=3: 12 | print '\nUsage: python svm_modelling.py ' 13 | sys.exit() 14 | 15 | speech_vector_file = sys.argv[1] 16 | class_label_file = sys.argv[2] 17 | #cross_validation_folds_number = sys.argv[3] 18 | 19 | #We have chosen l2 normalization to normalize the mfcc speech vector over the entire set of frames. 20 | #Training Data -- Speech Vector File 21 | with open(speech_vector_file, 'rb') as infile1: 22 | InputData = pickle.load(infile1) 23 | InputDataSpeech = preprocessing.normalize(InputData,norm='l2') 24 | infile1.close() 25 | 26 | # Target Values -- Class Label Files. 27 | with open(class_label_file, 'rb') as infile2: 28 | TargetData = pickle.load(infile2) 29 | TargetClassLabelTemp = preprocessing.normalize(TargetData,norm='l2') 30 | infile2.close() 31 | 32 | print InputDataSpeech.shape 33 | print TargetClassLabelTemp.shape 34 | 35 | TargetClassLabel = np.array(scipy.sparse.coo_matrix((TargetClassLabelTemp),dtype=np.float64).toarray()).tolist() 36 | TargetClassLabel = map(str, TargetClassLabel) 37 | #Recording the start time. 38 | start = time.time() 39 | 40 | #Choosing SVM as our machine learning model. 41 | #clf_model = svm.LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,intercept_scaling=1, loss='squared_hinge', multi_class='ovr', penalty='l2',random_state=None, tol=0.0001, verbose=0) 42 | clf_model = svm.LinearSVC() 43 | 44 | print 'Starting Cross Validation' 45 | 46 | # fit() is not required when cross validating. 47 | #clf = clf_model.fit(InputDataSpeech,TargetClassLabel) 48 | 49 | scores = cross_validation.cross_val_score(clf_model, InputDataSpeech, TargetClassLabel, cv=10) 50 | print "\nFinal Accuracy Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2) 51 | 52 | #Recording the end time. 53 | end = time.time() 54 | 55 | 56 | print "Total execution time in minutes :: >>" 57 | print (end - start)/60 58 | 59 | print 'Task is Finished!' --------------------------------------------------------------------------------