├── .gitignore
├── AdaBoostClassifier.py
├── README.md
├── RandomForestClassifier.py
├── classification_report.py
├── classifier_test.py
├── estimate_liblinear_svm_parameters.py
├── estimate_svm_parameters.py
├── format_convertor.py
├── input_and_target_dataset_generator.py
├── input_and_target_dataset_mfcc_26D.py
├── speech_noise_ir_audio_mixing_script.py
└── svm_modelling.py


/.gitignore:
--------------------------------------------------------------------------------
1 | A1.py 
2 | beta_script.py


--------------------------------------------------------------------------------
/AdaBoostClassifier.py:
--------------------------------------------------------------------------------
  1 | from sklearn import svm
  2 | from sklearn import cross_validation
  3 | from sklearn import preprocessing
  4 | from sklearn import metrics
  5 | import numpy as np
  6 | import time
  7 | import sys
  8 | import cPickle as pickle
  9 | import scipy.sparse
 10 | from itertools import groupby
 11 | from sklearn.ensemble import RandomForestClassifier
 12 | from sklearn.ensemble import AdaBoostClassifier
 13 | from sklearn.neighbors import KNeighborsClassifier
 14 | from sklearn.naive_bayes import GaussianNB
 15 | from sklearn.tree import DecisionTreeClassifier
 16 | 
 17 | #We have chosen l2 normalization to normalize the mfcc speech vector over the entire set of frames.
 18 | #Training Data -- Speech Vector File 
 19 | with open('mfcc_weighted_vector.dat', 'rb') as infile1:
 20 |    InputData = pickle.load(infile1)
 21 | InputDataSpeech = preprocessing.normalize(InputData,norm='l2')
 22 | infile1.close()
 23 | 
 24 | # Target Values -- Class Label Files.
 25 | with open('class_label_weighted_vector.dat', 'rb') as infile2:
 26 |    TargetData = pickle.load(infile2)
 27 | TargetClassLabelTemp = preprocessing.normalize(TargetData,norm='l2')
 28 | infile2.close()
 29 | 
 30 | TargetClassLabel = np.array(scipy.sparse.coo_matrix((TargetClassLabelTemp),dtype=np.int16).toarray()).tolist()
 31 | TargetClassLabel1 = map(str, TargetClassLabel)
 32 | TargetClassLabel = results = [int(i.strip('[').strip(']')) for i in TargetClassLabel1]
 33 | #TargetClassLabel = TargetClassLabel[:200000]
 34 | n_samples = len(TargetClassLabel)
 35 | InputDataSpeechTemp = scipy.sparse.coo_matrix((InputDataSpeech),dtype=np.float64).toarray()
 36 | #InputDataSpeechTemp = InputDataSpeechTemp[:200000]
 37 | # merging the label and mfcc feature
 38 | data_zipped = zip(TargetClassLabel, InputDataSpeechTemp)
 39 | 
 40 | # permutation of whole dataset
 41 | #data_zipped = np.random.permutation(data_zipped)
 42 | 
 43 | # seperating the ones and zeros
 44 | ones_index = [i for i, j in enumerate(TargetClassLabel) if j == 1]
 45 | zeros_index = [i for i, j in enumerate(TargetClassLabel) if j == 0]
 46 | ones = [data_zipped[i] for i in ones_index]
 47 | zeros = [data_zipped[i] for i in zeros_index]
 48 | num_zero = len(zeros)
 49 | original_ratio = float(len(ones))/num_zero
 50 | # whole number of zeros:  145082
 51 | # whole number of ones:  2057376
 52 | 
 53 | one_zero_ratio = 1
 54 | train_test_ratio = 30 
 55 | 
 56 | # reconstructe test set according to original_ratio
 57 | test_zero = zeros[:int(num_zero/train_test_ratio)]
 58 | print 'test of zeros:	', len(test_zero)
 59 | test_one = ones[:int(len(test_zero)*original_ratio)]
 60 | #test_one = ones[:int(len(test_zero)*1)]
 61 | print 'test of ones:	', len(test_one)
 62 | test = test_zero + test_one
 63 | np.random.shuffle(test)
 64 | print 'test:	', len(test)
 65 | test_label = []
 66 | test_feature = []
 67 | for i in test:
 68 | 	test_label.append(i[0])
 69 | for i in test:
 70 | 	test_feature.append(i[1])
 71 | 
 72 | # reconstructe training set according to one_zero_ratio
 73 | # the ratio of ones to zeros on training is 1
 74 | train_zero = zeros[len(test_zero)+1:]
 75 | print 'train_zero:	', len(train_zero)
 76 | train_one = ones[len(test_one)+1:len(test_one)+1+int(len(train_zero)*one_zero_ratio)]
 77 | print 'train_one	', len(train_one)
 78 | train = train_zero + train_one
 79 | np.random.shuffle(train)
 80 | print 'train:	', len(train)
 81 | train_label = []
 82 | train_feature = []
 83 | for i in train:
 84 | 	train_label.append(i[0])
 85 | #print 'train_label', len(train_label)
 86 | for i in train:
 87 | 	train_feature.append(i[1])
 88 | #print 'train_feature', len(train_feature)
 89 | clf = RandomForestClassifier()
 90 | clf_model = AdaBoostClassifier(base_estimator=clf,n_estimators=500).fit(train_feature, train_label)
 91 | label_predicted = clf_model.predict(test_feature)
 92 | 
 93 | target_names = ['0', '1' ]
 94 | print("Classification report for classifier %s:\n%s\n"
 95 |       % (clf_model, metrics.classification_report(test_label, label_predicted, target_names=target_names)))
 96 | cm = metrics.confusion_matrix(test_label, label_predicted)
 97 | cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
 98 | print('Confusion matrix, without normalization')
 99 | print(cm) 
100 | print('Normalized confusion matrix')
101 | print(cm_normalized)
102 | print 'Task is Finished!'
103 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Audio Based Machine Learning in Python: Assignment 1
  2 | 
  3 | *Goals*
  4 | * Mix speech corpora with noise sounds; fold with impulse responses
  5 | * Extract MFCC features
  6 | * Voice Activity Detection using out-of-the-box classifiers (Random Forests, SVM, Neural Networks etc.) with cross validation
  7 | * Present your results
  8 | 
  9 | *Details and constraints*
 10 | 
 11 | In this first, introductory assignment you will create a dataset that simulates speech in every-day scenarios. You will train a classifier on this dataset for distinguishing voiced from non-voiced sections, a task called voice activity detection, VAD for short. This, of course, requires a ground truth in terms of VAD annotations.
 12 | 
 13 | ### format_convertor.py
 14 | 
 15 | Converts the format of the noise files into int_16 type. We convert all the noise files into the int_16 format and save them in a new folder which is used further in the generation of dataset.
 16 | 
 17 | ```
 18 | Usage: python format_convertor.py
 19 | ```
 20 | 
 21 | ### speech_noise_ir_audio_mixing_script.py
 22 | 
 23 | Generates the Convoluted speech files in a given "output" directory, with customized command line parameters.
 24 | ```
 25 | Usage: python speech_noise_ir_audio_mixing_script.py <speaker_speech_dir> <reference_file> <noise_file_dir> <ir_noise_file_dir> <output_root_directory>
 26 | 
 27 | Command line arguments:
 28 | <speaker_speech_dir>: Root directory where all the VAD speaker speech data is located
 29 | <reference_file>: Reference File for the Calculation of Replay gain, named as "ref_pink.wav"
 30 | <noise_file_dir>: Root directory where all the noise data is located
 31 | <ir_noise_file_dir> : Root directory where all the impulse response noise samples is located
 32 | <output_root_directory>: Root directory for storing all the output files in same directory hierarchy as the input speech directory
 33 | ```
 34 | 
 35 | Sample Run and Output:
 36 | ```
 37 | mlteam3@pcschlichter4:~/A1$ python gamma_script.py ../data/sample ref_pink.wav 
 38 | ../data/noise_sample/ /mnt/tatooine/data/impulse_responses/16kHz/wavs16b ../newout1
 39 | Noise File Path: ../data/noise_sample/super_market_mall2_copy.wav
 40 | IR-Noise File Path: /mnt/tatooine/data/impulse_responses/16kHz/wavs16b/lg_leather_bag.wav
 41 | Final output file path: ../newout1/sample/SA2_632.wav
 42 | Operations Finished!
 43 | ```
 44 | 
 45 | ### input_and_target_dataset_generator.py
 46 | 
 47 | Generates the input and target label dataset from the given noise mixed speech files and corresponding annotation files. Stores them in scipy.sparse.coo_matrix representation and saves the resultant dataset vectors onto disk. 
 48 | 
 49 | ```
 50 | Usage: python input_and_target_dataset_generator.py <annotations_dir> <noise_mix_speech_dir> <mfcc_vector_output_file> <class_label_vector_output_file>
 51 | 
 52 | ```
 53 | 
 54 | Sample run:
 55 | 
 56 | ```
 57 | mlteam3@pcschlichter4:~/A1$ python FrameLevelAnnotation.py ../data/vad_dirsample/ ../output_all/ mfcc_full_vector.dat class_label_full_vector.dat
 58 | ```
 59 | 
 60 | ### svm_modelling.py
 61 | 
 62 | This script, employs scikit learn's lib linear version of support vector machine and computes cross validation accuracy scores.
 63 | 
 64 | ```
 65 | Usage: python svm_modelling.py <speech_vector_file> <class_label_file>
 66 | ```
 67 | _CROSS Validation: 3_
 68 | ```
 69 | mlteam3@pcschlichter4:~/A1$ python svm_modelling.py mfcc_full_vector.dat class_label_full_vector.dat
 70 | (2202458, 13)
 71 | (2202458, 1)
 72 | Starting Cross Validation
 73 | Final Accuracy Score: 0.93 (+/- 0.00)
 74 | Total execution time in minutes :: >>
 75 | 1.27803570032
 76 | Task is Finished!
 77 | ```
 78 | 
 79 | _CROSS Validation: 10_
 80 | ```
 81 | mlteam3@pcschlichter4:~/A1$ python svm_modelling.py mfcc_full_vector.dat class_label_full_vector.dat
 82 | (2202458, 13)
 83 | (2202458, 1)
 84 | Starting Cross Validation
 85 | Final Accuracy Score: 0.93413 (+/- 0.00)
 86 | Total execution time in minutes :: >>
 87 | 5.50397278468
 88 | Task is Finished!
 89 | 
 90 | ```
 91 | ### classification_report.py
 92 | 
 93 | Generates classification report for the given model.
 94 | ```
 95 | Usage: python classification_report.py <speech_vector_file> <class_label_file>
 96 | ```
 97 | 
 98 | Sample Run:
 99 | 
100 | ```
101 | mlteam3@pcschlichter4:~/A1$ python classification_report.py mfcc_full_vector.dat class_label_full_vector.dat
102 | (2202458, 13)
103 | (2202458, 1)
104 | Starting Cross Validation
105 | /usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:958: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
106 |   'precision', 'predicted', average, warn_for)
107 | Classification report for classifier LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
108 |      intercept_scaling=1, loss='squared_hinge', max_iter=1000,
109 |      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
110 |      verbose=0):
111 |                     precision    recall  f1-score   support
112 | 
113 | No Speech Detected       0.00      0.00      0.00     60627
114 |    Speech Detected       0.94      1.00      0.97   1040602
115 | 
116 |        avg / total       0.89      0.94      0.92   1101229
117 | 
118 | 
119 | Confusion matrix:
120 | [[      0   60627]
121 |  [      0 1040602]]
122 | Total execution time in minutes :: >>
123 | 0.405680398146
124 | Task is Finished!
125 | 
126 | ```
127 | 
128 | ### Accuracy Scores
129 | 
130 | Support Vector Machines (10 Fold Cross Validation):  0.93413
131 | 
132 | ### RandomForestClassifier.py
133 | 
134 | Seperate the speech frames and nonspeech frames;
135 | Rearrange and shuffle the training set according to adjustable speech to non-speech frames ratio and using RandomForestClassifier.
136 | ```
137 | Usage: python classification_report.py <speech_vector_file> <class_label_file>
138 | ```
139 | 
140 | Sample Run:
141 | 
142 | ```
143 | mlteam3@pcschlichter4:~/A1$ python RandomForestClassifier.py
144 | test of zeros:  4836
145 | test of ones:   68578
146 | test:   73414
147 | train_zero:     140245
148 | train_one       140245
149 | train:  280490
150 | Classification report for classifier RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
151 |             max_depth=None, max_features='auto', max_leaf_nodes=None,
152 |             min_samples_leaf=1, min_samples_split=2,
153 |             min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
154 |             oob_score=False, random_state=None, verbose=0,
155 |             warm_start=False):
156 |              precision    recall  f1-score   support
157 | 
158 |           0       0.07      0.99      0.12      4836
159 |           1       0.96      0.02      0.04     68578
160 | 
161 | avg / total       0.90      0.08      0.04     73414
162 | 
163 | 
164 | Confusion matrix, without normalization
165 | [[ 4779    57]
166 |  [67331  1247]]
167 | Normalized confusion matrix
168 | [[ 0.9882134   0.0117866 ]
169 |  [ 0.98181633  0.01818367]]
170 | Task is Finished!
171 | 
172 | 
173 | ```
174 | ### estimate_svm_paramters.py and  estimate_liblinear_svm_parameters.py
175 | 
176 | This script(s) determines the best paramters for SVM and LinearSVC(LibLinear SVM implementation) using Grid search cross validation over five folds. 
177 | 
178 | ```
179 | Usage: python estimate_svm_parameters.py <speech_vector_file> <class_label_file>
180 | 
181 | Usage: estimate_liblinear_svm_parameters.py <speech_vector_file> <class_label_file>
182 | 
183 | ```
184 | 
185 | 
186 | The following parameters were experimented on, to find the most suitable configuration:
187 | 
188 | ```
189 | For SVM:
190 | tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
191 |                      'C': [1, 10, 100, 1000]},
192 |                     {'kernel': ['linear'], 'C': [1, 10, 100, 1000]},
193 |                     {{'kernel': ['sigmoid'], 'gamma': [1e-3, 1e-4], 
194 |                      'C': [1, 10, 100, 1000]}]
195 | For LinearSVC:
196 | 
197 | tuned_parameters = [{'C': [1, 10, 100, 1000],'loss':['hinge' , 'squared_hinge'] }]
198 | 
199 | ```
200 | Results:
201 | 
202 | ```
203 | # Tuning hyper-parameters for precision
204 | 
205 | Best parameters set found on development set:
206 | 
207 | {'loss': 'hinge', 'C': 1000}
208 | 
209 | Grid scores on development set:
210 | 
211 | 0.852 (+/-0.000) for {'loss': 'hinge', 'C': 1}
212 | 0.852 (+/-0.000) for {'loss': 'squared_hinge', 'C': 1}
213 | 0.852 (+/-0.000) for {'loss': 'hinge', 'C': 10}
214 | 0.852 (+/-0.000) for {'loss': 'squared_hinge', 'C': 10}
215 | 0.852 (+/-0.000) for {'loss': 'hinge', 'C': 100}
216 | 0.852 (+/-0.000) for {'loss': 'squared_hinge', 'C': 100}
217 | 0.858 (+/-0.021) for {'loss': 'hinge', 'C': 1000}
218 | 0.854 (+/-0.007) for {'loss': 'squared_hinge', 'C': 1000}
219 | 
220 | Detailed classification report:
221 | 
222 | The model is trained on the full development set.
223 | The scores are computed on the full evaluation set.
224 | 
225 |              precision    recall  f1-score   support
226 | 
227 |           0       0.00      0.00      0.00     60627
228 |           1       0.94      1.00      0.97   1040602
229 | 
230 | avg / total       0.89      0.94      0.92   1101229
231 | 
232 | 
233 | # Tuning hyper-parameters for recall
234 | 
235 | Best parameters set found on development set:
236 | 
237 | {'loss': 'hinge', 'C': 1}
238 | 
239 | Grid scores on development set:
240 | 
241 | 0.923 (+/-0.000) for {'loss': 'hinge', 'C': 1}
242 | 0.923 (+/-0.000) for {'loss': 'squared_hinge', 'C': 1}
243 | 0.923 (+/-0.000) for {'loss': 'hinge', 'C': 10}
244 | 0.923 (+/-0.000) for {'loss': 'squared_hinge', 'C': 10}
245 | 0.923 (+/-0.000) for {'loss': 'hinge', 'C': 100}
246 | 0.923 (+/-0.000) for {'loss': 'squared_hinge', 'C': 100}
247 | 0.923 (+/-0.000) for {'loss': 'hinge', 'C': 1000}
248 | 0.763 (+/-0.640) for {'loss': 'squared_hinge', 'C': 1000}
249 | 
250 | Detailed classification report:
251 | 
252 | The model is trained on the full development set.
253 | The scores are computed on the full evaluation set.
254 | 
255 |              precision    recall  f1-score   support
256 | 
257 |           0       0.00      0.00      0.00     60627
258 |           1       0.94      1.00      0.97   1040602
259 | 
260 | avg / total       0.89      0.94      0.92   1101229
261 | 
262 | 
263 | Total execution time in minutes :: >>
264 | 174.473954884
265 | Task is Finished!
266 | ```
267 | 


--------------------------------------------------------------------------------
/RandomForestClassifier.py:
--------------------------------------------------------------------------------
  1 | from sklearn import svm
  2 | from sklearn import cross_validation
  3 | from sklearn import preprocessing
  4 | from sklearn import metrics
  5 | import numpy as np
  6 | import time
  7 | import sys
  8 | import cPickle as pickle
  9 | import scipy.sparse
 10 | from itertools import groupby
 11 | from sklearn.ensemble import RandomForestClassifier
 12 | from sklearn.ensemble import AdaBoostClassifier
 13 | from sklearn.neighbors import KNeighborsClassifier
 14 | from sklearn.naive_bayes import GaussianNB
 15 | from sklearn.tree import DecisionTreeClassifier
 16 | 
 17 | #We have chosen l2 normalization to normalize the mfcc speech vector over the entire set of frames.
 18 | #Training Data -- Speech Vector File 
 19 | with open('mfcc_weighted_vector_39D.dat', 'rb') as infile1:
 20 |    InputData = pickle.load(infile1)
 21 | InputDataSpeech = preprocessing.normalize(InputData,norm='l2')
 22 | infile1.close()
 23 | 
 24 | # Target Values -- Class Label Files.
 25 | with open('class_label_weighted_vector.dat', 'rb') as infile2:
 26 |    TargetData = pickle.load(infile2)
 27 | TargetClassLabelTemp = preprocessing.normalize(TargetData,norm='l2')
 28 | infile2.close()
 29 | 
 30 | TargetClassLabel = np.array(scipy.sparse.coo_matrix((TargetClassLabelTemp),dtype=np.int16).toarray()).tolist()
 31 | TargetClassLabel1 = map(str, TargetClassLabel)
 32 | TargetClassLabel = results = [int(i.strip('[').strip(']')) for i in TargetClassLabel1]
 33 | #TargetClassLabel = TargetClassLabel[:200000]
 34 | n_samples = len(TargetClassLabel)
 35 | InputDataSpeechTemp = scipy.sparse.coo_matrix((InputDataSpeech),dtype=np.float64).toarray()
 36 | #InputDataSpeechTemp = InputDataSpeechTemp[:200000]
 37 | # merging the label and mfcc feature
 38 | data_zipped = zip(TargetClassLabel, InputDataSpeechTemp)
 39 | 
 40 | # permutation of whole dataset
 41 | #data_zipped = np.random.permutation(data_zipped)
 42 | 
 43 | # seperating the ones and zeros
 44 | ones_index = [i for i, j in enumerate(TargetClassLabel) if j == 1]
 45 | zeros_index = [i for i, j in enumerate(TargetClassLabel) if j == 0]
 46 | ones = [data_zipped[i] for i in ones_index]
 47 | zeros = [data_zipped[i] for i in zeros_index]
 48 | num_zero = len(zeros)
 49 | original_ratio = float(len(ones))/num_zero
 50 | # whole number of zeros:  145082
 51 | # whole number of ones:  2057376
 52 | 
 53 | one_zero_ratio = 1
 54 | train_test_ratio = 30 
 55 | 
 56 | # reconstructe test set according to original_ratio
 57 | test_zero = zeros[:int(num_zero/train_test_ratio)]
 58 | print 'test of zeros:	', len(test_zero)
 59 | test_one = ones[:int(len(test_zero)*original_ratio)]
 60 | #test_one = ones[:int(len(test_zero)*1)]
 61 | print 'test of ones:	', len(test_one)
 62 | test = test_zero + test_one
 63 | np.random.shuffle(test)
 64 | print 'test:	', len(test)
 65 | test_label = []
 66 | test_feature = []
 67 | for i in test:
 68 | 	test_label.append(i[0])
 69 | for i in test:
 70 | 	test_feature.append(i[1])
 71 | 
 72 | # reconstructe training set according to one_zero_ratio
 73 | # the ratio of ones to zeros on training is 1
 74 | train_zero = zeros[len(test_zero)+1:]
 75 | print 'train_zero:	', len(train_zero)
 76 | train_one = ones[len(test_one)+1:len(test_one)+1+int(len(train_zero)*one_zero_ratio)]
 77 | print 'train_one	', len(train_one)
 78 | train = train_zero + train_one
 79 | np.random.shuffle(train)
 80 | print 'train:	', len(train)
 81 | train_label = []
 82 | train_feature = []
 83 | for i in train:
 84 | 	train_label.append(i[0])
 85 | #print 'train_label', len(train_label)
 86 | for i in train:
 87 | 	train_feature.append(i[1])
 88 | #print 'train_feature', len(train_feature)
 89 | 
 90 | clf_model = RandomForestClassifier(n_estimators=100).fit(train_feature, train_label)
 91 | label_predicted = clf_model.predict(test_feature)
 92 | 
 93 | target_names = ['0', '1' ]
 94 | print("Classification report for classifier %s:\n%s\n"
 95 |       % (clf_model, metrics.classification_report(test_label, label_predicted, target_names=target_names)))
 96 | cm = metrics.confusion_matrix(test_label, label_predicted)
 97 | cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
 98 | print('Confusion matrix, without normalization')
 99 | print(cm) 
100 | print('Normalized confusion matrix')
101 | print(cm_normalized)
102 | print 'Task is Finished!'
103 | 


--------------------------------------------------------------------------------
/classification_report.py:
--------------------------------------------------------------------------------
 1 | from sklearn import svm
 2 | from sklearn import cross_validation
 3 | from sklearn import preprocessing
 4 | from sklearn import metrics
 5 | import numpy as np
 6 | import time
 7 | import sys
 8 | import cPickle as pickle
 9 | import scipy.sparse
10 | from itertools import groupby
11 | 
12 | 
13 | if len(sys.argv)!=3:
14 |     print '\nUsage: python classification_report.py <speech_vector_file> <class_label_file>'
15 |     sys.exit()
16 | 
17 | speech_vector_file = sys.argv[1]
18 | class_label_file = sys.argv[2]
19 | #cross_validation_folds_number = sys.argv[3]
20 | 
21 | #We have chosen l2 normalization to normalize the mfcc speech vector over the entire set of frames.
22 | #Training Data -- Speech Vector File 
23 | with open(speech_vector_file, 'rb') as infile1:
24 |    InputData = pickle.load(infile1)
25 | InputDataSpeech = preprocessing.normalize(InputData,norm='l2')
26 | infile1.close()
27 | 
28 | # Target Values -- Class Label Files.
29 | with open(class_label_file, 'rb') as infile2:
30 |    TargetData = pickle.load(infile2)
31 | TargetClassLabelTemp = preprocessing.normalize(TargetData,norm='l2')
32 | infile2.close()
33 | 
34 | #print InputDataSpeech.shape
35 | #print TargetClassLabelTemp.shape
36 | 
37 | TargetClassLabel = np.array(scipy.sparse.coo_matrix((TargetClassLabelTemp),dtype=np.int16).toarray()).tolist()
38 | TargetClassLabel1 = map(str, TargetClassLabel)
39 | #TargetClassLabel = map(int, TargetClassLabel)
40 | TargetClassLabel = results = [int(i.strip('[').strip(']')) for i in TargetClassLabel1]
41 | 
42 | #Recording the start time.
43 | start = time.time()
44 | 
45 | #Choosing SVM as our machine learning model.
46 | #clf_model = svm.LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,intercept_scaling=1, loss='squared_hinge', multi_class='ovr', penalty='l2',random_state=None, tol=0.0001, verbose=0)
47 | clf_model = svm.LinearSVC()
48 | 
49 | print 'Starting Cross Validation'
50 | 
51 | n_samples = len(TargetClassLabel)
52 | InputDataSpeechTemp = scipy.sparse.coo_matrix((InputDataSpeech),dtype=np.float64).toarray()
53 | first_half_input_vector = InputDataSpeechTemp[:n_samples/2,:13]
54 | second_half_input_vector = InputDataSpeechTemp[n_samples/2:,:13]
55 | #print "Predicted: "+str(second_half_input_vector.shape)
56 | 
57 | clf_model.fit(first_half_input_vector, TargetClassLabel[:n_samples / 2])
58 | 
59 | # Now predict the value of the digit on the second half:
60 | expected = TargetClassLabel[n_samples / 2:]
61 | predicted = clf_model.predict(second_half_input_vector)
62 | #predicted = predicted1.tolist()
63 | #predicted[0] = 0
64 | 
65 | target_names = ['No Speech Detected', 'Speech Detected' ]
66 | 
67 | #Generate Classification Report
68 | print("Classification report for classifier %s:\n%s\n"
69 |       % (clf_model, metrics.classification_report(expected, predicted,target_names=target_names)))
70 | print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
71 | 
72 | #Recording the end time.
73 | end = time.time()
74 | print "Total execution time in minutes :: >>"
75 | print (end - start)/60
76 | print 'Task is Finished!'
77 | 
78 | # print "Expected: "+str([len(list(group)) for key, group in groupby(expected)])
79 | # print "Predicted: "+str([len(list(group)) for key, group in groupby(predicted)])
80 | 
81 | #print "Expected: "+str(expected)
82 | #print "Predicted: "+str(predicted)
83 | 


--------------------------------------------------------------------------------
/classifier_test.py:
--------------------------------------------------------------------------------
 1 | from sklearn import svm
 2 | from sklearn import cross_validation
 3 | from sklearn import preprocessing
 4 | from sklearn import metrics
 5 | import numpy as np
 6 | import time
 7 | import sys
 8 | import cPickle as pickle
 9 | import scipy.sparse
10 | from itertools import groupby
11 | from sklearn.ensemble import RandomForestClassifier
12 | from sklearn.ensemble import AdaBoostClassifier
13 | from sklearn.neighbors import KNeighborsClassifier
14 | from sklearn.naive_bayes import GaussianNB
15 | 
16 | 
17 | #We have chosen l2 normalization to normalize the mfcc speech vector over the entire set of frames.
18 | #Training Data -- Speech Vector File 
19 | with open('mfcc_full_vector.dat', 'rb') as infile1:
20 |    InputData = pickle.load(infile1)
21 | InputDataSpeech = preprocessing.normalize(InputData,norm='l2')
22 | infile1.close()
23 | 
24 | # Target Values -- Class Label Files.
25 | with open('class_label_full_vector.dat', 'rb') as infile2:
26 |    TargetData = pickle.load(infile2)
27 | TargetClassLabelTemp = preprocessing.normalize(TargetData,norm='l2')
28 | infile2.close()
29 | 
30 | #print InputDataSpeech.shape
31 | #print TargetClassLabelTemp.shape
32 | 
33 | TargetClassLabel = np.array(scipy.sparse.coo_matrix((TargetClassLabelTemp),dtype=np.int16).toarray()).tolist()
34 | TargetClassLabel1 = map(str, TargetClassLabel)
35 | #TargetClassLabel = map(int, TargetClassLabel)
36 | TargetClassLabel = results = [int(i.strip('[').strip(']')) for i in TargetClassLabel1]
37 | 
38 | n_samples = len(TargetClassLabel)
39 | InputDataSpeechTemp = scipy.sparse.coo_matrix((InputDataSpeech),dtype=np.float64).toarray()
40 | 
41 | sub_feature = InputDataSpeechTemp[:n_samples/5,:13]
42 | sub_label = TargetClassLabel[:n_samples / 5]
43 | test_feature = InputDataSpeechTemp[99*n_samples/100:,:13]
44 | test_label = TargetClassLabel[99*n_samples/100:]
45 | 
46 | # rearranging training set
47 | ones_index= [i for i, j in enumerate(sub_label) if j == 1]
48 | zeros_index = [i for i, j in enumerate(sub_label) if j == 0]
49 | ones_index = np.random.permutation(ones_index)
50 | zeros_index= np.random.permutation(zeros_index)
51 | sub_zero = [sub_feature[i] for i in zeros_index]
52 | sub_one = [sub_feature[i] for i in ones_index]
53 | print str(len(sub_zero)) + ' 0' 
54 | print str(len(sub_one)) + ' 1'
55 | one_cutted = sub_one[:len(sub_zero)]
56 | feature_balanced = np.concatenate((sub_zero, one_cutted), axis=0)
57 | label_balanced = np.append(np.zeros(len(zeros_index)),np.ones(len(one_cutted)))
58 | 
59 | # rearranging test set FOR TEST
60 | ones_index_test= [i for i, j in enumerate(test_label) if j == 1]
61 | zeros_index_test = [i for i, j in enumerate(test_label) if j == 0]
62 | ones_index = np.random.permutation(ones_index_test)
63 | zeros_index= np.random.permutation(zeros_index_test)
64 | test_zero = [test_feature[i] for i in zeros_index_test]
65 | test_one = [test_feature[i] for i in ones_index_test]
66 | print str(len(test_zero)) + ' 0' 
67 | print str(len(test_one)) + ' 1'
68 | one_cutted_test = test_one[:len(test_zero)]
69 | feature_balanced_test = np.concatenate((test_zero, one_cutted_test), axis=0)
70 | label_balanced_test = np.append(np.zeros(len(test_zero)),np.ones(len(one_cutted_test)))
71 | 
72 | clf_model = KNeighborsClassifier().fit(feature_balanced, label_balanced)
73 | label_predicted = clf_model.predict(feature_balanced_test)
74 | 
75 | target_names = ['No Speech Detected', 'Speech Detected' ]
76 | print("Classification report for classifier %s:\n%s\n"
77 |       % (clf_model, metrics.classification_report(label_balanced_test, label_predicted,target_names=target_names)))
78 | cm = metrics.confusion_matrix(label_balanced_test, label_predicted)
79 | print label_balanced_test
80 | print label_predicted
81 | print len(label_balanced_test)
82 | print len(label_predicted)
83 | print sum(label_balanced_test)
84 | print sum(label_predicted)
85 | cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
86 | print('Confusion matrix, without normalization')
87 | print(cm)
88 | print('Normalized confusion matrix')
89 | print(cm_normalized)
90 | print 'Task is Finished!'
91 | 


--------------------------------------------------------------------------------
/estimate_liblinear_svm_parameters.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | from sklearn import datasets
  4 | from sklearn.cross_validation import train_test_split
  5 | from sklearn.grid_search import GridSearchCV
  6 | from sklearn.metrics import classification_report
  7 | #from sklearn.svm import SVC
  8 | 
  9 | from sklearn import svm
 10 | from sklearn import cross_validation
 11 | from sklearn import preprocessing
 12 | from sklearn import metrics
 13 | import numpy as np
 14 | import time
 15 | import sys
 16 | import cPickle as pickle
 17 | import scipy.sparse
 18 | from itertools import groupby
 19 | 
 20 | 
 21 | if len(sys.argv)!=3:
 22 |     print ('\nUsage: python estimate_liblinear_svm_parameters.py <speech_vector_file> <class_label_file>')
 23 |     sys.exit()
 24 | 
 25 | speech_vector_file = sys.argv[1]
 26 | class_label_file = sys.argv[2]
 27 | #cross_validation_folds_number = sys.argv[3]
 28 | 
 29 | #We have chosen l2 normalization to normalize the mfcc speech vector over the entire set of frames.
 30 | #Training Data -- Speech Vector File 
 31 | with open(speech_vector_file, 'rb') as infile1:
 32 |    InputData = pickle.load(infile1)
 33 | InputDataSpeech = preprocessing.normalize(InputData,norm='l2')
 34 | infile1.close()
 35 | 
 36 | # Target Values -- Class Label Files.
 37 | with open(class_label_file, 'rb') as infile2:
 38 |    TargetData = pickle.load(infile2)
 39 | TargetClassLabelTemp = preprocessing.normalize(TargetData,norm='l2')
 40 | infile2.close()
 41 | 
 42 | #print InputDataSpeech.shape
 43 | #print TargetClassLabelTemp.shape
 44 | 
 45 | TargetClassLabel = np.array(scipy.sparse.coo_matrix((TargetClassLabelTemp),dtype=np.int16).toarray()).tolist()
 46 | TargetClassLabel1 = map(str, TargetClassLabel)
 47 | #TargetClassLabel = map(int, TargetClassLabel)
 48 | TargetClassLabel = results = [int(i.strip('[').strip(']')) for i in TargetClassLabel1]
 49 | 
 50 | #Recording the start time.
 51 | start = time.time()
 52 | 
 53 | # Loading the Digits dataset
 54 | #digits = datasets.load_digits()
 55 | 
 56 | # To apply an classifier on this data, we need to flatten the image, to
 57 | # turn the data in a (samples, feature) matrix:
 58 | #n_samples = len(digits.images)
 59 | #X = digits.images.reshape((n_samples, -1))
 60 | #y = digits.target
 61 | 
 62 | # Split the dataset in two equal parts
 63 | #X_train, X_test, y_train, y_test = train_test_split(
 64 | #    X, y, test_size=0.5, random_state=0)
 65 | 
 66 | n_samples = len(TargetClassLabel)
 67 | InputDataSpeechTemp = scipy.sparse.coo_matrix((InputDataSpeech),dtype=np.float64).toarray()
 68 | X_train = InputDataSpeechTemp[:n_samples/2,:13]
 69 | X_test = InputDataSpeechTemp[n_samples/2:,:13]
 70 | y_train = TargetClassLabel[:n_samples / 2]
 71 | y_test =  TargetClassLabel[n_samples / 2:]
 72 | 
 73 | # Set the parameters by cross-validation
 74 | tuned_parameters = [{'C': [1, 10, 100, 1000],'loss':['hinge' , 'squared_hinge'] }]
 75 | 
 76 | scores = ['precision', 'recall']
 77 | 
 78 | for score in scores:
 79 |     print("# Tuning hyper-parameters for %s" % score)
 80 |     print()
 81 | 
 82 |     clf = GridSearchCV(svm.LinearSVC(C=1), tuned_parameters, cv=5,
 83 |                        scoring='%s_weighted' % score)
 84 | 
 85 |     clf.fit(X_train, y_train)
 86 | 
 87 |     print("Best parameters set found on development set:")
 88 |     print()
 89 |     print(clf.best_params_)
 90 |     print()
 91 |     print("Grid scores on development set:")
 92 |     print()
 93 |     for params, mean_score, scores in clf.grid_scores_:
 94 |         print("%0.3f (+/-%0.03f) for %r"
 95 |               % (mean_score, scores.std() * 2, params))
 96 |     print()
 97 | 
 98 |     print("Detailed classification report:")
 99 |     print()
100 |     print("The model is trained on the full development set.")
101 |     print("The scores are computed on the full evaluation set.")
102 |     print()
103 |     y_true, y_pred = y_test, clf.predict(X_test)
104 |     print(classification_report(y_true, y_pred))
105 |     print()
106 | 
107 | #Recording the end time.
108 | end = time.time()
109 | print ("Total execution time in minutes :: >>")
110 | print ((end - start)/60)
111 | print ('Task is Finished!')
112 | 


--------------------------------------------------------------------------------
/estimate_svm_parameters.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | from sklearn import datasets
  4 | from sklearn.cross_validation import train_test_split
  5 | from sklearn.grid_search import GridSearchCV
  6 | from sklearn.metrics import classification_report
  7 | from sklearn.svm import SVC
  8 | 
  9 | #from sklearn import svm
 10 | from sklearn import cross_validation
 11 | from sklearn import preprocessing
 12 | from sklearn import metrics
 13 | import numpy as np
 14 | import time
 15 | import sys
 16 | import cPickle as pickle
 17 | import scipy.sparse
 18 | from itertools import groupby
 19 | 
 20 | 
 21 | if len(sys.argv)!=3:
 22 |     print '\nUsage: python estimate_svm_parameters.py <speech_vector_file> <class_label_file>'
 23 |     sys.exit()
 24 | 
 25 | speech_vector_file = sys.argv[1]
 26 | class_label_file = sys.argv[2]
 27 | #cross_validation_folds_number = sys.argv[3]
 28 | 
 29 | #We have chosen l2 normalization to normalize the mfcc speech vector over the entire set of frames.
 30 | #Training Data -- Speech Vector File 
 31 | with open(speech_vector_file, 'rb') as infile1:
 32 |    InputData = pickle.load(infile1)
 33 | InputDataSpeech = preprocessing.normalize(InputData,norm='l2')
 34 | infile1.close()
 35 | 
 36 | # Target Values -- Class Label Files.
 37 | with open(class_label_file, 'rb') as infile2:
 38 |    TargetData = pickle.load(infile2)
 39 | TargetClassLabelTemp = preprocessing.normalize(TargetData,norm='l2')
 40 | infile2.close()
 41 | 
 42 | #print InputDataSpeech.shape
 43 | #print TargetClassLabelTemp.shape
 44 | 
 45 | TargetClassLabel = np.array(scipy.sparse.coo_matrix((TargetClassLabelTemp),dtype=np.int16).toarray()).tolist()
 46 | TargetClassLabel1 = map(str, TargetClassLabel)
 47 | #TargetClassLabel = map(int, TargetClassLabel)
 48 | TargetClassLabel = results = [int(i.strip('[').strip(']')) for i in TargetClassLabel1]
 49 | 
 50 | #Recording the start time.
 51 | start = time.time()
 52 | 
 53 | # Loading the Digits dataset
 54 | #digits = datasets.load_digits()
 55 | 
 56 | # To apply an classifier on this data, we need to flatten the image, to
 57 | # turn the data in a (samples, feature) matrix:
 58 | #n_samples = len(digits.images)
 59 | #X = digits.images.reshape((n_samples, -1))
 60 | #y = digits.target
 61 | 
 62 | # Split the dataset in two equal parts
 63 | #X_train, X_test, y_train, y_test = train_test_split(
 64 | #    X, y, test_size=0.5, random_state=0)
 65 | 
 66 | n_samples = len(TargetClassLabel)
 67 | InputDataSpeechTemp = scipy.sparse.coo_matrix((InputDataSpeech),dtype=np.float64).toarray()
 68 | X_train = InputDataSpeechTemp[:n_samples/2,:13]
 69 | X_test = InputDataSpeechTemp[n_samples/2:,:13]
 70 | y_train = TargetClassLabel[:n_samples / 2]
 71 | y_test =  TargetClassLabel[n_samples / 2:]
 72 | 
 73 | # Set the parameters by cross-validation
 74 | tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
 75 |                      'C': [1, 10, 100, 1000]},
 76 |                     {'kernel': ['linear'], 'C': [1, 10, 100, 1000]},
 77 |                     {{'kernel': ['sigmoid'], 'gamma': [1e-3, 1e-4], 
 78 |                      'C': [1, 10, 100, 1000]}]
 79 | 
 80 | scores = ['precision', 'recall']
 81 | 
 82 | for score in scores:
 83 |     print("# Tuning hyper-parameters for %s" % score)
 84 |     print()
 85 | 
 86 |     clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,
 87 |                        scoring='%s_weighted' % score)
 88 | 
 89 |     clf.fit(X_train, y_train)
 90 | 
 91 |     print("Best parameters set found on development set:")
 92 |     print()
 93 |     print(clf.best_params_)
 94 |     print()
 95 |     print("Grid scores on development set:")
 96 |     print()
 97 |     for params, mean_score, scores in clf.grid_scores_:
 98 |         print("%0.3f (+/-%0.03f) for %r"
 99 |               % (mean_score, scores.std() * 2, params))
100 |     print()
101 | 
102 |     print("Detailed classification report:")
103 |     print()
104 |     print("The model is trained on the full development set.")
105 |     print("The scores are computed on the full evaluation set.")
106 |     print()
107 |     y_true, y_pred = y_test, clf.predict(X_test)
108 |     print(classification_report(y_true, y_pred))
109 |     print()
110 | 
111 | #Recording the end time.
112 | end = time.time()
113 | print "Total execution time in minutes :: >>"
114 | print (end - start)/60
115 | print 'Task is Finished!'
116 | 


--------------------------------------------------------------------------------
/format_convertor.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | NoiseFileDir = "/mnt/tatooine/data/noise/noise_equal_concat/train"
 4 | outputDir = "/mnt/alderaan/mlteam3/data/noise_int16"
 5 | for root, dirs, files in os.walk(NoiseFileDir):
 6 |     path = root.split('/')
 7 |     #print (len(path) - 1) *'---' , os.path.basename(root)       
 8 |     for file in files:
 9 |         if (file.lower().endswith('.wav')):
10 |             #ir_file_list.append(file)     
11 |             (rate_IR, data_IR) = wav.read(file)
12 | 			data_IR = data_IR.astype(numpy.int16)
13 | 			outputfilename = os.path.basename(file)
14 | 			outputfilepath = os.path.join(outputDir,outputfilename) 
15 | 			wav.write(outputfilepath, rate_IR, data_IR)
16 | 


--------------------------------------------------------------------------------
/input_and_target_dataset_generator.py:
--------------------------------------------------------------------------------
  1 | import scipy.io.wavfile as wav
  2 | import numpy as np
  3 | import math
  4 | from features import mfcc
  5 | import sys
  6 | import os
  7 | import cPickle
  8 | import scipy.sparse
  9 | 
 10 | if len(sys.argv)!=5:
 11 |     print '\nUsage: python input_and_target_dataset_generator.py <annotations_dir> <noise_mix_speech_dir> <mfcc_vector_output_file> <class_label_vector_output_file>'
 12 |     sys.exit()
 13 | 
 14 | annotation_dir = sys.argv[1]
 15 | audio_files_dir= sys.argv[2]
 16 | mfcc_vector_output_file=sys.argv[3]
 17 | class_label_vector_output_file=sys.argv[4]
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | def getframelevelanno(noise_mix_speech_ano_file):
 24 | 	#print "#: "+str(noise_mix_speech_ano_file)
 25 | 	#MFCC Parameters
 26 | 	samplerate=16000
 27 | 	winlen=0.025
 28 | 	winstep=0.01
 29 | 	
 30 | 	annotation = [line.strip() for line in open(noise_mix_speech_ano_file, 'r')]
 31 | 	
 32 | 	window = []
 33 | 	frames_ano = []
 34 | 	
 35 | 	for bv in annotation:
 36 | 		window.append(float(bv))
 37 | 		if len(window)==winlen*samplerate:
 38 | 			frames_ano.append(math.ceil(sum(window)/len(window)))		
 39 | 			window=window[int(winstep*samplerate):] #moving window
 40 | 
 41 | 	frames_ano.append(math.ceil(float(sum(window))/len(window))) #the remaining samples for the last one frame
 42 | 	#frames_ano_array = np.array(frames_ano).reshape(len(frames_ano),1)
 43 | 	#return frames_ano_array
 44 | 	return frames_ano
 45 | 
 46 | 
 47 | 
 48 | def getMfccVector(noise_mix_speech_file):
 49 | 	(rate, signal) = wav.read(noise_mix_speech_file)
 50 | 	mfcc_vec = mfcc(signal,rate)
 51 | 	return mfcc_vec
 52 | 
 53 | def getDataset(annotation_dir, audio_files_dir):
 54 | 	speech_vector_final = np.zeros((1,13))
 55 | 	speech_vector_final = np.delete(speech_vector_final, (0), axis=0)
 56 | 	class_label_vector_final = []
 57 | 	directoryCount = 1
 58 | 	#failedFilesCount = 1
 59 | 	for root, dirs, files in os.walk(annotation_dir):
 60 | 		print "Directory Count: "+str(directoryCount)
 61 | 		path = root.split('/')
 62 | 		for file in files:
 63 | 			if (file.lower().endswith('.wav')):
 64 | 				speechFilePath = os.path.join(root,str(file))
 65 | 				tmp = os.path.dirname(speechFilePath)
 66 | 				root_dir_name = os.path.basename(tmp)
 67 | 
 68 | 				annotationfilename = str(os.path.splitext(file)[0])+'.ano'
 69 | 				annotation_file_fullpath = os.path.join(annotation_dir,root_dir_name,annotationfilename)
 70 | 				audio_file_fullpath = os.path.join(audio_files_dir,root_dir_name,file)
 71 | 				print "Annotation File: "+ annotation_file_fullpath
 72 | 				print "Audio file:"+ audio_file_fullpath
 73 | 				mfcc_vector =  getMfccVector(audio_file_fullpath)
 74 | 				#print mfcc_vector
 75 | 				class_label_vector = getframelevelanno(annotation_file_fullpath)
 76 | 				(x,y) = mfcc_vector.shape
 77 | 				z = len(class_label_vector)
 78 | 				
 79 | 				#if (x!=z ):
 80 | 					#failedFilesCount = failedFilesCount +1
 81 | 					#print "mfcc - x: "+ str(x)
 82 | 					#print "class - z: "+ str(z)
 83 | 				# if (w != 1):
 84 | 				# 	print "Failed"
 85 | 				# 	print class_label_vector.shape				
 86 | 					
 87 | 				if (x==z):
 88 | 					print  "MFCC: " + str(x)
 89 | 					print "LABEL: " + str(len(class_label_vector))
 90 | 					speech_vector_final = np.vstack((speech_vector_final,mfcc_vector))
 91 | 					class_label_vector_final.extend(class_label_vector)
 92 | 		directoryCount = directoryCount+1
 93 | 	#print "Total Number of Failed Files: "+ str(failedFilesCount)
 94 | 	return (speech_vector_final,class_label_vector_final)
 95 | 				
 96 | 
 97 | 
 98 | 
 99 | 
100 | #Main Routine
101 | print "Start of the Program ....."
102 | #speech_vector_final = np.empty((1,13))
103 | #class_label_vector_final = []
104 | (speech_vector_final,class_label_vector_final) = getDataset(annotation_dir, audio_files_dir)
105 | #print "Class Labels:"+str(len(class_label_vector_final))
106 | 
107 | 
108 | # Generate the various output files
109 | #MFCC Speech
110 | mfcc_vector_file = open(mfcc_vector_output_file, 'w')
111 | temp1 = scipy.sparse.coo_matrix(speech_vector_final)
112 | cPickle.dump(temp1,mfcc_vector_file,-1)
113 | mfcc_vector_file.close()
114 | 
115 | #Class Labels
116 | class_label_vector_file = open(class_label_vector_output_file, 'w')
117 | class_label_vector_final_array = np.array(class_label_vector_final).reshape(len(class_label_vector_final),1)
118 | temp2 = scipy.sparse.coo_matrix(class_label_vector_final_array)
119 | cPickle.dump(temp2,class_label_vector_file,-1)
120 | class_label_vector_file.close()
121 | 
122 | print "Final Shapes:"
123 | print "Speech Vector:"+str(speech_vector_final.shape)
124 | print "Class Labels:"+str(class_label_vector_final_array.shape)
125 | 
126 | print "Program completed Successfully"
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/input_and_target_dataset_mfcc_26D.py:
--------------------------------------------------------------------------------
  1 | import scipy.io.wavfile as wav
  2 | import numpy as np
  3 | import math
  4 | from features import mfcc
  5 | import sys
  6 | import os
  7 | import cPickle
  8 | import scipy.sparse
  9 | from scipy import signal
 10 | 
 11 | if len(sys.argv)!=5:
 12 |     print '\nUsage: FrameLevelAnnotation_weighted_39D.py <annotations_dir> <noise_mix_speech_dir> <mfcc_vector_output_file> <class_label_vector_output_file>'
 13 |     sys.exit()
 14 | 
 15 | annotation_dir = sys.argv[1]
 16 | audio_files_dir= sys.argv[2]
 17 | mfcc_vector_output_file=sys.argv[3]
 18 | class_label_vector_output_file=sys.argv[4]
 19 | 
 20 | def getframelevelanno(noise_mix_speech_ano_file):
 21 | 	#MFCC Parameters
 22 | 	samplerate=16000
 23 | 	winlen=0.025
 24 | 	winstep=0.01
 25 | 	winweight = signal.hamming(winlen*samplerate)
 26 | 	winweight = winweight/sum(winweight)
 27 | 	annotation = [line.strip() for line in open(noise_mix_speech_ano_file, 'r')]
 28 | 	
 29 | 	window = []
 30 | 	frames_ano = []
 31 | 	
 32 | 	for bv in annotation:
 33 | 		window.append(float(bv))
 34 | 		if len(window)==winlen*samplerate:
 35 | 			frames_ano.append(sum(np.multiply(window,winweight)))	
 36 | 			window=window[int(winstep*samplerate):] #moving window
 37 | 	frames_ano.append(sum(np.multiply(window,winweight[:len(window)]))) #the remaining samples for the last one frame
 38 | 	return frames_ano
 39 | 
 40 | 
 41 | 
 42 | def getMfccVector(noise_mix_speech_file):
 43 | 	(rate, signal) = wav.read(noise_mix_speech_file)
 44 | 	mfcc_vec = mfcc(signal,rate,numcep=26)
 45 | 	return mfcc_vec
 46 | 
 47 | def getDataset(annotation_dir, audio_files_dir):
 48 | 	speech_vector_final = np.zeros((1,26))
 49 | 	speech_vector_final = np.delete(speech_vector_final, (0), axis=0)
 50 | 	print "Speech Vector Final: "+ str(speech_vector_final.shape)
 51 | 
 52 | 	class_label_vector_final = []
 53 | 	directoryCount = 1
 54 | 
 55 | 	for root, dirs, files in os.walk(annotation_dir):
 56 | 		path = root.split('/')
 57 | 		for file in files:
 58 | 			if (file.lower().endswith('.wav')):
 59 | 				speechFilePath = os.path.join(root,str(file))
 60 | 				tmp = os.path.dirname(speechFilePath)
 61 | 				root_dir_name = os.path.basename(tmp)
 62 | 
 63 | 				annotationfilename = str(os.path.splitext(file)[0])+'.ano'
 64 | 				annotation_file_fullpath = os.path.join(annotation_dir,root_dir_name,annotationfilename)
 65 | 				audio_file_fullpath = os.path.join(audio_files_dir,root_dir_name,file)
 66 | 
 67 | 				print "Annotation File: "+ annotation_file_fullpath
 68 | 				print "Audio file:"+ audio_file_fullpath
 69 | 
 70 | 				mfcc_vector =  getMfccVector(audio_file_fullpath)
 71 | 				class_label_vector = getframelevelanno(annotation_file_fullpath)
 72 | 				(x,y) = mfcc_vector.shape
 73 | 				z = len(class_label_vector)
 74 | 				
 75 | 				if (x==z):
 76 | 					print "MFCC: " + str(x)
 77 | 					print "LABEL: " + str(len(class_label_vector))
 78 | 					print "speech_v: " + str(speech_vector_final.shape)
 79 | 					print "mfcc_v: " + str(mfcc_vector.shape)
 80 | 					speech_vector_final = np.vstack((speech_vector_final,mfcc_vector))
 81 | 					class_label_vector_final.extend(class_label_vector)
 82 | 		print "Directory Count: "+str(directoryCount)
 83 | 		directoryCount = directoryCount+1
 84 | 
 85 | 	return (speech_vector_final,class_label_vector_final)
 86 | 				
 87 | 
 88 | #Main Routine
 89 | print "Start of the Program ....."
 90 | (speech_vector_final,class_label_vector_final) = getDataset(annotation_dir, audio_files_dir)
 91 | 
 92 | 
 93 | 
 94 | # Generate the various output files
 95 | #MFCC Speech
 96 | mfcc_vector_file = open(mfcc_vector_output_file, 'w')
 97 | temp1 = scipy.sparse.coo_matrix(speech_vector_final)
 98 | cPickle.dump(temp1,mfcc_vector_file,-1)
 99 | mfcc_vector_file.close()
100 | 
101 | #Class Labels
102 | class_label_vector_file = open(class_label_vector_output_file, 'w')
103 | class_label_vector_final_array = np.array(class_label_vector_final).reshape(len(class_label_vector_final),1)
104 | temp2 = scipy.sparse.coo_matrix(class_label_vector_final_array)
105 | cPickle.dump(temp2,class_label_vector_file,-1)
106 | class_label_vector_file.close()
107 | 
108 | print "Final Shapes:"
109 | print "Speech Vector:"+str(speech_vector_final.shape)
110 | print "Class Labels:"+str(class_label_vector_final_array.shape)
111 | 
112 | print "Program completed Successfully"
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/speech_noise_ir_audio_mixing_script.py:
--------------------------------------------------------------------------------
  1 | import scipy.io.wavfile as wav
  2 | import scipy.io as sio
  3 | import audiotools
  4 | import numpy
  5 | import math
  6 | import pydub
  7 | import random
  8 | from features import mfcc
  9 | from features import logfbank
 10 | import os
 11 | import sys
 12 | 
 13 | if len(sys.argv)!=6:
 14 |     print '\nUsage: python speech_noise_ir_audio_mixing_script.py <speaker_speech_dir> <reference_file> <noise_file_dir> <ir_noise_file_dir> <output_root_directory>'
 15 |     print '\nCommand line arguments:'
 16 |     print '<speaker_speech_dir>: Root directory where all the VAD speaker speech data is located'
 17 |     print '<reference_file>: Reference File for the Calculation of Replay gain, named as "ref_pink.wav" '
 18 |     print '<noise_file_dir>: Root directory where all the noise data is located'
 19 |     print '<ir_noise_file_dir> : Root directory where all the impulse response noise samples is located' 
 20 |     print '<output_root_directory>: Root directory for storing all the output files in same directory hierarchy as the input speech directory'
 21 |     print'\n'
 22 |     sys.exit()
 23 | 
 24 | #Command Line Arguments
 25 | speakerSpeechDir = sys.argv[1]
 26 | referenceFile = sys.argv[2]
 27 | noiseFileDir = sys.argv[3]
 28 | irNoiseFileDir =sys.argv[4]
 29 | output_root_directory =sys.argv[5]
 30 | 
 31 | #Create the output directory if its not already present in the filesystem.
 32 | if not os.path.exists(output_root_directory):
 33 |     		os.makedirs(output_root_directory)
 34 | 
 35 | def audioeval(speechFile, referenceFile,noiseFile, root_dir_name, output_root_directory,ir_noise_file):
 36 | 	"This function evaluates a single audio file."
 37 | 	print 'Noise File Path: '+str(noiseFile)
 38 | 	print 'IR-Noise File Path: '+str(ir_noise_file)
 39 | 	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 40 | 	# ReplayGain calculation of reference
 41 | 	ref = audiotools.open(referenceFile)
 42 | 	ref_replay_gain = audiotools.calculate_replay_gain([ref])
 43 | 	ref_track_gain = list(list(ref_replay_gain)[0])[1]
 44 | 	#print ref_track_gain
 45 | 
 46 | 	# ReplayGain calculation of example speech file
 47 | 	speech = audiotools.open(speechFile)
 48 | 	speech_replay_gain = audiotools.calculate_replay_gain([speech])
 49 | 	speech_track_gain = list(list(speech_replay_gain)[0])[1]
 50 | 	#print speech_track_gain
 51 | 
 52 | 	# Normalization of example speech file
 53 | 	(rate_speech, data_speech) = wav.read(speechFile)
 54 | 	gain = ref_track_gain-speech_track_gain
 55 | 	data_normalized = numpy.asarray(data_speech*math.pow(10, (-(gain)/20)), dtype=numpy.int16)
 56 | 	normalizedFile = "speech_normalized.wav"
 57 | 	wav.write(normalizedFile , rate_speech, data_normalized)
 58 | 
 59 | 	# Loudness test of normalized example speech
 60 | 	test = audiotools.open(normalizedFile)
 61 | 	test_replay_gain = audiotools.calculate_replay_gain([test])
 62 | 	test_track_gain = list(list(test_replay_gain)[0])[1]
 63 | 	#print test_track_gain
 64 | 
 65 | 	# Randomly choosing one noise file from the pool
 66 | 	# here I just fix one waiting for implementation later
 67 | 
 68 | 	# Using pydub API to calculate the length of normalized speech file and the noise file
 69 | 	speech_normalized = pydub.AudioSegment.from_wav(normalizedFile)
 70 | 	
 71 | 	#We have converted all the noise files to 16 bit int format and then passed the directoyr location to randomly choose noise files, which is different for each speech file.
 72 | 	noise = pydub.AudioSegment.from_wav(noiseFile)
 73 | 	speech_normalized_length = speech_normalized.duration_seconds
 74 | 	noise_length = noise.duration_seconds
 75 | 
 76 | 	# Selecting a randow start point of the noise file to get a segment of the required length
 77 | 	start = random.randrange(0,int(noise_length-speech_normalized_length)*1000)
 78 | 	# pydub does things in milliseconds
 79 | 	noise_segmented = noise[start:int(start+speech_normalized_length*1000)]
 80 | 	noise_segmented.export("noise_segmented.wav",format="wav")
 81 | 
 82 | 	# Linear fading of sharply segmented noised segment
 83 | 	# 1 sec fade in, 1 sec fade out
 84 | 	noise_faded = noise_segmented.fade_in(1000).fade_out(1000)
 85 | 	noise_faded.export("noise_faded.wav",format="wav")
 86 | 
 87 | 	# how long is good? 1 sec?
 88 | 
 89 | 	# Picking a random signal to noise ratio (SNR)
 90 | 	SNR_ratio = random.randint(-2, 20)
 91 | 	#print "SNR_ratio: " + str(SNR_ratio)
 92 | 
 93 | 	# loudness in dBFS (Decibels relative to full scale)
 94 | 	# (all peak measurements will be negative numbers)
 95 | 	speech_dB = speech_normalized.dBFS
 96 | 	noise_dB = noise_segmented.dBFS
 97 | 	#print "loudness of speech: " + str(speech_dB)
 98 | 	#print "loudness of noise: " + str(noise_dB)
 99 | 
100 | 	# Change the amplitude (generally, loudness) of the speech by SNR ratio from noise. 
101 | 	# Gain is specified in dB. 
102 | 	gain = SNR_ratio-(speech_dB-noise_dB)
103 | 	#print "gain: " + str(gain)
104 | 	speech_SNRed = speech_normalized.apply_gain(gain)
105 | 	#print "loudness of adjusted speech: " + str(speech_SNRed.dBFS)
106 | 	# check SNR
107 | 	#print "check SNR: " + str(speech_SNRed.dBFS - noise_dB)
108 | 
109 | 	# mix the two tracks by adding the respective samples
110 | 	# (If the overlaid AudioSegment is longer than this one, the result will be truncated)
111 | 	noisy_speech = speech_SNRed.overlay(noise_segmented)
112 | 	noisy_speech.export("noisy_speech.wav",format="wav")
113 | 	# Since the sample values have increased through the summation, it is possible that they exceed the maximum imposed by the data type. How this API deals with this problem?
114 | 
115 | 
116 | 	# draw an impulse response from the pool
117 | 	# ...waiting to implement
118 | 
119 | 	# peak-normalize it to 0dB (=1) by dividing the IR vector through its maximum value.
120 | 	(rate_IR, data_IR) = wav.read(ir_noise_file)
121 | 	
122 | 	# data_IR.dtype is int16, change it into float64
123 | 	data_IR = data_IR.astype(numpy.float64)/65536.0 
124 | 	data_IR = data_IR / data_IR.max()
125 | 
126 | 	# convolve speech with the normalized IR
127 | 	(rate_noisy_speech, data_noisy_speech) = wav.read("noisy_speech.wav")
128 | 	speech_convolved = numpy.convolve(data_IR, data_noisy_speech)
129 | 
130 | 	#print "Root Directory Name: "+str(root_dir_name)
131 | 	output_directory = os.path.join(output_root_directory, root_dir_name) 
132 | 	#print output_directory
133 | 
134 | 	if not os.path.exists(output_directory):
135 |     		os.makedirs(output_directory)
136 | 
137 | 	#speech_convolved_file = output_directory+'/'+str(os.path.splitext(speechFile)[0])+"_convolved.wav"
138 | 	speech_convolved_file_name = os.path.basename(speechFile)
139 | 	#print "Speech File Name: "+str(speech_convolved_file_name)	
140 | 	speech_convolved_file = os.path.join(output_directory, speech_convolved_file_name)
141 | 	print "Final output file path: "+str(speech_convolved_file)	
142 | 	
143 | 	# cut the convolved track to its original length if prolonged and store the resulting track
144 | 	wav.write(speech_convolved_file, rate_noisy_speech, speech_convolved[:data_noisy_speech.size])
145 | 
146 | 	# MFCC CODE *********** COMMENTED OUT **************
147 | 	# MFCC Feature extraction
148 | 	# Do the default parameters (frame size etc.) work for you?
149 | 	#(rate,sig) = wav.read(speech_convolved_file)
150 | 	#mfcc_feat = mfcc(sig,rate)
151 | 	#print "MFCC Shape:"
152 | 	#print mfcc_feat.shape
153 | 	#print mfcc_feat
154 | 	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
155 | 	## Cleaup code which deletes the intermediate files which get generated.
156 | 	return;
157 | 
158 | noise_file_list = []
159 | ir_file_list = []
160 | 
161 | for root, dirs, files in os.walk(noiseFileDir):
162 |     path = root.split('/')
163 |     #print (len(path) - 1) *'---' , os.path.basename(root)       
164 |     for file in files:
165 |         if (file.lower().endswith('.wav')):
166 |             noise_file_list.append(file)
167 | 
168 |         
169 | for root, dirs, files in os.walk(irNoiseFileDir):
170 |     path = root.split('/')
171 |     #print (len(path) - 1) *'---' , os.path.basename(root)       
172 |     for file in files:
173 |         if (file.lower().endswith('.wav')):
174 |             ir_file_list.append(file)     
175 |     
176 | # traverse root directory, and list directories as dirs and files as files
177 | for root, dirs, files in os.walk(speakerSpeechDir):
178 |     path = root.split('/')
179 |     #print (len(path) - 1) *'---' , os.path.basename(root)       
180 |     for file in files:
181 |         if (file.lower().endswith('.wav')):
182 |            #print 'Current File: ', file
183 |            speechFilePath = os.path.join(root,str(file)) 
184 |            tmp = os.path.dirname(speechFilePath)
185 |            #print "Root: "+str(os.path.basename(tmp))
186 |            #print os.path.basename(speechFilePath)
187 |            root_dir_name = os.path.basename(tmp)
188 |            #print "Main Root Directory Name: "+str(os.path.basename(root))
189 | 
190 |            #Choose a random noise and ir file from the directory specified in the command line argument. 
191 |            noiseFile = os.path.join(noiseFileDir,random.choice(noise_file_list))
192 |            ir_noise_file = os.path.join (irNoiseFileDir,random.choice(ir_file_list))
193 | 
194 |            audioeval(speechFilePath,referenceFile,noiseFile,root_dir_name,output_root_directory,ir_noise_file)
195 | print "Operations Finished!"
196 | 
197 | 


--------------------------------------------------------------------------------
/svm_modelling.py:
--------------------------------------------------------------------------------
 1 | from sklearn import svm
 2 | from sklearn import cross_validation
 3 | from sklearn import preprocessing
 4 | import numpy as np
 5 | import time
 6 | import sys
 7 | import cPickle as pickle
 8 | import scipy.sparse
 9 | 
10 | 
11 | if len(sys.argv)!=3:
12 |     print '\nUsage: python svm_modelling.py <speech_vector_file> <class_label_file>'
13 |     sys.exit()
14 | 
15 | speech_vector_file = sys.argv[1]
16 | class_label_file = sys.argv[2]
17 | #cross_validation_folds_number = sys.argv[3]
18 | 
19 | #We have chosen l2 normalization to normalize the mfcc speech vector over the entire set of frames.
20 | #Training Data -- Speech Vector File 
21 | with open(speech_vector_file, 'rb') as infile1:
22 |    InputData = pickle.load(infile1)
23 | InputDataSpeech = preprocessing.normalize(InputData,norm='l2')
24 | infile1.close()
25 | 
26 | # Target Values -- Class Label Files.
27 | with open(class_label_file, 'rb') as infile2:
28 |    TargetData = pickle.load(infile2)
29 | TargetClassLabelTemp = preprocessing.normalize(TargetData,norm='l2')
30 | infile2.close()
31 | 
32 | print InputDataSpeech.shape
33 | print TargetClassLabelTemp.shape
34 | 
35 | TargetClassLabel = np.array(scipy.sparse.coo_matrix((TargetClassLabelTemp),dtype=np.float64).toarray()).tolist()
36 | TargetClassLabel = map(str, TargetClassLabel)
37 | #Recording the start time.
38 | start = time.time()
39 | 
40 | #Choosing SVM as our machine learning model.
41 | #clf_model = svm.LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,intercept_scaling=1, loss='squared_hinge', multi_class='ovr', penalty='l2',random_state=None, tol=0.0001, verbose=0)
42 | clf_model = svm.LinearSVC()
43 | 
44 | print 'Starting Cross Validation'
45 | 
46 | # fit() is not required when cross validating.
47 | #clf = clf_model.fit(InputDataSpeech,TargetClassLabel)
48 | 
49 | scores = cross_validation.cross_val_score(clf_model, InputDataSpeech, TargetClassLabel, cv=10)
50 | print "\nFinal Accuracy Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)
51 | 
52 | #Recording the end time.
53 | end = time.time()
54 | 
55 | 
56 | print "Total execution time in minutes :: >>"
57 | print (end - start)/60
58 | 
59 | print 'Task is Finished!'


--------------------------------------------------------------------------------