├── to_speech.py
├── README.md
├── calculate_features.py
├── models.py
├── code_yan_lstm.py
├── pitch.py
├── code.py
└── cf.py


/to_speech.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from gtts import gTTS
 3 | import numpy as np
 4 | 
 5 | def to_speech(text):
 6 |   tts = gTTS(text=text, lang = 'en')
 7 |   return tts
 8 | 
 9 | def save(tts, filename):
10 |   tts.save(filename + '.mp3')
11 | 
12 | data = {'A':158, 'E':9307, 'M':1318, 'R':576, 'T':637, 'N':5707}
13 | labels = ['A','E','M','R','T','N']
14 | values = [158, 9307, 1318, 576, 637, 5707]
15 | 
16 | bounds = [np.sum(values[:i]) for i in xrange(0, 7)]
17 | print bounds
18 | 
19 | n = np.sum(values)
20 | 
21 | print np.sum(values)
22 | print values/np.sum(values, dtype=float)
23 | p_labels = []
24 | t_labels = []
25 | 
26 | 
27 | for i in xrange(n):
28 |   a = int(n * np.random.random())
29 |   for j in xrange(6):
30 |     if bounds[j] <= a and a < bounds[j+1]:
31 |       p_labels.append(labels[j])
32 |     if bounds[j] <= i and i < bounds[j+1]:
33 |       t_labels.append(labels[j])
34 | 
35 | p_labels = np.array(p_labels)
36 | t_labels = np.array(t_labels)
37 | 
38 | print p_labels[p_labels == t_labels].shape[0] / float(t_labels.shape[0])
39 | 
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Audio emotion recognition
 2 | 
 3 | ## Description
 4 | 
 5 | The goal of this project is to provide script to verify our emotion recognition approach.
 6 | 
 7 | ### Requirements
 8 | 
 9 | * numpy
10 | * sklearn
11 | * librosa (for pitch estimation, optional)
12 | * keras
13 | 
14 | ### Project structure
15 | The code consist of 3 main .py files.
16 | 
17 | * model.py - containt keras implementation of LSTM, MPC neural networks, sklearn models
18 | 
19 |  Main methods to use in emotion recognition:
20 |  * train_mpc(train_x, train_y, test_x, test_y) - train multilayer perceptron. 
21 |  * train_lstm(train_x, train_y, test_x, test_y) - train LSTM NN
22 |  * train_rfc(train_x, train_y, options) - train Random Forest classifier
23 | 
24 |  All methods return Model object and also show precision and loss on test sample to inline validation. 
25 | 
26 | * calculate_features.py - script for features estimation
27 | 
28 |  Method `calculate_features(signal, freq, options)` returns features set for all 0.2 sec frames in signal. Freq is a signal framerate. If use_derivatives flag is true, method also include 1st and 2nd time deltas of features.
29 |  To calculate features cf.py code is used. It based on https://github.com/tyiannak/pyAudioAnalysis/blob/master/audioFeatureExtraction.py with minor improvements. 
30 | 
31 |  As default returns 32 mfcc, spectral and chromagram features
32 | 
33 | * code_yan_lstm.py - implementation of LSTM NN in emotion recognition. 
34 | 
35 |  The code can be divided into 3 blocks
36 |   * Data reading 
37 |   * Data preprocessing
38 |   * The main part includes model buildg and validation
39 |  
40 |  To get the data successfully, choose regime from ['aibo4', 'aibo5', 'iemocap'] and correct paths to wavs, labels, etc. aibo4 and aibo5 regimes correspondings to different labels over AIBO database.
41 |  
42 |  Data preprocessing consist of normalization, padding sequences, transfromation labels from categorical to vector, balancing and resampling.
43 | 
44 |  The main part is a 5-fold cross-validation procedure over readed database. It splits sample into train and test parts, trains models on train and validates on test. By ending cross-validation procedure it plots confusion matrix aomparing prediction and expected output.
45 | 
46 | ## Running
47 | 
48 | To run the code, enter `python code_yan_lstm.py` in a command line or IDE. In addition to correction paths and settings, create folder for feature samples in the working directory. 
49 | 


--------------------------------------------------------------------------------
/calculate_features.py:
--------------------------------------------------------------------------------
  1 | import wave
  2 | import numpy as np
  3 | import math
  4 | import os
  5 | import pickle
  6 | import cf
  7 | 
  8 | 
  9 | def weighting(x):
 10 |   if weighting_type == 'cos':
 11 |     return x * np.sin(np.pi * np.arange(len(x)) / (len(x) - 1))
 12 |   elif weighting_type == 'hamming':
 13 |     return x * (0.54 - 0.46 * np. cos(2. * np.pi * np.arange(len(x)) / (len(x) - 1)))
 14 |   else:
 15 |     return x
 16 | 
 17 | def calculate_features(frames, freq, options):
 18 |   n = len(frames)
 19 |   window_sec = 0.2
 20 |   window_n = int(freq * window_sec)
 21 |   use_derivatives = False
 22 | 
 23 |   st_f = cf.stFeatureExtraction(frames, freq, window_n, window_n / 2)
 24 |   if st_f.shape[1] > 2:
 25 |     i0 = 1
 26 |     i1 = st_f.shape[1] - 1
 27 |     if i1 - i0 < 1:
 28 |       i1 = i0 + 1
 29 |     if use_derivatives:
 30 |       deriv_st_f = np.zeros((st_f.shape[0]*3, i1 - i0), dtype=float)
 31 |     else:
 32 |       deriv_st_f = np.zeros((st_f.shape[0], i1 - i0), dtype=float)
 33 |     for i in xrange(i0, i1):
 34 |       i_left = i - 1
 35 |       i_right = i + 1
 36 |       deriv_st_f[:st_f.shape[0], i - i0] = st_f[:, i]
 37 |       if use_derivatives:
 38 |         if st_f.shape[1] >= 2:
 39 |           deriv_st_f[st_f.shape[0]:st_f.shape[0]*2, i - i0] = (st_f[:, i_right] - st_f[:, i_left]) / 2.
 40 |           deriv_st_f[st_f.shape[0]*2:st_f.shape[0]*3, i - i0] = st_f[:, i] - 0.5*(st_f[:, i_left] + st_f[:, i_right])
 41 |     return deriv_st_f
 42 |   elif st_f.shape[1] == 2:
 43 |     deriv_st_f = np.zeros((st_f.shape[0], 1), dtype=float)
 44 |     deriv_st_f[:st_f.shape[0], 0] = st_f[:, 0]
 45 |     if use_derivatives:
 46 |       deriv_st_f[st_f.shape[0]:st_f.shape[0]*2, 0] = st_f[:, 1] - st_f[:, 0]
 47 |       deriv_st_f[st_f.shape[0]*2:st_f.shape[0]*3, 0] = np.zeros(st_f.shape[0])
 48 |     return deriv_st_f
 49 |   else:
 50 |     deriv_st_f = np.zeros((st_f.shape[0], 1), dtype=float)
 51 |     deriv_st_f[:st_f.shape[0], 0] = st_f[:, 0]
 52 |     if use_derivatives:
 53 |       deriv_st_f[st_f.shape[0]:st_f.shape[0]*2, 0] = np.zeros(st_f.shape[0])
 54 |       deriv_st_f[st_f.shape[0]*2:st_f.shape[0]*3, 0] = np.zeros(st_f.shape[0])
 55 |     return deriv_st_f
 56 | 
 57 |   #mt_f, st_f2 = cf.mtFeatureExtraction(frames, 16000, 2*window_n, 2*window_n, window_n, window_n)
 58 |   #print st_f.shape, n / window_n 
 59 |   # print st_f2.shape, n / window_n 
 60 |   # print mt_f.shape, n / window_n 
 61 |   # print mt_f
 62 |   # for i in xrange(k0, k1):
 63 |   #   if overlapping_type in ['l', 'lr']:
 64 |   #     i0 = (i - 1) * window_n
 65 |   #   else:
 66 |   #     i0 = i * window_n
 67 |   #   if overlapping_type in ['r', 'lr']:
 68 |   #     i1 = (i + 2) * window_n
 69 |   #   else:
 70 |   #     i1 = (i + 1) * window_n
 71 |     
 72 |   #   x = frames[i0:i1]
 73 |   #   print len(x), i0, i1, len(frames), n / window_n
 74 | 
 75 |   #   # x = weighting(x)
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
  1 | import wave
  2 | import numpy as np
  3 | import math
  4 | import os
  5 | import pickle
  6 | from keras.preprocessing import sequence
  7 | from sklearn.ensemble import RandomForestRegressor as RFR
  8 | 
  9 | from keras.datasets import mnist
 10 | from keras.models import Sequential
 11 | from keras.layers.core import Dense, Dropout, Activation, Merge, TimeDistributedDense
 12 | from keras.layers import LSTM
 13 | from keras.optimizers import SGD, Adam, RMSprop
 14 | from keras.utils import np_utils
 15 | from keras.regularizers import l2
 16 | from keras.optimizers import SGD
 17 | 
 18 | import h5py
 19 | 
 20 | def train_rfr(x, y, tx, ty):
 21 |   rfr = RFR(n_estimators=50)
 22 |   model = rfr.fit(x, y)
 23 | 
 24 |   return model
 25 | 
 26 | 
 27 | 
 28 | def train(x, y, options):
 29 |   classes = np.unique(y)
 30 |   print classes
 31 | 
 32 | 
 33 |   classifiers = []
 34 |   for c in classes:
 35 |     out = np.array([int(i) for i in y==c])
 36 |     print c, len(out[out == 1]), len(out[out == 0]), len(out[out == 1]) + len(out[out == 0])
 37 | 
 38 | 
 39 | 
 40 |     classifier = RFC(n_estimators=10)
 41 |     classifier.fit(x, out)
 42 |     classifiers.append(classifier)
 43 |   return classifiers
 44 | 
 45 | def train_rfc(x, y, options):
 46 |   classifier = RFC(n_estimators=100)
 47 | 
 48 |   return classifier.fit(x, y)
 49 | 
 50 | def fork (model, n=2):
 51 |     forks = []
 52 |     for i in range(n):
 53 |         f = Sequential()
 54 |         f.add (model)
 55 |         forks.append(f)
 56 |     return forks
 57 | 
 58 | 
 59 | def make_sample_lstm(x, n, y=None, use_y=False):
 60 |   if use_y:
 61 |     xt = np.zeros((x.shape[0], n, x.shape[1] + 1), dtype=float)
 62 |     t = np.zeros((x.shape[0], x.shape[1] + 1), dtype=float)
 63 |     for i in xrange(x.shape[0]):
 64 |       if i == 0:
 65 |         t[i, :-1] = x[i, :]
 66 |         t[i, -1] = 0
 67 |       else:
 68 |         t[i, :-1] = x[i, :]
 69 |         t[i, -1] = y[i-1]
 70 | 
 71 |     for i in xrange(x.shape[0]):
 72 |       if i < n:
 73 |         i0 = n - i
 74 |         xt[i, :i0, :] = np.zeros((i0, x.shape[1] + 1), dtype=float)
 75 |         if i > 0:
 76 |           xt[i, i0:, :] = t[:i, :]
 77 |       else:
 78 |         xt[i, :, :] = t[i-n:i, :]
 79 |     return xt
 80 |   else:
 81 |     xt = np.zeros((x.shape[0], n, x.shape[1]), dtype=float)
 82 |     for i in xrange(x.shape[0]):
 83 |       if i < n:
 84 |         i0 = n - i
 85 |         xt[i, :i0, :] = np.zeros((i0, x.shape[1]), dtype=float)
 86 |         if i > 0:
 87 |           xt[i, i0:, :] = x[:i, :]
 88 |       else:
 89 |         xt[i, :, :] = x[i-n:i, :]
 90 |     return xt
 91 | 
 92 | 
 93 | class modelLSTM:
 94 |   def __init__(self, model, length, use_y):
 95 |     self.model = model
 96 |     self.n = length
 97 |     self.use_y = use_y
 98 |   def predict(self, x):
 99 |     if self.use_y:
100 |       result = np.zeros((x.shape[0], 1), dtype=float)
101 |       for i in xrange(x.shape[0]):
102 |         t = np.zeros((self.n, x.shape[1] + 1), dtype=float)
103 | 
104 | 
105 |     else:
106 |       xt = make_sample_lstm(x, self.n)
107 |       return self.model.predict(xt)
108 |   def save(self, name_json, name_weights):
109 |     json_string = self.model.to_json()
110 |     open(name_json, 'w').write(json_string)
111 |     self.model.save_weights(name_weights)
112 | 
113 | 
114 | def train_lstm_avec(x, y, xt, yt):
115 |   length = 25
116 |   use_y = False
117 | 
118 |   x_series_train = make_sample_lstm(x, length, y, use_y)
119 |   print x_series_train.shape
120 |   x_series_test = make_sample_lstm(xt, length, yt, use_y)
121 |   print x_series_test.shape
122 | 
123 |   print y[:100, 0]
124 |   model = Sequential()
125 |   model.add(LSTM(256, return_sequences=True, input_shape=(x_series_train.shape[1], x_series_train.shape[2])))
126 |   model.add(Dropout(0.2))
127 |   model.add(Activation('tanh'))
128 |   model.add(LSTM(128))
129 |   model.add(Dropout(0.2))
130 |   model.add(Activation('tanh'))
131 |   # model.add(Dense(128))
132 |   # model.add(Activation('tanh'))
133 |   model.add(Dense(1))
134 |   #model.add(Activation('softmax'))  
135 |   model.summary()
136 |   model.compile(loss='mean_absolute_error', optimizer='rmsprop')
137 |   model.fit(x_series_train, y, batch_size=512, nb_epoch=50,
138 |                        verbose=2, validation_data=(x_series_test, yt))
139 | 
140 |   return modelLSTM(model, length, use_y)
141 | 
142 | 
143 | 
144 | 
145 | 
146 | def train_lstm(x, y, xt, yt):
147 |   batch_size = 320
148 |   nb_classes = 10
149 |   nb_epoch = 25
150 |   ts = x[0].shape[0]
151 |   model = Sequential()
152 |   model.add(LSTM(512, return_sequences=True, input_shape=(ts, x[0].shape[1])))
153 |   model.add(Activation('tanh'))
154 |   # model.add(LSTM(512, return_sequences=True))
155 |   # model.add(Activation('tanh'))
156 |   model.add(LSTM(256, return_sequences=False))
157 |   model.add(Activation('tanh'))
158 |   model.add(Dense(512))
159 |   model.add(Activation('tanh'))
160 |   model.add(Dense(y.shape[1]))
161 |   model.add(Activation('softmax'))
162 | 
163 |   model.summary()
164 | 
165 |   model.compile(loss='categorical_crossentropy', optimizer=RMSprop())
166 | 
167 | 
168 | 
169 |   # for epoch in xrange(nb_epoch):
170 |   model.fit(x, y, batch_size=batch_size, nb_epoch=nb_epoch,
171 |                        verbose=2, validation_data=(xt, yt), show_accuracy=True)
172 | 
173 |   return model
174 | 
175 | def train_mpc(x, y, tx, ty):
176 |   
177 |   batch_size = 256
178 |   nb_classes = 10
179 |   nb_epoch = 20
180 | 
181 |   model = Sequential()
182 |   model.add(Dense(512, input_shape=(x.shape[1],)))
183 |   model.add(Activation('relu'))
184 |   model.add(Dropout(0.2))
185 |   model.add(Dense(1024))
186 |   model.add(Activation('relu'))
187 |   model.add(Dense(1024))
188 |   model.add(Activation('relu'))
189 |   model.add(Dropout(0.2))
190 |   model.add(Dense(512))
191 |   model.add(Activation('relu'))
192 |   model.add(Dense(1024))
193 |   model.add(Activation('relu'))
194 |   model.add(Dense(512))
195 |   model.add(Activation('relu'))
196 |   model.add(Dense(y.shape[1]))
197 |   #model.add(Activation('softmax'))
198 | 
199 |   #model.summary()
200 | 
201 |   model.compile(loss='mean_absolute_error',
202 |               optimizer=RMSprop())
203 | 
204 |   history = model.fit(x, y,
205 |                     batch_size=batch_size, nb_epoch=nb_epoch,
206 |                     verbose=0, validation_data=(tx, ty), show_accuracy=True)
207 | 
208 |   return model
209 | 
210 | def validate1(classifier, test_x, test_y):
211 |   predictions = classifier.predict(test_x)
212 | 
213 |   total_acc = predictions[predictions == test_y].shape[0] / float(predictions.shape[0])
214 |   print 'Total acc ', total_acc
215 | 
216 |   classes = np.unique(test_y)
217 |   print test_y
218 |   print predictions
219 |   ans = []
220 |   for ci, c in enumerate(classes):
221 |     print ci
222 |     
223 |     idx = np.array([ii for ii, i in enumerate(test_y==c)])
224 |     out = test_y[idx]
225 |     pred = predictions[idx]
226 | 
227 |     tt = pred[(pred == c) * (out == c)].shape[0] / float(out[out == c].shape[0])
228 |     tf = pred[(pred == c) * (out != c)].shape[0] / float(out[out != c].shape[0])
229 |     ft = pred[(pred != c) * (out == c)].shape[0] / float(out[out == c].shape[0])
230 |     ff = pred[(pred != c) * (out != c)].shape[0] / float(out[out != c].shape[0])
231 |     print tt, tf, ft, ff, '\t', out[out == c].shape[0] / float(out.shape[0]), out[out != c].shape[0] / float(out.shape[0])
232 |     tt_tf_ft_ff = [tt, tf, ft, ff]
233 |     ans.append(tt_tf_ft_ff)
234 | 
235 |   ans_matrix = np.zeros((len(classes), len(classes)), dtype=float)
236 |   for ci, c in enumerate(classes):
237 |     for ci2, c2 in enumerate(classes):
238 |       ans_matrix[ci2, ci] = pred[(pred == c) * (test_y == c2)].shape[0] / float(test_y[test_y == c2].shape[0])
239 | 
240 | 
241 |   return np.array(ans, dtype=float), ans_matrix
242 | 
243 | 
244 | class Random:
245 |   def __init__(self):
246 |     1
247 |   def fit(self, x, y):
248 |     1
249 |   def predict(self, x):
250 |     return np.random.normal(0, 0.1, size=x.shape[0])
251 | 
252 | 
253 | def train_rnd(x, y, tx, ty):
254 |   return Random()
255 | 
256 | 
257 | def validate(classifiers, test_x, test_y):
258 |   print len(classifiers)
259 |   classes = np.unique(test_y)
260 |   ans = []
261 |   for ci, c in enumerate(classes):
262 |     print ci
263 |     out = np.array([int(i) for i in test_y==c])
264 |     predictions = classifiers[ci].predict(test_x)
265 |     tt = predictions[(predictions == 1) * (out == 1)].shape[0] / float(out[out == 1].shape[0])
266 |     tf = predictions[(predictions == 1) * (out == 0)].shape[0] / float(out[out == 0].shape[0])
267 |     ft = predictions[(predictions == 0) * (out == 1)].shape[0] / float(out[out == 1].shape[0])
268 |     ff = predictions[(predictions == 0) * (out == 0)].shape[0] / float(out[out == 0].shape[0])
269 |     print tt, tf, ft, ff, '\t', out[out == 1].shape[0] / float(out.shape[0]), out[out == 0].shape[0] / float(out.shape[0])
270 |     tt_tf_ft_ff = [tt, tf, ft, ff]
271 |     ans.append(tt_tf_ft_ff)
272 | 
273 |   return np.array(ans, dtype=float)
274 | 
275 | 
276 | 
277 | 
278 | 
279 | 
280 | 
281 | 
282 | 
283 | 
284 | 
285 | 
286 | 
287 | 


--------------------------------------------------------------------------------
/code_yan_lstm.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | 
  3 | matplotlib.use('Agg')
  4 | import matplotlib.pyplot as plt
  5 | import matplotlib.ticker as ticker
  6 | import wave
  7 | import numpy as np
  8 | import math
  9 | import os
 10 | import pickle
 11 | import models
 12 | import csv
 13 | 
 14 | import calculate_features as cf
 15 | import models
 16 | 
 17 | np.random.seed(200)
 18 | 
 19 | regime = 'aibo5'
 20 | 
 21 | def get_params(regime):
 22 |   if regime == 'iemocap':
 23 |     #available_emotions = ['ang', 'exc', 'fru', 'neu', 'sad']
 24 |     available_emotions = ['ang', 'exc', 'neu', 'sad']
 25 |     path_to_samples = 'iem_samples/'
 26 |     conf_matrix_prefix = 'iemocap'
 27 |     framerate = 44100
 28 |     return available_emotions, '', '', '', path_to_samples, conf_matrix_prefix, framerate, '', 0
 29 |   elif regime == 'aibo4':   
 30 |     available_emotions = ['N', 'M', 'E', 'A']
 31 |     
 32 |     path_to_wav = 'aibo_data/wav/'
 33 |     path_to_transcription = 'aibo_data/transliteration/'
 34 |     path_to_labels = 'aibo_data/labels/CEICES/'
 35 |     path_to_samples = 'yan_samples_lstm4/'
 36 |     conf_matrix_prefix = 'aibo4'
 37 |     labels_file = 'word_labels_4cl_aibo_word_set.txt'
 38 |     label_pos = 2
 39 |     framerate = 16000
 40 |     return available_emotions, path_to_wav, path_to_transcription, path_to_labels, path_to_samples, conf_matrix_prefix, framerate, labels_file, label_pos
 41 |   else:
 42 |     available_emotions = ['N', 'R', 'E', 'A', 'P']
 43 |     
 44 |     path_to_wav = 'aibo_data/wav/'
 45 |     path_to_transcription = 'aibo_data/transliteration/'
 46 |     path_to_labels = 'aibo_data/labels/IS2009EmotionChallenge/'
 47 |     path_to_samples = 'yan_samples_lstm5/'
 48 |     conf_matrix_prefix = 'aibo5'
 49 |     labels_file = 'chunk_labels_5cl_corpus.txt'
 50 |     framerate = 16000
 51 |     label_pos = 1
 52 |     return available_emotions, path_to_wav, path_to_transcription, path_to_labels, path_to_samples, conf_matrix_prefix, framerate, labels_file, label_pos
 53 | 
 54 | available_emotions, path_to_wav, path_to_transcription, path_to_labels, path_to_samples, conf_matrix_prefix, framerate, labels_file, label_pos = get_params(regime)
 55 | 
 56 | segmentation = 'by_phrase'
 57 | 
 58 | types = {1: np.int8, 2: np.int16, 4: np.int32}
 59 | 
 60 | def open_wav(path_to_wav, filename):
 61 |   wav = wave.open(path_to_wav + filename, mode="r")
 62 |   (nchannels, sampwidth, framerate, nframes, comptype, compname) = wav.getparams()
 63 |   content = wav.readframes(nframes)
 64 | 
 65 |   samples = np.fromstring(content, dtype=types[sampwidth])
 66 |   return (nchannels, sampwidth, framerate, nframes, comptype, compname), samples
 67 | 
 68 | def read_lines(f):
 69 |   lines = open(f, 'r').read()
 70 |   return np.array(lines.split('\n'))
 71 | 
 72 | 
 73 | def get_needed_lines(lines, i, subline):
 74 |   result = []
 75 |   j = i
 76 |   condition = True
 77 |   while condition:
 78 |     if lines[j][:len(subline)] == subline:
 79 |       result.append(lines[j])
 80 |       j += 1
 81 |     else:
 82 |       condition = False
 83 |   return result, j
 84 | 
 85 | def get_label(expert_grades):
 86 |   u = np.unique(expert_grades)
 87 |   grade = u[0]
 88 |   count = expert_grades[expert_grades == grade].shape[0]
 89 |   for ui in u:
 90 |     current_count = expert_grades[expert_grades == ui].shape[0]
 91 |     if current_count > count:
 92 |       grade = ui
 93 |       count = current_count
 94 |   return grade
 95 | 
 96 | 
 97 | 
 98 | def read_aibo_data():
 99 |   data = []
100 |   files = np.sort([f[:-4] + '.wav' for f in os.listdir(path_to_wav)])
101 |   transliterations = read_lines(path_to_transcription + 'transliteration.txt')
102 |   labels = read_lines(path_to_labels + labels_file)
103 |   index_t = 0
104 |   index_l = 0
105 | 
106 |   c = 0
107 |   for fi, f in enumerate(files):
108 |     d = {}
109 |     if fi % 1000 == 0:
110 |       print f, fi, ' out of ', len(files)
111 |     w = open_wav(path_to_wav, f)
112 |     signal = w[1]
113 |     length = w[0][3] / float(w[0][2])
114 |     t, index_t = get_needed_lines(transliterations, index_t, f[:-4])
115 |     l, index_l = get_needed_lines(labels, index_l, f[:-4])
116 |     if l != []:
117 |       grades = []
118 | 
119 |       em = l[0].split(' ')[label_pos][0]
120 |       d['id'] = f[:-4]
121 |       d['emotion'] = em
122 |       d['signal'] = signal
123 |       d['transcription'] = t[0][14:-2]
124 |       d['length'] = length
125 |       if (d['emotion'] in available_emotions) and (d['length'] > 0.8):
126 |         data.append(d)
127 |     else:
128 |       c += 1
129 | 
130 |   print 'missed: ', c
131 | 
132 |   return data
133 | 
134 | 
135 | sessions = ['Session1', 'Session2', 'Session3', 'Session4', 'Session5']
136 | 
137 | def get_transcriptions(path_to_transcriptions, filename):
138 |   f = open(path_to_transcriptions + filename, 'r').read()
139 |   f = np.array(f.split('\n'))
140 |   transcription = {}
141 | 
142 |   for i in xrange(len(f) - 1):
143 |     g = f[i]
144 |     i1 = g.find(': ')
145 |     i0 = g.find(' [')
146 |     ind_id = g[:i0]
147 |     ind_ts = g[i1+2:]
148 |     transcription[ind_id] = ind_ts
149 |   return transcription
150 | 
151 | def split_wav(wav, emotions):
152 |   (nchannels, sampwidth, framerate, nframes, comptype, compname), samples = wav
153 |   duration = nframes / framerate
154 | 
155 |   left = samples[0::nchannels]
156 |   right = samples[1::nchannels]
157 | 
158 |   frames = []
159 |   for ie, e in enumerate(emotions):
160 |     start = e['start']
161 |     end = e['end']
162 | 
163 |     e['right'] = right[int(start * framerate):int(end * framerate)]
164 |     e['left'] = left[int(start * framerate):int(end * framerate)]
165 | 
166 |     frames.append({'left':e['left'], 'right': e['right']})
167 |   return frames
168 | 
169 | def get_emotions(path_to_emotions, filename):
170 |   f = open(path_to_emotions + filename, 'r').read()
171 |   # print f
172 |   # print f.split('\n')
173 |   f = np.array(f.split('\n'))#np.append(np.array(['']), np.array(f.split('\n')))
174 |   c = 0
175 |   idx = f == ''
176 |   idx_n = np.arange(len(f))[idx]
177 |   emotion = []
178 |   for i in xrange(len(idx_n) - 2):
179 |     g = f[idx_n[i]+1:idx_n[i+1]]
180 |     head = g[0]
181 |     i0 = head.find(' - ')
182 |     start_time = float(head[head.find('[') + 1:head.find(' - ')])
183 |     end_time = float(head[head.find(' - ') + 3:head.find(']')])
184 |     actor_id = head[head.find(filename[:-4]) + len(filename[:-4]) + 1:head.find(filename[:-4]) + len(filename[:-4]) + 5]
185 |     emo = head[head.find('\t[') - 3:head.find('\t[')]
186 |     vad = head[head.find('\t[') + 1:]
187 | 
188 |     v = float(vad[1:7])
189 |     a = float(vad[9:15])
190 |     d = float(vad[17:23])
191 |     
192 |     emotion.append({'start':start_time, 
193 |                     'end':end_time,
194 |                     'id':filename[:-4] + '_' + actor_id,
195 |                     'v':v,
196 |                     'a':a,
197 |                     'd':d,
198 |                     'emotion':emo})
199 |   return emotion
200 | 
201 | def read_iemocap_data():
202 |   data = []
203 |   for session in sessions:
204 |     print session
205 |     path_to_wav = 'iemocap_data/' + session + '/dialog/wav/'
206 |     path_to_emotions = 'iemocap_data/' + session + '/dialog/EmoEvaluation/'
207 |     path_to_transcriptions = 'iemocap_data/' + session + '/dialog/transcriptions/'
208 | 
209 |     files = os.listdir(path_to_wav)
210 |     files = [f[:-4] for f in files]
211 |     print len(files)
212 |     print files
213 |     for f in files:
214 |       emotions = get_emotions(path_to_emotions, f + '.txt')
215 |       wav = open_wav(path_to_wav, f + '.wav')
216 |       sample = split_wav(wav, emotions)
217 | 
218 |       transcriptions = get_transcriptions(path_to_transcriptions, f + '.txt')
219 |       for ie, e in enumerate(emotions):
220 |         if e['emotion'] in available_emotions:
221 |           e['signal'] = sample[ie]['left']
222 |           #e['right'] = sample[ie]['right']
223 |           e['transcription'] = transcriptions[e['id']]
224 |           data.append(e)
225 |   return data
226 | 
227 | def read_data():
228 |   if regime == 'aibo4' or regime == 'aibo5':
229 |     return read_aibo_data()
230 |   else:
231 |     return read_iemocap_data()
232 | 
233 | def check_all_finite(X):
234 |   X = np.asanyarray(X)
235 |   if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum())
236 |           and not np.isfinite(X).all()):
237 |     return True
238 |   else:
239 |     return False
240 | 
241 | def to_categorical(y):
242 |   y_cat = np.zeros((len(y), len(available_emotions)), dtype=int)
243 |   for i in xrange(len(y)):
244 |     y_cat[i, :] = np.array(np.array(available_emotions) == y[i], dtype=int)
245 | 
246 |   return y_cat
247 | 
248 | def save_sample(x, y, name):
249 |   with open(name, 'w') as csvfile:
250 |     w = csv.writer(csvfile, delimiter=',')
251 |     for i in xrange(x.shape[0]):
252 |       row = x[i, :].tolist()
253 |       row.append(y[i])
254 |       w.writerow(row)
255 | 
256 | def load(name):
257 |   with open(name, 'r') as csvfile:
258 |     r = csv.reader(csvfile, delimiter=',')
259 |     x = []
260 |     y = []
261 |     for row in r:
262 |       x.append(row[:-1])
263 |       y.append(row[-1])
264 |   return np.array(x, dtype=float), np.array(y)
265 | 
266 | def get_features(data, save=True, path=path_to_samples):
267 |   failed_samples = []
268 |   for di, d in enumerate(data):
269 |     if di%1000 == 0: 
270 |       print di, ' out of ', len(data)
271 |     st_features = cf.calculate_features(d['signal'], framerate, None).T
272 |     x = []
273 |     y = []
274 |     for f in st_features:
275 |       if f[1] > 1.e-4:
276 |         x.append(f)
277 |         y.append(d['emotion'])
278 |     x = np.array(x, dtype=float)
279 |     y = np.array(y)
280 | 
281 |     if save:
282 |       save_sample(x, y, path + d['id'] + '.csv')
283 |   return x, y
284 | 
285 | def get_field(data, key):
286 |   return np.array([e[key] for e in data])
287 | 
288 | 
289 | def balance_sample(x, y, size):
290 |   labels = np.unique(y)
291 |   xc = {}
292 |   yc = {}
293 |   for l in labels:
294 |     xc[l] = x[y == l]
295 |     yc[l] = y[y == l]
296 | 
297 | 
298 |   s = size / len(labels)
299 | 
300 |   tx = np.zeros((s*len(labels), x.shape[1]), dtype=float)
301 |   ty = np.zeros(s*len(labels), dtype=str)
302 |   for i in xrange(len(labels)):
303 |     j = i*s
304 |     n = xc[labels[i]].shape[0]
305 | 
306 |     idx = np.random.randint(low=0, high=n, size=s)
307 |     tx[j:j+s, :] = xc[labels[i]][idx, :]
308 |     ty[j:j+s] = yc[labels[i]][idx]
309 | 
310 |   return tx, ty
311 | 
312 | def get_sample(idx, path):
313 |   tx = []
314 |   ty = []
315 |   for i in idx:
316 |     x, y = load(path + '/' + i + '.csv')
317 |     if len(x) < 40:
318 |       tx.append(np.array(x, dtype=float))
319 |       #tx.append(np.array(x, dtype=float))
320 |       ty.append(y[0])
321 | 
322 |   tx = np.array(tx)
323 |   ty = np.array(ty)
324 |   return tx, ty
325 | 
326 | def normalize(x):
327 |   gminx = np.zeros(x[0].shape[1]) + 1.e5
328 |   gmaxx = np.zeros(x[0].shape[1]) - 1.e5
329 |   for i in xrange(x.shape[0]):
330 |     q = x[i]
331 |     minx = np.min(q, axis=0)
332 |     maxx = np.max(q, axis=0)
333 | 
334 |     for s in xrange(x[0].shape[1]):
335 |       if gminx[s] > minx[s]:
336 |         gminx[s] = minx[s]
337 |       if gmaxx[s] < maxx[s]:
338 |         gmaxx[s] = maxx[s]
339 | 
340 |   for i in xrange(x.shape[0]):
341 |     for s in xrange(x[0].shape[1]):
342 |       x[i][:, s] = (x[i][:, s] - gminx[s]) / float(gmaxx[s] - gminx[s])
343 | 
344 | 
345 |   return x
346 | 
347 | def grow_sample(x, y, n=10000):
348 |   xg = []
349 |   yg = []
350 |   eps = 5.*1.e-2
351 |   for i in xrange(n):
352 |     j = np.random.randint(x.shape[0])
353 |     x0 = x[j]
354 |     x0 += eps * np.random.normal(0, 1, size=x0.shape)
355 |     y0 = y[j]
356 |     xg.append(x0)
357 |     yg.append(y0)
358 |   return np.array(xg), np.array(yg)
359 | 
360 | def reshape_for_dense(x, y):
361 |   j = 0
362 |   xr = []
363 |   yr = []
364 |   for i in xrange(x.shape[0]):
365 |     for k in xrange(x[i].shape[0]):
366 |       xr.append(x[i][k, :])
367 |       yr.append(y[i])
368 |   return np.array(xr), np.array(yr)
369 | 
370 | 
371 | def pad_sequence(x, ts):
372 |   xp = []
373 |   for i in xrange(x.shape[0]):
374 |     x0 = np.zeros((ts, x[i].shape[1]), dtype=float)
375 |     if ts > x[i].shape[0]:
376 |       x0[ts - x[i].shape[0]:, :] = x[i]
377 |     else:
378 |       maxe = np.sum(x[i][0:ts, 1])
379 |       for j in xrange(x[i].shape[0] - ts):
380 |         if np.sum(x[i][j:j + ts, 1]) > maxe:
381 |           x0 = x[i][j:j + ts, :]
382 |           maxe = np.sum(x[i][j:j + ts, 1])
383 |     xp.append(x0)
384 |   return np.array(xp)
385 | 
386 | 
387 | 
388 | data = np.array(read_data())
389 | print data
390 | print len(data)
391 | 
392 | ids = get_field(data, 'id')
393 | emotions = get_field(data, 'emotion')
394 | print np.unique(emotions)
395 | 
396 | for i in xrange(len(available_emotions)):
397 |   print available_emotions[i], emotions[emotions == available_emotions[i]].shape[0]
398 | 
399 | parts = 5
400 | permutation = np.random.permutation(len(data))
401 | permuted_ids = ids[permutation]
402 | 
403 | step = len(data) / parts
404 | 
405 | preds = []
406 | trues = []
407 | 
408 | get_features(data)
409 | 
410 | for part in xrange(parts):
411 |   i0 = step * part
412 |   i1 = step * (part + 1)
413 | 
414 |   train_idx = np.append(permuted_ids[:i0], permuted_ids[i1:])
415 |   test_idx = permuted_ids[i0:i1]
416 | 
417 |   train_x, train_y = get_sample(train_idx, path_to_samples)
418 | 
419 | 
420 | 
421 |   # energies = []
422 |   # for i in xrange(train_x.shape[0]):
423 |   #   energies.append(np.mean(train_x[i][:, 1]))
424 | 
425 |   # quantile = 0.5
426 |   # largest_energy_idx = np.argsort(energies)[:int(quantile*len(energies))]
427 | 
428 |   # train_x = train_x[largest_energy_idx]
429 |   # train_y = train_y[largest_energy_idx]
430 | 
431 |   test_x, test_y = get_sample(test_idx, path_to_samples)
432 | 
433 |   # train_x = normalize(train_x)
434 |   # test_x = normalize(test_x)
435 | 
436 |   #train_x, train_y = grow_sample(train_x, train_y, 10000)
437 |   #train_x, train_y = reshape_for_dense(train_x, train_y)
438 |   print train_x.shape
439 |   print train_y.shape
440 | 
441 |   # lengths = {}
442 |   # for i in xrange(train_x.shape[0]):
443 |   #   if train_x[i].shape[0] in lengths.keys():
444 |   #     lengths[train_x[i].shape[0]] += 1
445 |   #   else:
446 |   #     lengths[train_x[i].shape[0]] = 1
447 | 
448 |   # for k, v in lengths.items():
449 |   #   print k, v
450 | 
451 |   ts = 32
452 | 
453 |   train_x = pad_sequence(train_x, ts)
454 |   test_x = pad_sequence(test_x, ts)
455 | 
456 |   train_y_cat = to_categorical(train_y)
457 |   test_y_cat = to_categorical(test_y)
458 |   model = models.train_lstm(train_x, train_y_cat, test_x, test_y_cat)
459 | 
460 |   scores = model.predict(test_x)
461 |   prediction = np.array([available_emotions[np.argmax(t)] for t in scores])
462 |   print prediction[prediction == test_y].shape[0] / float(prediction.shape[0])
463 |   #test_x, test_y = get_sample(train_idx, 'yan_samples_lstm4')
464 | 
465 |   for i in xrange(len(prediction)):
466 |     preds.append(prediction[i])
467 |     trues.append(test_y[i])
468 | 
469 | class_to_class_precs = np.zeros((len(available_emotions), len(available_emotions)), dtype=float)
470 | 
471 | preds = np.array(preds)
472 | trues = np.array(trues)
473 | 
474 | print 'Total accuracy: ', preds[preds == trues].shape[0] / float(preds.shape[0])
475 | 
476 | for cpi, cp in enumerate(available_emotions):
477 |   for cti, ct in enumerate(available_emotions):
478 |     #print cp, ct, emo_pred[(emo_pred == cp) * (emo_test == ct)].shape[0], emo_test[emo_test == ct].shape[0]
479 |     if trues[trues == ct].shape[0] > 0:
480 |       class_to_class_precs[cti, cpi] = preds[(preds == cp) * (trues == ct)].shape[0] / float(trues[trues == ct].shape[0])
481 |     else:
482 |       class_to_class_precs[cti, cpi] = 0.
483 | 
484 | 
485 | fig, ax = plt.subplots()
486 | heatmap = ax.pcolor(class_to_class_precs.T, cmap=plt.cm.Blues)
487 | 
488 | ax.set_xticklabels(available_emotions, minor=False)
489 | ax.set_yticklabels(available_emotions, minor=False)
490 | 
491 | ax.set_xticks(np.arange(len(available_emotions)) + 0.5, minor=False)
492 | ax.set_yticks(np.arange(len(available_emotions)) + 0.5, minor=False)
493 | 
494 | for i in xrange(len(available_emotions)):
495 |   for j in xrange(len(available_emotions)):
496 |     plt.text(i+0.2, j+0.4, str(class_to_class_precs[i, j])[:5])
497 | 
498 | plt.savefig(conf_matrix_prefix + 'lstm.png')
499 | plt.close()
500 | 
501 | 


--------------------------------------------------------------------------------
/pitch.py:
--------------------------------------------------------------------------------
  1 | """this is the module version"""
  2 | 
  3 | import librosa
  4 | import numpy as np
  5 | import math
  6 | 
  7 | # --------------------------------------------------------------------------- #
  8 | # utilities
  9 | # --------------------------------------------------------------------------- #
 10 | 
 11 | def ratio_to_cents(r): 
 12 |   return 1200.0 * np.log2(r)
 13 | 
 14 | def ratio_to_cents_protected(f1, f2): 
 15 |   """avoid divide by zero in here, but expects np.arrays"""
 16 |   out = np.zeros_like(f1)
 17 |   key = (f1!=0.0) * (f2!=0.0)
 18 |   out[key] = 1200.0 * np.log2(f1[key]/f2[key])
 19 |   out[f1==0.0] = -np.inf
 20 |   out[f2==0.0] = np.inf
 21 |   out[(f1==0.0) * (f2==0.0)] = 0.0
 22 |   return out
 23 | 
 24 | def cents_to_ratio(c): 
 25 |   return np.power(2, c/1200.0)
 26 | 
 27 | def freq_to_midi(f): 
 28 |   return 69.0 + 12.0 * np.log2(f/440.0)
 29 | 
 30 | def midi_to_freq(m): 
 31 |   return np.power(2, (m-69.0)/12.0) * 440.0 if m!= 0.0 else 0.0
 32 | 
 33 | def bin_to_freq(b, sr, n_fft):
 34 |   return b * float(sr) / float(n_fft)
 35 | 
 36 | def freq_to_bin(f, sr, n_fft): 
 37 |   return np.round(f/(float(sr)/float(n_fft))).astype('int')
 38 | 
 39 | # --------------------------------------------------------------------------- #
 40 | # ppitch
 41 | # --------------------------------------------------------------------------- #
 42 | 
 43 | def ppitch(y, sr=44100, n_fft=8820, win_length=1024, hop_length=2048, 
 44 |   num_peaks=20, num_pitches=3, min_peak=2, max_peak=743, min_fund=55.0, 
 45 |   max_fund=1000.0, harm_offset=0, harm_rolloff=0.75, ml_width=25, 
 46 |   bounce_width=0, bounce_hist=0, bounce_ratios=None, max_harm=32767, 
 47 |   npartial=7):
 48 |  
 49 |   """Polyphonic pitch estimation.
 50 |   parameters:
 51 |     - y: signal to analyze
 52 |     - sr: sample rate
 53 |     - n_fft: fft size
 54 |     - win_length: fft window size
 55 |     - hop_length: stft hop
 56 |     - num_peaks: number of peaks to find
 57 |     - num_pitches: number of pitches ot find
 58 |     - min_peak: min peak frequency (Hz) <- NOT IMPLEMENTED
 59 |     - max_peak: max peak frequency (Hz)
 60 |     - min_fund: min fundamental freq
 61 |     - max_fund: max fundamental freq
 62 |     - harm_offset: deprecated 
 63 |     - harm_rolloff: deprecated (use if npartial is None)
 64 |     - ml_width: cents range to include peak in maximum likliehood
 65 |     - bounce_width: bounce freqs w/in this +/- this range from bounce_ratios
 66 |     - bounce_hist: bounce freqs from previous frames as well
 67 |     - bounce_ratios: bounce freqs w/in +/- bounce_width of these ratios
 68 |     - max_harm: basically deprecated, don't use harms above this one
 69 |     - npartial: partial that is half weight
 70 |   returns:
 71 |     - pitches
 72 |     - STFT
 73 |     - peaks
 74 |   todo:
 75 |     -> stats for how much bounced
 76 |   notes
 77 |     - changing to npartial instead of harm_rolloff, 
 78 |       but if npartial=None, use harm_rolloff
 79 |     - previously was nearest_multiple+1 # to avoid divide by 0
 80 |    """
 81 | 
 82 |   # other params FUCKING: (for now)
 83 | 
 84 |   # ------------------------------------------------------------------------- #
 85 |   # go to time-freq
 86 |   # ------------------------------------------------------------------------- #
 87 | 
 88 |   if_gram, D = librosa.core.ifgram(y, sr=sr, n_fft=n_fft, 
 89 |     win_length=win_length,hop_length=hop_length)
 90 | 
 91 |   # ------------------------------------------------------------------------- #
 92 |   # find peaks
 93 |   # ------------------------------------------------------------------------- #
 94 | 
 95 |   peak_thresh = 1e-3
 96 | 
 97 |   # bins (freq range) to search 
 98 |   min_bin = 2
 99 |   max_bin = freq_to_bin(float(max_peak), sr, n_fft)
100 |   # //--> [make sure we have room either side for peak picking]
101 | 
102 |   # npartial
103 |   if npartial is not None:
104 |     harm_rolloff = math.log(2, float(npartial))
105 | 
106 |   # things we know
107 |   num_bins, num_frames = if_gram.shape
108 | 
109 |   # store pitches here
110 |   pitches = np.zeros([num_frames, num_pitches])       # pitch bins
111 |   peaks = np.zeros([num_frames, num_peaks])           # top20 peaks
112 |   fundamentals = np.zeros([num_frames, num_pitches])  # funds (least squares)
113 |   confidences = np.zeros([num_frames, num_pitches])   # confidence scores
114 | 
115 |   # loop through frames
116 |   for i in range(num_frames):
117 | 
118 |     # grab frequency, magnitudes, total power
119 |     frqs = if_gram[:,i]
120 |     mags = np.abs(D[:,i])
121 |     total_power = mags.sum()
122 |     max_amp = np.max(mags)
123 | 
124 |     # neighbor bins
125 |     lower  = mags[(min_bin-1):(max_bin)]
126 |     middle = mags[(min_bin)  :(max_bin+1)]
127 |     upper  = mags[(min_bin+1):(max_bin+2)]
128 |     
129 |     # all local peaks
130 |     peaks_mask_all = (middle > lower) & (middle > upper)
131 | 
132 |     # resize mask to dimensions of im_gram
133 |     zeros_left = np.zeros(min_bin)
134 |     zeros_right = np.zeros(num_bins - min_bin - max_bin + 1)
135 |     peaks_mask_all = np.concatenate((zeros_left, peaks_mask_all, zeros_right)) 
136 | 
137 |     # find the first <num_peaks> (at most)
138 |     peaks_mags_all = peaks_mask_all * mags
139 |     top20 = np.argsort(peaks_mags_all)[::-1][0:num_peaks]
140 | 
141 |     # extract just the peaks (freqs and normed mags)
142 |     peaks_frqs = frqs[top20]
143 |     peaks_mags = mags[top20]
144 | 
145 |     # note number of peaks found
146 |     num_peaks_found = top20.shape[0]
147 | 
148 |     # ----------------------------------------------------------------------- #
149 |     # maximum liklihood 
150 |     # ----------------------------------------------------------------------- #
151 | 
152 |     # range we'll consider for fundamentals
153 |     min_freq = float(min_fund)
154 |     max_freq = float(max_fund)
155 | 
156 |     # from min_bin to max_bin in 48ths of an octave (1/4 tones)
157 | 
158 |     # [1] bin index to frequency min_freq to max_freq in 48ths of an octave
159 |     def b2f(index): return min_freq * np.power(np.power(2, 1.0/48.0), index)
160 | 
161 |     # [2] max_histo_bin is the bin at max_freq (FUCKING: rounds down?)
162 |     max_histo_bin = int(math.log(max_freq / min_freq, math.pow(2, 1.0/48.0))) 
163 | 
164 |     # now, generate them
165 |     histo = np.fromfunction(lambda x,y: b2f(y), (num_peaks_found, max_histo_bin))
166 |   
167 |     frqs_tile = np.tile(peaks_frqs, (max_histo_bin,1)).transpose()
168 |     mags_tile = np.tile(peaks_mags, (max_histo_bin,1)).transpose()
169 | 
170 |     # likelihood function for each bin frequency
171 |     def ml_a(amp): 
172 |       """a factor depending on the amplitude of the ith peak"""
173 |       return np.sqrt(np.sqrt(amp/max_amp))
174 |       # return np.sqrt(np.sqrt(amp))
175 |       # return np.ones_like(amp)
176 |       # return amp
177 | 
178 |     def ml_t(r1, r2):
179 |       """how closely the ith peak is tuned to a multiple of f"""
180 |       max_dist = ml_width # cents
181 |       cents = np.abs(ratio_to_cents_protected(r1,r2))
182 |       dist = np.clip(1.0 - (cents / max_dist), 0, 1)
183 |       return dist 
184 | 
185 |     def ml_i(nearest_multiple): 
186 |       """whether the peak is closest to a high or low multiple of f"""
187 |       out = np.zeros_like(nearest_multiple)
188 |       out[nearest_multiple.nonzero()] = 1/np.power(nearest_multiple[nearest_multiple.nonzero()], harm_rolloff) 
189 |       return out
190 |       # return 1/np.power(np.clip(nearest_multiple + harm_offset, 1, 32767), harm_rolloff) * (nearest_multiple <= max_harm) 
191 |       # return 1/np.power(nearest_multiple+1, 2)
192 |       # return np.ones_like(nearest_multiple)
193 | 
194 |     ml = (ml_a(mags_tile) * \
195 |       ml_t((frqs_tile/histo), (frqs_tile/histo).round()) * \
196 |       ml_i((frqs_tile/histo).round())).sum(axis=0)
197 | 
198 |     # for debugging
199 |     # ml_hat = (ml_a(mags_tile) * \
200 |     #   ml_t((frqs_tile/histo), (frqs_tile/histo).round()) * \
201 |     #   ml_i((frqs_tile/histo).round()))
202 |     
203 | 
204 |     """super rough hack but work!
205 |     bring this in as a control
206 |     only tests in one direction (new is higher), adjust so ratio goes both ways
207 |     what about which harms we're testing for should that be variable too
208 |     """
209 | 
210 |     num_found = 0
211 |     maybe = 0
212 |     found_so_far = []
213 |     bounce_list = list(np.ravel(pitches[i-bounce_hist:i]))
214 |     # bounce_ratios = [1,2,3,4,5,6]
215 |     bounce_ratios = [1.0, 2.0, 3.0/2.0, 3.0, 4.0] if bounce_ratios is None else bounce_ratios
216 |     prev_frame = list(pitches[i-1]) if i>0 else []
217 |     indices = ml.argsort()[::-1]
218 |     while num_found < num_pitches and maybe <= ml.shape[0]:
219 |       this_one = b2f(indices[maybe])
220 |       # check bounce with this frame
221 |       bounce1 = any([any([abs(ratio_to_cents(
222 |         this_one/(other_one*harm))) < bounce_width 
223 |         for harm in bounce_ratios]) 
224 |         for other_one in bounce_list])
225 |       bounce2 = any([any([abs(ratio_to_cents
226 |         (other_one/(this_one*harm))) < bounce_width
227 |         for harm in bounce_ratios]) 
228 |         for other_one in bounce_list])
229 |       # if any bounces
230 |       bounce = bounce1 or bounce2 
231 |       if not bounce:
232 |         #print 'notbounce'
233 |         found_so_far += [this_one]
234 |         bounce_list += [this_one]
235 |         num_found += 1
236 |       # if bounce: print 'bounce!!!!'
237 |       maybe += 1
238 | 
239 |     indices = ml.argsort()[::-1][0:num_pitches]
240 |     pitches[i] = np.array(found_so_far)
241 |     confidences[i] = ml[indices]
242 |     peaks[i] = peaks_frqs
243 | 
244 | 
245 |     # indices = ml.argsort()[::-1][0:num_pitches]
246 |     # pitches[i] = b2f(indices)
247 |     # confidences[i] = ml[indices]
248 |     # peaks[i] = peaks_frqs
249 | 
250 |     # ----------------------------------------------------------------------- #
251 |     # estimate fundamental (least squares)
252 |     # ----------------------------------------------------------------------- #
253 | 
254 |     """estimate fundamental (least squares)
255 |     here we solve a least squares approximation to give a more precise
256 |     estimate of the fundamental within a histogram bin.
257 |     
258 |     least squares solution :: WAx ~ Wb
259 |    
260 |     A = matrix of harmonic integers (num_peaks x 1)
261 |     b = matrix of actual frequencies (num_peaks x 1)
262 |     W = matrix of weights (on digaonal, num_peaks x num_peaks)
263 |     x = fundamental (singletone matrix)
264 |    
265 |     """
266 | 
267 |     ml_peaks = pitches[i] # regions strong ml
268 |     width = 25 # count peaks w/in 25 cents
269 | 
270 |     frame_fundamentals = []
271 |     for bin_frq in ml_peaks:
272 | 
273 |       # vector of nearest harmonics
274 |       nearest_harmonic = (peaks_frqs/bin_frq).round()
275 | 
276 |       # mask in range? vector of bools
277 |       mask = np.abs(ratio_to_cents_protected( 
278 |         (peaks_frqs/bin_frq), (peaks_frqs/bin_frq).round())) <= width
279 | 
280 |       # weight (same harmonic weight as above)
281 |       weights = ml_i( (peaks_frqs/bin_frq).round() )
282 | 
283 |       # build matrices
284 |       A = np.matrix(nearest_harmonic).T
285 |       b = np.matrix(peaks_frqs).T
286 |       W = np.matrix(np.diag(mask * weights))
287 | 
288 |       # do least squares
289 |       fund = np.linalg.lstsq(W*A, W*b)[0][0].item()
290 | 
291 |       # append
292 |       frame_fundamentals += [fund]
293 | 
294 |     fundamentals[i] = np.array(frame_fundamentals)
295 | 
296 |   # ------------------------------------------------------------------------- #
297 |   # pitch tracks
298 |   # ------------------------------------------------------------------------- #
299 | 
300 |   tracks = np.copy(pitches)
301 | 
302 |   return fundamentals, pitches, D, peaks, confidences, tracks
303 | 
304 | 
305 | # --------------------------------------------------------------------------- #
306 | # extras
307 | # --------------------------------------------------------------------------- #
308 | 
309 | def voice_tracks(pitches, confidences):
310 | 
311 |   # alloc output data
312 |   out = np.zeros_like(pitches)
313 |   out[0] = pitches[0]
314 | 
315 |   out_confidences = np.zeros_like(confidences)
316 |   out_confidences[0] = confidences[0]
317 | 
318 |   num_frames, num_voices = pitches.shape
319 | 
320 |   # loop through frames
321 |   for i in range(1,num_frames):
322 | 
323 |     # setup
324 |     prev_frame = out[i-1]
325 |     next_frame = pitches[i]
326 | 
327 |     # pairwise distances indexed [from, to]
328 |     delta = np.abs(ratio_to_cents(np.atleast_2d(prev_frame).T/np.repeat(
329 |       np.atleast_2d(next_frame),num_voices,axis=0)))
330 |     # delta = np.abs(ratio_to_cents_protected(np.atleast_2d(prev_frame).T,
331 |       # np.repeat(np.atleast_2d(next_frame),num_voices,axis=0)))
332 | 
333 |     # indices of sorted delta [from, to]
334 |     delta_sorted = np.unravel_index(np.argsort(delta.ravel()), delta.shape)
335 | 
336 |     # step through and reject if either prev or next is already found
337 |     num_found = 0; found_prevs = []; found_nexts = []; index = 0
338 | 
339 |     while num_found < num_voices:
340 |       prev_voice,next_voice = delta_sorted[0][index], delta_sorted[1][index]
341 |       if prev_voice not in found_prevs and next_voice not in found_nexts:
342 |         out[i][prev_voice] = pitches[i][next_voice]
343 |         out_confidences[i][prev_voice] = confidences[i][next_voice]
344 |         found_prevs += [prev_voice]
345 |         found_nexts += [next_voice]
346 |         num_found += 1
347 |       index += 1
348 | 
349 |   return out, out_confidences
350 | 
351 | # --------------------------------------------------------------------------- #
352 | 
353 | """pitchsets.
354 |   this is slightly ineffient b/c we filter twice
355 |     1. first for change (vibrato depth)
356 |     2. then for sustain (vibrato length)
357 |     3. whatever is left are ampsets
358 |     - it could all be done in one pass, but this makes it easier to swap in
359 |       change measures (absolute vs relative grid) and sustain measures
360 |   * currently pitchsets are detected by voice but reported by frame to be 
361 |     consistent with ampsets. 
362 |     
363 |   -> this should change.
364 | """
365 | 
366 | def pitchsets(pitches, win=3):
367 | 
368 |   # 1. filter for change on an asbolute 1/2-tone grid (depth)
369 |   pitches_change = change_filter(pitches)
370 | 
371 |   # 2. filter for sustain (length)
372 |   pitches_sustain = sustain_filter(pitches_change, win=win)
373 | 
374 |   # 3. what is left are pitchsets
375 |   pitchsets = np.sort(np.array(list(set(np.argwhere(np.nan_to_num(pitches_sustain))[:,0]))))
376 |   
377 |   return pitchsets, np.where(np.nan_to_num(pitches_sustain))
378 | 
379 | # --------------------------------------------------------------------------- #
380 | 
381 | """filter for change.
382 |   * filter for change on grid rounded to nearest 1/2 tone
383 |   * non-change pitches are changed to 0's
384 |   -> maybe this should be np.nan's instead
385 | """
386 | 
387 | def change_filter(pitches):
388 | 
389 |   # copy working data
390 |   wpitches = np.copy(pitches)
391 | 
392 |   # alloc output data
393 |   out = np.zeros_like(wpitches)
394 |   out[out==0] = np.nan
395 | 
396 |   # convert to midi pitch and round
397 |   wpitches = freq_to_midi(wpitches).round()
398 | 
399 |   # indices of pitch change
400 |   indices = np.diff(wpitches, axis=0).nonzero()
401 | 
402 |   # copy pitch values from original data where change
403 |   out[0] = pitches[0]
404 |   out[1:][indices] = pitches[1:][indices]
405 | 
406 |   return out 
407 | 
408 | # --------------------------------------------------------------------------- #
409 | 
410 | """filter for sustain.
411 |   * say something here: wish could be numpy...
412 | """
413 | 
414 | def sustain_filter(pitches, win=3):
415 | 
416 |   # alloc output data
417 |   out = np.zeros_like(pitches)
418 | 
419 |   # loop through frames and pitches
420 |   for i in range(pitches.shape[0]-win):
421 |     for j in range(pitches.shape[1]):
422 | 
423 |         # keep pitch if win after are nan
424 |         out[i,j] = pitches[i,j] * np.isnan(pitches[i+1:i+win,j]).all()
425 | 
426 |   # convert 0 to nan
427 |   out[out==0] = np.nan
428 | 
429 |   return out 
430 | 
431 | # --------------------------------------------------------------------------- #
432 | 
433 |   """pitch tracks
434 |   here we parse the raw frequencies into pitch tracks. currently, we do this 
435 |   by minimizing the abs of the frame to frame difference, but we could use
436 |   any feature, such as distance from the running mean, variance, etc.
437 |     todo: --> implement more pitch track options
438 |     
439 |   """
440 | 
441 |   # move through the frames
442 |   # have pitch tracks ready to go
443 |   # for each num_pitches put in track with nearest frame to frame difference
444 |   # note: what if the pitch confidence is low, might want to put track to sleep...
445 |   # or if it's too far from anything just for a sec
446 |   # or more sophisticated for like minimizing total error...
447 |   # here we do it dumb way from strong to weak closest match (pref to first)
448 |   # tracks = np.zeros([num_frames, num_pitches])   # confidence scores
449 |   
450 |   # tracks[0] = fundamentals[0]
451 | 
452 |   # tracks[0] = fundamentals[0]
453 |   # for i in range(1, num_frames):
454 |   #   prev_tracks = list(tracks[i-1])
455 |   #   new_funds = list(fundamentals[i])
456 |   #   for j in range(num_pitches):
457 |   #     wewant = min(new_funds, key=lambda x: abs(x-prev_tracks[j]))
458 |   #     index = new_funds.index(wewant)
459 |   #     new_funds.pop(index)
460 |   #     tracks[i,j]= wewant
461 | def pitch_onsets(funds, win=2, depth=25):
462 |   
463 |   onsets = np.zeros_like(funds)
464 |   for i in range(funds.shape[0]):
465 |     for j in range(funds.shape[1]):
466 |       onsets[i,j] = (np.abs(pyfid.ratio_to_cents(funds[i-win:i,:] / funds[i,j])) <= thresh).any(axis=1).all()
467 | 
468 |   return onsets


--------------------------------------------------------------------------------
/code.py:
--------------------------------------------------------------------------------
  1 | import wave
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import matplotlib.ticker as ticker
  5 | import math
  6 | import os
  7 | import pickle
  8 | import models
  9 | import csv
 10 | 
 11 | import calculate_features as cf
 12 | import models
 13 | 
 14 | np.random.seed(200)
 15 | 
 16 | 
 17 | def format_time(x, pos=None):
 18 |     global duration, nframes, k
 19 |     progress = int(x / float(nframes) * duration * k)
 20 |     mins, secs = divmod(progress, 60)
 21 |     hours, mins = divmod(mins, 60)
 22 |     out = "%d:%02d" % (mins, secs)
 23 |     if hours > 0:
 24 |         out = "%d:" % hours
 25 |     return out
 26 | 
 27 | def format_db(x, pos=None):
 28 |     if pos == 0:
 29 |         return ""
 30 |     global peak
 31 |     if x == 0:
 32 |         return "-inf"
 33 | 
 34 |     db = 20 * math.log10(abs(x) / float(peak))
 35 |     return int(db)
 36 | 
 37 | types = {
 38 |     1: np.int8,
 39 |     2: np.int16,
 40 |     4: np.int32
 41 | }
 42 | 
 43 | 
 44 | def open_wav(path_to_wav, filename):
 45 |   wav = wave.open(path_to_wav + filename, mode="r")
 46 |   (nchannels, sampwidth, framerate, nframes, comptype, compname) = wav.getparams()
 47 |   content = wav.readframes(nframes)
 48 | 
 49 |   samples = np.fromstring(content, dtype=types[sampwidth])
 50 |   return (nchannels, sampwidth, framerate, nframes, comptype, compname), samples
 51 | 
 52 | 
 53 | def get_all_files(path_to_wav):
 54 |   files = os.listdir(path_to_wav)
 55 |   return files
 56 | 
 57 | def get_emotions(path_to_emotions, filename):
 58 |   f = open(path_to_emotions + filename, 'r').read()
 59 |   # print f
 60 |   # print f.split('\n')
 61 |   f = np.array(f.split('\n'))#np.append(np.array(['']), np.array(f.split('\n')))
 62 |   c = 0
 63 |   idx = f == ''
 64 |   idx_n = np.arange(len(f))[idx]
 65 |   emotion = []
 66 |   for i in xrange(len(idx_n) - 2):
 67 |     g = f[idx_n[i]+1:idx_n[i+1]]
 68 |     head = g[0]
 69 |     i0 = head.find(' - ')
 70 |     start_time = float(head[head.find('[') + 1:head.find(' - ')])
 71 |     end_time = float(head[head.find(' - ') + 3:head.find(']')])
 72 |     actor_id = head[head.find(filename[:-4]) + len(filename[:-4]) + 1:head.find(filename[:-4]) + len(filename[:-4]) + 5]
 73 |     emo = head[head.find('\t[') - 3:head.find('\t[')]
 74 |     vad = head[head.find('\t[') + 1:]
 75 | 
 76 |     v = float(vad[1:7])
 77 |     a = float(vad[9:15])
 78 |     d = float(vad[17:23])
 79 |     
 80 |     emotion.append({'start':start_time, 
 81 |                'end':end_time,
 82 |                'id':filename[:-4] + '_' + actor_id,
 83 |                'v':v,
 84 |                'a':a,
 85 |                'd':d,
 86 |                'emotion':emo})
 87 |   return emotion
 88 | 
 89 | 
 90 | def split_wav(wav, emotion):
 91 |   (nchannels, sampwidth, framerate, nframes, comptype, compname), samples = wav
 92 |   duration = nframes / framerate
 93 | 
 94 |   left = samples[0::nchannels]
 95 |   right = samples[1::nchannels]
 96 | 
 97 |   frames = []
 98 |   for ie, e in enumerate(emotions):
 99 |     start = e['start']
100 |     end = e['end']
101 | 
102 |     e['right'] = right[int(start * framerate):int(end * framerate)]
103 |     e['left'] = left[int(start * framerate):int(end * framerate)]
104 | 
105 |     frames.append({'left':e['left'], 'right': e['right']})
106 |   return frames
107 | 
108 | available_emotions = ['ang', 'exc', 'fru', 'neu', 'sad']
109 | 
110 | def get_transcriptions(path_to_transcriptions, filename):
111 |   f = open(path_to_transcriptions + filename, 'r').read()
112 |   f = np.array(f.split('\n'))
113 |   transcription = {}
114 | 
115 |   for i in xrange(len(f) - 1):
116 |     g = f[i]
117 |     i1 = g.find(': ')
118 |     i0 = g.find(' [')
119 |     ind_id = g[:i0]
120 |     ind_ts = g[i1+2:]
121 |     transcription[ind_id] = ind_ts
122 |   return transcription
123 | 
124 | 
125 | 
126 | data = []
127 | sessions = ['session1', 'session2', 'session3', 'session4', 'session5']
128 | for session in sessions:
129 |   print session
130 |   path_to_wav = 'D:/emotion recognition/' + session + '/dialog/wav/'
131 |   path_to_pics = 'D:/emotion recognition/pics/'
132 |   path_to_emotions = 'D:/emotion recognition/' + session + '/dialog/EmoEvaluation/'
133 |   path_to_transcriptions = 'D:/emotion recognition/' + session + '/dialog/transcriptions/'
134 | 
135 |   files = get_all_files(path_to_wav)
136 |   files = [f[:-4] for f in files]
137 |   print len(files)
138 |   print files
139 |   for f in files:
140 |     emotions = get_emotions(path_to_emotions, f + '.txt')
141 |     wav = open_wav(path_to_wav, f + '.wav')
142 |     sample = split_wav(wav, emotions)
143 | 
144 |     transcriptions = get_transcriptions(path_to_transcriptions, f + '.txt')
145 |     for ie, e in enumerate(emotions):
146 |       if e['emotion'] in available_emotions:
147 |         e['left'] = sample[ie]['left']
148 |         e['right'] = sample[ie]['right']
149 |         e['transcription'] = transcriptions[e['id']]
150 |         data.append(e)
151 | 
152 | def save_sample(x, y, name):
153 |   with open(name, 'w') as csvfile:
154 |     w = csv.writer(csvfile, delimiter=',')
155 |     for i in xrange(x.shape[0]):
156 |       row = x[i, :].tolist()
157 |       row.append(y[i])
158 |       w.writerow(row)
159 | 
160 | def load(name):
161 |   with open(name, 'r') as csvfile:
162 |     r = csv.reader(csvfile, delimiter=',')
163 |     x = []
164 |     y = []
165 |     for row in r:
166 |       x.append(row[:-1])
167 |       y.append(row[-1])
168 |   return np.array(x, dtype=float), np.array(y)
169 | 
170 | def get_field(data, key):
171 |   return np.array([e[key] for e in data])
172 | 
173 | 
174 | def split_sample(n, train_part=0.8):
175 |   ids_idx = np.random.permutation(len(ids))
176 |   step = int(n * (1 - train_part))
177 |   samples = []
178 |   for i in xrange(n / step):
179 |     ids_train = ids[ids_idx[:int(train_part*len(ids)) ]]
180 |     ids_test = ids[ids_idx[int(train_part*len(ids)):]]
181 |     samples.append((ids_train, ids_test))
182 |   return samples
183 | 
184 | 
185 | def get_features(data, save=True, path='samples/l_', mode='calculate'):
186 |   if mode == 'calculate':
187 |     for di, d in enumerate(data):
188 |       print di, ' out of ', len(data)
189 |       st_features = cf.calculate_features(d['left'], None).T
190 |       x = []
191 |       y = []
192 |       for f in st_features:
193 |         if f[1] > 1.e-4:
194 |           x.append(f)
195 |           y.append(d['emotion'])
196 |       x = np.array(x, dtype=float)
197 |       y = np.array(y)
198 |       if save:
199 |         save_sample(x, y, path + d['id'] + '.csv')
200 |     return x, y
201 | 
202 | 
203 | path_to_probas = 'D:/emotion recognition/probas/'
204 | def plot_probas_emotions(probas, true_emo, filename):
205 |   colors = ['r', 'g', 'b', 'yellow', 'black', 'magenta']
206 |   plt.figure(figsize=(16, 12))
207 |   plt.title('True emotion: ' + true_emo, fontsize=50)
208 |   for j in xrange(probas.shape[1]):
209 |     plt.plot(0.2*np.arange(probas.shape[0]), probas[:, j], color=colors[j], linewidth=3)
210 | 
211 |   plt.legend(available_emotions, fontsize=20)
212 |   plt.ylim(ymax=1, ymin=0)
213 |   plt.savefig(path_to_probas + filename + '.png')
214 |   plt.close()
215 | 
216 | from keras.utils import np_utils
217 | def to_categorical(y):
218 |   y_cat = np.zeros((len(y), len(available_emotions)), dtype=int)
219 |   for i in xrange(len(y)):
220 |     y_cat[i, :] = np.array(np.array(available_emotions) == y[i], dtype=int)
221 | 
222 |   return y_cat
223 | 
224 | 
225 | def check_all_finite(X):
226 |   X = np.asanyarray(X)
227 |   if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum())
228 |           and not np.isfinite(X).all()):
229 |     return True
230 |   else:
231 |     return False
232 | 
233 | data = np.array(data)
234 | print data
235 | print len(data)
236 | 
237 | x, y = get_features(data)
238 | 
239 | ids = get_field(data, 'id')
240 | emotions = get_field(data, 'emotion')
241 | 
242 | for i in xrange(len(available_emotions)):
243 |   print available_emotions[i], emotions[emotions == available_emotions[i]].shape[0]
244 | 
245 | 
246 | parts = 5
247 | 
248 | 
249 | permutation = np.random.permutation(len(data))
250 | permuted_ids = ids[permutation]
251 | 
252 | step = len(data) / parts
253 | 
254 | preds = []
255 | trues = []
256 | 
257 | for part in xrange(parts):
258 |   if part >= 0:
259 |     print 'Validation part: ', part
260 |     i0 = step * part
261 |     i1 = step * (part + 1)
262 | 
263 |     level1_ids = np.append(permuted_ids[:i0], permuted_ids[i1:])
264 |     validation_ids = permuted_ids[i0:i1]
265 | 
266 |     validation_x = {}
267 |     validation_y = []
268 |     
269 |     for i in validation_ids:
270 |       x, y = load('samples/l_' + i + '.csv')
271 |       validation_x[i] = x
272 |       validation_y.append(y[0])
273 |     validation_y = np.array(validation_y)
274 | 
275 |     index = 0
276 |     level2_parts = 3
277 |     level2_x = np.zeros((len(data) - len(validation_ids), level2_parts * len(available_emotions)), dtype=np.float32)
278 |     level2_y = []
279 | 
280 |     for j in xrange(parts - 1):
281 |       print 'Level 1 part: ', j
282 |       j0 = step * j
283 |       j1 = step * (j + 1)
284 | 
285 |       train_ids = np.append(level1_ids[:j0], level1_ids[j1:])
286 |       test_ids = level1_ids[j0:j1]
287 | 
288 |       train_x = []
289 |       train_y = []
290 |       test_x = {}
291 |       test_y = {}
292 | 
293 |       for i in level1_ids:
294 |         x, y = load('samples/l_' + i + '.csv')
295 |         if i in train_ids:
296 |           for s in xrange(x.shape[0]):
297 |             train_x.append(x[s, :])
298 |             train_y.append(y[s])
299 |         if i in test_ids:
300 |           test_x[i] = x
301 |           test_y[i] = y[0]
302 | 
303 |       train_x = np.array(train_x, dtype=float)
304 |       train_y = np.array(train_y)
305 | 
306 |       train_y_cat = to_categorical(train_y)
307 | 
308 |       load_models = False
309 |       if load_models:
310 |         classifiers = pickle.load(open('model' + str(part) + str(j) + '.pkl', 'r'))
311 |       else:
312 |         #classifiers = models.train_rfc(train_x, train_y, None)
313 |         classifiers = models.train_mpc(train_x, train_y_cat, None)
314 |         #pickle.dump(classifiers, open('model' + str(part) + str(j) + '.pkl', 'w'))
315 | 
316 |       for i in test_ids:
317 |         probs = classifiers.predict(test_x[i])
318 |         for m in xrange(level2_parts):
319 |           m0 = probs.shape[0] * m / level2_parts
320 |           m1 = probs.shape[0] * (m + 1) / level2_parts
321 | 
322 |           f = np.mean(probs[m0:m1, :], axis=0)
323 |           for fi, ff in enumerate(f):
324 |             if np.isnan(ff):
325 |               f[fi] = 1./float(len(f))
326 |           level2_x[index, m * len(f):(m + 1) * len(f)] = f
327 |         level2_y.append(test_y[i])
328 | 
329 |         print test_x[i]
330 |         print probs
331 |         print level2_x[index, :]
332 |         print level2_y[index]
333 |         print
334 |         print
335 |         index += 1
336 |     
337 |     #level2_x = level2_x[:len(level2_y), :]
338 |     level2_y = np.array(level2_y)
339 |     print level2_x.shape, level2_y.shape
340 |     print np.sum(level2_x)
341 |     print check_all_finite(level2_x), check_all_finite(level2_y)
342 |     print np.unique(level2_y)
343 | 
344 |     # for i in xrange(level2_x.shape[0]):
345 |     #   print i, level2_x[i, :], level2_y[i]
346 |     level2_x = np.array(level2_x, dtype=float)
347 |     if load_models:
348 |       level2_classifiers = pickle.load(open('level2_model' + str(part) + '.pkl', 'r'))
349 |     else:
350 |       level2_classifiers = models.train_rfc(level2_x, level2_y, None)
351 |       #pickle.dump(level2_classifiers, open('level2_model' + str(part) + '.pkl', 'w'))
352 | 
353 |     level1_train_x = []
354 |     level1_train_y = []
355 | 
356 |     for i in level1_ids:
357 |       x, y = load('samples/l_' + i + '.csv')
358 |       for s in xrange(x.shape[0]):
359 |         level1_train_x.append(x[s, :])
360 |         level1_train_y.append(y[s])
361 |     level1_train_x = np.array(level1_train_x, dtype=float)
362 |     level1_train_y = np.array(level1_train_y)
363 | 
364 | 
365 |     level1_train_y_cat = to_categorical(level1_train_y)
366 |     if load_models:
367 |       level1_classifiers = pickle.load(open('level1_model' + str(part) + '.pkl', 'r'))
368 |     else:
369 |       level1_classifiers = models.train_mpc(level1_train_x, level1_train_y_cat, None)
370 |       #pickle.dump(level1_classifiers, open('level1_model' + str(part) + '.pkl', 'w'))
371 | 
372 | 
373 | 
374 |     level2_validation_x = np.zeros((len(validation_ids), level2_parts * len(available_emotions)), dtype=float)
375 |     for ii, i in enumerate(validation_ids):
376 |       probs = level1_classifiers.predict(validation_x[i])
377 |       for m in xrange(level2_parts):
378 |         m0 = probs.shape[0] * m / level2_parts
379 |         m1 = probs.shape[0] * (m + 1) / level2_parts
380 | 
381 |         f = np.mean(probs[m0:m1, :], axis=0)
382 |         for fi, ff in enumerate(f):
383 |           if np.isnan(ff):
384 |             f[fi] = 1./float(len(f))
385 |         level2_validation_x[ii, m * len(f):(m + 1) * len(f)] = f
386 | 
387 |     
388 | 
389 |     prediction = level2_classifiers.predict(level2_validation_x)
390 |     # print pred_probas
391 |     # prediction = []
392 |     # for i in xrange(pred_probas.shape[0]):
393 |     #   prediction.append(available_emotions[np.argmax(pred_probas[i, :])])
394 |     # prediction = np.array(prediction)
395 | 
396 |     print prediction[prediction == validation_y].shape[0] / float(prediction.shape[0])
397 | 
398 |     for i in xrange(len(prediction)):
399 |       preds.append(prediction[i])
400 |       trues.append(validation_y[i])
401 | 
402 | class_to_class_precs = np.zeros((len(available_emotions), len(available_emotions)), dtype=float)
403 | 
404 | preds = np.array(preds)
405 | trues = np.array(trues)
406 | 
407 | for cpi, cp in enumerate(available_emotions):
408 |   for cti, ct in enumerate(available_emotions):
409 |     #print cp, ct, emo_pred[(emo_pred == cp) * (emo_test == ct)].shape[0], emo_test[emo_test == ct].shape[0]
410 |     if trues[trues == ct].shape[0] > 0:
411 |       class_to_class_precs[cti, cpi] = preds[(preds == cp) * (trues == ct)].shape[0] / float(trues[trues == ct].shape[0])
412 |     else:
413 |       class_to_class_precs[cti, cpi] = 0.
414 | 
415 | 
416 | fig, ax = plt.subplots()
417 | heatmap = ax.pcolor(class_to_class_precs, cmap=plt.cm.Blues)
418 | 
419 | ax.set_xticklabels(available_emotions, minor=False)
420 | ax.set_yticklabels(available_emotions, minor=False)
421 | 
422 | ax.set_xticks(np.arange(len(available_emotions)) + 0.5, minor=False)
423 | ax.set_yticks(np.arange(len(available_emotions)) + 0.5, minor=False)
424 | 
425 | for i in xrange(len(available_emotions)):
426 |   for j in xrange(len(available_emotions)):
427 |     plt.text(i+0.2, j+0.4, str(class_to_class_precs[i, j])[:5])
428 | 
429 | plt.savefig('level2_precs.png')
430 | plt.close()
431 | 
432 | # samples = split_sample(len(ids))
433 | # precs = np.zeros(len(samples))
434 | 
435 | # class_to_class_precs = np.zeros((len(samples), len(available_emotions), len(available_emotions)), dtype=float)
436 | 
437 | # load_models = True
438 | 
439 | # level2_parts = 3
440 | # level2_sample = np.zeros((len(data), level2_parts * len(available_emotions)), dtype=float)
441 | # level2_emotions = []
442 | 
443 | 
444 | # index = 0
445 | # for it, (ids_train, ids_test) in enumerate(samples):
446 | 
447 | #   train_x = []
448 | #   test_x = []
449 | #   train_y = []
450 | #   test_y = []
451 | 
452 | #   test_x2 = {}
453 | #   test_y2 = {}
454 | 
455 | #   for i in ids:
456 | #     x, y = load('samples/l_' + i + '.csv')
457 | #     if i in ids_train:
458 | #       for j in xrange(x.shape[0]):
459 | #         train_x.append(x[j, :])
460 | #         train_y.append(y[j])
461 | #     if i in ids_test:
462 | #       test_x2[i] = x
463 | #       test_y2[i] = y
464 | #       for j in xrange(x.shape[0]):
465 | #         test_x.append(x[j, :])
466 | #         test_y.append(y[j])
467 | 
468 | #   train_x = np.array(train_x, dtype=float)
469 | #   train_y = np.array(train_y)
470 | #   test_x = np.array(test_x, dtype=float)
471 | #   test_y = np.array(test_y)
472 | 
473 | #   if load_models:
474 | #     classifiers = pickle.load(open('model' + str(it) + '.pkl', 'r'))
475 | #   else:
476 | #     classifiers = models.train1(train_x, train_y, None)
477 | #     pickle.dump(classifiers, open('model' + str(it) + '.pkl', 'w'))
478 | #   # classifiers = pickle.load(open('model.pkl', 'r'))
479 | 
480 | #   for ii, i in enumerate(ids_test):
481 | #     print test_x2[i].shape, i
482 | #     probas = classifiers.predict_proba(test_x2[i])
483 | 
484 | #     plot_probas_emotions(probas, test_y2[i][0], str(i))
485 | 
486 |   #   k = probas.shape[0]
487 |   #   l = len(available_emotions)
488 |   #   for j in xrange(level2_parts):
489 |   #     i0 = k * j / level2_parts
490 |   #     i1 = k * (j + 1) / level2_parts
491 | 
492 |   #     f = np.mean(probas[i0:i1, :], axis=0)
493 |   #     level2_sample[index, j * l:(j + 1) * l] = f
494 |   #   level2_emotions.append(test_y2[i][0])
495 |   #   # print level2_sample[index, :], level2_emotions[index]
496 |   #   index += 1
497 | 
498 | # level2_emotions = np.array(level2_emotions)
499 | 
500 | # print len(data)
501 | # print level2_sample.shape
502 | # print level2_emotions.shape
503 | 
504 | # with open('level2_sample.csv', 'w') as csvfile:
505 | #   w = csv.writer(csvfile, delimiter=',')
506 | #   for i in xrange(level2_sample.shape[0]):
507 | #     d = level2_sample[i, :].tolist()
508 | #     d.append(level2_emotions[i])
509 | #     w.writerow(d)
510 | 
511 | # #===============================================================================
512 | #   emo_test = []
513 | #   emo_pred = []
514 | #   j = 0
515 | #   for ii, i in enumerate(ids_test):
516 | #     try:
517 | #       predictions_frames = classifiers.predict(test_x2[i])
518 | #       print classifiers.predict_proba(test_x2[i]).shape
519 | #       print classifiers.predict_proba(test_x2[i])
520 | #       ans = np.zeros(len(available_emotions))
521 | #       for ie, e in enumerate(available_emotions):
522 | #         ans[ie] = predictions_frames[predictions_frames == e].shape[0]
523 | 
524 | #       emo = available_emotions[np.argmax(ans)]
525 | #       emo_test.append(test_y2[i][0])
526 | #       emo_pred.append(emo)
527 | #     except:
528 | #       j += 1
529 | #       print ii, j
530 | #   emo_test = np.array(emo_test)
531 | #   emo_pred = np.array(emo_pred)
532 | 
533 | #   print emo_pred.shape, emo_test.shape
534 | 
535 | #   print emo_test[emo_test == emo_pred].shape[0] / float(emo_test.shape[0])
536 | #   precs[it] = emo_test[emo_test == emo_pred].shape[0] / float(emo_test.shape[0])
537 | 
538 | #   for cpi, cp in enumerate(available_emotions):
539 | #     for cti, ct in enumerate(available_emotions):
540 | #       #print cp, ct, emo_pred[(emo_pred == cp) * (emo_test == ct)].shape[0], emo_test[emo_test == ct].shape[0]
541 | #       if emo_test[emo_test == ct].shape[0] > 0:
542 | #         class_to_class_precs[it, cti, cpi] = emo_pred[(emo_pred == cp) * (emo_test == ct)].shape[0] / float(emo_test[emo_test == ct].shape[0])
543 | #       else:
544 | #         class_to_class_precs[it, cti, cpi] = 0.
545 | 
546 | # print precs
547 | # print np.mean(precs)
548 | 
549 | # class_to_class_precs = np.mean(class_to_class_precs, axis=0)
550 | 
551 | # fig, ax = plt.subplots()
552 | # heatmap = ax.pcolor(class_to_class_precs, cmap=plt.cm.Blues)
553 | 
554 | # ax.set_xticklabels(available_emotions, minor=False)
555 | # ax.set_yticklabels(available_emotions, minor=False)
556 | 
557 | # ax.set_xticks(np.arange(len(available_emotions)) + 0.5, minor=False)
558 | # ax.set_yticks(np.arange(len(available_emotions)) + 0.5, minor=False)
559 | 
560 | # for i in xrange(len(available_emotions)):
561 | #   for j in xrange(len(available_emotions)):
562 | #     plt.text(i+0.2, j+0.4, str(class_to_class_precs[i, j])[:5])
563 | 
564 | # plt.savefig('precs.png')
565 | # plt.close()
566 | 
567 | # with open('precs.csv', 'w') as csvfile:
568 | #   w = csv.writer(csvfile, delimiter=',')
569 | #   d = ['']
570 | #   for j in xrange(len(available_emotions)):
571 | #     d.append(available_emotions[j])
572 | 
573 | 
574 | #   for i in xrange(len(available_emotions)):
575 | #     d = [available_emotions[i]]
576 | #     for j in xrange(len(available_emotions)):
577 | #       d.append(class_to_class_precs[i, j])
578 | #     w.writerow(d)
579 | 
580 | # #==============================================================================
581 | 
582 | 
583 | 
584 | # ans, ans_m = models.validate1(classifiers, test_x, test_y)
585 | # print ans_m
586 | 
587 | # with open('ans.csv', 'w') as csvfile:
588 | #   w = csv.writer(csvfile, delimiter='\t')
589 | #   for r in xrange(ans.shape[0]):
590 | #     print available_emotions[r], ans_m[r, :]
591 | #     w.writerow(ans_m[r, :])
592 | 
593 | 
594 | 
595 | 
596 | 
597 | # save(x, y, 'data.csv')
598 | 
599 | # print np.unique(y)
600 | 
601 | 
602 | 
603 | 
604 | 
605 | 
606 | 
607 | 
608 | 
609 | 
610 | 
611 | 
612 | 
613 | 
614 | 


--------------------------------------------------------------------------------
/cf.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | import os
  4 | import glob
  5 | import numpy
  6 | #import mlpy
  7 | import cPickle
  8 | import aifc
  9 | import math
 10 | from numpy import NaN, Inf, arange, isscalar, array
 11 | from scipy.fftpack import rfft
 12 | from scipy.fftpack import fft
 13 | from scipy.fftpack.realtransforms import dct
 14 | from scipy.signal import fftconvolve
 15 | from scipy import linalg as la
 16 | #import audioTrainTest as aT
 17 | #import audioBasicIO
 18 | #import utilities
 19 | from scipy.signal import lfilter, hamming
 20 | use_pitch = False
 21 | 
 22 | if use_pitch:
 23 |     import pitch
 24 | #from scikits.talkbox import lpc
 25 | 
 26 | eps = 0.00000001
 27 | 
 28 | """ Time-domain audio features """
 29 | 
 30 | 
 31 | def stZCR(frame):
 32 |     """Computes zero crossing rate of frame"""
 33 |     count = len(frame)
 34 |     countZ = numpy.sum(numpy.abs(numpy.diff(numpy.sign(frame)))) / 2
 35 |     return (numpy.float64(countZ) / numpy.float64(count-1.0))
 36 | 
 37 | 
 38 | def stEnergy(frame):
 39 |     """Computes signal energy of frame"""
 40 |     return numpy.sum(frame ** 2) / numpy.float64(len(frame))
 41 | 
 42 | 
 43 | def stEnergyEntropy(frame, numOfShortBlocks=10):
 44 |     """Computes entropy of energy"""
 45 |     Eol = numpy.sum(frame ** 2)    # total frame energy
 46 |     L = len(frame)
 47 |     subWinLength = int(numpy.floor(L / numOfShortBlocks))
 48 |     if L != subWinLength * numOfShortBlocks:
 49 |             frame = frame[0:subWinLength * numOfShortBlocks]
 50 |     # subWindows is of size [numOfShortBlocks x L]
 51 |     subWindows = frame.reshape(subWinLength, numOfShortBlocks, order='F').copy()
 52 | 
 53 |     # Compute normalized sub-frame energies:
 54 |     s = numpy.sum(subWindows ** 2, axis=0) / (Eol + eps)
 55 | 
 56 |     # Compute entropy of the normalized sub-frame energies:
 57 |     Entropy = -numpy.sum(s * numpy.log2(s + eps))
 58 |     return Entropy
 59 | 
 60 | 
 61 | """ Frequency-domain audio features """
 62 | 
 63 | 
 64 | def stSpectralCentroidAndSpread(X, fs):
 65 |     """Computes spectral centroid of frame (given abs(FFT))"""
 66 |     ind = (numpy.arange(1, len(X) + 1)) * (fs/(2.0 * len(X)))
 67 | 
 68 |     Xt = X.copy()
 69 |     Xt = Xt / Xt.max()
 70 |     NUM = numpy.sum(ind * Xt)
 71 |     DEN = numpy.sum(Xt) + eps
 72 | 
 73 |     # Centroid:
 74 |     C = (NUM / DEN)
 75 | 
 76 |     # Spread:
 77 |     S = numpy.sqrt(numpy.sum(((ind - C) ** 2) * Xt) / DEN)
 78 | 
 79 |     # Normalize:
 80 |     C = C / (fs / 2.0)
 81 |     S = S / (fs / 2.0)
 82 | 
 83 |     return (C, S)
 84 | 
 85 | 
 86 | def stSpectralEntropy(X, numOfShortBlocks=10):
 87 |     """Computes the spectral entropy"""
 88 |     L = len(X)                         # number of frame samples
 89 |     Eol = numpy.sum(X ** 2)            # total spectral energy
 90 | 
 91 |     subWinLength = int(numpy.floor(L / numOfShortBlocks))   # length of sub-frame
 92 |     if L != subWinLength * numOfShortBlocks:
 93 |         X = X[0:subWinLength * numOfShortBlocks]
 94 | 
 95 |     subWindows = X.reshape(subWinLength, numOfShortBlocks, order='F').copy()  # define sub-frames (using matrix reshape)
 96 |     s = numpy.sum(subWindows ** 2, axis=0) / (Eol + eps)                      # compute spectral sub-energies
 97 |     En = -numpy.sum(s*numpy.log2(s + eps))                                    # compute spectral entropy
 98 | 
 99 |     return En
100 | 
101 | 
102 | def stSpectralFlux(X, Xprev):
103 |     """
104 |     Computes the spectral flux feature of the current frame
105 |     ARGUMENTS:
106 |         X:        the abs(fft) of the current frame
107 |         Xpre:        the abs(fft) of the previous frame
108 |     """
109 |     # compute the spectral flux as the sum of square distances:
110 |     sumX = numpy.sum(X + eps)
111 |     sumPrevX = numpy.sum(Xprev + eps)
112 |     F = numpy.sum((X / sumX - Xprev/sumPrevX) ** 2)
113 | 
114 |     return F
115 | 
116 | 
117 | def stSpectralRollOff(X, c, fs):
118 |     """Computes spectral roll-off"""
119 |     totalEnergy = numpy.sum(X ** 2)
120 |     fftLength = len(X)
121 |     Thres = c*totalEnergy
122 |     # Ffind the spectral rolloff as the frequency position where the respective spectral energy is equal to c*totalEnergy
123 |     CumSum = numpy.cumsum(X ** 2) + eps
124 |     [a, ] = numpy.nonzero(CumSum > Thres)
125 |     if len(a) > 0:
126 |         mC = numpy.float64(a[0]) / (float(fftLength))
127 |     else:
128 |         mC = 0.0
129 |     return (mC)
130 | 
131 | 
132 | def stHarmonic(frame, fs):
133 |     """
134 |     Computes harmonic ratio and pitch
135 |     """
136 |     M = numpy.round(0.016 * fs) - 1
137 |     R = numpy.correlate(frame, frame, mode='full')
138 | 
139 |     g = R[len(frame)-1]
140 |     R = R[len(frame):-1]
141 | 
142 |     # estimate m0 (as the first zero crossing of R)
143 |     [a, ] = numpy.nonzero(numpy.diff(numpy.sign(R)))
144 | 
145 |     if len(a) == 0:
146 |         m0 = len(R)-1
147 |     else:
148 |         m0 = a[0]
149 |     if M > len(R):
150 |         M = len(R) - 1
151 | 
152 |     Gamma = numpy.zeros((M), dtype=numpy.float64)
153 |     CSum = numpy.cumsum(frame ** 2)
154 |     Gamma[m0:M] = R[m0:M] / (numpy.sqrt((g * CSum[M:m0:-1])) + eps)
155 | 
156 |     ZCR = stZCR(Gamma)
157 | 
158 |     if ZCR > 0.15:
159 |         HR = 0.0
160 |         f0 = 0.0
161 |     else:
162 |         if len(Gamma) == 0:
163 |             HR = 1.0
164 |             blag = 0.0
165 |             Gamma = numpy.zeros((M), dtype=numpy.float64)
166 |         else:
167 |             HR = numpy.max(Gamma)
168 |             blag = numpy.argmax(Gamma)
169 | 
170 |         # Get fundamental frequency:
171 |         f0 = fs / (blag + eps)
172 |         if f0 > 5000:
173 |             f0 = 0.0
174 |         if HR < 0.1:
175 |             f0 = 0.0
176 | 
177 |     return (HR, f0)
178 | 
179 | 
180 | def mfccInitFilterBanks(fs, nfft):
181 |     """
182 |     Computes the triangular filterbank for MFCC computation (used in the stFeatureExtraction function before the stMFCC function call)
183 |     This function is taken from the scikits.talkbox library (MIT Licence):
184 |     https://pypi.python.org/pypi/scikits.talkbox
185 |     """
186 | 
187 |     # filter bank params:
188 |     lowfreq = 133.33
189 |     linsc = 200/3.
190 |     logsc = 1.0711703
191 |     numLinFiltTotal = 13
192 |     numLogFilt = 27
193 | 
194 |     if fs < 8000:
195 |         nlogfil = 5
196 | 
197 |     # Total number of filters
198 |     nFiltTotal = numLinFiltTotal + numLogFilt
199 | 
200 |     # Compute frequency points of the triangle:
201 |     freqs = numpy.zeros(nFiltTotal+2)
202 |     freqs[:numLinFiltTotal] = lowfreq + numpy.arange(numLinFiltTotal) * linsc
203 |     freqs[numLinFiltTotal:] = freqs[numLinFiltTotal-1] * logsc ** numpy.arange(1, numLogFilt + 3)
204 |     heights = 2./(freqs[2:] - freqs[0:-2])
205 | 
206 |     # Compute filterbank coeff (in fft domain, in bins)
207 |     fbank = numpy.zeros((nFiltTotal, nfft))
208 |     nfreqs = numpy.arange(nfft) / (1. * nfft) * fs
209 | 
210 |     for i in range(nFiltTotal):
211 |         lowTrFreq = freqs[i]
212 |         cenTrFreq = freqs[i+1]
213 |         highTrFreq = freqs[i+2]
214 | 
215 |         lid = numpy.arange(numpy.floor(lowTrFreq * nfft / fs) + 1, numpy.floor(cenTrFreq * nfft / fs) + 1, dtype=numpy.int)
216 |         lslope = heights[i] / (cenTrFreq - lowTrFreq)
217 |         rid = numpy.arange(numpy.floor(cenTrFreq * nfft / fs) + 1, numpy.floor(highTrFreq * nfft / fs) + 1, dtype=numpy.int)
218 |         rslope = heights[i] / (highTrFreq - cenTrFreq)
219 |         fbank[i][lid] = lslope * (nfreqs[lid] - lowTrFreq)
220 |         fbank[i][rid] = rslope * (highTrFreq - nfreqs[rid])
221 | 
222 |     return fbank, freqs
223 | 
224 | 
225 | def stMFCC(X, fbank, nceps):
226 |     """
227 |     Computes the MFCCs of a frame, given the fft mag
228 |     ARGUMENTS:
229 |         X:        fft magnitude abs(FFT)
230 |         fbank:    filter bank (see mfccInitFilterBanks)
231 |     RETURN
232 |         ceps:     MFCCs (13 element vector)
233 |     Note:    MFCC calculation is, in general, taken from the scikits.talkbox library (MIT Licence),
234 |     #    with a small number of modifications to make it more compact and suitable for the pyAudioAnalysis Lib
235 |     """
236 | 
237 |     mspec = numpy.log10(numpy.dot(X, fbank.T)+eps)
238 |     ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:nceps]
239 |     return ceps
240 | 
241 | 
242 | def stChromaFeaturesInit(nfft, fs):
243 |     """
244 |     This function initializes the chroma matrices used in the calculation of the chroma features
245 |     """
246 |     freqs = numpy.array([((f + 1) * fs) / (2 * nfft) for f in range(nfft)])
247 |     Cp = 27.50
248 | 
249 |     nChroma = numpy.round(12.0 * numpy.log2(freqs / Cp)).astype(int)
250 | 
251 |     nFreqsPerChroma = numpy.zeros((nChroma.shape[0], ))
252 | 
253 |     uChroma = numpy.unique(nChroma)
254 |     for u in uChroma:
255 |         idx = numpy.nonzero(nChroma == u)
256 |         nFreqsPerChroma[idx] = idx[0].shape
257 |     return nChroma, nFreqsPerChroma
258 | 
259 | 
260 | def stChromaFeatures(X, fs, nChroma, nFreqsPerChroma):
261 |     #TODO: 1 complexity
262 |     #TODO: 2 bug with large windows
263 | 
264 |     chromaNames = ['A', 'A#', 'B', 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#']
265 |     spec = X**2
266 |     C = numpy.zeros((nChroma.shape[0],))
267 |     C[nChroma] = spec
268 |     C /= nFreqsPerChroma[nChroma]
269 |     finalC = numpy.zeros((12, 1))
270 |     newD = int(numpy.ceil(C.shape[0] / 12.0) * 12)
271 |     C2 = numpy.zeros((newD, ))
272 |     C2[0:C.shape[0]] = C
273 |     C2 = C2.reshape(C2.shape[0]/12, 12)
274 |     #for i in range(12):
275 |     #    finalC[i] = numpy.sum(C[i:C.shape[0]:12])
276 |     finalC = numpy.matrix(numpy.sum(C2, axis=0)).T
277 |     finalC /= spec.sum()
278 | 
279 | #    ax = plt.gca()
280 | #    plt.hold(False)
281 | #    plt.plot(finalC)
282 | #    ax.set_xticks(range(len(chromaNames)))
283 | #    ax.set_xticklabels(chromaNames)
284 | #    xaxis = numpy.arange(0, 0.02, 0.01);
285 | #    ax.set_yticks(range(len(xaxis)))
286 | #    ax.set_yticklabels(xaxis)
287 | #    plt.show(block=False)
288 | #    plt.draw()
289 | 
290 |     return chromaNames, finalC
291 | 
292 | 
293 | def stChromagram(signal, Fs, Win, Step, PLOT=False):
294 |     """
295 |     Short-term FFT mag for spectogram estimation:
296 |     Returns:
297 |         a numpy array (nFFT x numOfShortTermWindows)
298 |     ARGUMENTS:
299 |         signal:      the input signal samples
300 |         Fs:          the sampling freq (in Hz)
301 |         Win:         the short-term window size (in samples)
302 |         Step:        the short-term window step (in samples)
303 |         PLOT:        flag, 1 if results are to be ploted
304 |     RETURNS:
305 |     """
306 |     Win = int(Win)
307 |     Step = int(Step)
308 |     signal = numpy.double(signal)
309 |     signal = signal / (2.0 ** 15)
310 |     DC = signal.mean()
311 |     MAX = (numpy.abs(signal)).max()
312 |     signal = (signal - DC) / (MAX - DC)
313 | 
314 |     N = len(signal)        # total number of signals
315 |     curPos = 0
316 |     countFrames = 0
317 |     nfft = int(Win / 2)
318 |     nChroma, nFreqsPerChroma = stChromaFeaturesInit(nfft, Fs)
319 |     chromaGram = numpy.array([], dtype=numpy.float64)
320 | 
321 |     while (curPos + Win - 1 < N):
322 |         countFrames += 1
323 |         x = signal[curPos:curPos + Win]
324 |         curPos = curPos + Step
325 |         X = abs(fft(x))
326 |         X = X[0:nfft]
327 |         X = X / len(X)
328 |         chromaNames, C = stChromaFeatures(X, Fs, nChroma, nFreqsPerChroma)
329 |         C = C[:, 0]
330 |         if countFrames == 1:
331 |             chromaGram = C.T
332 |         else:
333 |             chromaGram = numpy.vstack((chromaGram, C.T))
334 |     FreqAxis = chromaNames
335 |     TimeAxis = [(t * Step) / Fs for t in range(chromaGram.shape[0])]
336 | 
337 |     if (PLOT):
338 |         fig, ax = plt.subplots()
339 |         chromaGramToPlot = chromaGram.transpose()[::-1, :]
340 |         Ratio = chromaGramToPlot.shape[1] / (3*chromaGramToPlot.shape[0])
341 |         chromaGramToPlot = numpy.repeat(chromaGramToPlot, Ratio, axis=0)
342 |         imgplot = plt.imshow(chromaGramToPlot)
343 |         Fstep = int(nfft / 5.0)
344 | #        FreqTicks = range(0, int(nfft) + Fstep, Fstep)
345 | #        FreqTicksLabels = [str(Fs/2-int((f*Fs) / (2*nfft))) for f in FreqTicks]
346 |         ax.set_yticks(range(Ratio / 2, len(FreqAxis) * Ratio, Ratio))
347 |         ax.set_yticklabels(FreqAxis[::-1])
348 |         TStep = countFrames / 3
349 |         TimeTicks = range(0, countFrames, TStep)
350 |         TimeTicksLabels = ['%.2f' % (float(t * Step) / Fs) for t in TimeTicks]
351 |         ax.set_xticks(TimeTicks)
352 |         ax.set_xticklabels(TimeTicksLabels)
353 |         ax.set_xlabel('time (secs)')
354 |         imgplot.set_cmap('jet')
355 |         plt.colorbar()
356 |         plt.show()
357 | 
358 |     return (chromaGram, TimeAxis, FreqAxis)
359 | 
360 | 
361 | def phormants(x, Fs):
362 |     N = len(x)
363 |     w = numpy.hamming(N)
364 | 
365 |     # Apply window and high pass filter.
366 |     x1 = x * w   
367 |     x1 = lfilter([1], [1., 0.63], x1)
368 |     
369 |     # Get LPC.    
370 |     ncoeff = 2 + Fs / 1000
371 |     A, e, k = lpc(x1, ncoeff)    
372 |     #A, e, k = lpc(x1, 8)
373 | 
374 |     # Get roots.
375 |     rts = numpy.roots(A)
376 |     rts = [r for r in rts if numpy.imag(r) >= 0]
377 | 
378 |     # Get angles.
379 |     angz = numpy.arctan2(numpy.imag(rts), numpy.real(rts))
380 | 
381 |     # Get frequencies.    
382 |     frqs = sorted(angz * (Fs / (2 * math.pi)))
383 | 
384 |     return frqs
385 | def beatExtraction(stFeatures, winSize, PLOT=False):
386 |     """
387 |     This function extracts an estimate of the beat rate for a musical signal.
388 |     ARGUMENTS:
389 |      - stFeatures:     a numpy array (numOfFeatures x numOfShortTermWindows)
390 |      - winSize:        window size in seconds
391 |     RETURNS:
392 |      - BPM:            estimates of beats per minute
393 |      - Ratio:          a confidence measure
394 |     """
395 | 
396 |     # Features that are related to the beat tracking task:
397 |     toWatch = [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
398 | 
399 |     maxBeatTime = int(round(2.0 / winSize))
400 |     HistAll = numpy.zeros((maxBeatTime,))
401 |     for ii, i in enumerate(toWatch):                                        # for each feature
402 |         DifThres = 2.0 * (numpy.abs(stFeatures[i, 0:-1] - stFeatures[i, 1::])).mean()    # dif threshold (3 x Mean of Difs)
403 |         [pos1, _] = utilities.peakdet(stFeatures[i, :], DifThres)           # detect local maxima
404 |         posDifs = []                                                        # compute histograms of local maxima changes
405 |         for j in range(len(pos1)-1):
406 |             posDifs.append(pos1[j+1]-pos1[j])
407 |         [HistTimes, HistEdges] = numpy.histogram(posDifs, numpy.arange(0.5, maxBeatTime + 1.5))
408 |         HistCenters = (HistEdges[0:-1] + HistEdges[1::]) / 2.0
409 |         HistTimes = HistTimes.astype(float) / stFeatures.shape[1]
410 |         HistAll += HistTimes
411 |         if PLOT:
412 |             plt.subplot(9, 2, ii + 1)
413 |             plt.plot(stFeatures[i, :], 'k')
414 |             for k in pos1:
415 |                 plt.plot(k, stFeatures[i, k], 'k*')
416 |             f1 = plt.gca()
417 |             f1.axes.get_xaxis().set_ticks([])
418 |             f1.axes.get_yaxis().set_ticks([])
419 | 
420 |     if PLOT:
421 |         plt.show(block=False)
422 |         plt.figure()
423 | 
424 |     # Get beat as the argmax of the agregated histogram:
425 |     I = numpy.argmax(HistAll)
426 |     BPMs = 60 / (HistCenters * winSize)
427 |     BPM = BPMs[I]
428 |     # ... and the beat ratio:
429 |     Ratio = HistAll[I] / HistAll.sum()
430 | 
431 |     if PLOT:
432 |         # filter out >500 beats from plotting:
433 |         HistAll = HistAll[BPMs < 500]
434 |         BPMs = BPMs[BPMs < 500]
435 | 
436 |         plt.plot(BPMs, HistAll, 'k')
437 |         plt.xlabel('Beats per minute')
438 |         plt.ylabel('Freq Count')
439 |         plt.show(block=True)
440 | 
441 |     return BPM, Ratio
442 | 
443 | 
444 | def stSpectogram(signal, Fs, Win, Step, PLOT=False):
445 |     """
446 |     Short-term FFT mag for spectogram estimation:
447 |     Returns:
448 |         a numpy array (nFFT x numOfShortTermWindows)
449 |     ARGUMENTS:
450 |         signal:      the input signal samples
451 |         Fs:          the sampling freq (in Hz)
452 |         Win:         the short-term window size (in samples)
453 |         Step:        the short-term window step (in samples)
454 |         PLOT:        flag, 1 if results are to be ploted
455 |     RETURNS:
456 |     """
457 |     Win = int(Win)
458 |     Step = int(Step)
459 |     signal = numpy.double(signal)
460 |     signal = signal / (2.0 ** 15)
461 |     DC = signal.mean()
462 |     MAX = (numpy.abs(signal)).max()
463 |     signal = (signal - DC) / (MAX - DC)
464 | 
465 |     N = len(signal)        # total number of signals
466 |     curPos = 0
467 |     countFrames = 0
468 |     nfft = int(Win / 2)
469 |     specgram = numpy.array([], dtype=numpy.float64)
470 | 
471 |     while (curPos + Win - 1 < N):
472 |         countFrames += 1
473 |         x = signal[curPos:curPos+Win]
474 |         curPos = curPos + Step
475 |         X = abs(fft(x))
476 |         X = X[0:nfft]
477 |         X = X / len(X)
478 | 
479 |         if countFrames == 1:
480 |             specgram = X ** 2
481 |         else:
482 |             specgram = numpy.vstack((specgram, X))
483 | 
484 |     FreqAxis = [((f + 1) * Fs) / (2 * nfft) for f in range(specgram.shape[1])]
485 |     TimeAxis = [(t * Step) / Fs for t in range(specgram.shape[0])]
486 | 
487 |     if (PLOT):
488 |         fig, ax = plt.subplots()
489 |         imgplot = plt.imshow(specgram.transpose()[::-1, :])
490 |         Fstep = int(nfft / 5.0)
491 |         FreqTicks = range(0, int(nfft) + Fstep, Fstep)
492 |         FreqTicksLabels = [str(Fs / 2 - int((f * Fs) / (2 * nfft))) for f in FreqTicks]
493 |         ax.set_yticks(FreqTicks)
494 |         ax.set_yticklabels(FreqTicksLabels)
495 |         TStep = countFrames/3
496 |         TimeTicks = range(0, countFrames, TStep)
497 |         TimeTicksLabels = ['%.2f' % (float(t * Step) / Fs) for t in TimeTicks]
498 |         ax.set_xticks(TimeTicks)
499 |         ax.set_xticklabels(TimeTicksLabels)
500 |         ax.set_xlabel('time (secs)')
501 |         ax.set_ylabel('freq (Hz)')
502 |         imgplot.set_cmap('jet')
503 |         plt.colorbar()
504 |         plt.show()
505 | 
506 |     return (specgram, TimeAxis, FreqAxis)
507 | 
508 | 
509 | """ Windowing and feature extraction """
510 | 
511 | 
512 | def stFeatureExtraction(signal, Fs, Win, Step):
513 |     """
514 |     This function implements the shor-term windowing process. For each short-term window a set of features is extracted.
515 |     This results to a sequence of feature vectors, stored in a numpy matrix.
516 |     ARGUMENTS
517 |         signal:       the input signal samples
518 |         Fs:           the sampling freq (in Hz)
519 |         Win:          the short-term window size (in samples)
520 |         Step:         the short-term window step (in samples)
521 |     RETURNS
522 |         stFeatures:   a numpy array (numOfFeatures x numOfShortTermWindows)
523 |     """
524 | 
525 |     Win = int(Win)
526 |     Step = int(Step)
527 | 
528 |     # Signal normalization
529 |     signal = numpy.double(signal)
530 | 
531 |     signal = signal / (2.0 ** 15)
532 |     DC = signal.mean()
533 |     MAX = (numpy.abs(signal)).max()
534 |     signal = (signal - DC) / MAX
535 | 
536 |     N = len(signal)                                # total number of samples
537 |     curPos = 0
538 |     countFrames = 0
539 |     nFFT = Win / 2
540 | 
541 |     [fbank, freqs] = mfccInitFilterBanks(Fs, nFFT)                # compute the triangular filter banks used in the mfcc calculation
542 |     nChroma, nFreqsPerChroma = stChromaFeaturesInit(nFFT, Fs)
543 | 
544 | 
545 |     numOfPitches = 5
546 |     numOfPeaks = 10
547 |     numOfTimeSpectralFeatures = 8
548 |     numOfHarmonicFeatures = 0
549 |     nceps = 13
550 |     numOfChromaFeatures = 13
551 |     if use_pitch:
552 |         totalNumOfFeatures = numOfTimeSpectralFeatures + nceps + numOfHarmonicFeatures + numOfChromaFeatures + numOfPeaks + numOfPitches
553 |     else:
554 |         totalNumOfFeatures = numOfTimeSpectralFeatures + nceps + numOfHarmonicFeatures + numOfChromaFeatures
555 | #    totalNumOfFeatures = numOfTimeSpectralFeatures + nceps + numOfHarmonicFeatures
556 |     stFeatures = numpy.array([], dtype=numpy.float64)
557 | 
558 |     while (curPos + Win - 1 < N):                        # for each short-term window until the end of signal
559 |         countFrames += 1
560 |         x = signal[curPos:curPos+Win]                    # get current window
561 |         if use_pitch:
562 |             p = pitch.ppitch(x, sr=Fs, num_pitches=numOfPitches, num_peaks=numOfPeaks, win_length=Win, hop_length=Win*2)
563 |             pitches = p[1][0:1, :].T * 1.e-3
564 |             peaks = p[3][0:1, :].T * 1.e-3
565 |         curPos = curPos + Step                           # update window position
566 |         X = abs(fft(x))                                  # get fft magnitude
567 |         X = X[0:nFFT]                                    # normalize fft
568 |         X = X / len(X)
569 |         if countFrames == 1:
570 |             Xprev = X.copy()                             # keep previous fft mag (used in spectral flux)
571 |         curFV = numpy.zeros((totalNumOfFeatures, 1))
572 |         curFV[0] = stZCR(x)                              # zero crossing rate
573 |         curFV[1] = stEnergy(x)                           # short-term energy
574 |         curFV[2] = stEnergyEntropy(x)                    # short-term entropy of energy
575 |         [curFV[3], curFV[4]] = stSpectralCentroidAndSpread(X, Fs)    # spectral centroid and spread
576 |         curFV[5] = stSpectralEntropy(X)                  # spectral entropy
577 |         curFV[6] = stSpectralFlux(X, Xprev)              # spectral flux
578 |         curFV[7] = stSpectralRollOff(X, 0.90, Fs)        # spectral rolloff
579 |         curFV[numOfTimeSpectralFeatures:numOfTimeSpectralFeatures+nceps, 0] = stMFCC(X, fbank, nceps).copy()    # MFCCs
580 | 
581 |         chromaNames, chromaF = stChromaFeatures(X, Fs, nChroma, nFreqsPerChroma)
582 |         curFV[numOfTimeSpectralFeatures + nceps: numOfTimeSpectralFeatures + nceps + numOfChromaFeatures - 1] = chromaF
583 |         numOfCFFeatures = numOfTimeSpectralFeatures + nceps + numOfChromaFeatures
584 | 
585 |         curFV[numOfCFFeatures-1] = chromaF.std()
586 |         if use_pitch:
587 |             curFV[numOfCFFeatures:numOfCFFeatures + numOfPeaks] = peaks
588 |             curFV[numOfCFFeatures + numOfPeaks:numOfCFFeatures + numOfPeaks + numOfPitches] = pitches
589 | #        curFV[numOfTimeSpectralFeatures+nceps+numOfChromaFeatures-1] = numpy.nonzero( chromaF > 2.0 * chromaF.mean() )[0].shape[0]
590 | #        temp = numpy.sort(chromaF[:,0])
591 | #        curFV[numOfTimeSpectralFeatures+numOfChromaFeatures] = temp[-1] / numpy.mean(temp[0:5])
592 | #        temp = numpy.sort(chromaF[:,0])
593 | #        if countFrames==10 or countFrames==30:
594 | #            A = int(temp[-1] / numpy.mean(temp[0:5]))/10
595 | #            for a in range(A):
596 | #                print("|"),
597 | #            print
598 | #        if countFrames==20:
599 | #            print numpy.nonzero(chromaF > 5*chromaF.mean())[0].shape[0]
600 |         #HR, curFV[numOfTimeSpectralFeatures+nceps] = stHarmonic(x, Fs)
601 |         # curFV[numOfTimeSpectralFeatures+nceps+1] = freq_from_autocorr(x, Fs)
602 |         if countFrames == 1:
603 |             stFeatures = curFV                                        # initialize feature matrix (if first frame)
604 |         else:
605 |             stFeatures = numpy.concatenate((stFeatures, curFV), 1)    # update feature matrix
606 |         Xprev = X.copy()
607 | 
608 |     return numpy.array(stFeatures)
609 | 
610 | 
611 | def mtFeatureExtraction(signal, Fs, mtWin, mtStep, stWin, stStep):
612 |     """
613 |     Mid-term feature extraction
614 |     """
615 | 
616 |     mtWinRatio = int(round(mtWin / stStep))
617 |     mtStepRatio = int(round(mtStep / stStep))
618 | 
619 |     mtFeatures = []
620 | 
621 |     stFeatures = stFeatureExtraction(signal, Fs, stWin, stStep)
622 |     numOfFeatures = len(stFeatures)
623 |     numOfStatistics = 2
624 | 
625 |     mtFeatures = []
626 |     #for i in range(numOfStatistics * numOfFeatures + 1):
627 |     for i in range(numOfStatistics * numOfFeatures):
628 |         mtFeatures.append([])
629 | 
630 |     for i in range(numOfFeatures):        # for each of the short-term features:
631 |         curPos = 0
632 |         N = len(stFeatures[i])
633 |         while (curPos < N):
634 |             N1 = curPos
635 |             N2 = curPos + mtWinRatio
636 |             if N2 > N:
637 |                 N2 = N
638 |             curStFeatures = stFeatures[i][N1:N2]
639 | 
640 |             mtFeatures[i].append(numpy.mean(curStFeatures))
641 |             mtFeatures[i+numOfFeatures].append(numpy.std(curStFeatures))
642 |             #mtFeatures[i+2*numOfFeatures].append(numpy.std(curStFeatures) / (numpy.mean(curStFeatures)+0.00000010))
643 |             curPos += mtStepRatio
644 | 
645 |     return numpy.array(mtFeatures), stFeatures
646 | 
647 | 
648 | # TODO
649 | def stFeatureSpeed(signal, Fs, Win, Step):
650 | 
651 |     signal = numpy.double(signal)
652 |     signal = signal / (2.0 ** 15)
653 |     DC = signal.mean()
654 |     MAX = (numpy.abs(signal)).max()
655 |     signal = (signal - DC) / MAX
656 |     # print (numpy.abs(signal)).max()
657 | 
658 |     N = len(signal)        # total number of signals
659 |     curPos = 0
660 |     countFrames = 0
661 | 
662 |     lowfreq = 133.33
663 |     linsc = 200/3.
664 |     logsc = 1.0711703
665 |     nlinfil = 13
666 |     nlogfil = 27
667 |     nceps = 13
668 |     nfil = nlinfil + nlogfil
669 |     nfft = Win / 2
670 |     if Fs < 8000:
671 |         nlogfil = 5
672 |         nfil = nlinfil + nlogfil
673 |         nfft = Win / 2
674 | 
675 |     # compute filter banks for mfcc:
676 |     [fbank, freqs] = mfccInitFilterBanks(Fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil)
677 | 
678 |     numOfTimeSpectralFeatures = 8
679 |     numOfHarmonicFeatures = 1
680 |     totalNumOfFeatures = numOfTimeSpectralFeatures + nceps + numOfHarmonicFeatures
681 |     #stFeatures = numpy.array([], dtype=numpy.float64)
682 |     stFeatures = []
683 | 
684 |     while (curPos + Win - 1 < N):
685 |         countFrames += 1
686 |         x = signal[curPos:curPos + Win]
687 |         curPos = curPos + Step
688 |         X = abs(fft(x))
689 |         X = X[0:nfft]
690 |         X = X / len(X)
691 |         Ex = 0.0
692 |         El = 0.0
693 |         X[0:4] = 0
694 | #        M = numpy.round(0.016 * fs) - 1
695 | #        R = numpy.correlate(frame, frame, mode='full')
696 |         stFeatures.append(stHarmonic(x, Fs))
697 | #        for i in range(len(X)):
698 |             #if (i < (len(X) / 8)) and (i > (len(X)/40)):
699 |             #    Ex += X[i]*X[i]
700 |             #El += X[i]*X[i]
701 | #        stFeatures.append(Ex / El)
702 | #        stFeatures.append(numpy.argmax(X))
703 | #        if curFV[numOfTimeSpectralFeatures+nceps+1]>0:
704 | #            print curFV[numOfTimeSpectralFeatures+nceps], curFV[numOfTimeSpectralFeatures+nceps+1]
705 |     return numpy.array(stFeatures)
706 | 
707 | 
708 | """ Feature Extraction Wrappers
709 |  - The first two feature extraction wrappers are used to extract long-term averaged
710 |    audio features for a list of WAV files stored in a given category.
711 |    It is important to note that, one single feature is extracted per WAV file (not the whole sequence of feature vectors)
712 |  """
713 | 
714 | 
715 | def dirWavFeatureExtraction(dirName, mtWin, mtStep, stWin, stStep, computeBEAT=False):
716 |     """
717 |     This function extracts the mid-term features of the WAVE files of a particular folder.
718 |     The resulting feature vector is extracted by long-term averaging the mid-term features.
719 |     Therefore ONE FEATURE VECTOR is extracted for each WAV file.
720 |     ARGUMENTS:
721 |         - dirName:        the path of the WAVE directory
722 |         - mtWin, mtStep:    mid-term window and step (in seconds)
723 |         - stWin, stStep:    short-term window and step (in seconds)
724 |     """
725 | 
726 |     allMtFeatures = numpy.array([])
727 |     processingTimes = []
728 | 
729 |     types = ('*.wav', '*.aif',  '*.aiff')
730 |     wavFilesList = []
731 |     for files in types:
732 |         wavFilesList.extend(glob.glob(os.path.join(dirName, files)))
733 | 
734 |     wavFilesList = sorted(wavFilesList)
735 | 
736 |     for wavFile in wavFilesList:
737 |         [Fs, x] = audioBasicIO.readAudioFile(wavFile)            # read file
738 |         t1 = time.clock()
739 |         x = audioBasicIO.stereo2mono(x)                          # convert stereo to mono
740 |         if computeBEAT:                                          # mid-term feature extraction for current file
741 |             [MidTermFeatures, stFeatures] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep))
742 |             [beat, beatConf] = beatExtraction(stFeatures, stStep)
743 |         else:
744 |             [MidTermFeatures, _] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep))
745 | 
746 |         MidTermFeatures = numpy.transpose(MidTermFeatures)
747 |         MidTermFeatures = MidTermFeatures.mean(axis=0)         # long term averaging of mid-term statistics
748 |         if computeBEAT:
749 |             MidTermFeatures = numpy.append(MidTermFeatures, beat)
750 |             MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
751 |         if len(allMtFeatures) == 0:                              # append feature vector
752 |             allMtFeatures = MidTermFeatures
753 |         else:
754 |             allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures))
755 |         t2 = time.clock()
756 |         duration = float(len(x)) / Fs
757 |         processingTimes.append((t2 - t1) / duration)
758 |     if len(processingTimes) > 0:
759 |         print "Feature extraction complexity ratio: {0:.1f} x realtime".format((1.0 / numpy.mean(numpy.array(processingTimes))))
760 |     return (allMtFeatures, wavFilesList)
761 | 
762 | 
763 | def dirsWavFeatureExtraction(dirNames, mtWin, mtStep, stWin, stStep, computeBEAT=False):
764 |     '''
765 |     Same as dirWavFeatureExtraction, but instead of a single dir it takes a list of paths as input and returns a list of feature matrices.
766 |     EXAMPLE:
767 |     [features, classNames] =
768 |            a.dirsWavFeatureExtraction(['audioData/classSegmentsRec/noise','audioData/classSegmentsRec/speech',
769 |                                        'audioData/classSegmentsRec/brush-teeth','audioData/classSegmentsRec/shower'], 1, 1, 0.02, 0.02);
770 |     It can be used during the training process of a classification model ,
771 |     in order to get feature matrices from various audio classes (each stored in a seperate path)
772 |     '''
773 | 
774 |     # feature extraction for each class:
775 |     features = []
776 |     classNames = []
777 |     fileNames = []
778 |     for i, d in enumerate(dirNames):
779 |         [f, fn] = dirWavFeatureExtraction(d, mtWin, mtStep, stWin, stStep, computeBEAT=computeBEAT)
780 |         if f.shape[0] > 0:       # if at least one audio file has been found in the provided folder:
781 |             features.append(f)
782 |             fileNames.append(fn)
783 |             if d[-1] == "/":
784 |                 classNames.append(d.split(os.sep)[-2])
785 |             else:
786 |                 classNames.append(d.split(os.sep)[-1])
787 |     return features, classNames, fileNames
788 | 
789 | 
790 | def dirWavFeatureExtractionNoAveraging(dirName, mtWin, mtStep, stWin, stStep):
791 |     """
792 |     This function extracts the mid-term features of the WAVE files of a particular folder without averaging each file.
793 |     ARGUMENTS:
794 |         - dirName:          the path of the WAVE directory
795 |         - mtWin, mtStep:    mid-term window and step (in seconds)
796 |         - stWin, stStep:    short-term window and step (in seconds)
797 |     RETURNS:
798 |         - X:                A feature matrix
799 |         - Y:                A matrix of file labels
800 |         - filenames:
801 |     """
802 | 
803 |     allMtFeatures = numpy.array([])
804 |     signalIndices = numpy.array([])
805 |     processingTimes = []
806 | 
807 |     types = ('*.wav', '*.aif',  '*.aiff')
808 |     wavFilesList = []
809 |     for files in types:
810 |         wavFilesList.extend(glob.glob(os.path.join(dirName, files)))
811 | 
812 |     wavFilesList = sorted(wavFilesList)
813 | 
814 |     for i, wavFile in enumerate(wavFilesList):
815 |         [Fs, x] = audioBasicIO.readAudioFile(wavFile)            # read file
816 |         x = audioBasicIO.stereo2mono(x)                          # convert stereo to mono
817 |         [MidTermFeatures, _] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep))  # mid-term feature
818 | 
819 |         MidTermFeatures = numpy.transpose(MidTermFeatures)
820 | #        MidTermFeatures = MidTermFeatures.mean(axis=0)        # long term averaging of mid-term statistics
821 |         if len(allMtFeatures) == 0:                # append feature vector
822 |             allMtFeatures = MidTermFeatures
823 |             signalIndices = numpy.zeros((MidTermFeatures.shape[0], ))
824 |         else:
825 |             allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures))
826 |             signalIndices = numpy.append(signalIndices, i * numpy.ones((MidTermFeatures.shape[0], )))
827 | 
828 |     return (allMtFeatures, signalIndices, wavFilesList)
829 | 
830 | 
831 | # The following two feature extraction wrappers extract features for given audio files, however
832 | # NO LONG-TERM AVERAGING is performed. Therefore, the output for each audio file is NOT A SINGLE FEATURE VECTOR
833 | # but a whole feature matrix.
834 | #
835 | # Also, another difference between the following two wrappers and the previous is that they NO LONG-TERM AVERAGING IS PERFORMED.
836 | # In other words, the WAV files in these functions are not used as uniform samples that need to be averaged but as sequences
837 | 
838 | def mtFeatureExtractionToFile(fileName, midTermSize, midTermStep, shortTermSize, shortTermStep, outPutFile,
839 |                               storeStFeatures=False, storeToCSV=False, PLOT=False):
840 |     """
841 |     This function is used as a wrapper to:
842 |     a) read the content of a WAV file
843 |     b) perform mid-term feature extraction on that signal
844 |     c) write the mid-term feature sequences to a numpy file
845 |     """
846 |     [Fs, x] = audioBasicIO.readAudioFile(fileName)            # read the wav file
847 |     x = audioBasicIO.stereo2mono(x)                           # convert to MONO if required
848 |     if storeStFeatures:
849 |         [mtF, stF] = mtFeatureExtraction(x, Fs, round(Fs * midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep))
850 |     else:
851 |         [mtF, _] = mtFeatureExtraction(x, Fs, round(Fs*midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep))
852 | 
853 |     numpy.save(outPutFile, mtF)                              # save mt features to numpy file
854 |     if PLOT:
855 |         print "Mid-term numpy file: " + outPutFile + ".npy saved"
856 |     if storeToCSV:
857 |         numpy.savetxt(outPutFile+".csv", mtF.T, delimiter=",")
858 |         if PLOT:
859 |             print "Mid-term CSV file: " + outPutFile + ".csv saved"
860 | 
861 |     if storeStFeatures:
862 |         numpy.save(outPutFile+"_st", stF)                    # save st features to numpy file
863 |         if PLOT:
864 |             print "Short-term numpy file: " + outPutFile + "_st.npy saved"
865 |         if storeToCSV:
866 |             numpy.savetxt(outPutFile+"_st.csv", stF.T, delimiter=",")    # store st features to CSV file
867 |             if PLOT:
868 |                 print "Short-term CSV file: " + outPutFile + "_st.csv saved"
869 | 
870 | 
871 | def mtFeatureExtractionToFileDir(dirName, midTermSize, midTermStep, shortTermSize, shortTermStep, storeStFeatures=False, storeToCSV=False, PLOT=False):
872 |     types = (dirName + os.sep + '*.wav', )
873 |     filesToProcess = []
874 |     for files in types:
875 |         filesToProcess.extend(glob.glob(files))
876 |     for f in filesToProcess:
877 |         outPath = f
878 |         mtFeatureExtractionToFile(f, midTermSize, midTermStep, shortTermSize, shortTermStep, outPath, storeStFeatures, storeToCSV, PLOT)


--------------------------------------------------------------------------------