├── to_speech.py ├── README.md ├── calculate_features.py ├── models.py ├── code_yan_lstm.py ├── pitch.py ├── code.py └── cf.py /to_speech.py: -------------------------------------------------------------------------------- 1 | import os 2 | from gtts import gTTS 3 | import numpy as np 4 | 5 | def to_speech(text): 6 | tts = gTTS(text=text, lang = 'en') 7 | return tts 8 | 9 | def save(tts, filename): 10 | tts.save(filename + '.mp3') 11 | 12 | data = {'A':158, 'E':9307, 'M':1318, 'R':576, 'T':637, 'N':5707} 13 | labels = ['A','E','M','R','T','N'] 14 | values = [158, 9307, 1318, 576, 637, 5707] 15 | 16 | bounds = [np.sum(values[:i]) for i in xrange(0, 7)] 17 | print bounds 18 | 19 | n = np.sum(values) 20 | 21 | print np.sum(values) 22 | print values/np.sum(values, dtype=float) 23 | p_labels = [] 24 | t_labels = [] 25 | 26 | 27 | for i in xrange(n): 28 | a = int(n * np.random.random()) 29 | for j in xrange(6): 30 | if bounds[j] <= a and a < bounds[j+1]: 31 | p_labels.append(labels[j]) 32 | if bounds[j] <= i and i < bounds[j+1]: 33 | t_labels.append(labels[j]) 34 | 35 | p_labels = np.array(p_labels) 36 | t_labels = np.array(t_labels) 37 | 38 | print p_labels[p_labels == t_labels].shape[0] / float(t_labels.shape[0]) 39 | 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Audio emotion recognition 2 | 3 | ## Description 4 | 5 | The goal of this project is to provide script to verify our emotion recognition approach. 6 | 7 | ### Requirements 8 | 9 | * numpy 10 | * sklearn 11 | * librosa (for pitch estimation, optional) 12 | * keras 13 | 14 | ### Project structure 15 | The code consist of 3 main .py files. 16 | 17 | * model.py - containt keras implementation of LSTM, MPC neural networks, sklearn models 18 | 19 | Main methods to use in emotion recognition: 20 | * train_mpc(train_x, train_y, test_x, test_y) - train multilayer perceptron. 21 | * train_lstm(train_x, train_y, test_x, test_y) - train LSTM NN 22 | * train_rfc(train_x, train_y, options) - train Random Forest classifier 23 | 24 | All methods return Model object and also show precision and loss on test sample to inline validation. 25 | 26 | * calculate_features.py - script for features estimation 27 | 28 | Method `calculate_features(signal, freq, options)` returns features set for all 0.2 sec frames in signal. Freq is a signal framerate. If use_derivatives flag is true, method also include 1st and 2nd time deltas of features. 29 | To calculate features cf.py code is used. It based on https://github.com/tyiannak/pyAudioAnalysis/blob/master/audioFeatureExtraction.py with minor improvements. 30 | 31 | As default returns 32 mfcc, spectral and chromagram features 32 | 33 | * code_yan_lstm.py - implementation of LSTM NN in emotion recognition. 34 | 35 | The code can be divided into 3 blocks 36 | * Data reading 37 | * Data preprocessing 38 | * The main part includes model buildg and validation 39 | 40 | To get the data successfully, choose regime from ['aibo4', 'aibo5', 'iemocap'] and correct paths to wavs, labels, etc. aibo4 and aibo5 regimes correspondings to different labels over AIBO database. 41 | 42 | Data preprocessing consist of normalization, padding sequences, transfromation labels from categorical to vector, balancing and resampling. 43 | 44 | The main part is a 5-fold cross-validation procedure over readed database. It splits sample into train and test parts, trains models on train and validates on test. By ending cross-validation procedure it plots confusion matrix aomparing prediction and expected output. 45 | 46 | ## Running 47 | 48 | To run the code, enter `python code_yan_lstm.py` in a command line or IDE. In addition to correction paths and settings, create folder for feature samples in the working directory. 49 | -------------------------------------------------------------------------------- /calculate_features.py: -------------------------------------------------------------------------------- 1 | import wave 2 | import numpy as np 3 | import math 4 | import os 5 | import pickle 6 | import cf 7 | 8 | 9 | def weighting(x): 10 | if weighting_type == 'cos': 11 | return x * np.sin(np.pi * np.arange(len(x)) / (len(x) - 1)) 12 | elif weighting_type == 'hamming': 13 | return x * (0.54 - 0.46 * np. cos(2. * np.pi * np.arange(len(x)) / (len(x) - 1))) 14 | else: 15 | return x 16 | 17 | def calculate_features(frames, freq, options): 18 | n = len(frames) 19 | window_sec = 0.2 20 | window_n = int(freq * window_sec) 21 | use_derivatives = False 22 | 23 | st_f = cf.stFeatureExtraction(frames, freq, window_n, window_n / 2) 24 | if st_f.shape[1] > 2: 25 | i0 = 1 26 | i1 = st_f.shape[1] - 1 27 | if i1 - i0 < 1: 28 | i1 = i0 + 1 29 | if use_derivatives: 30 | deriv_st_f = np.zeros((st_f.shape[0]*3, i1 - i0), dtype=float) 31 | else: 32 | deriv_st_f = np.zeros((st_f.shape[0], i1 - i0), dtype=float) 33 | for i in xrange(i0, i1): 34 | i_left = i - 1 35 | i_right = i + 1 36 | deriv_st_f[:st_f.shape[0], i - i0] = st_f[:, i] 37 | if use_derivatives: 38 | if st_f.shape[1] >= 2: 39 | deriv_st_f[st_f.shape[0]:st_f.shape[0]*2, i - i0] = (st_f[:, i_right] - st_f[:, i_left]) / 2. 40 | deriv_st_f[st_f.shape[0]*2:st_f.shape[0]*3, i - i0] = st_f[:, i] - 0.5*(st_f[:, i_left] + st_f[:, i_right]) 41 | return deriv_st_f 42 | elif st_f.shape[1] == 2: 43 | deriv_st_f = np.zeros((st_f.shape[0], 1), dtype=float) 44 | deriv_st_f[:st_f.shape[0], 0] = st_f[:, 0] 45 | if use_derivatives: 46 | deriv_st_f[st_f.shape[0]:st_f.shape[0]*2, 0] = st_f[:, 1] - st_f[:, 0] 47 | deriv_st_f[st_f.shape[0]*2:st_f.shape[0]*3, 0] = np.zeros(st_f.shape[0]) 48 | return deriv_st_f 49 | else: 50 | deriv_st_f = np.zeros((st_f.shape[0], 1), dtype=float) 51 | deriv_st_f[:st_f.shape[0], 0] = st_f[:, 0] 52 | if use_derivatives: 53 | deriv_st_f[st_f.shape[0]:st_f.shape[0]*2, 0] = np.zeros(st_f.shape[0]) 54 | deriv_st_f[st_f.shape[0]*2:st_f.shape[0]*3, 0] = np.zeros(st_f.shape[0]) 55 | return deriv_st_f 56 | 57 | #mt_f, st_f2 = cf.mtFeatureExtraction(frames, 16000, 2*window_n, 2*window_n, window_n, window_n) 58 | #print st_f.shape, n / window_n 59 | # print st_f2.shape, n / window_n 60 | # print mt_f.shape, n / window_n 61 | # print mt_f 62 | # for i in xrange(k0, k1): 63 | # if overlapping_type in ['l', 'lr']: 64 | # i0 = (i - 1) * window_n 65 | # else: 66 | # i0 = i * window_n 67 | # if overlapping_type in ['r', 'lr']: 68 | # i1 = (i + 2) * window_n 69 | # else: 70 | # i1 = (i + 1) * window_n 71 | 72 | # x = frames[i0:i1] 73 | # print len(x), i0, i1, len(frames), n / window_n 74 | 75 | # # x = weighting(x) 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import wave 2 | import numpy as np 3 | import math 4 | import os 5 | import pickle 6 | from keras.preprocessing import sequence 7 | from sklearn.ensemble import RandomForestRegressor as RFR 8 | 9 | from keras.datasets import mnist 10 | from keras.models import Sequential 11 | from keras.layers.core import Dense, Dropout, Activation, Merge, TimeDistributedDense 12 | from keras.layers import LSTM 13 | from keras.optimizers import SGD, Adam, RMSprop 14 | from keras.utils import np_utils 15 | from keras.regularizers import l2 16 | from keras.optimizers import SGD 17 | 18 | import h5py 19 | 20 | def train_rfr(x, y, tx, ty): 21 | rfr = RFR(n_estimators=50) 22 | model = rfr.fit(x, y) 23 | 24 | return model 25 | 26 | 27 | 28 | def train(x, y, options): 29 | classes = np.unique(y) 30 | print classes 31 | 32 | 33 | classifiers = [] 34 | for c in classes: 35 | out = np.array([int(i) for i in y==c]) 36 | print c, len(out[out == 1]), len(out[out == 0]), len(out[out == 1]) + len(out[out == 0]) 37 | 38 | 39 | 40 | classifier = RFC(n_estimators=10) 41 | classifier.fit(x, out) 42 | classifiers.append(classifier) 43 | return classifiers 44 | 45 | def train_rfc(x, y, options): 46 | classifier = RFC(n_estimators=100) 47 | 48 | return classifier.fit(x, y) 49 | 50 | def fork (model, n=2): 51 | forks = [] 52 | for i in range(n): 53 | f = Sequential() 54 | f.add (model) 55 | forks.append(f) 56 | return forks 57 | 58 | 59 | def make_sample_lstm(x, n, y=None, use_y=False): 60 | if use_y: 61 | xt = np.zeros((x.shape[0], n, x.shape[1] + 1), dtype=float) 62 | t = np.zeros((x.shape[0], x.shape[1] + 1), dtype=float) 63 | for i in xrange(x.shape[0]): 64 | if i == 0: 65 | t[i, :-1] = x[i, :] 66 | t[i, -1] = 0 67 | else: 68 | t[i, :-1] = x[i, :] 69 | t[i, -1] = y[i-1] 70 | 71 | for i in xrange(x.shape[0]): 72 | if i < n: 73 | i0 = n - i 74 | xt[i, :i0, :] = np.zeros((i0, x.shape[1] + 1), dtype=float) 75 | if i > 0: 76 | xt[i, i0:, :] = t[:i, :] 77 | else: 78 | xt[i, :, :] = t[i-n:i, :] 79 | return xt 80 | else: 81 | xt = np.zeros((x.shape[0], n, x.shape[1]), dtype=float) 82 | for i in xrange(x.shape[0]): 83 | if i < n: 84 | i0 = n - i 85 | xt[i, :i0, :] = np.zeros((i0, x.shape[1]), dtype=float) 86 | if i > 0: 87 | xt[i, i0:, :] = x[:i, :] 88 | else: 89 | xt[i, :, :] = x[i-n:i, :] 90 | return xt 91 | 92 | 93 | class modelLSTM: 94 | def __init__(self, model, length, use_y): 95 | self.model = model 96 | self.n = length 97 | self.use_y = use_y 98 | def predict(self, x): 99 | if self.use_y: 100 | result = np.zeros((x.shape[0], 1), dtype=float) 101 | for i in xrange(x.shape[0]): 102 | t = np.zeros((self.n, x.shape[1] + 1), dtype=float) 103 | 104 | 105 | else: 106 | xt = make_sample_lstm(x, self.n) 107 | return self.model.predict(xt) 108 | def save(self, name_json, name_weights): 109 | json_string = self.model.to_json() 110 | open(name_json, 'w').write(json_string) 111 | self.model.save_weights(name_weights) 112 | 113 | 114 | def train_lstm_avec(x, y, xt, yt): 115 | length = 25 116 | use_y = False 117 | 118 | x_series_train = make_sample_lstm(x, length, y, use_y) 119 | print x_series_train.shape 120 | x_series_test = make_sample_lstm(xt, length, yt, use_y) 121 | print x_series_test.shape 122 | 123 | print y[:100, 0] 124 | model = Sequential() 125 | model.add(LSTM(256, return_sequences=True, input_shape=(x_series_train.shape[1], x_series_train.shape[2]))) 126 | model.add(Dropout(0.2)) 127 | model.add(Activation('tanh')) 128 | model.add(LSTM(128)) 129 | model.add(Dropout(0.2)) 130 | model.add(Activation('tanh')) 131 | # model.add(Dense(128)) 132 | # model.add(Activation('tanh')) 133 | model.add(Dense(1)) 134 | #model.add(Activation('softmax')) 135 | model.summary() 136 | model.compile(loss='mean_absolute_error', optimizer='rmsprop') 137 | model.fit(x_series_train, y, batch_size=512, nb_epoch=50, 138 | verbose=2, validation_data=(x_series_test, yt)) 139 | 140 | return modelLSTM(model, length, use_y) 141 | 142 | 143 | 144 | 145 | 146 | def train_lstm(x, y, xt, yt): 147 | batch_size = 320 148 | nb_classes = 10 149 | nb_epoch = 25 150 | ts = x[0].shape[0] 151 | model = Sequential() 152 | model.add(LSTM(512, return_sequences=True, input_shape=(ts, x[0].shape[1]))) 153 | model.add(Activation('tanh')) 154 | # model.add(LSTM(512, return_sequences=True)) 155 | # model.add(Activation('tanh')) 156 | model.add(LSTM(256, return_sequences=False)) 157 | model.add(Activation('tanh')) 158 | model.add(Dense(512)) 159 | model.add(Activation('tanh')) 160 | model.add(Dense(y.shape[1])) 161 | model.add(Activation('softmax')) 162 | 163 | model.summary() 164 | 165 | model.compile(loss='categorical_crossentropy', optimizer=RMSprop()) 166 | 167 | 168 | 169 | # for epoch in xrange(nb_epoch): 170 | model.fit(x, y, batch_size=batch_size, nb_epoch=nb_epoch, 171 | verbose=2, validation_data=(xt, yt), show_accuracy=True) 172 | 173 | return model 174 | 175 | def train_mpc(x, y, tx, ty): 176 | 177 | batch_size = 256 178 | nb_classes = 10 179 | nb_epoch = 20 180 | 181 | model = Sequential() 182 | model.add(Dense(512, input_shape=(x.shape[1],))) 183 | model.add(Activation('relu')) 184 | model.add(Dropout(0.2)) 185 | model.add(Dense(1024)) 186 | model.add(Activation('relu')) 187 | model.add(Dense(1024)) 188 | model.add(Activation('relu')) 189 | model.add(Dropout(0.2)) 190 | model.add(Dense(512)) 191 | model.add(Activation('relu')) 192 | model.add(Dense(1024)) 193 | model.add(Activation('relu')) 194 | model.add(Dense(512)) 195 | model.add(Activation('relu')) 196 | model.add(Dense(y.shape[1])) 197 | #model.add(Activation('softmax')) 198 | 199 | #model.summary() 200 | 201 | model.compile(loss='mean_absolute_error', 202 | optimizer=RMSprop()) 203 | 204 | history = model.fit(x, y, 205 | batch_size=batch_size, nb_epoch=nb_epoch, 206 | verbose=0, validation_data=(tx, ty), show_accuracy=True) 207 | 208 | return model 209 | 210 | def validate1(classifier, test_x, test_y): 211 | predictions = classifier.predict(test_x) 212 | 213 | total_acc = predictions[predictions == test_y].shape[0] / float(predictions.shape[0]) 214 | print 'Total acc ', total_acc 215 | 216 | classes = np.unique(test_y) 217 | print test_y 218 | print predictions 219 | ans = [] 220 | for ci, c in enumerate(classes): 221 | print ci 222 | 223 | idx = np.array([ii for ii, i in enumerate(test_y==c)]) 224 | out = test_y[idx] 225 | pred = predictions[idx] 226 | 227 | tt = pred[(pred == c) * (out == c)].shape[0] / float(out[out == c].shape[0]) 228 | tf = pred[(pred == c) * (out != c)].shape[0] / float(out[out != c].shape[0]) 229 | ft = pred[(pred != c) * (out == c)].shape[0] / float(out[out == c].shape[0]) 230 | ff = pred[(pred != c) * (out != c)].shape[0] / float(out[out != c].shape[0]) 231 | print tt, tf, ft, ff, '\t', out[out == c].shape[0] / float(out.shape[0]), out[out != c].shape[0] / float(out.shape[0]) 232 | tt_tf_ft_ff = [tt, tf, ft, ff] 233 | ans.append(tt_tf_ft_ff) 234 | 235 | ans_matrix = np.zeros((len(classes), len(classes)), dtype=float) 236 | for ci, c in enumerate(classes): 237 | for ci2, c2 in enumerate(classes): 238 | ans_matrix[ci2, ci] = pred[(pred == c) * (test_y == c2)].shape[0] / float(test_y[test_y == c2].shape[0]) 239 | 240 | 241 | return np.array(ans, dtype=float), ans_matrix 242 | 243 | 244 | class Random: 245 | def __init__(self): 246 | 1 247 | def fit(self, x, y): 248 | 1 249 | def predict(self, x): 250 | return np.random.normal(0, 0.1, size=x.shape[0]) 251 | 252 | 253 | def train_rnd(x, y, tx, ty): 254 | return Random() 255 | 256 | 257 | def validate(classifiers, test_x, test_y): 258 | print len(classifiers) 259 | classes = np.unique(test_y) 260 | ans = [] 261 | for ci, c in enumerate(classes): 262 | print ci 263 | out = np.array([int(i) for i in test_y==c]) 264 | predictions = classifiers[ci].predict(test_x) 265 | tt = predictions[(predictions == 1) * (out == 1)].shape[0] / float(out[out == 1].shape[0]) 266 | tf = predictions[(predictions == 1) * (out == 0)].shape[0] / float(out[out == 0].shape[0]) 267 | ft = predictions[(predictions == 0) * (out == 1)].shape[0] / float(out[out == 1].shape[0]) 268 | ff = predictions[(predictions == 0) * (out == 0)].shape[0] / float(out[out == 0].shape[0]) 269 | print tt, tf, ft, ff, '\t', out[out == 1].shape[0] / float(out.shape[0]), out[out == 0].shape[0] / float(out.shape[0]) 270 | tt_tf_ft_ff = [tt, tf, ft, ff] 271 | ans.append(tt_tf_ft_ff) 272 | 273 | return np.array(ans, dtype=float) 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | -------------------------------------------------------------------------------- /code_yan_lstm.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | 3 | matplotlib.use('Agg') 4 | import matplotlib.pyplot as plt 5 | import matplotlib.ticker as ticker 6 | import wave 7 | import numpy as np 8 | import math 9 | import os 10 | import pickle 11 | import models 12 | import csv 13 | 14 | import calculate_features as cf 15 | import models 16 | 17 | np.random.seed(200) 18 | 19 | regime = 'aibo5' 20 | 21 | def get_params(regime): 22 | if regime == 'iemocap': 23 | #available_emotions = ['ang', 'exc', 'fru', 'neu', 'sad'] 24 | available_emotions = ['ang', 'exc', 'neu', 'sad'] 25 | path_to_samples = 'iem_samples/' 26 | conf_matrix_prefix = 'iemocap' 27 | framerate = 44100 28 | return available_emotions, '', '', '', path_to_samples, conf_matrix_prefix, framerate, '', 0 29 | elif regime == 'aibo4': 30 | available_emotions = ['N', 'M', 'E', 'A'] 31 | 32 | path_to_wav = 'aibo_data/wav/' 33 | path_to_transcription = 'aibo_data/transliteration/' 34 | path_to_labels = 'aibo_data/labels/CEICES/' 35 | path_to_samples = 'yan_samples_lstm4/' 36 | conf_matrix_prefix = 'aibo4' 37 | labels_file = 'word_labels_4cl_aibo_word_set.txt' 38 | label_pos = 2 39 | framerate = 16000 40 | return available_emotions, path_to_wav, path_to_transcription, path_to_labels, path_to_samples, conf_matrix_prefix, framerate, labels_file, label_pos 41 | else: 42 | available_emotions = ['N', 'R', 'E', 'A', 'P'] 43 | 44 | path_to_wav = 'aibo_data/wav/' 45 | path_to_transcription = 'aibo_data/transliteration/' 46 | path_to_labels = 'aibo_data/labels/IS2009EmotionChallenge/' 47 | path_to_samples = 'yan_samples_lstm5/' 48 | conf_matrix_prefix = 'aibo5' 49 | labels_file = 'chunk_labels_5cl_corpus.txt' 50 | framerate = 16000 51 | label_pos = 1 52 | return available_emotions, path_to_wav, path_to_transcription, path_to_labels, path_to_samples, conf_matrix_prefix, framerate, labels_file, label_pos 53 | 54 | available_emotions, path_to_wav, path_to_transcription, path_to_labels, path_to_samples, conf_matrix_prefix, framerate, labels_file, label_pos = get_params(regime) 55 | 56 | segmentation = 'by_phrase' 57 | 58 | types = {1: np.int8, 2: np.int16, 4: np.int32} 59 | 60 | def open_wav(path_to_wav, filename): 61 | wav = wave.open(path_to_wav + filename, mode="r") 62 | (nchannels, sampwidth, framerate, nframes, comptype, compname) = wav.getparams() 63 | content = wav.readframes(nframes) 64 | 65 | samples = np.fromstring(content, dtype=types[sampwidth]) 66 | return (nchannels, sampwidth, framerate, nframes, comptype, compname), samples 67 | 68 | def read_lines(f): 69 | lines = open(f, 'r').read() 70 | return np.array(lines.split('\n')) 71 | 72 | 73 | def get_needed_lines(lines, i, subline): 74 | result = [] 75 | j = i 76 | condition = True 77 | while condition: 78 | if lines[j][:len(subline)] == subline: 79 | result.append(lines[j]) 80 | j += 1 81 | else: 82 | condition = False 83 | return result, j 84 | 85 | def get_label(expert_grades): 86 | u = np.unique(expert_grades) 87 | grade = u[0] 88 | count = expert_grades[expert_grades == grade].shape[0] 89 | for ui in u: 90 | current_count = expert_grades[expert_grades == ui].shape[0] 91 | if current_count > count: 92 | grade = ui 93 | count = current_count 94 | return grade 95 | 96 | 97 | 98 | def read_aibo_data(): 99 | data = [] 100 | files = np.sort([f[:-4] + '.wav' for f in os.listdir(path_to_wav)]) 101 | transliterations = read_lines(path_to_transcription + 'transliteration.txt') 102 | labels = read_lines(path_to_labels + labels_file) 103 | index_t = 0 104 | index_l = 0 105 | 106 | c = 0 107 | for fi, f in enumerate(files): 108 | d = {} 109 | if fi % 1000 == 0: 110 | print f, fi, ' out of ', len(files) 111 | w = open_wav(path_to_wav, f) 112 | signal = w[1] 113 | length = w[0][3] / float(w[0][2]) 114 | t, index_t = get_needed_lines(transliterations, index_t, f[:-4]) 115 | l, index_l = get_needed_lines(labels, index_l, f[:-4]) 116 | if l != []: 117 | grades = [] 118 | 119 | em = l[0].split(' ')[label_pos][0] 120 | d['id'] = f[:-4] 121 | d['emotion'] = em 122 | d['signal'] = signal 123 | d['transcription'] = t[0][14:-2] 124 | d['length'] = length 125 | if (d['emotion'] in available_emotions) and (d['length'] > 0.8): 126 | data.append(d) 127 | else: 128 | c += 1 129 | 130 | print 'missed: ', c 131 | 132 | return data 133 | 134 | 135 | sessions = ['Session1', 'Session2', 'Session3', 'Session4', 'Session5'] 136 | 137 | def get_transcriptions(path_to_transcriptions, filename): 138 | f = open(path_to_transcriptions + filename, 'r').read() 139 | f = np.array(f.split('\n')) 140 | transcription = {} 141 | 142 | for i in xrange(len(f) - 1): 143 | g = f[i] 144 | i1 = g.find(': ') 145 | i0 = g.find(' [') 146 | ind_id = g[:i0] 147 | ind_ts = g[i1+2:] 148 | transcription[ind_id] = ind_ts 149 | return transcription 150 | 151 | def split_wav(wav, emotions): 152 | (nchannels, sampwidth, framerate, nframes, comptype, compname), samples = wav 153 | duration = nframes / framerate 154 | 155 | left = samples[0::nchannels] 156 | right = samples[1::nchannels] 157 | 158 | frames = [] 159 | for ie, e in enumerate(emotions): 160 | start = e['start'] 161 | end = e['end'] 162 | 163 | e['right'] = right[int(start * framerate):int(end * framerate)] 164 | e['left'] = left[int(start * framerate):int(end * framerate)] 165 | 166 | frames.append({'left':e['left'], 'right': e['right']}) 167 | return frames 168 | 169 | def get_emotions(path_to_emotions, filename): 170 | f = open(path_to_emotions + filename, 'r').read() 171 | # print f 172 | # print f.split('\n') 173 | f = np.array(f.split('\n'))#np.append(np.array(['']), np.array(f.split('\n'))) 174 | c = 0 175 | idx = f == '' 176 | idx_n = np.arange(len(f))[idx] 177 | emotion = [] 178 | for i in xrange(len(idx_n) - 2): 179 | g = f[idx_n[i]+1:idx_n[i+1]] 180 | head = g[0] 181 | i0 = head.find(' - ') 182 | start_time = float(head[head.find('[') + 1:head.find(' - ')]) 183 | end_time = float(head[head.find(' - ') + 3:head.find(']')]) 184 | actor_id = head[head.find(filename[:-4]) + len(filename[:-4]) + 1:head.find(filename[:-4]) + len(filename[:-4]) + 5] 185 | emo = head[head.find('\t[') - 3:head.find('\t[')] 186 | vad = head[head.find('\t[') + 1:] 187 | 188 | v = float(vad[1:7]) 189 | a = float(vad[9:15]) 190 | d = float(vad[17:23]) 191 | 192 | emotion.append({'start':start_time, 193 | 'end':end_time, 194 | 'id':filename[:-4] + '_' + actor_id, 195 | 'v':v, 196 | 'a':a, 197 | 'd':d, 198 | 'emotion':emo}) 199 | return emotion 200 | 201 | def read_iemocap_data(): 202 | data = [] 203 | for session in sessions: 204 | print session 205 | path_to_wav = 'iemocap_data/' + session + '/dialog/wav/' 206 | path_to_emotions = 'iemocap_data/' + session + '/dialog/EmoEvaluation/' 207 | path_to_transcriptions = 'iemocap_data/' + session + '/dialog/transcriptions/' 208 | 209 | files = os.listdir(path_to_wav) 210 | files = [f[:-4] for f in files] 211 | print len(files) 212 | print files 213 | for f in files: 214 | emotions = get_emotions(path_to_emotions, f + '.txt') 215 | wav = open_wav(path_to_wav, f + '.wav') 216 | sample = split_wav(wav, emotions) 217 | 218 | transcriptions = get_transcriptions(path_to_transcriptions, f + '.txt') 219 | for ie, e in enumerate(emotions): 220 | if e['emotion'] in available_emotions: 221 | e['signal'] = sample[ie]['left'] 222 | #e['right'] = sample[ie]['right'] 223 | e['transcription'] = transcriptions[e['id']] 224 | data.append(e) 225 | return data 226 | 227 | def read_data(): 228 | if regime == 'aibo4' or regime == 'aibo5': 229 | return read_aibo_data() 230 | else: 231 | return read_iemocap_data() 232 | 233 | def check_all_finite(X): 234 | X = np.asanyarray(X) 235 | if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum()) 236 | and not np.isfinite(X).all()): 237 | return True 238 | else: 239 | return False 240 | 241 | def to_categorical(y): 242 | y_cat = np.zeros((len(y), len(available_emotions)), dtype=int) 243 | for i in xrange(len(y)): 244 | y_cat[i, :] = np.array(np.array(available_emotions) == y[i], dtype=int) 245 | 246 | return y_cat 247 | 248 | def save_sample(x, y, name): 249 | with open(name, 'w') as csvfile: 250 | w = csv.writer(csvfile, delimiter=',') 251 | for i in xrange(x.shape[0]): 252 | row = x[i, :].tolist() 253 | row.append(y[i]) 254 | w.writerow(row) 255 | 256 | def load(name): 257 | with open(name, 'r') as csvfile: 258 | r = csv.reader(csvfile, delimiter=',') 259 | x = [] 260 | y = [] 261 | for row in r: 262 | x.append(row[:-1]) 263 | y.append(row[-1]) 264 | return np.array(x, dtype=float), np.array(y) 265 | 266 | def get_features(data, save=True, path=path_to_samples): 267 | failed_samples = [] 268 | for di, d in enumerate(data): 269 | if di%1000 == 0: 270 | print di, ' out of ', len(data) 271 | st_features = cf.calculate_features(d['signal'], framerate, None).T 272 | x = [] 273 | y = [] 274 | for f in st_features: 275 | if f[1] > 1.e-4: 276 | x.append(f) 277 | y.append(d['emotion']) 278 | x = np.array(x, dtype=float) 279 | y = np.array(y) 280 | 281 | if save: 282 | save_sample(x, y, path + d['id'] + '.csv') 283 | return x, y 284 | 285 | def get_field(data, key): 286 | return np.array([e[key] for e in data]) 287 | 288 | 289 | def balance_sample(x, y, size): 290 | labels = np.unique(y) 291 | xc = {} 292 | yc = {} 293 | for l in labels: 294 | xc[l] = x[y == l] 295 | yc[l] = y[y == l] 296 | 297 | 298 | s = size / len(labels) 299 | 300 | tx = np.zeros((s*len(labels), x.shape[1]), dtype=float) 301 | ty = np.zeros(s*len(labels), dtype=str) 302 | for i in xrange(len(labels)): 303 | j = i*s 304 | n = xc[labels[i]].shape[0] 305 | 306 | idx = np.random.randint(low=0, high=n, size=s) 307 | tx[j:j+s, :] = xc[labels[i]][idx, :] 308 | ty[j:j+s] = yc[labels[i]][idx] 309 | 310 | return tx, ty 311 | 312 | def get_sample(idx, path): 313 | tx = [] 314 | ty = [] 315 | for i in idx: 316 | x, y = load(path + '/' + i + '.csv') 317 | if len(x) < 40: 318 | tx.append(np.array(x, dtype=float)) 319 | #tx.append(np.array(x, dtype=float)) 320 | ty.append(y[0]) 321 | 322 | tx = np.array(tx) 323 | ty = np.array(ty) 324 | return tx, ty 325 | 326 | def normalize(x): 327 | gminx = np.zeros(x[0].shape[1]) + 1.e5 328 | gmaxx = np.zeros(x[0].shape[1]) - 1.e5 329 | for i in xrange(x.shape[0]): 330 | q = x[i] 331 | minx = np.min(q, axis=0) 332 | maxx = np.max(q, axis=0) 333 | 334 | for s in xrange(x[0].shape[1]): 335 | if gminx[s] > minx[s]: 336 | gminx[s] = minx[s] 337 | if gmaxx[s] < maxx[s]: 338 | gmaxx[s] = maxx[s] 339 | 340 | for i in xrange(x.shape[0]): 341 | for s in xrange(x[0].shape[1]): 342 | x[i][:, s] = (x[i][:, s] - gminx[s]) / float(gmaxx[s] - gminx[s]) 343 | 344 | 345 | return x 346 | 347 | def grow_sample(x, y, n=10000): 348 | xg = [] 349 | yg = [] 350 | eps = 5.*1.e-2 351 | for i in xrange(n): 352 | j = np.random.randint(x.shape[0]) 353 | x0 = x[j] 354 | x0 += eps * np.random.normal(0, 1, size=x0.shape) 355 | y0 = y[j] 356 | xg.append(x0) 357 | yg.append(y0) 358 | return np.array(xg), np.array(yg) 359 | 360 | def reshape_for_dense(x, y): 361 | j = 0 362 | xr = [] 363 | yr = [] 364 | for i in xrange(x.shape[0]): 365 | for k in xrange(x[i].shape[0]): 366 | xr.append(x[i][k, :]) 367 | yr.append(y[i]) 368 | return np.array(xr), np.array(yr) 369 | 370 | 371 | def pad_sequence(x, ts): 372 | xp = [] 373 | for i in xrange(x.shape[0]): 374 | x0 = np.zeros((ts, x[i].shape[1]), dtype=float) 375 | if ts > x[i].shape[0]: 376 | x0[ts - x[i].shape[0]:, :] = x[i] 377 | else: 378 | maxe = np.sum(x[i][0:ts, 1]) 379 | for j in xrange(x[i].shape[0] - ts): 380 | if np.sum(x[i][j:j + ts, 1]) > maxe: 381 | x0 = x[i][j:j + ts, :] 382 | maxe = np.sum(x[i][j:j + ts, 1]) 383 | xp.append(x0) 384 | return np.array(xp) 385 | 386 | 387 | 388 | data = np.array(read_data()) 389 | print data 390 | print len(data) 391 | 392 | ids = get_field(data, 'id') 393 | emotions = get_field(data, 'emotion') 394 | print np.unique(emotions) 395 | 396 | for i in xrange(len(available_emotions)): 397 | print available_emotions[i], emotions[emotions == available_emotions[i]].shape[0] 398 | 399 | parts = 5 400 | permutation = np.random.permutation(len(data)) 401 | permuted_ids = ids[permutation] 402 | 403 | step = len(data) / parts 404 | 405 | preds = [] 406 | trues = [] 407 | 408 | get_features(data) 409 | 410 | for part in xrange(parts): 411 | i0 = step * part 412 | i1 = step * (part + 1) 413 | 414 | train_idx = np.append(permuted_ids[:i0], permuted_ids[i1:]) 415 | test_idx = permuted_ids[i0:i1] 416 | 417 | train_x, train_y = get_sample(train_idx, path_to_samples) 418 | 419 | 420 | 421 | # energies = [] 422 | # for i in xrange(train_x.shape[0]): 423 | # energies.append(np.mean(train_x[i][:, 1])) 424 | 425 | # quantile = 0.5 426 | # largest_energy_idx = np.argsort(energies)[:int(quantile*len(energies))] 427 | 428 | # train_x = train_x[largest_energy_idx] 429 | # train_y = train_y[largest_energy_idx] 430 | 431 | test_x, test_y = get_sample(test_idx, path_to_samples) 432 | 433 | # train_x = normalize(train_x) 434 | # test_x = normalize(test_x) 435 | 436 | #train_x, train_y = grow_sample(train_x, train_y, 10000) 437 | #train_x, train_y = reshape_for_dense(train_x, train_y) 438 | print train_x.shape 439 | print train_y.shape 440 | 441 | # lengths = {} 442 | # for i in xrange(train_x.shape[0]): 443 | # if train_x[i].shape[0] in lengths.keys(): 444 | # lengths[train_x[i].shape[0]] += 1 445 | # else: 446 | # lengths[train_x[i].shape[0]] = 1 447 | 448 | # for k, v in lengths.items(): 449 | # print k, v 450 | 451 | ts = 32 452 | 453 | train_x = pad_sequence(train_x, ts) 454 | test_x = pad_sequence(test_x, ts) 455 | 456 | train_y_cat = to_categorical(train_y) 457 | test_y_cat = to_categorical(test_y) 458 | model = models.train_lstm(train_x, train_y_cat, test_x, test_y_cat) 459 | 460 | scores = model.predict(test_x) 461 | prediction = np.array([available_emotions[np.argmax(t)] for t in scores]) 462 | print prediction[prediction == test_y].shape[0] / float(prediction.shape[0]) 463 | #test_x, test_y = get_sample(train_idx, 'yan_samples_lstm4') 464 | 465 | for i in xrange(len(prediction)): 466 | preds.append(prediction[i]) 467 | trues.append(test_y[i]) 468 | 469 | class_to_class_precs = np.zeros((len(available_emotions), len(available_emotions)), dtype=float) 470 | 471 | preds = np.array(preds) 472 | trues = np.array(trues) 473 | 474 | print 'Total accuracy: ', preds[preds == trues].shape[0] / float(preds.shape[0]) 475 | 476 | for cpi, cp in enumerate(available_emotions): 477 | for cti, ct in enumerate(available_emotions): 478 | #print cp, ct, emo_pred[(emo_pred == cp) * (emo_test == ct)].shape[0], emo_test[emo_test == ct].shape[0] 479 | if trues[trues == ct].shape[0] > 0: 480 | class_to_class_precs[cti, cpi] = preds[(preds == cp) * (trues == ct)].shape[0] / float(trues[trues == ct].shape[0]) 481 | else: 482 | class_to_class_precs[cti, cpi] = 0. 483 | 484 | 485 | fig, ax = plt.subplots() 486 | heatmap = ax.pcolor(class_to_class_precs.T, cmap=plt.cm.Blues) 487 | 488 | ax.set_xticklabels(available_emotions, minor=False) 489 | ax.set_yticklabels(available_emotions, minor=False) 490 | 491 | ax.set_xticks(np.arange(len(available_emotions)) + 0.5, minor=False) 492 | ax.set_yticks(np.arange(len(available_emotions)) + 0.5, minor=False) 493 | 494 | for i in xrange(len(available_emotions)): 495 | for j in xrange(len(available_emotions)): 496 | plt.text(i+0.2, j+0.4, str(class_to_class_precs[i, j])[:5]) 497 | 498 | plt.savefig(conf_matrix_prefix + 'lstm.png') 499 | plt.close() 500 | 501 | -------------------------------------------------------------------------------- /pitch.py: -------------------------------------------------------------------------------- 1 | """this is the module version""" 2 | 3 | import librosa 4 | import numpy as np 5 | import math 6 | 7 | # --------------------------------------------------------------------------- # 8 | # utilities 9 | # --------------------------------------------------------------------------- # 10 | 11 | def ratio_to_cents(r): 12 | return 1200.0 * np.log2(r) 13 | 14 | def ratio_to_cents_protected(f1, f2): 15 | """avoid divide by zero in here, but expects np.arrays""" 16 | out = np.zeros_like(f1) 17 | key = (f1!=0.0) * (f2!=0.0) 18 | out[key] = 1200.0 * np.log2(f1[key]/f2[key]) 19 | out[f1==0.0] = -np.inf 20 | out[f2==0.0] = np.inf 21 | out[(f1==0.0) * (f2==0.0)] = 0.0 22 | return out 23 | 24 | def cents_to_ratio(c): 25 | return np.power(2, c/1200.0) 26 | 27 | def freq_to_midi(f): 28 | return 69.0 + 12.0 * np.log2(f/440.0) 29 | 30 | def midi_to_freq(m): 31 | return np.power(2, (m-69.0)/12.0) * 440.0 if m!= 0.0 else 0.0 32 | 33 | def bin_to_freq(b, sr, n_fft): 34 | return b * float(sr) / float(n_fft) 35 | 36 | def freq_to_bin(f, sr, n_fft): 37 | return np.round(f/(float(sr)/float(n_fft))).astype('int') 38 | 39 | # --------------------------------------------------------------------------- # 40 | # ppitch 41 | # --------------------------------------------------------------------------- # 42 | 43 | def ppitch(y, sr=44100, n_fft=8820, win_length=1024, hop_length=2048, 44 | num_peaks=20, num_pitches=3, min_peak=2, max_peak=743, min_fund=55.0, 45 | max_fund=1000.0, harm_offset=0, harm_rolloff=0.75, ml_width=25, 46 | bounce_width=0, bounce_hist=0, bounce_ratios=None, max_harm=32767, 47 | npartial=7): 48 | 49 | """Polyphonic pitch estimation. 50 | parameters: 51 | - y: signal to analyze 52 | - sr: sample rate 53 | - n_fft: fft size 54 | - win_length: fft window size 55 | - hop_length: stft hop 56 | - num_peaks: number of peaks to find 57 | - num_pitches: number of pitches ot find 58 | - min_peak: min peak frequency (Hz) <- NOT IMPLEMENTED 59 | - max_peak: max peak frequency (Hz) 60 | - min_fund: min fundamental freq 61 | - max_fund: max fundamental freq 62 | - harm_offset: deprecated 63 | - harm_rolloff: deprecated (use if npartial is None) 64 | - ml_width: cents range to include peak in maximum likliehood 65 | - bounce_width: bounce freqs w/in this +/- this range from bounce_ratios 66 | - bounce_hist: bounce freqs from previous frames as well 67 | - bounce_ratios: bounce freqs w/in +/- bounce_width of these ratios 68 | - max_harm: basically deprecated, don't use harms above this one 69 | - npartial: partial that is half weight 70 | returns: 71 | - pitches 72 | - STFT 73 | - peaks 74 | todo: 75 | -> stats for how much bounced 76 | notes 77 | - changing to npartial instead of harm_rolloff, 78 | but if npartial=None, use harm_rolloff 79 | - previously was nearest_multiple+1 # to avoid divide by 0 80 | """ 81 | 82 | # other params FUCKING: (for now) 83 | 84 | # ------------------------------------------------------------------------- # 85 | # go to time-freq 86 | # ------------------------------------------------------------------------- # 87 | 88 | if_gram, D = librosa.core.ifgram(y, sr=sr, n_fft=n_fft, 89 | win_length=win_length,hop_length=hop_length) 90 | 91 | # ------------------------------------------------------------------------- # 92 | # find peaks 93 | # ------------------------------------------------------------------------- # 94 | 95 | peak_thresh = 1e-3 96 | 97 | # bins (freq range) to search 98 | min_bin = 2 99 | max_bin = freq_to_bin(float(max_peak), sr, n_fft) 100 | # //--> [make sure we have room either side for peak picking] 101 | 102 | # npartial 103 | if npartial is not None: 104 | harm_rolloff = math.log(2, float(npartial)) 105 | 106 | # things we know 107 | num_bins, num_frames = if_gram.shape 108 | 109 | # store pitches here 110 | pitches = np.zeros([num_frames, num_pitches]) # pitch bins 111 | peaks = np.zeros([num_frames, num_peaks]) # top20 peaks 112 | fundamentals = np.zeros([num_frames, num_pitches]) # funds (least squares) 113 | confidences = np.zeros([num_frames, num_pitches]) # confidence scores 114 | 115 | # loop through frames 116 | for i in range(num_frames): 117 | 118 | # grab frequency, magnitudes, total power 119 | frqs = if_gram[:,i] 120 | mags = np.abs(D[:,i]) 121 | total_power = mags.sum() 122 | max_amp = np.max(mags) 123 | 124 | # neighbor bins 125 | lower = mags[(min_bin-1):(max_bin)] 126 | middle = mags[(min_bin) :(max_bin+1)] 127 | upper = mags[(min_bin+1):(max_bin+2)] 128 | 129 | # all local peaks 130 | peaks_mask_all = (middle > lower) & (middle > upper) 131 | 132 | # resize mask to dimensions of im_gram 133 | zeros_left = np.zeros(min_bin) 134 | zeros_right = np.zeros(num_bins - min_bin - max_bin + 1) 135 | peaks_mask_all = np.concatenate((zeros_left, peaks_mask_all, zeros_right)) 136 | 137 | # find the first (at most) 138 | peaks_mags_all = peaks_mask_all * mags 139 | top20 = np.argsort(peaks_mags_all)[::-1][0:num_peaks] 140 | 141 | # extract just the peaks (freqs and normed mags) 142 | peaks_frqs = frqs[top20] 143 | peaks_mags = mags[top20] 144 | 145 | # note number of peaks found 146 | num_peaks_found = top20.shape[0] 147 | 148 | # ----------------------------------------------------------------------- # 149 | # maximum liklihood 150 | # ----------------------------------------------------------------------- # 151 | 152 | # range we'll consider for fundamentals 153 | min_freq = float(min_fund) 154 | max_freq = float(max_fund) 155 | 156 | # from min_bin to max_bin in 48ths of an octave (1/4 tones) 157 | 158 | # [1] bin index to frequency min_freq to max_freq in 48ths of an octave 159 | def b2f(index): return min_freq * np.power(np.power(2, 1.0/48.0), index) 160 | 161 | # [2] max_histo_bin is the bin at max_freq (FUCKING: rounds down?) 162 | max_histo_bin = int(math.log(max_freq / min_freq, math.pow(2, 1.0/48.0))) 163 | 164 | # now, generate them 165 | histo = np.fromfunction(lambda x,y: b2f(y), (num_peaks_found, max_histo_bin)) 166 | 167 | frqs_tile = np.tile(peaks_frqs, (max_histo_bin,1)).transpose() 168 | mags_tile = np.tile(peaks_mags, (max_histo_bin,1)).transpose() 169 | 170 | # likelihood function for each bin frequency 171 | def ml_a(amp): 172 | """a factor depending on the amplitude of the ith peak""" 173 | return np.sqrt(np.sqrt(amp/max_amp)) 174 | # return np.sqrt(np.sqrt(amp)) 175 | # return np.ones_like(amp) 176 | # return amp 177 | 178 | def ml_t(r1, r2): 179 | """how closely the ith peak is tuned to a multiple of f""" 180 | max_dist = ml_width # cents 181 | cents = np.abs(ratio_to_cents_protected(r1,r2)) 182 | dist = np.clip(1.0 - (cents / max_dist), 0, 1) 183 | return dist 184 | 185 | def ml_i(nearest_multiple): 186 | """whether the peak is closest to a high or low multiple of f""" 187 | out = np.zeros_like(nearest_multiple) 188 | out[nearest_multiple.nonzero()] = 1/np.power(nearest_multiple[nearest_multiple.nonzero()], harm_rolloff) 189 | return out 190 | # return 1/np.power(np.clip(nearest_multiple + harm_offset, 1, 32767), harm_rolloff) * (nearest_multiple <= max_harm) 191 | # return 1/np.power(nearest_multiple+1, 2) 192 | # return np.ones_like(nearest_multiple) 193 | 194 | ml = (ml_a(mags_tile) * \ 195 | ml_t((frqs_tile/histo), (frqs_tile/histo).round()) * \ 196 | ml_i((frqs_tile/histo).round())).sum(axis=0) 197 | 198 | # for debugging 199 | # ml_hat = (ml_a(mags_tile) * \ 200 | # ml_t((frqs_tile/histo), (frqs_tile/histo).round()) * \ 201 | # ml_i((frqs_tile/histo).round())) 202 | 203 | 204 | """super rough hack but work! 205 | bring this in as a control 206 | only tests in one direction (new is higher), adjust so ratio goes both ways 207 | what about which harms we're testing for should that be variable too 208 | """ 209 | 210 | num_found = 0 211 | maybe = 0 212 | found_so_far = [] 213 | bounce_list = list(np.ravel(pitches[i-bounce_hist:i])) 214 | # bounce_ratios = [1,2,3,4,5,6] 215 | bounce_ratios = [1.0, 2.0, 3.0/2.0, 3.0, 4.0] if bounce_ratios is None else bounce_ratios 216 | prev_frame = list(pitches[i-1]) if i>0 else [] 217 | indices = ml.argsort()[::-1] 218 | while num_found < num_pitches and maybe <= ml.shape[0]: 219 | this_one = b2f(indices[maybe]) 220 | # check bounce with this frame 221 | bounce1 = any([any([abs(ratio_to_cents( 222 | this_one/(other_one*harm))) < bounce_width 223 | for harm in bounce_ratios]) 224 | for other_one in bounce_list]) 225 | bounce2 = any([any([abs(ratio_to_cents 226 | (other_one/(this_one*harm))) < bounce_width 227 | for harm in bounce_ratios]) 228 | for other_one in bounce_list]) 229 | # if any bounces 230 | bounce = bounce1 or bounce2 231 | if not bounce: 232 | #print 'notbounce' 233 | found_so_far += [this_one] 234 | bounce_list += [this_one] 235 | num_found += 1 236 | # if bounce: print 'bounce!!!!' 237 | maybe += 1 238 | 239 | indices = ml.argsort()[::-1][0:num_pitches] 240 | pitches[i] = np.array(found_so_far) 241 | confidences[i] = ml[indices] 242 | peaks[i] = peaks_frqs 243 | 244 | 245 | # indices = ml.argsort()[::-1][0:num_pitches] 246 | # pitches[i] = b2f(indices) 247 | # confidences[i] = ml[indices] 248 | # peaks[i] = peaks_frqs 249 | 250 | # ----------------------------------------------------------------------- # 251 | # estimate fundamental (least squares) 252 | # ----------------------------------------------------------------------- # 253 | 254 | """estimate fundamental (least squares) 255 | here we solve a least squares approximation to give a more precise 256 | estimate of the fundamental within a histogram bin. 257 | 258 | least squares solution :: WAx ~ Wb 259 | 260 | A = matrix of harmonic integers (num_peaks x 1) 261 | b = matrix of actual frequencies (num_peaks x 1) 262 | W = matrix of weights (on digaonal, num_peaks x num_peaks) 263 | x = fundamental (singletone matrix) 264 | 265 | """ 266 | 267 | ml_peaks = pitches[i] # regions strong ml 268 | width = 25 # count peaks w/in 25 cents 269 | 270 | frame_fundamentals = [] 271 | for bin_frq in ml_peaks: 272 | 273 | # vector of nearest harmonics 274 | nearest_harmonic = (peaks_frqs/bin_frq).round() 275 | 276 | # mask in range? vector of bools 277 | mask = np.abs(ratio_to_cents_protected( 278 | (peaks_frqs/bin_frq), (peaks_frqs/bin_frq).round())) <= width 279 | 280 | # weight (same harmonic weight as above) 281 | weights = ml_i( (peaks_frqs/bin_frq).round() ) 282 | 283 | # build matrices 284 | A = np.matrix(nearest_harmonic).T 285 | b = np.matrix(peaks_frqs).T 286 | W = np.matrix(np.diag(mask * weights)) 287 | 288 | # do least squares 289 | fund = np.linalg.lstsq(W*A, W*b)[0][0].item() 290 | 291 | # append 292 | frame_fundamentals += [fund] 293 | 294 | fundamentals[i] = np.array(frame_fundamentals) 295 | 296 | # ------------------------------------------------------------------------- # 297 | # pitch tracks 298 | # ------------------------------------------------------------------------- # 299 | 300 | tracks = np.copy(pitches) 301 | 302 | return fundamentals, pitches, D, peaks, confidences, tracks 303 | 304 | 305 | # --------------------------------------------------------------------------- # 306 | # extras 307 | # --------------------------------------------------------------------------- # 308 | 309 | def voice_tracks(pitches, confidences): 310 | 311 | # alloc output data 312 | out = np.zeros_like(pitches) 313 | out[0] = pitches[0] 314 | 315 | out_confidences = np.zeros_like(confidences) 316 | out_confidences[0] = confidences[0] 317 | 318 | num_frames, num_voices = pitches.shape 319 | 320 | # loop through frames 321 | for i in range(1,num_frames): 322 | 323 | # setup 324 | prev_frame = out[i-1] 325 | next_frame = pitches[i] 326 | 327 | # pairwise distances indexed [from, to] 328 | delta = np.abs(ratio_to_cents(np.atleast_2d(prev_frame).T/np.repeat( 329 | np.atleast_2d(next_frame),num_voices,axis=0))) 330 | # delta = np.abs(ratio_to_cents_protected(np.atleast_2d(prev_frame).T, 331 | # np.repeat(np.atleast_2d(next_frame),num_voices,axis=0))) 332 | 333 | # indices of sorted delta [from, to] 334 | delta_sorted = np.unravel_index(np.argsort(delta.ravel()), delta.shape) 335 | 336 | # step through and reject if either prev or next is already found 337 | num_found = 0; found_prevs = []; found_nexts = []; index = 0 338 | 339 | while num_found < num_voices: 340 | prev_voice,next_voice = delta_sorted[0][index], delta_sorted[1][index] 341 | if prev_voice not in found_prevs and next_voice not in found_nexts: 342 | out[i][prev_voice] = pitches[i][next_voice] 343 | out_confidences[i][prev_voice] = confidences[i][next_voice] 344 | found_prevs += [prev_voice] 345 | found_nexts += [next_voice] 346 | num_found += 1 347 | index += 1 348 | 349 | return out, out_confidences 350 | 351 | # --------------------------------------------------------------------------- # 352 | 353 | """pitchsets. 354 | this is slightly ineffient b/c we filter twice 355 | 1. first for change (vibrato depth) 356 | 2. then for sustain (vibrato length) 357 | 3. whatever is left are ampsets 358 | - it could all be done in one pass, but this makes it easier to swap in 359 | change measures (absolute vs relative grid) and sustain measures 360 | * currently pitchsets are detected by voice but reported by frame to be 361 | consistent with ampsets. 362 | 363 | -> this should change. 364 | """ 365 | 366 | def pitchsets(pitches, win=3): 367 | 368 | # 1. filter for change on an asbolute 1/2-tone grid (depth) 369 | pitches_change = change_filter(pitches) 370 | 371 | # 2. filter for sustain (length) 372 | pitches_sustain = sustain_filter(pitches_change, win=win) 373 | 374 | # 3. what is left are pitchsets 375 | pitchsets = np.sort(np.array(list(set(np.argwhere(np.nan_to_num(pitches_sustain))[:,0])))) 376 | 377 | return pitchsets, np.where(np.nan_to_num(pitches_sustain)) 378 | 379 | # --------------------------------------------------------------------------- # 380 | 381 | """filter for change. 382 | * filter for change on grid rounded to nearest 1/2 tone 383 | * non-change pitches are changed to 0's 384 | -> maybe this should be np.nan's instead 385 | """ 386 | 387 | def change_filter(pitches): 388 | 389 | # copy working data 390 | wpitches = np.copy(pitches) 391 | 392 | # alloc output data 393 | out = np.zeros_like(wpitches) 394 | out[out==0] = np.nan 395 | 396 | # convert to midi pitch and round 397 | wpitches = freq_to_midi(wpitches).round() 398 | 399 | # indices of pitch change 400 | indices = np.diff(wpitches, axis=0).nonzero() 401 | 402 | # copy pitch values from original data where change 403 | out[0] = pitches[0] 404 | out[1:][indices] = pitches[1:][indices] 405 | 406 | return out 407 | 408 | # --------------------------------------------------------------------------- # 409 | 410 | """filter for sustain. 411 | * say something here: wish could be numpy... 412 | """ 413 | 414 | def sustain_filter(pitches, win=3): 415 | 416 | # alloc output data 417 | out = np.zeros_like(pitches) 418 | 419 | # loop through frames and pitches 420 | for i in range(pitches.shape[0]-win): 421 | for j in range(pitches.shape[1]): 422 | 423 | # keep pitch if win after are nan 424 | out[i,j] = pitches[i,j] * np.isnan(pitches[i+1:i+win,j]).all() 425 | 426 | # convert 0 to nan 427 | out[out==0] = np.nan 428 | 429 | return out 430 | 431 | # --------------------------------------------------------------------------- # 432 | 433 | """pitch tracks 434 | here we parse the raw frequencies into pitch tracks. currently, we do this 435 | by minimizing the abs of the frame to frame difference, but we could use 436 | any feature, such as distance from the running mean, variance, etc. 437 | todo: --> implement more pitch track options 438 | 439 | """ 440 | 441 | # move through the frames 442 | # have pitch tracks ready to go 443 | # for each num_pitches put in track with nearest frame to frame difference 444 | # note: what if the pitch confidence is low, might want to put track to sleep... 445 | # or if it's too far from anything just for a sec 446 | # or more sophisticated for like minimizing total error... 447 | # here we do it dumb way from strong to weak closest match (pref to first) 448 | # tracks = np.zeros([num_frames, num_pitches]) # confidence scores 449 | 450 | # tracks[0] = fundamentals[0] 451 | 452 | # tracks[0] = fundamentals[0] 453 | # for i in range(1, num_frames): 454 | # prev_tracks = list(tracks[i-1]) 455 | # new_funds = list(fundamentals[i]) 456 | # for j in range(num_pitches): 457 | # wewant = min(new_funds, key=lambda x: abs(x-prev_tracks[j])) 458 | # index = new_funds.index(wewant) 459 | # new_funds.pop(index) 460 | # tracks[i,j]= wewant 461 | def pitch_onsets(funds, win=2, depth=25): 462 | 463 | onsets = np.zeros_like(funds) 464 | for i in range(funds.shape[0]): 465 | for j in range(funds.shape[1]): 466 | onsets[i,j] = (np.abs(pyfid.ratio_to_cents(funds[i-win:i,:] / funds[i,j])) <= thresh).any(axis=1).all() 467 | 468 | return onsets -------------------------------------------------------------------------------- /code.py: -------------------------------------------------------------------------------- 1 | import wave 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import matplotlib.ticker as ticker 5 | import math 6 | import os 7 | import pickle 8 | import models 9 | import csv 10 | 11 | import calculate_features as cf 12 | import models 13 | 14 | np.random.seed(200) 15 | 16 | 17 | def format_time(x, pos=None): 18 | global duration, nframes, k 19 | progress = int(x / float(nframes) * duration * k) 20 | mins, secs = divmod(progress, 60) 21 | hours, mins = divmod(mins, 60) 22 | out = "%d:%02d" % (mins, secs) 23 | if hours > 0: 24 | out = "%d:" % hours 25 | return out 26 | 27 | def format_db(x, pos=None): 28 | if pos == 0: 29 | return "" 30 | global peak 31 | if x == 0: 32 | return "-inf" 33 | 34 | db = 20 * math.log10(abs(x) / float(peak)) 35 | return int(db) 36 | 37 | types = { 38 | 1: np.int8, 39 | 2: np.int16, 40 | 4: np.int32 41 | } 42 | 43 | 44 | def open_wav(path_to_wav, filename): 45 | wav = wave.open(path_to_wav + filename, mode="r") 46 | (nchannels, sampwidth, framerate, nframes, comptype, compname) = wav.getparams() 47 | content = wav.readframes(nframes) 48 | 49 | samples = np.fromstring(content, dtype=types[sampwidth]) 50 | return (nchannels, sampwidth, framerate, nframes, comptype, compname), samples 51 | 52 | 53 | def get_all_files(path_to_wav): 54 | files = os.listdir(path_to_wav) 55 | return files 56 | 57 | def get_emotions(path_to_emotions, filename): 58 | f = open(path_to_emotions + filename, 'r').read() 59 | # print f 60 | # print f.split('\n') 61 | f = np.array(f.split('\n'))#np.append(np.array(['']), np.array(f.split('\n'))) 62 | c = 0 63 | idx = f == '' 64 | idx_n = np.arange(len(f))[idx] 65 | emotion = [] 66 | for i in xrange(len(idx_n) - 2): 67 | g = f[idx_n[i]+1:idx_n[i+1]] 68 | head = g[0] 69 | i0 = head.find(' - ') 70 | start_time = float(head[head.find('[') + 1:head.find(' - ')]) 71 | end_time = float(head[head.find(' - ') + 3:head.find(']')]) 72 | actor_id = head[head.find(filename[:-4]) + len(filename[:-4]) + 1:head.find(filename[:-4]) + len(filename[:-4]) + 5] 73 | emo = head[head.find('\t[') - 3:head.find('\t[')] 74 | vad = head[head.find('\t[') + 1:] 75 | 76 | v = float(vad[1:7]) 77 | a = float(vad[9:15]) 78 | d = float(vad[17:23]) 79 | 80 | emotion.append({'start':start_time, 81 | 'end':end_time, 82 | 'id':filename[:-4] + '_' + actor_id, 83 | 'v':v, 84 | 'a':a, 85 | 'd':d, 86 | 'emotion':emo}) 87 | return emotion 88 | 89 | 90 | def split_wav(wav, emotion): 91 | (nchannels, sampwidth, framerate, nframes, comptype, compname), samples = wav 92 | duration = nframes / framerate 93 | 94 | left = samples[0::nchannels] 95 | right = samples[1::nchannels] 96 | 97 | frames = [] 98 | for ie, e in enumerate(emotions): 99 | start = e['start'] 100 | end = e['end'] 101 | 102 | e['right'] = right[int(start * framerate):int(end * framerate)] 103 | e['left'] = left[int(start * framerate):int(end * framerate)] 104 | 105 | frames.append({'left':e['left'], 'right': e['right']}) 106 | return frames 107 | 108 | available_emotions = ['ang', 'exc', 'fru', 'neu', 'sad'] 109 | 110 | def get_transcriptions(path_to_transcriptions, filename): 111 | f = open(path_to_transcriptions + filename, 'r').read() 112 | f = np.array(f.split('\n')) 113 | transcription = {} 114 | 115 | for i in xrange(len(f) - 1): 116 | g = f[i] 117 | i1 = g.find(': ') 118 | i0 = g.find(' [') 119 | ind_id = g[:i0] 120 | ind_ts = g[i1+2:] 121 | transcription[ind_id] = ind_ts 122 | return transcription 123 | 124 | 125 | 126 | data = [] 127 | sessions = ['session1', 'session2', 'session3', 'session4', 'session5'] 128 | for session in sessions: 129 | print session 130 | path_to_wav = 'D:/emotion recognition/' + session + '/dialog/wav/' 131 | path_to_pics = 'D:/emotion recognition/pics/' 132 | path_to_emotions = 'D:/emotion recognition/' + session + '/dialog/EmoEvaluation/' 133 | path_to_transcriptions = 'D:/emotion recognition/' + session + '/dialog/transcriptions/' 134 | 135 | files = get_all_files(path_to_wav) 136 | files = [f[:-4] for f in files] 137 | print len(files) 138 | print files 139 | for f in files: 140 | emotions = get_emotions(path_to_emotions, f + '.txt') 141 | wav = open_wav(path_to_wav, f + '.wav') 142 | sample = split_wav(wav, emotions) 143 | 144 | transcriptions = get_transcriptions(path_to_transcriptions, f + '.txt') 145 | for ie, e in enumerate(emotions): 146 | if e['emotion'] in available_emotions: 147 | e['left'] = sample[ie]['left'] 148 | e['right'] = sample[ie]['right'] 149 | e['transcription'] = transcriptions[e['id']] 150 | data.append(e) 151 | 152 | def save_sample(x, y, name): 153 | with open(name, 'w') as csvfile: 154 | w = csv.writer(csvfile, delimiter=',') 155 | for i in xrange(x.shape[0]): 156 | row = x[i, :].tolist() 157 | row.append(y[i]) 158 | w.writerow(row) 159 | 160 | def load(name): 161 | with open(name, 'r') as csvfile: 162 | r = csv.reader(csvfile, delimiter=',') 163 | x = [] 164 | y = [] 165 | for row in r: 166 | x.append(row[:-1]) 167 | y.append(row[-1]) 168 | return np.array(x, dtype=float), np.array(y) 169 | 170 | def get_field(data, key): 171 | return np.array([e[key] for e in data]) 172 | 173 | 174 | def split_sample(n, train_part=0.8): 175 | ids_idx = np.random.permutation(len(ids)) 176 | step = int(n * (1 - train_part)) 177 | samples = [] 178 | for i in xrange(n / step): 179 | ids_train = ids[ids_idx[:int(train_part*len(ids)) ]] 180 | ids_test = ids[ids_idx[int(train_part*len(ids)):]] 181 | samples.append((ids_train, ids_test)) 182 | return samples 183 | 184 | 185 | def get_features(data, save=True, path='samples/l_', mode='calculate'): 186 | if mode == 'calculate': 187 | for di, d in enumerate(data): 188 | print di, ' out of ', len(data) 189 | st_features = cf.calculate_features(d['left'], None).T 190 | x = [] 191 | y = [] 192 | for f in st_features: 193 | if f[1] > 1.e-4: 194 | x.append(f) 195 | y.append(d['emotion']) 196 | x = np.array(x, dtype=float) 197 | y = np.array(y) 198 | if save: 199 | save_sample(x, y, path + d['id'] + '.csv') 200 | return x, y 201 | 202 | 203 | path_to_probas = 'D:/emotion recognition/probas/' 204 | def plot_probas_emotions(probas, true_emo, filename): 205 | colors = ['r', 'g', 'b', 'yellow', 'black', 'magenta'] 206 | plt.figure(figsize=(16, 12)) 207 | plt.title('True emotion: ' + true_emo, fontsize=50) 208 | for j in xrange(probas.shape[1]): 209 | plt.plot(0.2*np.arange(probas.shape[0]), probas[:, j], color=colors[j], linewidth=3) 210 | 211 | plt.legend(available_emotions, fontsize=20) 212 | plt.ylim(ymax=1, ymin=0) 213 | plt.savefig(path_to_probas + filename + '.png') 214 | plt.close() 215 | 216 | from keras.utils import np_utils 217 | def to_categorical(y): 218 | y_cat = np.zeros((len(y), len(available_emotions)), dtype=int) 219 | for i in xrange(len(y)): 220 | y_cat[i, :] = np.array(np.array(available_emotions) == y[i], dtype=int) 221 | 222 | return y_cat 223 | 224 | 225 | def check_all_finite(X): 226 | X = np.asanyarray(X) 227 | if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum()) 228 | and not np.isfinite(X).all()): 229 | return True 230 | else: 231 | return False 232 | 233 | data = np.array(data) 234 | print data 235 | print len(data) 236 | 237 | x, y = get_features(data) 238 | 239 | ids = get_field(data, 'id') 240 | emotions = get_field(data, 'emotion') 241 | 242 | for i in xrange(len(available_emotions)): 243 | print available_emotions[i], emotions[emotions == available_emotions[i]].shape[0] 244 | 245 | 246 | parts = 5 247 | 248 | 249 | permutation = np.random.permutation(len(data)) 250 | permuted_ids = ids[permutation] 251 | 252 | step = len(data) / parts 253 | 254 | preds = [] 255 | trues = [] 256 | 257 | for part in xrange(parts): 258 | if part >= 0: 259 | print 'Validation part: ', part 260 | i0 = step * part 261 | i1 = step * (part + 1) 262 | 263 | level1_ids = np.append(permuted_ids[:i0], permuted_ids[i1:]) 264 | validation_ids = permuted_ids[i0:i1] 265 | 266 | validation_x = {} 267 | validation_y = [] 268 | 269 | for i in validation_ids: 270 | x, y = load('samples/l_' + i + '.csv') 271 | validation_x[i] = x 272 | validation_y.append(y[0]) 273 | validation_y = np.array(validation_y) 274 | 275 | index = 0 276 | level2_parts = 3 277 | level2_x = np.zeros((len(data) - len(validation_ids), level2_parts * len(available_emotions)), dtype=np.float32) 278 | level2_y = [] 279 | 280 | for j in xrange(parts - 1): 281 | print 'Level 1 part: ', j 282 | j0 = step * j 283 | j1 = step * (j + 1) 284 | 285 | train_ids = np.append(level1_ids[:j0], level1_ids[j1:]) 286 | test_ids = level1_ids[j0:j1] 287 | 288 | train_x = [] 289 | train_y = [] 290 | test_x = {} 291 | test_y = {} 292 | 293 | for i in level1_ids: 294 | x, y = load('samples/l_' + i + '.csv') 295 | if i in train_ids: 296 | for s in xrange(x.shape[0]): 297 | train_x.append(x[s, :]) 298 | train_y.append(y[s]) 299 | if i in test_ids: 300 | test_x[i] = x 301 | test_y[i] = y[0] 302 | 303 | train_x = np.array(train_x, dtype=float) 304 | train_y = np.array(train_y) 305 | 306 | train_y_cat = to_categorical(train_y) 307 | 308 | load_models = False 309 | if load_models: 310 | classifiers = pickle.load(open('model' + str(part) + str(j) + '.pkl', 'r')) 311 | else: 312 | #classifiers = models.train_rfc(train_x, train_y, None) 313 | classifiers = models.train_mpc(train_x, train_y_cat, None) 314 | #pickle.dump(classifiers, open('model' + str(part) + str(j) + '.pkl', 'w')) 315 | 316 | for i in test_ids: 317 | probs = classifiers.predict(test_x[i]) 318 | for m in xrange(level2_parts): 319 | m0 = probs.shape[0] * m / level2_parts 320 | m1 = probs.shape[0] * (m + 1) / level2_parts 321 | 322 | f = np.mean(probs[m0:m1, :], axis=0) 323 | for fi, ff in enumerate(f): 324 | if np.isnan(ff): 325 | f[fi] = 1./float(len(f)) 326 | level2_x[index, m * len(f):(m + 1) * len(f)] = f 327 | level2_y.append(test_y[i]) 328 | 329 | print test_x[i] 330 | print probs 331 | print level2_x[index, :] 332 | print level2_y[index] 333 | print 334 | print 335 | index += 1 336 | 337 | #level2_x = level2_x[:len(level2_y), :] 338 | level2_y = np.array(level2_y) 339 | print level2_x.shape, level2_y.shape 340 | print np.sum(level2_x) 341 | print check_all_finite(level2_x), check_all_finite(level2_y) 342 | print np.unique(level2_y) 343 | 344 | # for i in xrange(level2_x.shape[0]): 345 | # print i, level2_x[i, :], level2_y[i] 346 | level2_x = np.array(level2_x, dtype=float) 347 | if load_models: 348 | level2_classifiers = pickle.load(open('level2_model' + str(part) + '.pkl', 'r')) 349 | else: 350 | level2_classifiers = models.train_rfc(level2_x, level2_y, None) 351 | #pickle.dump(level2_classifiers, open('level2_model' + str(part) + '.pkl', 'w')) 352 | 353 | level1_train_x = [] 354 | level1_train_y = [] 355 | 356 | for i in level1_ids: 357 | x, y = load('samples/l_' + i + '.csv') 358 | for s in xrange(x.shape[0]): 359 | level1_train_x.append(x[s, :]) 360 | level1_train_y.append(y[s]) 361 | level1_train_x = np.array(level1_train_x, dtype=float) 362 | level1_train_y = np.array(level1_train_y) 363 | 364 | 365 | level1_train_y_cat = to_categorical(level1_train_y) 366 | if load_models: 367 | level1_classifiers = pickle.load(open('level1_model' + str(part) + '.pkl', 'r')) 368 | else: 369 | level1_classifiers = models.train_mpc(level1_train_x, level1_train_y_cat, None) 370 | #pickle.dump(level1_classifiers, open('level1_model' + str(part) + '.pkl', 'w')) 371 | 372 | 373 | 374 | level2_validation_x = np.zeros((len(validation_ids), level2_parts * len(available_emotions)), dtype=float) 375 | for ii, i in enumerate(validation_ids): 376 | probs = level1_classifiers.predict(validation_x[i]) 377 | for m in xrange(level2_parts): 378 | m0 = probs.shape[0] * m / level2_parts 379 | m1 = probs.shape[0] * (m + 1) / level2_parts 380 | 381 | f = np.mean(probs[m0:m1, :], axis=0) 382 | for fi, ff in enumerate(f): 383 | if np.isnan(ff): 384 | f[fi] = 1./float(len(f)) 385 | level2_validation_x[ii, m * len(f):(m + 1) * len(f)] = f 386 | 387 | 388 | 389 | prediction = level2_classifiers.predict(level2_validation_x) 390 | # print pred_probas 391 | # prediction = [] 392 | # for i in xrange(pred_probas.shape[0]): 393 | # prediction.append(available_emotions[np.argmax(pred_probas[i, :])]) 394 | # prediction = np.array(prediction) 395 | 396 | print prediction[prediction == validation_y].shape[0] / float(prediction.shape[0]) 397 | 398 | for i in xrange(len(prediction)): 399 | preds.append(prediction[i]) 400 | trues.append(validation_y[i]) 401 | 402 | class_to_class_precs = np.zeros((len(available_emotions), len(available_emotions)), dtype=float) 403 | 404 | preds = np.array(preds) 405 | trues = np.array(trues) 406 | 407 | for cpi, cp in enumerate(available_emotions): 408 | for cti, ct in enumerate(available_emotions): 409 | #print cp, ct, emo_pred[(emo_pred == cp) * (emo_test == ct)].shape[0], emo_test[emo_test == ct].shape[0] 410 | if trues[trues == ct].shape[0] > 0: 411 | class_to_class_precs[cti, cpi] = preds[(preds == cp) * (trues == ct)].shape[0] / float(trues[trues == ct].shape[0]) 412 | else: 413 | class_to_class_precs[cti, cpi] = 0. 414 | 415 | 416 | fig, ax = plt.subplots() 417 | heatmap = ax.pcolor(class_to_class_precs, cmap=plt.cm.Blues) 418 | 419 | ax.set_xticklabels(available_emotions, minor=False) 420 | ax.set_yticklabels(available_emotions, minor=False) 421 | 422 | ax.set_xticks(np.arange(len(available_emotions)) + 0.5, minor=False) 423 | ax.set_yticks(np.arange(len(available_emotions)) + 0.5, minor=False) 424 | 425 | for i in xrange(len(available_emotions)): 426 | for j in xrange(len(available_emotions)): 427 | plt.text(i+0.2, j+0.4, str(class_to_class_precs[i, j])[:5]) 428 | 429 | plt.savefig('level2_precs.png') 430 | plt.close() 431 | 432 | # samples = split_sample(len(ids)) 433 | # precs = np.zeros(len(samples)) 434 | 435 | # class_to_class_precs = np.zeros((len(samples), len(available_emotions), len(available_emotions)), dtype=float) 436 | 437 | # load_models = True 438 | 439 | # level2_parts = 3 440 | # level2_sample = np.zeros((len(data), level2_parts * len(available_emotions)), dtype=float) 441 | # level2_emotions = [] 442 | 443 | 444 | # index = 0 445 | # for it, (ids_train, ids_test) in enumerate(samples): 446 | 447 | # train_x = [] 448 | # test_x = [] 449 | # train_y = [] 450 | # test_y = [] 451 | 452 | # test_x2 = {} 453 | # test_y2 = {} 454 | 455 | # for i in ids: 456 | # x, y = load('samples/l_' + i + '.csv') 457 | # if i in ids_train: 458 | # for j in xrange(x.shape[0]): 459 | # train_x.append(x[j, :]) 460 | # train_y.append(y[j]) 461 | # if i in ids_test: 462 | # test_x2[i] = x 463 | # test_y2[i] = y 464 | # for j in xrange(x.shape[0]): 465 | # test_x.append(x[j, :]) 466 | # test_y.append(y[j]) 467 | 468 | # train_x = np.array(train_x, dtype=float) 469 | # train_y = np.array(train_y) 470 | # test_x = np.array(test_x, dtype=float) 471 | # test_y = np.array(test_y) 472 | 473 | # if load_models: 474 | # classifiers = pickle.load(open('model' + str(it) + '.pkl', 'r')) 475 | # else: 476 | # classifiers = models.train1(train_x, train_y, None) 477 | # pickle.dump(classifiers, open('model' + str(it) + '.pkl', 'w')) 478 | # # classifiers = pickle.load(open('model.pkl', 'r')) 479 | 480 | # for ii, i in enumerate(ids_test): 481 | # print test_x2[i].shape, i 482 | # probas = classifiers.predict_proba(test_x2[i]) 483 | 484 | # plot_probas_emotions(probas, test_y2[i][0], str(i)) 485 | 486 | # k = probas.shape[0] 487 | # l = len(available_emotions) 488 | # for j in xrange(level2_parts): 489 | # i0 = k * j / level2_parts 490 | # i1 = k * (j + 1) / level2_parts 491 | 492 | # f = np.mean(probas[i0:i1, :], axis=0) 493 | # level2_sample[index, j * l:(j + 1) * l] = f 494 | # level2_emotions.append(test_y2[i][0]) 495 | # # print level2_sample[index, :], level2_emotions[index] 496 | # index += 1 497 | 498 | # level2_emotions = np.array(level2_emotions) 499 | 500 | # print len(data) 501 | # print level2_sample.shape 502 | # print level2_emotions.shape 503 | 504 | # with open('level2_sample.csv', 'w') as csvfile: 505 | # w = csv.writer(csvfile, delimiter=',') 506 | # for i in xrange(level2_sample.shape[0]): 507 | # d = level2_sample[i, :].tolist() 508 | # d.append(level2_emotions[i]) 509 | # w.writerow(d) 510 | 511 | # #=============================================================================== 512 | # emo_test = [] 513 | # emo_pred = [] 514 | # j = 0 515 | # for ii, i in enumerate(ids_test): 516 | # try: 517 | # predictions_frames = classifiers.predict(test_x2[i]) 518 | # print classifiers.predict_proba(test_x2[i]).shape 519 | # print classifiers.predict_proba(test_x2[i]) 520 | # ans = np.zeros(len(available_emotions)) 521 | # for ie, e in enumerate(available_emotions): 522 | # ans[ie] = predictions_frames[predictions_frames == e].shape[0] 523 | 524 | # emo = available_emotions[np.argmax(ans)] 525 | # emo_test.append(test_y2[i][0]) 526 | # emo_pred.append(emo) 527 | # except: 528 | # j += 1 529 | # print ii, j 530 | # emo_test = np.array(emo_test) 531 | # emo_pred = np.array(emo_pred) 532 | 533 | # print emo_pred.shape, emo_test.shape 534 | 535 | # print emo_test[emo_test == emo_pred].shape[0] / float(emo_test.shape[0]) 536 | # precs[it] = emo_test[emo_test == emo_pred].shape[0] / float(emo_test.shape[0]) 537 | 538 | # for cpi, cp in enumerate(available_emotions): 539 | # for cti, ct in enumerate(available_emotions): 540 | # #print cp, ct, emo_pred[(emo_pred == cp) * (emo_test == ct)].shape[0], emo_test[emo_test == ct].shape[0] 541 | # if emo_test[emo_test == ct].shape[0] > 0: 542 | # class_to_class_precs[it, cti, cpi] = emo_pred[(emo_pred == cp) * (emo_test == ct)].shape[0] / float(emo_test[emo_test == ct].shape[0]) 543 | # else: 544 | # class_to_class_precs[it, cti, cpi] = 0. 545 | 546 | # print precs 547 | # print np.mean(precs) 548 | 549 | # class_to_class_precs = np.mean(class_to_class_precs, axis=0) 550 | 551 | # fig, ax = plt.subplots() 552 | # heatmap = ax.pcolor(class_to_class_precs, cmap=plt.cm.Blues) 553 | 554 | # ax.set_xticklabels(available_emotions, minor=False) 555 | # ax.set_yticklabels(available_emotions, minor=False) 556 | 557 | # ax.set_xticks(np.arange(len(available_emotions)) + 0.5, minor=False) 558 | # ax.set_yticks(np.arange(len(available_emotions)) + 0.5, minor=False) 559 | 560 | # for i in xrange(len(available_emotions)): 561 | # for j in xrange(len(available_emotions)): 562 | # plt.text(i+0.2, j+0.4, str(class_to_class_precs[i, j])[:5]) 563 | 564 | # plt.savefig('precs.png') 565 | # plt.close() 566 | 567 | # with open('precs.csv', 'w') as csvfile: 568 | # w = csv.writer(csvfile, delimiter=',') 569 | # d = [''] 570 | # for j in xrange(len(available_emotions)): 571 | # d.append(available_emotions[j]) 572 | 573 | 574 | # for i in xrange(len(available_emotions)): 575 | # d = [available_emotions[i]] 576 | # for j in xrange(len(available_emotions)): 577 | # d.append(class_to_class_precs[i, j]) 578 | # w.writerow(d) 579 | 580 | # #============================================================================== 581 | 582 | 583 | 584 | # ans, ans_m = models.validate1(classifiers, test_x, test_y) 585 | # print ans_m 586 | 587 | # with open('ans.csv', 'w') as csvfile: 588 | # w = csv.writer(csvfile, delimiter='\t') 589 | # for r in xrange(ans.shape[0]): 590 | # print available_emotions[r], ans_m[r, :] 591 | # w.writerow(ans_m[r, :]) 592 | 593 | 594 | 595 | 596 | 597 | # save(x, y, 'data.csv') 598 | 599 | # print np.unique(y) 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | -------------------------------------------------------------------------------- /cf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import os 4 | import glob 5 | import numpy 6 | #import mlpy 7 | import cPickle 8 | import aifc 9 | import math 10 | from numpy import NaN, Inf, arange, isscalar, array 11 | from scipy.fftpack import rfft 12 | from scipy.fftpack import fft 13 | from scipy.fftpack.realtransforms import dct 14 | from scipy.signal import fftconvolve 15 | from scipy import linalg as la 16 | #import audioTrainTest as aT 17 | #import audioBasicIO 18 | #import utilities 19 | from scipy.signal import lfilter, hamming 20 | use_pitch = False 21 | 22 | if use_pitch: 23 | import pitch 24 | #from scikits.talkbox import lpc 25 | 26 | eps = 0.00000001 27 | 28 | """ Time-domain audio features """ 29 | 30 | 31 | def stZCR(frame): 32 | """Computes zero crossing rate of frame""" 33 | count = len(frame) 34 | countZ = numpy.sum(numpy.abs(numpy.diff(numpy.sign(frame)))) / 2 35 | return (numpy.float64(countZ) / numpy.float64(count-1.0)) 36 | 37 | 38 | def stEnergy(frame): 39 | """Computes signal energy of frame""" 40 | return numpy.sum(frame ** 2) / numpy.float64(len(frame)) 41 | 42 | 43 | def stEnergyEntropy(frame, numOfShortBlocks=10): 44 | """Computes entropy of energy""" 45 | Eol = numpy.sum(frame ** 2) # total frame energy 46 | L = len(frame) 47 | subWinLength = int(numpy.floor(L / numOfShortBlocks)) 48 | if L != subWinLength * numOfShortBlocks: 49 | frame = frame[0:subWinLength * numOfShortBlocks] 50 | # subWindows is of size [numOfShortBlocks x L] 51 | subWindows = frame.reshape(subWinLength, numOfShortBlocks, order='F').copy() 52 | 53 | # Compute normalized sub-frame energies: 54 | s = numpy.sum(subWindows ** 2, axis=0) / (Eol + eps) 55 | 56 | # Compute entropy of the normalized sub-frame energies: 57 | Entropy = -numpy.sum(s * numpy.log2(s + eps)) 58 | return Entropy 59 | 60 | 61 | """ Frequency-domain audio features """ 62 | 63 | 64 | def stSpectralCentroidAndSpread(X, fs): 65 | """Computes spectral centroid of frame (given abs(FFT))""" 66 | ind = (numpy.arange(1, len(X) + 1)) * (fs/(2.0 * len(X))) 67 | 68 | Xt = X.copy() 69 | Xt = Xt / Xt.max() 70 | NUM = numpy.sum(ind * Xt) 71 | DEN = numpy.sum(Xt) + eps 72 | 73 | # Centroid: 74 | C = (NUM / DEN) 75 | 76 | # Spread: 77 | S = numpy.sqrt(numpy.sum(((ind - C) ** 2) * Xt) / DEN) 78 | 79 | # Normalize: 80 | C = C / (fs / 2.0) 81 | S = S / (fs / 2.0) 82 | 83 | return (C, S) 84 | 85 | 86 | def stSpectralEntropy(X, numOfShortBlocks=10): 87 | """Computes the spectral entropy""" 88 | L = len(X) # number of frame samples 89 | Eol = numpy.sum(X ** 2) # total spectral energy 90 | 91 | subWinLength = int(numpy.floor(L / numOfShortBlocks)) # length of sub-frame 92 | if L != subWinLength * numOfShortBlocks: 93 | X = X[0:subWinLength * numOfShortBlocks] 94 | 95 | subWindows = X.reshape(subWinLength, numOfShortBlocks, order='F').copy() # define sub-frames (using matrix reshape) 96 | s = numpy.sum(subWindows ** 2, axis=0) / (Eol + eps) # compute spectral sub-energies 97 | En = -numpy.sum(s*numpy.log2(s + eps)) # compute spectral entropy 98 | 99 | return En 100 | 101 | 102 | def stSpectralFlux(X, Xprev): 103 | """ 104 | Computes the spectral flux feature of the current frame 105 | ARGUMENTS: 106 | X: the abs(fft) of the current frame 107 | Xpre: the abs(fft) of the previous frame 108 | """ 109 | # compute the spectral flux as the sum of square distances: 110 | sumX = numpy.sum(X + eps) 111 | sumPrevX = numpy.sum(Xprev + eps) 112 | F = numpy.sum((X / sumX - Xprev/sumPrevX) ** 2) 113 | 114 | return F 115 | 116 | 117 | def stSpectralRollOff(X, c, fs): 118 | """Computes spectral roll-off""" 119 | totalEnergy = numpy.sum(X ** 2) 120 | fftLength = len(X) 121 | Thres = c*totalEnergy 122 | # Ffind the spectral rolloff as the frequency position where the respective spectral energy is equal to c*totalEnergy 123 | CumSum = numpy.cumsum(X ** 2) + eps 124 | [a, ] = numpy.nonzero(CumSum > Thres) 125 | if len(a) > 0: 126 | mC = numpy.float64(a[0]) / (float(fftLength)) 127 | else: 128 | mC = 0.0 129 | return (mC) 130 | 131 | 132 | def stHarmonic(frame, fs): 133 | """ 134 | Computes harmonic ratio and pitch 135 | """ 136 | M = numpy.round(0.016 * fs) - 1 137 | R = numpy.correlate(frame, frame, mode='full') 138 | 139 | g = R[len(frame)-1] 140 | R = R[len(frame):-1] 141 | 142 | # estimate m0 (as the first zero crossing of R) 143 | [a, ] = numpy.nonzero(numpy.diff(numpy.sign(R))) 144 | 145 | if len(a) == 0: 146 | m0 = len(R)-1 147 | else: 148 | m0 = a[0] 149 | if M > len(R): 150 | M = len(R) - 1 151 | 152 | Gamma = numpy.zeros((M), dtype=numpy.float64) 153 | CSum = numpy.cumsum(frame ** 2) 154 | Gamma[m0:M] = R[m0:M] / (numpy.sqrt((g * CSum[M:m0:-1])) + eps) 155 | 156 | ZCR = stZCR(Gamma) 157 | 158 | if ZCR > 0.15: 159 | HR = 0.0 160 | f0 = 0.0 161 | else: 162 | if len(Gamma) == 0: 163 | HR = 1.0 164 | blag = 0.0 165 | Gamma = numpy.zeros((M), dtype=numpy.float64) 166 | else: 167 | HR = numpy.max(Gamma) 168 | blag = numpy.argmax(Gamma) 169 | 170 | # Get fundamental frequency: 171 | f0 = fs / (blag + eps) 172 | if f0 > 5000: 173 | f0 = 0.0 174 | if HR < 0.1: 175 | f0 = 0.0 176 | 177 | return (HR, f0) 178 | 179 | 180 | def mfccInitFilterBanks(fs, nfft): 181 | """ 182 | Computes the triangular filterbank for MFCC computation (used in the stFeatureExtraction function before the stMFCC function call) 183 | This function is taken from the scikits.talkbox library (MIT Licence): 184 | https://pypi.python.org/pypi/scikits.talkbox 185 | """ 186 | 187 | # filter bank params: 188 | lowfreq = 133.33 189 | linsc = 200/3. 190 | logsc = 1.0711703 191 | numLinFiltTotal = 13 192 | numLogFilt = 27 193 | 194 | if fs < 8000: 195 | nlogfil = 5 196 | 197 | # Total number of filters 198 | nFiltTotal = numLinFiltTotal + numLogFilt 199 | 200 | # Compute frequency points of the triangle: 201 | freqs = numpy.zeros(nFiltTotal+2) 202 | freqs[:numLinFiltTotal] = lowfreq + numpy.arange(numLinFiltTotal) * linsc 203 | freqs[numLinFiltTotal:] = freqs[numLinFiltTotal-1] * logsc ** numpy.arange(1, numLogFilt + 3) 204 | heights = 2./(freqs[2:] - freqs[0:-2]) 205 | 206 | # Compute filterbank coeff (in fft domain, in bins) 207 | fbank = numpy.zeros((nFiltTotal, nfft)) 208 | nfreqs = numpy.arange(nfft) / (1. * nfft) * fs 209 | 210 | for i in range(nFiltTotal): 211 | lowTrFreq = freqs[i] 212 | cenTrFreq = freqs[i+1] 213 | highTrFreq = freqs[i+2] 214 | 215 | lid = numpy.arange(numpy.floor(lowTrFreq * nfft / fs) + 1, numpy.floor(cenTrFreq * nfft / fs) + 1, dtype=numpy.int) 216 | lslope = heights[i] / (cenTrFreq - lowTrFreq) 217 | rid = numpy.arange(numpy.floor(cenTrFreq * nfft / fs) + 1, numpy.floor(highTrFreq * nfft / fs) + 1, dtype=numpy.int) 218 | rslope = heights[i] / (highTrFreq - cenTrFreq) 219 | fbank[i][lid] = lslope * (nfreqs[lid] - lowTrFreq) 220 | fbank[i][rid] = rslope * (highTrFreq - nfreqs[rid]) 221 | 222 | return fbank, freqs 223 | 224 | 225 | def stMFCC(X, fbank, nceps): 226 | """ 227 | Computes the MFCCs of a frame, given the fft mag 228 | ARGUMENTS: 229 | X: fft magnitude abs(FFT) 230 | fbank: filter bank (see mfccInitFilterBanks) 231 | RETURN 232 | ceps: MFCCs (13 element vector) 233 | Note: MFCC calculation is, in general, taken from the scikits.talkbox library (MIT Licence), 234 | # with a small number of modifications to make it more compact and suitable for the pyAudioAnalysis Lib 235 | """ 236 | 237 | mspec = numpy.log10(numpy.dot(X, fbank.T)+eps) 238 | ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:nceps] 239 | return ceps 240 | 241 | 242 | def stChromaFeaturesInit(nfft, fs): 243 | """ 244 | This function initializes the chroma matrices used in the calculation of the chroma features 245 | """ 246 | freqs = numpy.array([((f + 1) * fs) / (2 * nfft) for f in range(nfft)]) 247 | Cp = 27.50 248 | 249 | nChroma = numpy.round(12.0 * numpy.log2(freqs / Cp)).astype(int) 250 | 251 | nFreqsPerChroma = numpy.zeros((nChroma.shape[0], )) 252 | 253 | uChroma = numpy.unique(nChroma) 254 | for u in uChroma: 255 | idx = numpy.nonzero(nChroma == u) 256 | nFreqsPerChroma[idx] = idx[0].shape 257 | return nChroma, nFreqsPerChroma 258 | 259 | 260 | def stChromaFeatures(X, fs, nChroma, nFreqsPerChroma): 261 | #TODO: 1 complexity 262 | #TODO: 2 bug with large windows 263 | 264 | chromaNames = ['A', 'A#', 'B', 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#'] 265 | spec = X**2 266 | C = numpy.zeros((nChroma.shape[0],)) 267 | C[nChroma] = spec 268 | C /= nFreqsPerChroma[nChroma] 269 | finalC = numpy.zeros((12, 1)) 270 | newD = int(numpy.ceil(C.shape[0] / 12.0) * 12) 271 | C2 = numpy.zeros((newD, )) 272 | C2[0:C.shape[0]] = C 273 | C2 = C2.reshape(C2.shape[0]/12, 12) 274 | #for i in range(12): 275 | # finalC[i] = numpy.sum(C[i:C.shape[0]:12]) 276 | finalC = numpy.matrix(numpy.sum(C2, axis=0)).T 277 | finalC /= spec.sum() 278 | 279 | # ax = plt.gca() 280 | # plt.hold(False) 281 | # plt.plot(finalC) 282 | # ax.set_xticks(range(len(chromaNames))) 283 | # ax.set_xticklabels(chromaNames) 284 | # xaxis = numpy.arange(0, 0.02, 0.01); 285 | # ax.set_yticks(range(len(xaxis))) 286 | # ax.set_yticklabels(xaxis) 287 | # plt.show(block=False) 288 | # plt.draw() 289 | 290 | return chromaNames, finalC 291 | 292 | 293 | def stChromagram(signal, Fs, Win, Step, PLOT=False): 294 | """ 295 | Short-term FFT mag for spectogram estimation: 296 | Returns: 297 | a numpy array (nFFT x numOfShortTermWindows) 298 | ARGUMENTS: 299 | signal: the input signal samples 300 | Fs: the sampling freq (in Hz) 301 | Win: the short-term window size (in samples) 302 | Step: the short-term window step (in samples) 303 | PLOT: flag, 1 if results are to be ploted 304 | RETURNS: 305 | """ 306 | Win = int(Win) 307 | Step = int(Step) 308 | signal = numpy.double(signal) 309 | signal = signal / (2.0 ** 15) 310 | DC = signal.mean() 311 | MAX = (numpy.abs(signal)).max() 312 | signal = (signal - DC) / (MAX - DC) 313 | 314 | N = len(signal) # total number of signals 315 | curPos = 0 316 | countFrames = 0 317 | nfft = int(Win / 2) 318 | nChroma, nFreqsPerChroma = stChromaFeaturesInit(nfft, Fs) 319 | chromaGram = numpy.array([], dtype=numpy.float64) 320 | 321 | while (curPos + Win - 1 < N): 322 | countFrames += 1 323 | x = signal[curPos:curPos + Win] 324 | curPos = curPos + Step 325 | X = abs(fft(x)) 326 | X = X[0:nfft] 327 | X = X / len(X) 328 | chromaNames, C = stChromaFeatures(X, Fs, nChroma, nFreqsPerChroma) 329 | C = C[:, 0] 330 | if countFrames == 1: 331 | chromaGram = C.T 332 | else: 333 | chromaGram = numpy.vstack((chromaGram, C.T)) 334 | FreqAxis = chromaNames 335 | TimeAxis = [(t * Step) / Fs for t in range(chromaGram.shape[0])] 336 | 337 | if (PLOT): 338 | fig, ax = plt.subplots() 339 | chromaGramToPlot = chromaGram.transpose()[::-1, :] 340 | Ratio = chromaGramToPlot.shape[1] / (3*chromaGramToPlot.shape[0]) 341 | chromaGramToPlot = numpy.repeat(chromaGramToPlot, Ratio, axis=0) 342 | imgplot = plt.imshow(chromaGramToPlot) 343 | Fstep = int(nfft / 5.0) 344 | # FreqTicks = range(0, int(nfft) + Fstep, Fstep) 345 | # FreqTicksLabels = [str(Fs/2-int((f*Fs) / (2*nfft))) for f in FreqTicks] 346 | ax.set_yticks(range(Ratio / 2, len(FreqAxis) * Ratio, Ratio)) 347 | ax.set_yticklabels(FreqAxis[::-1]) 348 | TStep = countFrames / 3 349 | TimeTicks = range(0, countFrames, TStep) 350 | TimeTicksLabels = ['%.2f' % (float(t * Step) / Fs) for t in TimeTicks] 351 | ax.set_xticks(TimeTicks) 352 | ax.set_xticklabels(TimeTicksLabels) 353 | ax.set_xlabel('time (secs)') 354 | imgplot.set_cmap('jet') 355 | plt.colorbar() 356 | plt.show() 357 | 358 | return (chromaGram, TimeAxis, FreqAxis) 359 | 360 | 361 | def phormants(x, Fs): 362 | N = len(x) 363 | w = numpy.hamming(N) 364 | 365 | # Apply window and high pass filter. 366 | x1 = x * w 367 | x1 = lfilter([1], [1., 0.63], x1) 368 | 369 | # Get LPC. 370 | ncoeff = 2 + Fs / 1000 371 | A, e, k = lpc(x1, ncoeff) 372 | #A, e, k = lpc(x1, 8) 373 | 374 | # Get roots. 375 | rts = numpy.roots(A) 376 | rts = [r for r in rts if numpy.imag(r) >= 0] 377 | 378 | # Get angles. 379 | angz = numpy.arctan2(numpy.imag(rts), numpy.real(rts)) 380 | 381 | # Get frequencies. 382 | frqs = sorted(angz * (Fs / (2 * math.pi))) 383 | 384 | return frqs 385 | def beatExtraction(stFeatures, winSize, PLOT=False): 386 | """ 387 | This function extracts an estimate of the beat rate for a musical signal. 388 | ARGUMENTS: 389 | - stFeatures: a numpy array (numOfFeatures x numOfShortTermWindows) 390 | - winSize: window size in seconds 391 | RETURNS: 392 | - BPM: estimates of beats per minute 393 | - Ratio: a confidence measure 394 | """ 395 | 396 | # Features that are related to the beat tracking task: 397 | toWatch = [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18] 398 | 399 | maxBeatTime = int(round(2.0 / winSize)) 400 | HistAll = numpy.zeros((maxBeatTime,)) 401 | for ii, i in enumerate(toWatch): # for each feature 402 | DifThres = 2.0 * (numpy.abs(stFeatures[i, 0:-1] - stFeatures[i, 1::])).mean() # dif threshold (3 x Mean of Difs) 403 | [pos1, _] = utilities.peakdet(stFeatures[i, :], DifThres) # detect local maxima 404 | posDifs = [] # compute histograms of local maxima changes 405 | for j in range(len(pos1)-1): 406 | posDifs.append(pos1[j+1]-pos1[j]) 407 | [HistTimes, HistEdges] = numpy.histogram(posDifs, numpy.arange(0.5, maxBeatTime + 1.5)) 408 | HistCenters = (HistEdges[0:-1] + HistEdges[1::]) / 2.0 409 | HistTimes = HistTimes.astype(float) / stFeatures.shape[1] 410 | HistAll += HistTimes 411 | if PLOT: 412 | plt.subplot(9, 2, ii + 1) 413 | plt.plot(stFeatures[i, :], 'k') 414 | for k in pos1: 415 | plt.plot(k, stFeatures[i, k], 'k*') 416 | f1 = plt.gca() 417 | f1.axes.get_xaxis().set_ticks([]) 418 | f1.axes.get_yaxis().set_ticks([]) 419 | 420 | if PLOT: 421 | plt.show(block=False) 422 | plt.figure() 423 | 424 | # Get beat as the argmax of the agregated histogram: 425 | I = numpy.argmax(HistAll) 426 | BPMs = 60 / (HistCenters * winSize) 427 | BPM = BPMs[I] 428 | # ... and the beat ratio: 429 | Ratio = HistAll[I] / HistAll.sum() 430 | 431 | if PLOT: 432 | # filter out >500 beats from plotting: 433 | HistAll = HistAll[BPMs < 500] 434 | BPMs = BPMs[BPMs < 500] 435 | 436 | plt.plot(BPMs, HistAll, 'k') 437 | plt.xlabel('Beats per minute') 438 | plt.ylabel('Freq Count') 439 | plt.show(block=True) 440 | 441 | return BPM, Ratio 442 | 443 | 444 | def stSpectogram(signal, Fs, Win, Step, PLOT=False): 445 | """ 446 | Short-term FFT mag for spectogram estimation: 447 | Returns: 448 | a numpy array (nFFT x numOfShortTermWindows) 449 | ARGUMENTS: 450 | signal: the input signal samples 451 | Fs: the sampling freq (in Hz) 452 | Win: the short-term window size (in samples) 453 | Step: the short-term window step (in samples) 454 | PLOT: flag, 1 if results are to be ploted 455 | RETURNS: 456 | """ 457 | Win = int(Win) 458 | Step = int(Step) 459 | signal = numpy.double(signal) 460 | signal = signal / (2.0 ** 15) 461 | DC = signal.mean() 462 | MAX = (numpy.abs(signal)).max() 463 | signal = (signal - DC) / (MAX - DC) 464 | 465 | N = len(signal) # total number of signals 466 | curPos = 0 467 | countFrames = 0 468 | nfft = int(Win / 2) 469 | specgram = numpy.array([], dtype=numpy.float64) 470 | 471 | while (curPos + Win - 1 < N): 472 | countFrames += 1 473 | x = signal[curPos:curPos+Win] 474 | curPos = curPos + Step 475 | X = abs(fft(x)) 476 | X = X[0:nfft] 477 | X = X / len(X) 478 | 479 | if countFrames == 1: 480 | specgram = X ** 2 481 | else: 482 | specgram = numpy.vstack((specgram, X)) 483 | 484 | FreqAxis = [((f + 1) * Fs) / (2 * nfft) for f in range(specgram.shape[1])] 485 | TimeAxis = [(t * Step) / Fs for t in range(specgram.shape[0])] 486 | 487 | if (PLOT): 488 | fig, ax = plt.subplots() 489 | imgplot = plt.imshow(specgram.transpose()[::-1, :]) 490 | Fstep = int(nfft / 5.0) 491 | FreqTicks = range(0, int(nfft) + Fstep, Fstep) 492 | FreqTicksLabels = [str(Fs / 2 - int((f * Fs) / (2 * nfft))) for f in FreqTicks] 493 | ax.set_yticks(FreqTicks) 494 | ax.set_yticklabels(FreqTicksLabels) 495 | TStep = countFrames/3 496 | TimeTicks = range(0, countFrames, TStep) 497 | TimeTicksLabels = ['%.2f' % (float(t * Step) / Fs) for t in TimeTicks] 498 | ax.set_xticks(TimeTicks) 499 | ax.set_xticklabels(TimeTicksLabels) 500 | ax.set_xlabel('time (secs)') 501 | ax.set_ylabel('freq (Hz)') 502 | imgplot.set_cmap('jet') 503 | plt.colorbar() 504 | plt.show() 505 | 506 | return (specgram, TimeAxis, FreqAxis) 507 | 508 | 509 | """ Windowing and feature extraction """ 510 | 511 | 512 | def stFeatureExtraction(signal, Fs, Win, Step): 513 | """ 514 | This function implements the shor-term windowing process. For each short-term window a set of features is extracted. 515 | This results to a sequence of feature vectors, stored in a numpy matrix. 516 | ARGUMENTS 517 | signal: the input signal samples 518 | Fs: the sampling freq (in Hz) 519 | Win: the short-term window size (in samples) 520 | Step: the short-term window step (in samples) 521 | RETURNS 522 | stFeatures: a numpy array (numOfFeatures x numOfShortTermWindows) 523 | """ 524 | 525 | Win = int(Win) 526 | Step = int(Step) 527 | 528 | # Signal normalization 529 | signal = numpy.double(signal) 530 | 531 | signal = signal / (2.0 ** 15) 532 | DC = signal.mean() 533 | MAX = (numpy.abs(signal)).max() 534 | signal = (signal - DC) / MAX 535 | 536 | N = len(signal) # total number of samples 537 | curPos = 0 538 | countFrames = 0 539 | nFFT = Win / 2 540 | 541 | [fbank, freqs] = mfccInitFilterBanks(Fs, nFFT) # compute the triangular filter banks used in the mfcc calculation 542 | nChroma, nFreqsPerChroma = stChromaFeaturesInit(nFFT, Fs) 543 | 544 | 545 | numOfPitches = 5 546 | numOfPeaks = 10 547 | numOfTimeSpectralFeatures = 8 548 | numOfHarmonicFeatures = 0 549 | nceps = 13 550 | numOfChromaFeatures = 13 551 | if use_pitch: 552 | totalNumOfFeatures = numOfTimeSpectralFeatures + nceps + numOfHarmonicFeatures + numOfChromaFeatures + numOfPeaks + numOfPitches 553 | else: 554 | totalNumOfFeatures = numOfTimeSpectralFeatures + nceps + numOfHarmonicFeatures + numOfChromaFeatures 555 | # totalNumOfFeatures = numOfTimeSpectralFeatures + nceps + numOfHarmonicFeatures 556 | stFeatures = numpy.array([], dtype=numpy.float64) 557 | 558 | while (curPos + Win - 1 < N): # for each short-term window until the end of signal 559 | countFrames += 1 560 | x = signal[curPos:curPos+Win] # get current window 561 | if use_pitch: 562 | p = pitch.ppitch(x, sr=Fs, num_pitches=numOfPitches, num_peaks=numOfPeaks, win_length=Win, hop_length=Win*2) 563 | pitches = p[1][0:1, :].T * 1.e-3 564 | peaks = p[3][0:1, :].T * 1.e-3 565 | curPos = curPos + Step # update window position 566 | X = abs(fft(x)) # get fft magnitude 567 | X = X[0:nFFT] # normalize fft 568 | X = X / len(X) 569 | if countFrames == 1: 570 | Xprev = X.copy() # keep previous fft mag (used in spectral flux) 571 | curFV = numpy.zeros((totalNumOfFeatures, 1)) 572 | curFV[0] = stZCR(x) # zero crossing rate 573 | curFV[1] = stEnergy(x) # short-term energy 574 | curFV[2] = stEnergyEntropy(x) # short-term entropy of energy 575 | [curFV[3], curFV[4]] = stSpectralCentroidAndSpread(X, Fs) # spectral centroid and spread 576 | curFV[5] = stSpectralEntropy(X) # spectral entropy 577 | curFV[6] = stSpectralFlux(X, Xprev) # spectral flux 578 | curFV[7] = stSpectralRollOff(X, 0.90, Fs) # spectral rolloff 579 | curFV[numOfTimeSpectralFeatures:numOfTimeSpectralFeatures+nceps, 0] = stMFCC(X, fbank, nceps).copy() # MFCCs 580 | 581 | chromaNames, chromaF = stChromaFeatures(X, Fs, nChroma, nFreqsPerChroma) 582 | curFV[numOfTimeSpectralFeatures + nceps: numOfTimeSpectralFeatures + nceps + numOfChromaFeatures - 1] = chromaF 583 | numOfCFFeatures = numOfTimeSpectralFeatures + nceps + numOfChromaFeatures 584 | 585 | curFV[numOfCFFeatures-1] = chromaF.std() 586 | if use_pitch: 587 | curFV[numOfCFFeatures:numOfCFFeatures + numOfPeaks] = peaks 588 | curFV[numOfCFFeatures + numOfPeaks:numOfCFFeatures + numOfPeaks + numOfPitches] = pitches 589 | # curFV[numOfTimeSpectralFeatures+nceps+numOfChromaFeatures-1] = numpy.nonzero( chromaF > 2.0 * chromaF.mean() )[0].shape[0] 590 | # temp = numpy.sort(chromaF[:,0]) 591 | # curFV[numOfTimeSpectralFeatures+numOfChromaFeatures] = temp[-1] / numpy.mean(temp[0:5]) 592 | # temp = numpy.sort(chromaF[:,0]) 593 | # if countFrames==10 or countFrames==30: 594 | # A = int(temp[-1] / numpy.mean(temp[0:5]))/10 595 | # for a in range(A): 596 | # print("|"), 597 | # print 598 | # if countFrames==20: 599 | # print numpy.nonzero(chromaF > 5*chromaF.mean())[0].shape[0] 600 | #HR, curFV[numOfTimeSpectralFeatures+nceps] = stHarmonic(x, Fs) 601 | # curFV[numOfTimeSpectralFeatures+nceps+1] = freq_from_autocorr(x, Fs) 602 | if countFrames == 1: 603 | stFeatures = curFV # initialize feature matrix (if first frame) 604 | else: 605 | stFeatures = numpy.concatenate((stFeatures, curFV), 1) # update feature matrix 606 | Xprev = X.copy() 607 | 608 | return numpy.array(stFeatures) 609 | 610 | 611 | def mtFeatureExtraction(signal, Fs, mtWin, mtStep, stWin, stStep): 612 | """ 613 | Mid-term feature extraction 614 | """ 615 | 616 | mtWinRatio = int(round(mtWin / stStep)) 617 | mtStepRatio = int(round(mtStep / stStep)) 618 | 619 | mtFeatures = [] 620 | 621 | stFeatures = stFeatureExtraction(signal, Fs, stWin, stStep) 622 | numOfFeatures = len(stFeatures) 623 | numOfStatistics = 2 624 | 625 | mtFeatures = [] 626 | #for i in range(numOfStatistics * numOfFeatures + 1): 627 | for i in range(numOfStatistics * numOfFeatures): 628 | mtFeatures.append([]) 629 | 630 | for i in range(numOfFeatures): # for each of the short-term features: 631 | curPos = 0 632 | N = len(stFeatures[i]) 633 | while (curPos < N): 634 | N1 = curPos 635 | N2 = curPos + mtWinRatio 636 | if N2 > N: 637 | N2 = N 638 | curStFeatures = stFeatures[i][N1:N2] 639 | 640 | mtFeatures[i].append(numpy.mean(curStFeatures)) 641 | mtFeatures[i+numOfFeatures].append(numpy.std(curStFeatures)) 642 | #mtFeatures[i+2*numOfFeatures].append(numpy.std(curStFeatures) / (numpy.mean(curStFeatures)+0.00000010)) 643 | curPos += mtStepRatio 644 | 645 | return numpy.array(mtFeatures), stFeatures 646 | 647 | 648 | # TODO 649 | def stFeatureSpeed(signal, Fs, Win, Step): 650 | 651 | signal = numpy.double(signal) 652 | signal = signal / (2.0 ** 15) 653 | DC = signal.mean() 654 | MAX = (numpy.abs(signal)).max() 655 | signal = (signal - DC) / MAX 656 | # print (numpy.abs(signal)).max() 657 | 658 | N = len(signal) # total number of signals 659 | curPos = 0 660 | countFrames = 0 661 | 662 | lowfreq = 133.33 663 | linsc = 200/3. 664 | logsc = 1.0711703 665 | nlinfil = 13 666 | nlogfil = 27 667 | nceps = 13 668 | nfil = nlinfil + nlogfil 669 | nfft = Win / 2 670 | if Fs < 8000: 671 | nlogfil = 5 672 | nfil = nlinfil + nlogfil 673 | nfft = Win / 2 674 | 675 | # compute filter banks for mfcc: 676 | [fbank, freqs] = mfccInitFilterBanks(Fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil) 677 | 678 | numOfTimeSpectralFeatures = 8 679 | numOfHarmonicFeatures = 1 680 | totalNumOfFeatures = numOfTimeSpectralFeatures + nceps + numOfHarmonicFeatures 681 | #stFeatures = numpy.array([], dtype=numpy.float64) 682 | stFeatures = [] 683 | 684 | while (curPos + Win - 1 < N): 685 | countFrames += 1 686 | x = signal[curPos:curPos + Win] 687 | curPos = curPos + Step 688 | X = abs(fft(x)) 689 | X = X[0:nfft] 690 | X = X / len(X) 691 | Ex = 0.0 692 | El = 0.0 693 | X[0:4] = 0 694 | # M = numpy.round(0.016 * fs) - 1 695 | # R = numpy.correlate(frame, frame, mode='full') 696 | stFeatures.append(stHarmonic(x, Fs)) 697 | # for i in range(len(X)): 698 | #if (i < (len(X) / 8)) and (i > (len(X)/40)): 699 | # Ex += X[i]*X[i] 700 | #El += X[i]*X[i] 701 | # stFeatures.append(Ex / El) 702 | # stFeatures.append(numpy.argmax(X)) 703 | # if curFV[numOfTimeSpectralFeatures+nceps+1]>0: 704 | # print curFV[numOfTimeSpectralFeatures+nceps], curFV[numOfTimeSpectralFeatures+nceps+1] 705 | return numpy.array(stFeatures) 706 | 707 | 708 | """ Feature Extraction Wrappers 709 | - The first two feature extraction wrappers are used to extract long-term averaged 710 | audio features for a list of WAV files stored in a given category. 711 | It is important to note that, one single feature is extracted per WAV file (not the whole sequence of feature vectors) 712 | """ 713 | 714 | 715 | def dirWavFeatureExtraction(dirName, mtWin, mtStep, stWin, stStep, computeBEAT=False): 716 | """ 717 | This function extracts the mid-term features of the WAVE files of a particular folder. 718 | The resulting feature vector is extracted by long-term averaging the mid-term features. 719 | Therefore ONE FEATURE VECTOR is extracted for each WAV file. 720 | ARGUMENTS: 721 | - dirName: the path of the WAVE directory 722 | - mtWin, mtStep: mid-term window and step (in seconds) 723 | - stWin, stStep: short-term window and step (in seconds) 724 | """ 725 | 726 | allMtFeatures = numpy.array([]) 727 | processingTimes = [] 728 | 729 | types = ('*.wav', '*.aif', '*.aiff') 730 | wavFilesList = [] 731 | for files in types: 732 | wavFilesList.extend(glob.glob(os.path.join(dirName, files))) 733 | 734 | wavFilesList = sorted(wavFilesList) 735 | 736 | for wavFile in wavFilesList: 737 | [Fs, x] = audioBasicIO.readAudioFile(wavFile) # read file 738 | t1 = time.clock() 739 | x = audioBasicIO.stereo2mono(x) # convert stereo to mono 740 | if computeBEAT: # mid-term feature extraction for current file 741 | [MidTermFeatures, stFeatures] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep)) 742 | [beat, beatConf] = beatExtraction(stFeatures, stStep) 743 | else: 744 | [MidTermFeatures, _] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep)) 745 | 746 | MidTermFeatures = numpy.transpose(MidTermFeatures) 747 | MidTermFeatures = MidTermFeatures.mean(axis=0) # long term averaging of mid-term statistics 748 | if computeBEAT: 749 | MidTermFeatures = numpy.append(MidTermFeatures, beat) 750 | MidTermFeatures = numpy.append(MidTermFeatures, beatConf) 751 | if len(allMtFeatures) == 0: # append feature vector 752 | allMtFeatures = MidTermFeatures 753 | else: 754 | allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures)) 755 | t2 = time.clock() 756 | duration = float(len(x)) / Fs 757 | processingTimes.append((t2 - t1) / duration) 758 | if len(processingTimes) > 0: 759 | print "Feature extraction complexity ratio: {0:.1f} x realtime".format((1.0 / numpy.mean(numpy.array(processingTimes)))) 760 | return (allMtFeatures, wavFilesList) 761 | 762 | 763 | def dirsWavFeatureExtraction(dirNames, mtWin, mtStep, stWin, stStep, computeBEAT=False): 764 | ''' 765 | Same as dirWavFeatureExtraction, but instead of a single dir it takes a list of paths as input and returns a list of feature matrices. 766 | EXAMPLE: 767 | [features, classNames] = 768 | a.dirsWavFeatureExtraction(['audioData/classSegmentsRec/noise','audioData/classSegmentsRec/speech', 769 | 'audioData/classSegmentsRec/brush-teeth','audioData/classSegmentsRec/shower'], 1, 1, 0.02, 0.02); 770 | It can be used during the training process of a classification model , 771 | in order to get feature matrices from various audio classes (each stored in a seperate path) 772 | ''' 773 | 774 | # feature extraction for each class: 775 | features = [] 776 | classNames = [] 777 | fileNames = [] 778 | for i, d in enumerate(dirNames): 779 | [f, fn] = dirWavFeatureExtraction(d, mtWin, mtStep, stWin, stStep, computeBEAT=computeBEAT) 780 | if f.shape[0] > 0: # if at least one audio file has been found in the provided folder: 781 | features.append(f) 782 | fileNames.append(fn) 783 | if d[-1] == "/": 784 | classNames.append(d.split(os.sep)[-2]) 785 | else: 786 | classNames.append(d.split(os.sep)[-1]) 787 | return features, classNames, fileNames 788 | 789 | 790 | def dirWavFeatureExtractionNoAveraging(dirName, mtWin, mtStep, stWin, stStep): 791 | """ 792 | This function extracts the mid-term features of the WAVE files of a particular folder without averaging each file. 793 | ARGUMENTS: 794 | - dirName: the path of the WAVE directory 795 | - mtWin, mtStep: mid-term window and step (in seconds) 796 | - stWin, stStep: short-term window and step (in seconds) 797 | RETURNS: 798 | - X: A feature matrix 799 | - Y: A matrix of file labels 800 | - filenames: 801 | """ 802 | 803 | allMtFeatures = numpy.array([]) 804 | signalIndices = numpy.array([]) 805 | processingTimes = [] 806 | 807 | types = ('*.wav', '*.aif', '*.aiff') 808 | wavFilesList = [] 809 | for files in types: 810 | wavFilesList.extend(glob.glob(os.path.join(dirName, files))) 811 | 812 | wavFilesList = sorted(wavFilesList) 813 | 814 | for i, wavFile in enumerate(wavFilesList): 815 | [Fs, x] = audioBasicIO.readAudioFile(wavFile) # read file 816 | x = audioBasicIO.stereo2mono(x) # convert stereo to mono 817 | [MidTermFeatures, _] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep)) # mid-term feature 818 | 819 | MidTermFeatures = numpy.transpose(MidTermFeatures) 820 | # MidTermFeatures = MidTermFeatures.mean(axis=0) # long term averaging of mid-term statistics 821 | if len(allMtFeatures) == 0: # append feature vector 822 | allMtFeatures = MidTermFeatures 823 | signalIndices = numpy.zeros((MidTermFeatures.shape[0], )) 824 | else: 825 | allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures)) 826 | signalIndices = numpy.append(signalIndices, i * numpy.ones((MidTermFeatures.shape[0], ))) 827 | 828 | return (allMtFeatures, signalIndices, wavFilesList) 829 | 830 | 831 | # The following two feature extraction wrappers extract features for given audio files, however 832 | # NO LONG-TERM AVERAGING is performed. Therefore, the output for each audio file is NOT A SINGLE FEATURE VECTOR 833 | # but a whole feature matrix. 834 | # 835 | # Also, another difference between the following two wrappers and the previous is that they NO LONG-TERM AVERAGING IS PERFORMED. 836 | # In other words, the WAV files in these functions are not used as uniform samples that need to be averaged but as sequences 837 | 838 | def mtFeatureExtractionToFile(fileName, midTermSize, midTermStep, shortTermSize, shortTermStep, outPutFile, 839 | storeStFeatures=False, storeToCSV=False, PLOT=False): 840 | """ 841 | This function is used as a wrapper to: 842 | a) read the content of a WAV file 843 | b) perform mid-term feature extraction on that signal 844 | c) write the mid-term feature sequences to a numpy file 845 | """ 846 | [Fs, x] = audioBasicIO.readAudioFile(fileName) # read the wav file 847 | x = audioBasicIO.stereo2mono(x) # convert to MONO if required 848 | if storeStFeatures: 849 | [mtF, stF] = mtFeatureExtraction(x, Fs, round(Fs * midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep)) 850 | else: 851 | [mtF, _] = mtFeatureExtraction(x, Fs, round(Fs*midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep)) 852 | 853 | numpy.save(outPutFile, mtF) # save mt features to numpy file 854 | if PLOT: 855 | print "Mid-term numpy file: " + outPutFile + ".npy saved" 856 | if storeToCSV: 857 | numpy.savetxt(outPutFile+".csv", mtF.T, delimiter=",") 858 | if PLOT: 859 | print "Mid-term CSV file: " + outPutFile + ".csv saved" 860 | 861 | if storeStFeatures: 862 | numpy.save(outPutFile+"_st", stF) # save st features to numpy file 863 | if PLOT: 864 | print "Short-term numpy file: " + outPutFile + "_st.npy saved" 865 | if storeToCSV: 866 | numpy.savetxt(outPutFile+"_st.csv", stF.T, delimiter=",") # store st features to CSV file 867 | if PLOT: 868 | print "Short-term CSV file: " + outPutFile + "_st.csv saved" 869 | 870 | 871 | def mtFeatureExtractionToFileDir(dirName, midTermSize, midTermStep, shortTermSize, shortTermStep, storeStFeatures=False, storeToCSV=False, PLOT=False): 872 | types = (dirName + os.sep + '*.wav', ) 873 | filesToProcess = [] 874 | for files in types: 875 | filesToProcess.extend(glob.glob(files)) 876 | for f in filesToProcess: 877 | outPath = f 878 | mtFeatureExtractionToFile(f, midTermSize, midTermStep, shortTermSize, shortTermStep, outPath, storeStFeatures, storeToCSV, PLOT) --------------------------------------------------------------------------------