├── README.md ├── area_train.py ├── features.py ├── model.py ├── models └── readme.md ├── process_dataset.py └── vtlpAug.py /README.md: -------------------------------------------------------------------------------- 1 | # area_attention_for_SER 2 | Speech emotion recogntion with area attention and VTLP 3 | 4 | 1. Use process_dataset.py to divide IEMOCAP data set to train set and test set, and generate CSV files. 5 | 2. Use vtlpAug.py to generate data augmented with VTLP. 6 | 3. Use area_train.py to train and save the models. 7 | -------------------------------------------------------------------------------- /area_train.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import pickle 4 | import random 5 | import time 6 | import math 7 | import logging 8 | import datetime 9 | import tensorflow as tf 10 | 11 | import numpy as np 12 | import librosa 13 | from tqdm import tqdm 14 | 15 | 16 | def setup_seed(seed): 17 | np.random.seed(seed) 18 | random.seed(seed) 19 | tf.set_random_seed(seed) 20 | #tf.random.set_seed(seed) 21 | 22 | tf.enable_eager_execution() 23 | #SEED = [111111,123456,0] 24 | SEED=[999999,987654] 25 | #SEED=[111111,123456,0,999999,987654] 26 | attention_head = 4 27 | attention_hidden = 32 28 | area_height = 1 29 | area_width = 1 30 | 31 | import features 32 | import model as MODEL 33 | 34 | Epochs = 100 35 | BATCH_SIZE = 32 36 | learning_rate = 0.0001 37 | T_stride = 2 38 | T_overlop = T_stride / 2 39 | overlapTime = { 40 | 'neutral': 1, 41 | 'happy': 1, 42 | 'sad': 1, 43 | 'angry': 1, 44 | } 45 | FEATURES_TO_USE = 'melspectrogram' # {'mfcc' , 'logfbank','fbank','spectrogram','melspectrogram'} 46 | featuresExist = True 47 | impro_or_script = 'impro' 48 | featuresFileName = 'features_{}_{}.pkl'.format(FEATURES_TO_USE, impro_or_script) 49 | toSaveFeatures = True 50 | WAV_PATH = "E:/Test/IEMOCAP/" 51 | RATE = 16000 52 | 53 | LABEL = { 54 | 'neutral': 0, 55 | 'happy': 1, 56 | 'sad': 2, 57 | 'angry': 3, 58 | } 59 | 60 | LABEL_DICT1 = { 61 | '01': 'neutral', 62 | # '02': 'frustration', 63 | # '03': 'happy', 64 | '04': 'sad', 65 | '05': 'angry', 66 | # '06': 'fearful', 67 | '07': 'happy', # excitement->happy 68 | # '08': 'surprised' 69 | } 70 | 71 | 72 | def process_data(path, t=2, train_overlap=1, val_overlap=1.6, RATE=16000): 73 | path = path.rstrip('/') 74 | wav_files = glob.glob(path + '/*.wav') 75 | 76 | n = len(wav_files) 77 | train_files = [] 78 | valid_files = [] 79 | train_indices = list(np.random.choice(range(n), int(n * 0.8), replace=False)) 80 | valid_indices = list(set(range(n)) - set(train_indices)) 81 | for i in train_indices: 82 | train_files.append(wav_files[i]) 83 | for i in valid_indices: 84 | valid_files.append(wav_files[i]) 85 | 86 | print("constructing meta dictionary for {}...".format(path)) 87 | train_X, train_y = train_data_process(train_files, LABEL_DICT1, RATE, t, train_overlap) 88 | 89 | val_dict = valid_data_process(valid_files, LABEL_DICT1, RATE, t, val_overlap) 90 | 91 | return train_X, train_y, val_dict 92 | 93 | 94 | def valid_data_process(valid_files, LABEL_DICT1, RATE, t, val_overlap): 95 | val_dict = {} 96 | if (val_overlap >= t): 97 | val_overlap = t / 2 98 | for i, wav_file in enumerate(tqdm(valid_files)): 99 | label = str(os.path.basename(wav_file).split('-')[2]) 100 | if (label not in LABEL_DICT1): 101 | continue 102 | if (impro_or_script != 'all' and (impro_or_script not in wav_file)): 103 | continue 104 | label = LABEL_DICT1[label] 105 | wav_data, _ = librosa.load(wav_file, sr=RATE) 106 | X1 = [] 107 | y1 = [] 108 | index = 0 109 | if (t * RATE >= len(wav_data)): 110 | continue 111 | while (index + t * RATE < len(wav_data)): 112 | X1.append(wav_data[int(index):int(index + t * RATE)]) 113 | y1.append(LABEL[label]) 114 | index += int((t - val_overlap) * RATE) 115 | 116 | X1 = np.array(X1) 117 | val_dict[i] = { 118 | 'X': X1, 119 | 'y': y1, 120 | 'path': wav_file 121 | } 122 | return val_dict 123 | 124 | 125 | def train_data_process(train_files, LABEL_DICT1, RATE, t, train_overlap): 126 | meta_dict = {} 127 | for i, wav_file in enumerate(tqdm(train_files)): 128 | label = str(os.path.basename(wav_file).split('-')[2]) 129 | if (label not in LABEL_DICT1): 130 | continue 131 | if (impro_or_script != 'all' and (impro_or_script not in wav_file)): 132 | continue 133 | label = LABEL_DICT1[label] 134 | 135 | wav_data, _ = librosa.load(wav_file, sr=RATE) 136 | X1 = [] 137 | y1 = [] 138 | index = 0 139 | if (t * RATE >= len(wav_data)): 140 | continue 141 | 142 | while (index + t * RATE < len(wav_data)): 143 | X1.append(wav_data[int(index):int(index + t * RATE)]) 144 | y1.append(LABEL[label]) 145 | assert t - train_overlap > 0 146 | index += int((t - train_overlap) * RATE / overlapTime[label]) 147 | 148 | X1 = np.array(X1) 149 | meta_dict[i] = { 150 | 'X': X1, 151 | 'y': y1, 152 | 'path': wav_file 153 | } 154 | print("building X, y...") 155 | train_X = [] 156 | train_y = [] 157 | for k in meta_dict: 158 | train_X.append(meta_dict[k]['X']) 159 | train_y += meta_dict[k]['y'] 160 | train_X = np.row_stack(train_X) 161 | train_y = np.array(train_y) 162 | assert len(train_X) == len(train_y), "X length and y length must match! X shape: {}, y length: {}".format( 163 | train_X.shape, train_y.shape) 164 | return train_X, train_y 165 | 166 | 167 | def train(SEED, area_width, area_height,AUG=True): 168 | setup_seed(SEED) 169 | MODEL_NAME = 'AUG_area{}x{}_seed{}'.format(area_width, area_height, SEED) 170 | data_dir = '/program/xumingke/IEMOCAP/' 171 | train_files = [] 172 | train_files2 = [] 173 | valid_files = [] 174 | with open(data_dir + '/IEMOCAP_train_{}.csv'.format(SEED)) as f: 175 | fr = f.readlines() 176 | for line in fr: 177 | train_files.append(data_dir + '/' + line.split('\t')[2]) 178 | if(AUG): 179 | for i in range(0): 180 | train_files.append(data_dir + '/' + line.split('\t')[2]+'.'+str(i+1)) 181 | for i in range(1): 182 | train_files2.append(data_dir + '/' + line.split('\t')[2]+'.'+str(i+5)) 183 | with open(data_dir + '/IEMOCAP_dev_{}.csv'.format(SEED)) as f: 184 | fr = f.readlines() 185 | for line in fr: 186 | train_files.append(data_dir + '/' + line.split('\t')[2]) 187 | if(AUG): 188 | for i in range(0): 189 | train_files.append(data_dir + '/' + line.split('\t')[2]+'.'+str(i+1)) 190 | for i in range(1): 191 | train_files2.append(data_dir + '/' + line.split('\t')[2]+'.'+str(i+5)) 192 | with open(data_dir + '/IEMOCAP_test_{}.csv'.format(SEED)) as f: 193 | fr = f.readlines() 194 | for line in fr: 195 | valid_files.append(data_dir + '/' + line.split('\t')[2]) 196 | 197 | train_X, train_y = train_data_process(train_files, LABEL_DICT1, RATE, T_stride, T_overlop) 198 | train_X2,train_y2=train_data_process(train_files2, LABEL_DICT1, RATE, T_stride, T_overlop) 199 | train_y=tf.concat([train_y,train_y2],0) 200 | val_dict = valid_data_process(valid_files, LABEL_DICT1, RATE, T_stride, 1.6) 201 | feature_extractor = features.FeatureExtractor(rate=RATE) 202 | 203 | train_X_features = feature_extractor.get_features(FEATURES_TO_USE, train_X) 204 | train_X_features2 = feature_extractor.get_features(FEATURES_TO_USE, train_X2) 205 | 206 | valid_features_dict = {} 207 | for _, i in enumerate(val_dict): 208 | X1 = feature_extractor.get_features(FEATURES_TO_USE, val_dict[i]['X']) 209 | valid_features_dict[i] = { 210 | 'X': X1, 211 | 'y': val_dict[i]['y'] 212 | } 213 | 214 | train_X_features = tf.expand_dims(train_X_features, -1) 215 | train_X_features2 = tf.expand_dims(train_X_features2, -1) 216 | train_X_features = tf.concat([train_X_features,train_X_features2],0) 217 | train_X_features = tf.cast(train_X_features, tf.float32) 218 | 219 | train_ds = tf.data.Dataset.from_tensor_slices( 220 | (train_X_features, train_y)).shuffle(train_X_features.shape[0]).batch(BATCH_SIZE) 221 | 222 | loss_object = tf.keras.losses.SparseCategoricalCrossentropy() 223 | optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, decay=1e-6) 224 | 225 | train_loss = tf.keras.metrics.Mean(name='train_loss') 226 | train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy') 227 | 228 | test_loss = tf.keras.metrics.Mean(name='test_loss') 229 | test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy') 230 | 231 | model = MODEL.AACNN(area_height, area_width) 232 | 233 | def train_step(images, labels): 234 | 235 | with tf.GradientTape() as tape: 236 | predictions = model(images) 237 | loss = loss_object(labels, predictions) 238 | gradients = tape.gradient(loss, model.trainable_variables) 239 | optimizer.apply_gradients(zip(gradients, model.trainable_variables)) 240 | 241 | train_loss(loss) 242 | train_accuracy(labels, predictions) 243 | 244 | def test_step(images, labels): 245 | predictions = model(images) 246 | t_loss = loss_object(labels, predictions) 247 | 248 | test_loss(t_loss) 249 | test_accuracy(labels, predictions) 250 | 251 | print('training...') 252 | logging.warning('training seed={}'.format(SEED)) 253 | maxWA = 0 254 | maxUA = 0 255 | maxACC = 0 256 | for epoch in range(Epochs): 257 | # 在下一个epoch开始时,重置评估指标 258 | train_loss.reset_states() 259 | train_accuracy.reset_states() 260 | test_loss.reset_states() 261 | test_accuracy.reset_states() 262 | # tq = tqdm(total=len(train_y)) 263 | for step, (images, labels) in enumerate(train_ds): 264 | train_step(images, labels) 265 | 266 | # tq.update(BATCH_SIZE) 267 | # tq.close() 268 | template = 'Epoch {}, Loss: {}, Accuracy: {}\n' 269 | print(template.format(epoch + 1, 270 | train_loss.result(), 271 | train_accuracy.result() * 100, 272 | )) 273 | logging.warning(template.format(epoch + 1, 274 | train_loss.result(), 275 | train_accuracy.result() * 100, 276 | )) 277 | 278 | correct = 0 279 | label_correct = [0, 0, 0, 0] 280 | label_total = [0, 0, 0, 0] 281 | 282 | for _, i in enumerate(valid_features_dict): 283 | 284 | x, y = valid_features_dict[i]['X'], valid_features_dict[i]['y'] 285 | x = tf.expand_dims(x, -1) 286 | x = tf.cast(x, tf.float32) 287 | y = np.array([y[0]]) 288 | out = model(x) 289 | out = tf.reduce_mean(out, 0, keepdims=True) 290 | 291 | label_total[y[0]] += 1 292 | if (test_accuracy(y, out) > 0): 293 | correct += 1 294 | label_correct[y[0]] += 1 295 | test_accuracy.reset_states() 296 | 297 | label_acc = [label_correct[0] / label_total[0], 298 | label_correct[1] / label_total[1], 299 | label_correct[2] / label_total[2], 300 | label_correct[3] / label_total[3]] 301 | UA = (label_acc[0] + label_acc[1] + label_acc[2] + label_acc[3]) / 4 302 | if (correct / len(valid_features_dict) > maxWA): 303 | maxWA = correct / len(valid_features_dict) 304 | if (UA > maxUA): 305 | maxUA = UA 306 | ACC = (correct / len(valid_features_dict)) + UA 307 | if (ACC > maxACC): 308 | print('saving model (WA:{},UA:{})\n'.format(correct / len(valid_features_dict), UA)) 309 | logging.warning('saving model (WA:{},UA:{})\n'.format(correct / len(valid_features_dict), UA)) 310 | model.save_weights('./models/{}'.format(MODEL_NAME)) 311 | maxACC = ACC 312 | print('label_correct:{}\nUA:{}'.format(label_correct, label_acc)) 313 | print('maxWA:{}\nmaxUA:{}'.format(maxWA, maxUA)) 314 | logging.warning('label_correct:{}\nUA:{}'.format(label_correct, label_acc)) 315 | logging.warning('maxWA:{}\nmaxUA:{}'.format(maxWA, maxUA)) 316 | 317 | print('end training on seed:{}'.format(SEED)) 318 | logging.warning('end training on seed:{}'.format(SEED)) 319 | del model 320 | 321 | # model = MODEL.AACNN() 322 | # # model.load_weights('./models/{}'.format(MODEL_NAME)) 323 | # # 324 | # # result = [] 325 | # # correct = 0 326 | # # for _, i in enumerate(valid_features_dict): 327 | # # x, y = valid_features_dict[i]['X'], valid_features_dict[i]['y'] 328 | # # x = tf.expand_dims(x, -1) 329 | # # x = tf.cast(x, tf.float32) 330 | # # y = np.array([y[0]]) 331 | # # out = model(x) 332 | # # out = tf.reduce_mean(out, 0, keepdims=True).numpy() 333 | # # if (np.argmax(out) == y): 334 | # # correct += 1 335 | # # result.append(out) 336 | # # print(correct) 337 | # # result = np.array(result) 338 | # # np.save('./test/SERtest_{}.npy'.format(SEED), result) 339 | 340 | 341 | if __name__ == '__main__': 342 | logger = logging.getLogger() 343 | logger.setLevel(logging.WARNING) # Log等级总开关 344 | # 第二步,创建一个handler,用于写入日志文件 345 | log_name = 'area3x3_AUG2.log2'.format(area_width, area_height) 346 | logfile = log_name 347 | fh = logging.FileHandler(logfile, mode='w') 348 | fh.setLevel(logging.DEBUG) # 输出到file的log等级的开关 349 | # 第三步,定义handler的输出格式 350 | formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s") 351 | fh.setFormatter(formatter) 352 | # 第四步,将logger添加到handler里面 353 | logger.addHandler(fh) 354 | 355 | for seed in SEED: 356 | train(seed, area_width, area_height,True) 357 | 358 | -------------------------------------------------------------------------------- /features.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import pickle 4 | 5 | import torch 6 | 7 | from python_speech_features import logfbank, fbank, sigproc 8 | import numpy as np 9 | import librosa 10 | from tqdm import tqdm 11 | 12 | 13 | class FeatureExtractor(object): 14 | def __init__(self, rate): 15 | self.rate = rate 16 | 17 | def get_features(self, features_to_use, X): 18 | X_features = None 19 | accepted_features_to_use = ("logfbank", 'mfcc', 'fbank', 'melspectrogram', 'spectrogram', 'pase') 20 | if features_to_use not in accepted_features_to_use: 21 | raise NotImplementedError("{} not in {}!".format(features_to_use, accepted_features_to_use)) 22 | if features_to_use in ('logfbank'): 23 | X_features = self.get_logfbank(X) 24 | if features_to_use in ('mfcc'): 25 | X_features = self.get_mfcc(X,26) 26 | if features_to_use in ('fbank'): 27 | X_features = self.get_fbank(X) 28 | if features_to_use in ('melspectrogram'): 29 | X_features = self.get_melspectrogram(X) 30 | if features_to_use in ('spectrogram'): 31 | X_features = self.get_spectrogram(X) 32 | if features_to_use in ('pase'): 33 | X_features = self.get_Pase(X) 34 | return X_features 35 | 36 | def get_logfbank(self, X): 37 | def _get_logfbank(x): 38 | out = logfbank(signal=x, samplerate=self.rate, winlen=0.040, winstep=0.010, nfft=1024, highfreq=4000, 39 | nfilt=40) 40 | return out 41 | 42 | X_features = np.apply_along_axis(_get_logfbank, 1, X) 43 | return X_features 44 | 45 | def get_mfcc(self, X, n_mfcc=13): 46 | def _get_mfcc(x): 47 | mfcc_data = librosa.feature.mfcc(x, sr=self.rate, n_mfcc=n_mfcc) 48 | # delta = librosa.feature.delta(mfcc_data) 49 | # delta_delta = librosa.feature.delta(mfcc_data, order=2) 50 | # mfcc_data = np.expand_dims(mfcc_data, 0) 51 | # delta = np.expand_dims(delta, 0) 52 | # delta_delta = np.expand_dims(delta_delta, 0) 53 | # out = np.concatenate((mfcc_data, delta, delta_delta), 0) 54 | return mfcc_data 55 | 56 | X_features = np.apply_along_axis(_get_mfcc, 1, X) 57 | return X_features 58 | 59 | def get_fbank(self, X): 60 | def _get_fbank(x): 61 | out, _ = fbank(signal=x, samplerate=self.rate, winlen=0.040, winstep=0.010, nfft=1024) 62 | return out 63 | 64 | X_features = np.apply_along_axis(_get_fbank, 1, X) 65 | return X_features 66 | 67 | def get_melspectrogram(self, X): 68 | def _get_melspectrogram(x): 69 | mel = librosa.feature.melspectrogram(y=x, sr=self.rate, n_fft=800, hop_length=400)[np.newaxis, :] 70 | out=np.log10(mel).squeeze() 71 | return out 72 | 73 | X_features = np.apply_along_axis(_get_melspectrogram, 1, X) 74 | return X_features 75 | 76 | def get_spectrogram(self, X): 77 | def _get_spectrogram(x): 78 | frames = sigproc.framesig(x, 640, 160) 79 | out = sigproc.logpowspec(frames, NFFT=3198) 80 | out = out.swapaxes(0, 1) 81 | return out[:][:400] 82 | 83 | X_features = np.apply_along_axis(_get_spectrogram, 1, X) 84 | return X_features 85 | 86 | # def get_Pase(self, X): 87 | # pase = wf_builder('PASE/cfg/PASE.cfg') 88 | # pase.eval() 89 | # pase.load_pretrained('PASE/PASE.ckpt', load_last=True, verbose=True) 90 | # # tq=tqdm(total=X.shape[0]) 91 | # def _get_spectrogram(x): 92 | # x = torch.from_numpy(x) 93 | # x = x.unsqueeze(0).unsqueeze(0) 94 | # y = pase(x).detach().numpy() 95 | # # tq.update(1) 96 | # return y 97 | # 98 | # X_features = np.apply_along_axis(_get_spectrogram, 1, X) 99 | # return X_features 100 | def get_Pase(self,X): 101 | return X 102 | 103 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.keras.layers as nn 3 | from tensorflow.keras import Model 4 | import numpy as np 5 | import tensor2tensor.layers.area_attention as area_attention 6 | 7 | class MACNN(Model): 8 | def __init__(self, attention_heads=4, attention_size=32, out_size=4): 9 | super(MACNN, self).__init__() 10 | self.conv1a = nn.Conv2D(16, (10, 2), padding='same', data_format='channels_last',)# activation='relu') 11 | self.conv1b = nn.Conv2D(16, (2, 8), padding='same', data_format='channels_last',)# activation='relu') 12 | self.conv2 = nn.Conv2D(32, (3, 3), padding='same', data_format='channels_last', )#activation='relu') 13 | self.conv3 = nn.Conv2D(48, (3, 3), padding='same', data_format='channels_last',)# activation='relu') 14 | self.conv4 = nn.Conv2D(64, (3, 3), padding='same', data_format='channels_last',)# activation='relu') 15 | self.conv5 = nn.Conv2D(80, (3, 3), padding='same', data_format='channels_last', )#activation='relu') 16 | self.maxp = nn.MaxPool2D((2, 2)) 17 | self.bn1a = nn.BatchNormalization(3) 18 | self.bn1b = nn.BatchNormalization(3) 19 | self.bn2 = nn.BatchNormalization(3) 20 | self.bn3 = nn.BatchNormalization(3) 21 | self.bn4 = nn.BatchNormalization(3) 22 | self.bn5 = nn.BatchNormalization(3) 23 | self.gap = nn.GlobalAveragePooling2D(data_format='channels_last') 24 | self.flatten = nn.Flatten(data_format='channels_last') 25 | self.fc = nn.Dense(out_size, activation='softmax') 26 | self.attention_query = [] 27 | self.attention_key = [] 28 | self.attention_value = [] 29 | self.attention_heads = attention_heads 30 | self.attention_size = attention_size 31 | for i in range(self.attention_heads): 32 | self.attention_query.append(nn.Conv2D(self.attention_size, 1, padding='same', data_format='channels_last')) 33 | self.attention_key.append(nn.Conv2D(self.attention_size, 1, padding='same', data_format='channels_last')) 34 | self.attention_value.append(nn.Conv2D(self.attention_size, 1, padding='same', data_format='channels_last')) 35 | 36 | def call(self, *input): 37 | x = input[0] 38 | xa = self.conv1a(x) 39 | xa = self.bn1a(xa) 40 | xa=tf.nn.relu(xa) 41 | xb = self.conv1b(x) 42 | xb = self.bn1b(xb) 43 | xb = tf.nn.relu(xb) 44 | x = tf.concat([xa, xb], 1) 45 | x = self.conv2(x) 46 | x = self.bn2(x) 47 | x=tf.nn.relu(x) 48 | x = self.maxp(x) 49 | x = self.conv3(x) 50 | x = self.bn3(x) 51 | x = tf.nn.relu(x) 52 | x = self.maxp(x) 53 | x = self.conv4(x) 54 | x = self.bn4(x) 55 | x = tf.nn.relu(x) 56 | x = self.conv5(x) 57 | x = self.bn5(x) 58 | x = tf.nn.relu(x) 59 | 60 | attn = None 61 | for i in range(self.attention_heads): 62 | # Q = self.attention_query[i](x) 63 | # Q = tf.transpose(Q, perm=[0, 3, 1, 2]) 64 | # K = self.attention_key[i](x) 65 | # K = tf.transpose(K, perm=[0, 3, 2, 1]) 66 | # V = self.attention_value[i](x) 67 | # V = tf.transpose(V, perm=[0, 3, 1, 2]) 68 | # attention = tf.nn.softmax(tf.matmul(Q, K)) 69 | # attention = tf.matmul(attention, V) 70 | Q = self.attention_query[i](x) 71 | K = self.attention_key[i](x) 72 | V = self.attention_value[i](x) 73 | attention = tf.nn.softmax(tf.multiply(Q, K)) 74 | attention = tf.multiply(attention, V) 75 | if (attn is None): 76 | attn = attention 77 | else: 78 | attn = tf.concat([attn, attention], 2) 79 | x = tf.transpose(attn, perm=[0, 2, 3, 1]) 80 | x = tf.nn.relu(x) 81 | x = self.gap(x) 82 | x = self.flatten(x) 83 | x = self.fc(x) 84 | return x 85 | 86 | class AACNN(Model): 87 | def __init__(self, height=3,width=3,out_size=4): 88 | super(AACNN, self).__init__() 89 | self.height=height 90 | self.width=width 91 | self.conv1 = nn.Conv2D(32, (3,3), padding='same', data_format='channels_last',) 92 | 93 | self.conv1a = nn.Conv2D(16, (10, 2), padding='same', data_format='channels_last',)# activation='relu') 94 | self.conv1b = nn.Conv2D(16, (2, 8), padding='same', data_format='channels_last',)# activation='relu') 95 | self.conv2 = nn.Conv2D(32, (3, 3), padding='same', data_format='channels_last', )#activation='relu') 96 | self.conv3 = nn.Conv2D(48, (3, 3), padding='same', data_format='channels_last',)# activation='relu') 97 | self.conv4 = nn.Conv2D(64, (3, 3), padding='same', data_format='channels_last',)# activation='relu') 98 | self.conv5 = nn.Conv2D(80, (3, 3), padding='same', data_format='channels_last', )#activation='relu') 99 | self.conv6 = nn.Conv2D(128, (3, 3), padding='same', data_format='channels_last', )# 100 | self.maxp = nn.MaxPool2D((2, 2)) 101 | self.bn1a = nn.BatchNormalization(3) 102 | self.bn1b = nn.BatchNormalization(3) 103 | self.bn2 = nn.BatchNormalization(3) 104 | self.bn3 = nn.BatchNormalization(3) 105 | self.bn4 = nn.BatchNormalization(3) 106 | self.bn5 = nn.BatchNormalization(3) 107 | self.bn6 = nn.BatchNormalization(3) 108 | self.gap = nn.GlobalAveragePooling2D(data_format='channels_last') 109 | self.flatten = nn.Flatten(data_format='channels_last') 110 | self.fc = nn.Dense(out_size, activation='softmax') 111 | self.query = nn.Dense(20) 112 | self.key = nn.Dense(20) 113 | self.value = nn.Dense(20) 114 | 115 | def call(self, *input): 116 | x = input[0] 117 | xa = self.conv1a(x) 118 | xa = self.bn1a(xa) 119 | xa=tf.nn.relu(xa) 120 | xb = self.conv1b(x) 121 | xb = self.bn1b(xb) 122 | xb = tf.nn.relu(xb) 123 | x = tf.concat([xa, xb], 1) 124 | 125 | #x=input[0] 126 | #x=self.bn1a(x) 127 | #x=self.conv1(x) 128 | #x=tf.nn.relu(x) 129 | 130 | x = self.conv2(x) 131 | x = self.bn2(x) 132 | x=tf.nn.relu(x) 133 | x = self.maxp(x) 134 | x = self.conv3(x) 135 | x = self.bn3(x) 136 | x = tf.nn.relu(x) 137 | x = self.maxp(x) 138 | x = self.conv4(x) 139 | x = self.bn4(x) 140 | x = tf.nn.relu(x) 141 | x = self.conv5(x) 142 | x = self.bn5(x) 143 | x = tf.nn.relu(x) 144 | 145 | #x=self.conv6(x) 146 | #x=self.bn6(x) 147 | #x=tf.nn.relu(x) 148 | 149 | q=x 150 | k=x 151 | v=x 152 | bias=None 153 | dropout_rate=0.5 154 | 155 | x=area_attention.dot_product_area_attention( 156 | q, k, v, bias, dropout_rate, None, 157 | save_weights_to=None, 158 | dropout_broadcast_dims=None, 159 | max_area_width=self.width, 160 | max_area_height=self.height, 161 | area_key_mode='mean', 162 | area_value_mode='sum', 163 | training=True) 164 | 165 | x = self.flatten(x) 166 | x = self.fc(x) 167 | return x 168 | 169 | if __name__ == '__main__': 170 | test = np.random.random((4, 40, 40,1)).astype(np.float32) 171 | test = tf.convert_to_tensor(test) 172 | macnn = MACNN() 173 | y = macnn(test) 174 | s=tf.Session() 175 | print(s.run(y)) 176 | -------------------------------------------------------------------------------- /models/readme.md: -------------------------------------------------------------------------------- 1 | The folder to save models 2 | -------------------------------------------------------------------------------- /process_dataset.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | 4 | import librosa 5 | from tqdm import tqdm 6 | import random 7 | import numpy as np 8 | 9 | 10 | def setup_seed(seed): 11 | np.random.seed(seed) 12 | random.seed(seed) 13 | 14 | 15 | LABEL_DICT1 = { 16 | '01': 'neutral', 17 | # '02': 'frustration', 18 | # '03': 'happy', 19 | '04': 'sad', 20 | '05': 'angry', 21 | # '06': 'fearful', 22 | '07': 'happy', # excitement->happy 23 | # '08': 'surprised' 24 | } 25 | 26 | impro_or_script='impro' 27 | RATE=16000 28 | T=2 29 | def build_test_list(valid_files, LABEL_DICT1, RATE, t): 30 | testList=[] 31 | for i, wav_file in enumerate(tqdm(valid_files)): 32 | label = str(os.path.basename(wav_file).split('-')[2]) 33 | if (label not in LABEL_DICT1): 34 | continue 35 | if (impro_or_script != 'all' and (impro_or_script not in wav_file)): 36 | continue 37 | wav_data, _ = librosa.load(wav_file, sr=RATE) 38 | if (t * RATE >= len(wav_data)): 39 | continue 40 | testList.append(os.path.basename(wav_file)) 41 | return testList 42 | 43 | def process_IEMO(): 44 | wavs = glob.glob('e:/test/iemocap/*.wav') 45 | transes = glob.glob('e:/test/iemo_t/trans/*.txt') 46 | write_list = [] 47 | for wav in tqdm(wavs): 48 | wav_name = os.path.basename(wav) 49 | wav_name_split = wav_name.split('.')[0].split('-') 50 | if(wav_name_split[2] not in LABEL_DICT1): 51 | continue 52 | if ('script' in wav_name): 53 | txt_name = wav_name_split[0] + '_' + wav_name_split[1] + '_' + wav_name_split[-1].split('_')[0] + '.txt' 54 | else: 55 | txt_name = wav_name_split[0] + '_' + wav_name_split[1] + '.txt' 56 | trans_name = None 57 | for trans in transes: 58 | if (os.path.basename(trans) == txt_name): 59 | trans_name = trans 60 | break 61 | if (trans_name is not None): 62 | f_trans = open(trans_name) 63 | fr_trans = f_trans.readlines() 64 | FIND = False 65 | for l_trans in fr_trans: 66 | if (l_trans.split(' ')[0] == wav_name_split[0] + '_' + wav_name_split[1] + '_' + wav_name_split[-1]): 67 | write_list.append((l_trans.split(' ')[0], l_trans.split(':')[-1].replace('\n',''), wav_name, wav_name_split[2])) 68 | FIND = True 69 | break 70 | if (FIND == False): 71 | print('Cannot find :' + wav_name) 72 | f_trans.close() 73 | else: 74 | print('Cannot find :' + txt_name) 75 | with open('IEMOCAP.csv', 'w') as f: 76 | for wl in write_list: 77 | for w in range(len(wl)): 78 | # f.write('\"' + wl[w] + '\"') 79 | f.write(wl[w]) 80 | if (w < len(wl) - 1): 81 | f.write('\t') 82 | else: 83 | f.write('\n') 84 | 85 | 86 | if __name__ == '__main__': 87 | SEED=[111111] 88 | # SEED=[123456,0,999999,987654] 89 | for seed in SEED: 90 | setup_seed(seed) 91 | process_IEMO() 92 | with open('IEMOCAP.csv', 'r') as f: 93 | fr = f.readlines() 94 | n = len(fr) 95 | trainAndDev_files = [] 96 | train_files = [] 97 | dev_files = [] 98 | test_files = [] 99 | trainAndDev_indices = list(np.random.choice(range(n), int(n * 0.8), replace=False)) 100 | test_indices = list(set(range(n)) - set(trainAndDev_indices)) 101 | for i in trainAndDev_indices: 102 | trainAndDev_files.append(fr[i]) 103 | for i in test_indices: 104 | test_files.append(fr[i]) 105 | n=len(trainAndDev_files) 106 | train_indices = list(np.random.choice(range(n), int(n * 0.875), replace=False)) 107 | dev_indices = list(set(range(n)) - set(train_indices)) 108 | for i in train_indices: 109 | train_files.append(trainAndDev_files[i]) 110 | for i in dev_indices: 111 | dev_files.append(trainAndDev_files[i]) 112 | 113 | data_dir = 'e:/test/iemocap' 114 | valid_files = [] 115 | for line in test_files: 116 | valid_files.append(data_dir + '/' + line.split('\t')[2]) 117 | test_wav=build_test_list(valid_files, LABEL_DICT1, RATE, T) 118 | bitest = [] 119 | for line in test_files: 120 | if (line.split('\t')[2] in test_wav): 121 | bitest.append(line) 122 | with open('./IEMOCAP_bitest_{}.csv'.format(seed), 'w') as f: 123 | for line in bitest: 124 | f.write(line) 125 | 126 | with open('IEMOCAP_train_{}.csv'.format(seed), 'w') as f: 127 | for l in train_files: 128 | f.write(l) 129 | with open('IEMOCAP_dev_{}.csv'.format(seed), 'w') as f: 130 | for l in dev_files: 131 | f.write(l) 132 | with open('IEMOCAP_test_{}.csv'.format(seed), 'w') as f: 133 | for l in test_files: 134 | f.write(l) -------------------------------------------------------------------------------- /vtlpAug.py: -------------------------------------------------------------------------------- 1 | import nlpaug.augmenter.audio as naa 2 | import librosa 3 | import glob 4 | from tqdm import tqdm 5 | import os 6 | wlist=glob.glob(r'../IEMOCAP/*.wav') 7 | targetDir='../IEMOCAP/' 8 | aug = naa.VtlpAug(16000, zone=(0.0, 1.0), coverage=1, duration=None, fhi=4800, factor=(0.8, 1.2)) 9 | for w in tqdm(wlist): 10 | for i in range(7): 11 | wav,_=librosa.load(w,16000) 12 | wavAug=aug.augment(wav) 13 | wavName=os.path.basename(w) 14 | librosa.output.write_wav(targetDir+wavName+'.'+str(i+1),wavAug,16000) 15 | --------------------------------------------------------------------------------