├── README.md
├── area_train.py
├── features.py
├── model.py
├── models
    └── readme.md
├── process_dataset.py
└── vtlpAug.py


/README.md:
--------------------------------------------------------------------------------
1 | # area_attention_for_SER
2 | Speech emotion recogntion with area attention and VTLP
3 | 
4 | 1. Use process_dataset.py to divide IEMOCAP data set to train set and test set, and generate CSV files.
5 | 2. Use vtlpAug.py to generate data augmented with VTLP.
6 | 3. Use area_train.py to train and save the models.
7 | 


--------------------------------------------------------------------------------
/area_train.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | import pickle
  4 | import random
  5 | import time
  6 | import math
  7 | import logging
  8 | import datetime
  9 | import tensorflow as tf
 10 | 
 11 | import numpy as np
 12 | import librosa
 13 | from tqdm import tqdm
 14 | 
 15 | 
 16 | def setup_seed(seed):
 17 |     np.random.seed(seed)
 18 |     random.seed(seed)
 19 |     tf.set_random_seed(seed)
 20 |     #tf.random.set_seed(seed)
 21 | 
 22 | tf.enable_eager_execution()
 23 | #SEED = [111111,123456,0]
 24 | SEED=[999999,987654]
 25 | #SEED=[111111,123456,0,999999,987654]
 26 | attention_head = 4
 27 | attention_hidden = 32
 28 | area_height = 1
 29 | area_width = 1
 30 | 
 31 | import features
 32 | import model as MODEL
 33 | 
 34 | Epochs = 100
 35 | BATCH_SIZE = 32
 36 | learning_rate = 0.0001
 37 | T_stride = 2
 38 | T_overlop = T_stride / 2
 39 | overlapTime = {
 40 |     'neutral': 1,
 41 |     'happy': 1,
 42 |     'sad': 1,
 43 |     'angry': 1,
 44 | }
 45 | FEATURES_TO_USE = 'melspectrogram'  # {'mfcc' , 'logfbank','fbank','spectrogram','melspectrogram'}
 46 | featuresExist = True
 47 | impro_or_script = 'impro'
 48 | featuresFileName = 'features_{}_{}.pkl'.format(FEATURES_TO_USE, impro_or_script)
 49 | toSaveFeatures = True
 50 | WAV_PATH = "E:/Test/IEMOCAP/"
 51 | RATE = 16000
 52 | 
 53 | LABEL = {
 54 |     'neutral': 0,
 55 |     'happy': 1,
 56 |     'sad': 2,
 57 |     'angry': 3,
 58 | }
 59 | 
 60 | LABEL_DICT1 = {
 61 |     '01': 'neutral',
 62 |     # '02': 'frustration',
 63 |     # '03': 'happy',
 64 |     '04': 'sad',
 65 |     '05': 'angry',
 66 |     # '06': 'fearful',
 67 |     '07': 'happy',  # excitement->happy
 68 |     # '08': 'surprised'
 69 | }
 70 | 
 71 | 
 72 | def process_data(path, t=2, train_overlap=1, val_overlap=1.6, RATE=16000):
 73 |     path = path.rstrip('/')
 74 |     wav_files = glob.glob(path + '/*.wav')
 75 | 
 76 |     n = len(wav_files)
 77 |     train_files = []
 78 |     valid_files = []
 79 |     train_indices = list(np.random.choice(range(n), int(n * 0.8), replace=False))
 80 |     valid_indices = list(set(range(n)) - set(train_indices))
 81 |     for i in train_indices:
 82 |         train_files.append(wav_files[i])
 83 |     for i in valid_indices:
 84 |         valid_files.append(wav_files[i])
 85 | 
 86 |     print("constructing meta dictionary for {}...".format(path))
 87 |     train_X, train_y = train_data_process(train_files, LABEL_DICT1, RATE, t, train_overlap)
 88 | 
 89 |     val_dict = valid_data_process(valid_files, LABEL_DICT1, RATE, t, val_overlap)
 90 | 
 91 |     return train_X, train_y, val_dict
 92 | 
 93 | 
 94 | def valid_data_process(valid_files, LABEL_DICT1, RATE, t, val_overlap):
 95 |     val_dict = {}
 96 |     if (val_overlap >= t):
 97 |         val_overlap = t / 2
 98 |     for i, wav_file in enumerate(tqdm(valid_files)):
 99 |         label = str(os.path.basename(wav_file).split('-')[2])
100 |         if (label not in LABEL_DICT1):
101 |             continue
102 |         if (impro_or_script != 'all' and (impro_or_script not in wav_file)):
103 |             continue
104 |         label = LABEL_DICT1[label]
105 |         wav_data, _ = librosa.load(wav_file, sr=RATE)
106 |         X1 = []
107 |         y1 = []
108 |         index = 0
109 |         if (t * RATE >= len(wav_data)):
110 |             continue
111 |         while (index + t * RATE < len(wav_data)):
112 |             X1.append(wav_data[int(index):int(index + t * RATE)])
113 |             y1.append(LABEL[label])
114 |             index += int((t - val_overlap) * RATE)
115 | 
116 |         X1 = np.array(X1)
117 |         val_dict[i] = {
118 |             'X': X1,
119 |             'y': y1,
120 |             'path': wav_file
121 |         }
122 |     return val_dict
123 | 
124 | 
125 | def train_data_process(train_files, LABEL_DICT1, RATE, t, train_overlap):
126 |     meta_dict = {}
127 |     for i, wav_file in enumerate(tqdm(train_files)):
128 |         label = str(os.path.basename(wav_file).split('-')[2])
129 |         if (label not in LABEL_DICT1):
130 |             continue
131 |         if (impro_or_script != 'all' and (impro_or_script not in wav_file)):
132 |             continue
133 |         label = LABEL_DICT1[label]
134 | 
135 |         wav_data, _ = librosa.load(wav_file, sr=RATE)
136 |         X1 = []
137 |         y1 = []
138 |         index = 0
139 |         if (t * RATE >= len(wav_data)):
140 |             continue
141 | 
142 |         while (index + t * RATE < len(wav_data)):
143 |             X1.append(wav_data[int(index):int(index + t * RATE)])
144 |             y1.append(LABEL[label])
145 |             assert t - train_overlap > 0
146 |             index += int((t - train_overlap) * RATE / overlapTime[label])
147 | 
148 |         X1 = np.array(X1)
149 |         meta_dict[i] = {
150 |             'X': X1,
151 |             'y': y1,
152 |             'path': wav_file
153 |         }
154 |     print("building X, y...")
155 |     train_X = []
156 |     train_y = []
157 |     for k in meta_dict:
158 |         train_X.append(meta_dict[k]['X'])
159 |         train_y += meta_dict[k]['y']
160 |     train_X = np.row_stack(train_X)
161 |     train_y = np.array(train_y)
162 |     assert len(train_X) == len(train_y), "X length and y length must match! X shape: {}, y length: {}".format(
163 |         train_X.shape, train_y.shape)
164 |     return train_X, train_y
165 | 
166 | 
167 | def train(SEED, area_width, area_height,AUG=True):
168 |     setup_seed(SEED)
169 |     MODEL_NAME = 'AUG_area{}x{}_seed{}'.format(area_width, area_height, SEED)
170 |     data_dir = '/program/xumingke/IEMOCAP/'
171 |     train_files = []
172 |     train_files2 = []
173 |     valid_files = []
174 |     with open(data_dir + '/IEMOCAP_train_{}.csv'.format(SEED)) as f:
175 |         fr = f.readlines()
176 |         for line in fr:
177 |             train_files.append(data_dir + '/' + line.split('\t')[2])
178 |             if(AUG):
179 |                 for i in range(0):
180 |                     train_files.append(data_dir + '/' + line.split('\t')[2]+'.'+str(i+1))
181 |                 for i in range(1):
182 |                     train_files2.append(data_dir + '/' + line.split('\t')[2]+'.'+str(i+5))
183 |     with open(data_dir + '/IEMOCAP_dev_{}.csv'.format(SEED)) as f:
184 |         fr = f.readlines()
185 |         for line in fr:
186 |             train_files.append(data_dir + '/' + line.split('\t')[2])
187 |             if(AUG):
188 |                 for i in range(0):
189 |                     train_files.append(data_dir + '/' + line.split('\t')[2]+'.'+str(i+1))
190 |                 for i in range(1):
191 |                     train_files2.append(data_dir + '/' + line.split('\t')[2]+'.'+str(i+5))            
192 |     with open(data_dir + '/IEMOCAP_test_{}.csv'.format(SEED)) as f:
193 |         fr = f.readlines()
194 |         for line in fr:
195 |             valid_files.append(data_dir + '/' + line.split('\t')[2])
196 | 
197 |     train_X, train_y = train_data_process(train_files, LABEL_DICT1, RATE, T_stride, T_overlop)
198 |     train_X2,train_y2=train_data_process(train_files2, LABEL_DICT1, RATE, T_stride, T_overlop)
199 |     train_y=tf.concat([train_y,train_y2],0)
200 |     val_dict = valid_data_process(valid_files, LABEL_DICT1, RATE, T_stride, 1.6)
201 |     feature_extractor = features.FeatureExtractor(rate=RATE)
202 | 
203 |     train_X_features = feature_extractor.get_features(FEATURES_TO_USE, train_X)
204 |     train_X_features2 = feature_extractor.get_features(FEATURES_TO_USE, train_X2)
205 | 
206 |     valid_features_dict = {}
207 |     for _, i in enumerate(val_dict):
208 |         X1 = feature_extractor.get_features(FEATURES_TO_USE, val_dict[i]['X'])
209 |         valid_features_dict[i] = {
210 |             'X': X1,
211 |             'y': val_dict[i]['y']
212 |         }
213 | 
214 |     train_X_features = tf.expand_dims(train_X_features, -1)
215 |     train_X_features2 = tf.expand_dims(train_X_features2, -1)
216 |     train_X_features = tf.concat([train_X_features,train_X_features2],0)	
217 |     train_X_features = tf.cast(train_X_features, tf.float32)
218 | 
219 |     train_ds = tf.data.Dataset.from_tensor_slices(
220 |         (train_X_features, train_y)).shuffle(train_X_features.shape[0]).batch(BATCH_SIZE)
221 | 
222 |     loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
223 |     optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, decay=1e-6)
224 | 
225 |     train_loss = tf.keras.metrics.Mean(name='train_loss')
226 |     train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
227 | 
228 |     test_loss = tf.keras.metrics.Mean(name='test_loss')
229 |     test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')
230 | 
231 |     model = MODEL.AACNN(area_height, area_width)
232 | 
233 |     def train_step(images, labels):
234 | 
235 |         with tf.GradientTape() as tape:
236 |             predictions = model(images)
237 |             loss = loss_object(labels, predictions)
238 |         gradients = tape.gradient(loss, model.trainable_variables)
239 |         optimizer.apply_gradients(zip(gradients, model.trainable_variables))
240 | 
241 |         train_loss(loss)
242 |         train_accuracy(labels, predictions)
243 | 
244 |     def test_step(images, labels):
245 |         predictions = model(images)
246 |         t_loss = loss_object(labels, predictions)
247 | 
248 |         test_loss(t_loss)
249 |         test_accuracy(labels, predictions)
250 | 
251 |     print('training...')
252 |     logging.warning('training seed={}'.format(SEED))
253 |     maxWA = 0
254 |     maxUA = 0
255 |     maxACC = 0
256 |     for epoch in range(Epochs):
257 |         # 在下一个epoch开始时，重置评估指标
258 |         train_loss.reset_states()
259 |         train_accuracy.reset_states()
260 |         test_loss.reset_states()
261 |         test_accuracy.reset_states()
262 |         # tq = tqdm(total=len(train_y))
263 |         for step, (images, labels) in enumerate(train_ds):
264 |             train_step(images, labels)
265 | 
266 |             # tq.update(BATCH_SIZE)
267 |         # tq.close()
268 |         template = 'Epoch {}, Loss: {}, Accuracy: {}\n'
269 |         print(template.format(epoch + 1,
270 |                               train_loss.result(),
271 |                               train_accuracy.result() * 100,
272 |                               ))
273 |         logging.warning(template.format(epoch + 1,
274 |                                      train_loss.result(),
275 |                                      train_accuracy.result() * 100,
276 |                                      ))
277 | 
278 |         correct = 0
279 |         label_correct = [0, 0, 0, 0]
280 |         label_total = [0, 0, 0, 0]
281 | 
282 |         for _, i in enumerate(valid_features_dict):
283 | 
284 |             x, y = valid_features_dict[i]['X'], valid_features_dict[i]['y']
285 |             x = tf.expand_dims(x, -1)
286 |             x = tf.cast(x, tf.float32)
287 |             y = np.array([y[0]])
288 |             out = model(x)
289 |             out = tf.reduce_mean(out, 0, keepdims=True)
290 | 
291 |             label_total[y[0]] += 1
292 |             if (test_accuracy(y, out) > 0):
293 |                 correct += 1
294 |                 label_correct[y[0]] += 1
295 |             test_accuracy.reset_states()
296 | 
297 |         label_acc = [label_correct[0] / label_total[0],
298 |                      label_correct[1] / label_total[1],
299 |                      label_correct[2] / label_total[2],
300 |                      label_correct[3] / label_total[3]]
301 |         UA = (label_acc[0] + label_acc[1] + label_acc[2] + label_acc[3]) / 4
302 |         if (correct / len(valid_features_dict) > maxWA):
303 |             maxWA = correct / len(valid_features_dict)
304 |         if (UA > maxUA):
305 |             maxUA = UA
306 |         ACC = (correct / len(valid_features_dict)) + UA
307 |         if (ACC > maxACC):
308 |             print('saving model (WA:{},UA:{})\n'.format(correct / len(valid_features_dict), UA))
309 |             logging.warning('saving model (WA:{},UA:{})\n'.format(correct / len(valid_features_dict), UA))
310 |             model.save_weights('./models/{}'.format(MODEL_NAME))
311 |             maxACC = ACC
312 |         print('label_correct:{}\nUA:{}'.format(label_correct, label_acc))
313 |         print('maxWA:{}\nmaxUA:{}'.format(maxWA, maxUA))
314 |         logging.warning('label_correct:{}\nUA:{}'.format(label_correct, label_acc))
315 |         logging.warning('maxWA:{}\nmaxUA:{}'.format(maxWA, maxUA))
316 | 
317 |     print('end training on seed:{}'.format(SEED))
318 |     logging.warning('end training on seed:{}'.format(SEED))
319 |     del model
320 | 
321 |     # model = MODEL.AACNN()
322 |     #     # model.load_weights('./models/{}'.format(MODEL_NAME))
323 |     #     #
324 |     #     # result = []
325 |     #     # correct = 0
326 |     #     # for _, i in enumerate(valid_features_dict):
327 |     #     #     x, y = valid_features_dict[i]['X'], valid_features_dict[i]['y']
328 |     #     #     x = tf.expand_dims(x, -1)
329 |     #     #     x = tf.cast(x, tf.float32)
330 |     #     #     y = np.array([y[0]])
331 |     #     #     out = model(x)
332 |     #     #     out = tf.reduce_mean(out, 0, keepdims=True).numpy()
333 |     #     #     if (np.argmax(out) == y):
334 |     #     #         correct += 1
335 |     #     #     result.append(out)
336 |     #     # print(correct)
337 |     #     # result = np.array(result)
338 |     #     # np.save('./test/SERtest_{}.npy'.format(SEED), result)
339 | 
340 | 
341 | if __name__ == '__main__':
342 |     logger = logging.getLogger()
343 |     logger.setLevel(logging.WARNING)  # Log等级总开关
344 |     # 第二步，创建一个handler，用于写入日志文件
345 |     log_name = 'area3x3_AUG2.log2'.format(area_width, area_height)
346 |     logfile = log_name
347 |     fh = logging.FileHandler(logfile, mode='w')
348 |     fh.setLevel(logging.DEBUG)  # 输出到file的log等级的开关
349 |     # 第三步，定义handler的输出格式
350 |     formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
351 |     fh.setFormatter(formatter)
352 |     # 第四步，将logger添加到handler里面
353 |     logger.addHandler(fh)
354 | 
355 |     for seed in SEED:
356 |         train(seed, area_width, area_height,True)
357 | 
358 | 


--------------------------------------------------------------------------------
/features.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | import pickle
  4 | 
  5 | import torch
  6 | 
  7 | from python_speech_features import logfbank, fbank, sigproc
  8 | import numpy as np
  9 | import librosa
 10 | from tqdm import tqdm
 11 | 
 12 | 
 13 | class FeatureExtractor(object):
 14 |     def __init__(self, rate):
 15 |         self.rate = rate
 16 | 
 17 |     def get_features(self, features_to_use, X):
 18 |         X_features = None
 19 |         accepted_features_to_use = ("logfbank", 'mfcc', 'fbank', 'melspectrogram', 'spectrogram', 'pase')
 20 |         if features_to_use not in accepted_features_to_use:
 21 |             raise NotImplementedError("{} not in {}!".format(features_to_use, accepted_features_to_use))
 22 |         if features_to_use in ('logfbank'):
 23 |             X_features = self.get_logfbank(X)
 24 |         if features_to_use in ('mfcc'):
 25 |             X_features = self.get_mfcc(X,26)
 26 |         if features_to_use in ('fbank'):
 27 |             X_features = self.get_fbank(X)
 28 |         if features_to_use in ('melspectrogram'):
 29 |             X_features = self.get_melspectrogram(X)
 30 |         if features_to_use in ('spectrogram'):
 31 |             X_features = self.get_spectrogram(X)
 32 |         if features_to_use in ('pase'):
 33 |             X_features = self.get_Pase(X)
 34 |         return X_features
 35 | 
 36 |     def get_logfbank(self, X):
 37 |         def _get_logfbank(x):
 38 |             out = logfbank(signal=x, samplerate=self.rate, winlen=0.040, winstep=0.010, nfft=1024, highfreq=4000,
 39 |                            nfilt=40)
 40 |             return out
 41 | 
 42 |         X_features = np.apply_along_axis(_get_logfbank, 1, X)
 43 |         return X_features
 44 | 
 45 |     def get_mfcc(self, X, n_mfcc=13):
 46 |         def _get_mfcc(x):
 47 |             mfcc_data = librosa.feature.mfcc(x, sr=self.rate, n_mfcc=n_mfcc)
 48 |             # delta = librosa.feature.delta(mfcc_data)
 49 |             # delta_delta = librosa.feature.delta(mfcc_data, order=2)
 50 |             # mfcc_data = np.expand_dims(mfcc_data, 0)
 51 |             # delta = np.expand_dims(delta, 0)
 52 |             # delta_delta = np.expand_dims(delta_delta, 0)
 53 |             # out = np.concatenate((mfcc_data, delta, delta_delta), 0)
 54 |             return mfcc_data
 55 | 
 56 |         X_features = np.apply_along_axis(_get_mfcc, 1, X)
 57 |         return X_features
 58 | 
 59 |     def get_fbank(self, X):
 60 |         def _get_fbank(x):
 61 |             out, _ = fbank(signal=x, samplerate=self.rate, winlen=0.040, winstep=0.010, nfft=1024)
 62 |             return out
 63 | 
 64 |         X_features = np.apply_along_axis(_get_fbank, 1, X)
 65 |         return X_features
 66 | 
 67 |     def get_melspectrogram(self, X):
 68 |         def _get_melspectrogram(x):
 69 |             mel = librosa.feature.melspectrogram(y=x, sr=self.rate, n_fft=800, hop_length=400)[np.newaxis, :]
 70 |             out=np.log10(mel).squeeze()
 71 |             return out
 72 | 
 73 |         X_features = np.apply_along_axis(_get_melspectrogram, 1, X)
 74 |         return X_features
 75 | 
 76 |     def get_spectrogram(self, X):
 77 |         def _get_spectrogram(x):
 78 |             frames = sigproc.framesig(x, 640, 160)
 79 |             out = sigproc.logpowspec(frames, NFFT=3198)
 80 |             out = out.swapaxes(0, 1)
 81 |             return out[:][:400]
 82 | 
 83 |         X_features = np.apply_along_axis(_get_spectrogram, 1, X)
 84 |         return X_features
 85 | 
 86 |     # def get_Pase(self, X):
 87 |     #     pase = wf_builder('PASE/cfg/PASE.cfg')
 88 |     #     pase.eval()
 89 |     #     pase.load_pretrained('PASE/PASE.ckpt', load_last=True, verbose=True)
 90 |     #     # tq=tqdm(total=X.shape[0])
 91 |     #     def _get_spectrogram(x):
 92 |     #         x = torch.from_numpy(x)
 93 |     #         x = x.unsqueeze(0).unsqueeze(0)
 94 |     #         y = pase(x).detach().numpy()
 95 |     #         # tq.update(1)
 96 |     #         return y
 97 |     #
 98 |     #     X_features = np.apply_along_axis(_get_spectrogram, 1, X)
 99 |     #     return X_features
100 |     def get_Pase(self,X):
101 |         return X
102 | 
103 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import tensorflow.keras.layers as nn
  3 | from tensorflow.keras import Model
  4 | import numpy as np
  5 | import tensor2tensor.layers.area_attention as area_attention
  6 | 
  7 | class MACNN(Model):
  8 |     def __init__(self, attention_heads=4, attention_size=32, out_size=4):
  9 |         super(MACNN, self).__init__()
 10 |         self.conv1a = nn.Conv2D(16, (10, 2), padding='same', data_format='channels_last',)# activation='relu')
 11 |         self.conv1b = nn.Conv2D(16, (2, 8), padding='same', data_format='channels_last',)# activation='relu')
 12 |         self.conv2 = nn.Conv2D(32, (3, 3), padding='same', data_format='channels_last', )#activation='relu')
 13 |         self.conv3 = nn.Conv2D(48, (3, 3), padding='same', data_format='channels_last',)# activation='relu')
 14 |         self.conv4 = nn.Conv2D(64, (3, 3), padding='same', data_format='channels_last',)# activation='relu')
 15 |         self.conv5 = nn.Conv2D(80, (3, 3), padding='same', data_format='channels_last', )#activation='relu')
 16 |         self.maxp = nn.MaxPool2D((2, 2))
 17 |         self.bn1a = nn.BatchNormalization(3)
 18 |         self.bn1b = nn.BatchNormalization(3)
 19 |         self.bn2 = nn.BatchNormalization(3)
 20 |         self.bn3 = nn.BatchNormalization(3)
 21 |         self.bn4 = nn.BatchNormalization(3)
 22 |         self.bn5 = nn.BatchNormalization(3)
 23 |         self.gap = nn.GlobalAveragePooling2D(data_format='channels_last')
 24 |         self.flatten = nn.Flatten(data_format='channels_last')
 25 |         self.fc = nn.Dense(out_size, activation='softmax')
 26 |         self.attention_query = []
 27 |         self.attention_key = []
 28 |         self.attention_value = []
 29 |         self.attention_heads = attention_heads
 30 |         self.attention_size = attention_size
 31 |         for i in range(self.attention_heads):
 32 |             self.attention_query.append(nn.Conv2D(self.attention_size, 1, padding='same', data_format='channels_last'))
 33 |             self.attention_key.append(nn.Conv2D(self.attention_size, 1, padding='same', data_format='channels_last'))
 34 |             self.attention_value.append(nn.Conv2D(self.attention_size, 1, padding='same', data_format='channels_last'))
 35 | 
 36 |     def call(self, *input):
 37 |         x = input[0]
 38 |         xa = self.conv1a(x)
 39 |         xa = self.bn1a(xa)
 40 |         xa=tf.nn.relu(xa)
 41 |         xb = self.conv1b(x)
 42 |         xb = self.bn1b(xb)
 43 |         xb = tf.nn.relu(xb)
 44 |         x = tf.concat([xa, xb], 1)
 45 |         x = self.conv2(x)
 46 |         x = self.bn2(x)
 47 |         x=tf.nn.relu(x)
 48 |         x = self.maxp(x)
 49 |         x = self.conv3(x)
 50 |         x = self.bn3(x)
 51 |         x = tf.nn.relu(x)
 52 |         x = self.maxp(x)
 53 |         x = self.conv4(x)
 54 |         x = self.bn4(x)
 55 |         x = tf.nn.relu(x)
 56 |         x = self.conv5(x)
 57 |         x = self.bn5(x)
 58 |         x = tf.nn.relu(x)
 59 | 
 60 |         attn = None
 61 |         for i in range(self.attention_heads):
 62 |             # Q = self.attention_query[i](x)
 63 |             # Q = tf.transpose(Q, perm=[0, 3, 1, 2])
 64 |             # K = self.attention_key[i](x)
 65 |             # K = tf.transpose(K, perm=[0, 3, 2, 1])
 66 |             # V = self.attention_value[i](x)
 67 |             # V = tf.transpose(V, perm=[0, 3, 1, 2])
 68 |             # attention = tf.nn.softmax(tf.matmul(Q, K))
 69 |             # attention = tf.matmul(attention, V)
 70 |             Q = self.attention_query[i](x)
 71 |             K = self.attention_key[i](x)
 72 |             V = self.attention_value[i](x)
 73 |             attention = tf.nn.softmax(tf.multiply(Q, K))
 74 |             attention = tf.multiply(attention, V)
 75 |             if (attn is None):
 76 |                 attn = attention
 77 |             else:
 78 |                 attn = tf.concat([attn, attention], 2)
 79 |         x = tf.transpose(attn, perm=[0, 2, 3, 1])
 80 |         x = tf.nn.relu(x)
 81 |         x = self.gap(x)
 82 |         x = self.flatten(x)
 83 |         x = self.fc(x)
 84 |         return x
 85 | 
 86 | class AACNN(Model):
 87 |     def __init__(self, height=3,width=3,out_size=4):
 88 |         super(AACNN, self).__init__()
 89 |         self.height=height
 90 |         self.width=width
 91 |         self.conv1 = nn.Conv2D(32, (3,3), padding='same', data_format='channels_last',)
 92 | 
 93 |         self.conv1a = nn.Conv2D(16, (10, 2), padding='same', data_format='channels_last',)# activation='relu')
 94 |         self.conv1b = nn.Conv2D(16, (2, 8), padding='same', data_format='channels_last',)# activation='relu')
 95 |         self.conv2 = nn.Conv2D(32, (3, 3), padding='same', data_format='channels_last', )#activation='relu')
 96 |         self.conv3 = nn.Conv2D(48, (3, 3), padding='same', data_format='channels_last',)# activation='relu')
 97 |         self.conv4 = nn.Conv2D(64, (3, 3), padding='same', data_format='channels_last',)# activation='relu')
 98 |         self.conv5 = nn.Conv2D(80, (3, 3), padding='same', data_format='channels_last', )#activation='relu')
 99 |         self.conv6 = nn.Conv2D(128, (3, 3), padding='same', data_format='channels_last', )#
100 |         self.maxp = nn.MaxPool2D((2, 2))
101 |         self.bn1a = nn.BatchNormalization(3)
102 |         self.bn1b = nn.BatchNormalization(3)
103 |         self.bn2 = nn.BatchNormalization(3)
104 |         self.bn3 = nn.BatchNormalization(3)
105 |         self.bn4 = nn.BatchNormalization(3)
106 |         self.bn5 = nn.BatchNormalization(3)
107 |         self.bn6 = nn.BatchNormalization(3)
108 |         self.gap = nn.GlobalAveragePooling2D(data_format='channels_last')
109 |         self.flatten = nn.Flatten(data_format='channels_last')
110 |         self.fc = nn.Dense(out_size, activation='softmax')
111 |         self.query = nn.Dense(20)
112 |         self.key = nn.Dense(20)
113 |         self.value = nn.Dense(20)
114 |         
115 |     def call(self, *input):
116 |         x = input[0]
117 |         xa = self.conv1a(x)
118 |         xa = self.bn1a(xa)
119 |         xa=tf.nn.relu(xa)
120 |         xb = self.conv1b(x)
121 |         xb = self.bn1b(xb)
122 |         xb = tf.nn.relu(xb)
123 |         x = tf.concat([xa, xb], 1)
124 | 
125 |         #x=input[0]
126 |         #x=self.bn1a(x)
127 |         #x=self.conv1(x)
128 |         #x=tf.nn.relu(x)
129 | 
130 |         x = self.conv2(x)
131 |         x = self.bn2(x)
132 |         x=tf.nn.relu(x)
133 |         x = self.maxp(x)
134 |         x = self.conv3(x)
135 |         x = self.bn3(x)
136 |         x = tf.nn.relu(x)
137 |         x = self.maxp(x)
138 |         x = self.conv4(x)
139 |         x = self.bn4(x)
140 |         x = tf.nn.relu(x)
141 |         x = self.conv5(x)
142 |         x = self.bn5(x)
143 |         x = tf.nn.relu(x)
144 |         
145 |         #x=self.conv6(x)
146 |         #x=self.bn6(x)
147 |         #x=tf.nn.relu(x)
148 | 
149 |         q=x
150 |         k=x
151 |         v=x
152 |         bias=None
153 |         dropout_rate=0.5
154 | 
155 |         x=area_attention.dot_product_area_attention(
156 |             q, k, v, bias, dropout_rate, None,
157 |             save_weights_to=None,
158 |             dropout_broadcast_dims=None,
159 |             max_area_width=self.width,
160 |             max_area_height=self.height,
161 |             area_key_mode='mean',
162 |             area_value_mode='sum',
163 |             training=True)
164 | 
165 |         x = self.flatten(x)
166 |         x = self.fc(x)
167 |         return x
168 | 
169 | if __name__ == '__main__':
170 |     test = np.random.random((4, 40, 40,1)).astype(np.float32)
171 |     test = tf.convert_to_tensor(test)
172 |     macnn = MACNN()
173 |     y = macnn(test)
174 |     s=tf.Session()
175 |     print(s.run(y))
176 | 


--------------------------------------------------------------------------------
/models/readme.md:
--------------------------------------------------------------------------------
1 | The folder to save models
2 | 


--------------------------------------------------------------------------------
/process_dataset.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | 
  4 | import librosa
  5 | from tqdm import tqdm
  6 | import random
  7 | import numpy as np
  8 | 
  9 | 
 10 | def setup_seed(seed):
 11 |     np.random.seed(seed)
 12 |     random.seed(seed)
 13 | 
 14 | 
 15 | LABEL_DICT1 = {
 16 |     '01': 'neutral',
 17 |     # '02': 'frustration',
 18 |     # '03': 'happy',
 19 |     '04': 'sad',
 20 |     '05': 'angry',
 21 |     # '06': 'fearful',
 22 |     '07': 'happy',  # excitement->happy
 23 |     # '08': 'surprised'
 24 | }
 25 | 
 26 | impro_or_script='impro'
 27 | RATE=16000
 28 | T=2
 29 | def build_test_list(valid_files, LABEL_DICT1, RATE, t):
 30 |     testList=[]
 31 |     for i, wav_file in enumerate(tqdm(valid_files)):
 32 |         label = str(os.path.basename(wav_file).split('-')[2])
 33 |         if (label not in LABEL_DICT1):
 34 |             continue
 35 |         if (impro_or_script != 'all' and (impro_or_script not in wav_file)):
 36 |             continue
 37 |         wav_data, _ = librosa.load(wav_file, sr=RATE)
 38 |         if (t * RATE >= len(wav_data)):
 39 |             continue
 40 |         testList.append(os.path.basename(wav_file))
 41 |     return testList
 42 | 
 43 | def process_IEMO():
 44 |     wavs = glob.glob('e:/test/iemocap/*.wav')
 45 |     transes = glob.glob('e:/test/iemo_t/trans/*.txt')
 46 |     write_list = []
 47 |     for wav in tqdm(wavs):
 48 |         wav_name = os.path.basename(wav)
 49 |         wav_name_split = wav_name.split('.')[0].split('-')
 50 |         if(wav_name_split[2] not in LABEL_DICT1):
 51 |             continue
 52 |         if ('script' in wav_name):
 53 |             txt_name = wav_name_split[0] + '_' + wav_name_split[1] + '_' + wav_name_split[-1].split('_')[0] + '.txt'
 54 |         else:
 55 |             txt_name = wav_name_split[0] + '_' + wav_name_split[1] + '.txt'
 56 |         trans_name = None
 57 |         for trans in transes:
 58 |             if (os.path.basename(trans) == txt_name):
 59 |                 trans_name = trans
 60 |                 break
 61 |         if (trans_name is not None):
 62 |             f_trans = open(trans_name)
 63 |             fr_trans = f_trans.readlines()
 64 |             FIND = False
 65 |             for l_trans in fr_trans:
 66 |                 if (l_trans.split(' ')[0] == wav_name_split[0] + '_' + wav_name_split[1] + '_' + wav_name_split[-1]):
 67 |                     write_list.append((l_trans.split(' ')[0], l_trans.split(':')[-1].replace('\n',''), wav_name, wav_name_split[2]))
 68 |                     FIND = True
 69 |                     break
 70 |             if (FIND == False):
 71 |                 print('Cannot find :' + wav_name)
 72 |             f_trans.close()
 73 |         else:
 74 |             print('Cannot find :' + txt_name)
 75 |     with open('IEMOCAP.csv', 'w') as f:
 76 |         for wl in write_list:
 77 |             for w in range(len(wl)):
 78 |                 # f.write('\"' + wl[w] + '\"')
 79 |                 f.write(wl[w])
 80 |                 if (w < len(wl) - 1):
 81 |                     f.write('\t')
 82 |                 else:
 83 |                     f.write('\n')
 84 | 
 85 | 
 86 | if __name__ == '__main__':
 87 |     SEED=[111111]
 88 |     # SEED=[123456,0,999999,987654]
 89 |     for seed in SEED:
 90 |         setup_seed(seed)
 91 |         process_IEMO()
 92 |         with open('IEMOCAP.csv', 'r') as f:
 93 |             fr = f.readlines()
 94 |         n = len(fr)
 95 |         trainAndDev_files = []
 96 |         train_files = []
 97 |         dev_files = []
 98 |         test_files = []
 99 |         trainAndDev_indices = list(np.random.choice(range(n), int(n * 0.8), replace=False))
100 |         test_indices = list(set(range(n)) - set(trainAndDev_indices))
101 |         for i in trainAndDev_indices:
102 |             trainAndDev_files.append(fr[i])
103 |         for i in test_indices:
104 |             test_files.append(fr[i])
105 |         n=len(trainAndDev_files)
106 |         train_indices = list(np.random.choice(range(n), int(n * 0.875), replace=False))
107 |         dev_indices = list(set(range(n)) - set(train_indices))
108 |         for i in train_indices:
109 |             train_files.append(trainAndDev_files[i])
110 |         for i in dev_indices:
111 |             dev_files.append(trainAndDev_files[i])
112 | 
113 |         data_dir = 'e:/test/iemocap'
114 |         valid_files = []
115 |         for line in test_files:
116 |             valid_files.append(data_dir + '/' + line.split('\t')[2])
117 |         test_wav=build_test_list(valid_files, LABEL_DICT1, RATE, T)
118 |         bitest = []
119 |         for line in test_files:
120 |             if (line.split('\t')[2] in test_wav):
121 |                 bitest.append(line)
122 |         with open('./IEMOCAP_bitest_{}.csv'.format(seed), 'w') as f:
123 |             for line in bitest:
124 |                 f.write(line)
125 | 
126 |         with open('IEMOCAP_train_{}.csv'.format(seed), 'w') as f:
127 |             for l in train_files:
128 |                 f.write(l)
129 |         with open('IEMOCAP_dev_{}.csv'.format(seed), 'w') as f:
130 |             for l in dev_files:
131 |                 f.write(l)
132 |         with open('IEMOCAP_test_{}.csv'.format(seed), 'w') as f:
133 |             for l in test_files:
134 |                 f.write(l)


--------------------------------------------------------------------------------
/vtlpAug.py:
--------------------------------------------------------------------------------
 1 | import nlpaug.augmenter.audio as naa
 2 | import librosa
 3 | import glob
 4 | from tqdm import tqdm
 5 | import os
 6 | wlist=glob.glob(r'../IEMOCAP/*.wav')
 7 | targetDir='../IEMOCAP/'
 8 | aug = naa.VtlpAug(16000, zone=(0.0, 1.0), coverage=1, duration=None, fhi=4800, factor=(0.8, 1.2))
 9 | for w in tqdm(wlist):
10 |     for i in range(7):
11 |         wav,_=librosa.load(w,16000)
12 |         wavAug=aug.augment(wav)
13 |         wavName=os.path.basename(w)
14 |         librosa.output.write_wav(targetDir+wavName+'.'+str(i+1),wavAug,16000)
15 | 


--------------------------------------------------------------------------------