├── LICENSE ├── README.md ├── Score_List.csv ├── model.py ├── output └── .DS_Store ├── requirements.txt ├── test.py ├── train.py └── utils.py /LICENSE: -------------------------------------------------------------------------------- 1 | StrengthNet: Deep Learning-based Emotion Strength Assessment for Emotional Speech Synthesis 2 | 3 | 4 | The MIT License (MIT) 5 | 6 | Copyright (c) 2021 Rui Liu 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # StrengthNet 2 | Implementation of "Accurate Emotion Strength Assessment for Seen and Unseen Speech Based on Data-Driven Deep Learning". INTERSPEECH'2022 3 | 4 | https://arxiv.org/abs/2206.07229 5 | 6 | 7 | ## Dependency 8 | Ubuntu 18.04.5 LTS 9 | 10 | - GPU: Quadro RTX 6000 11 | - Driver version: 450.80.02 12 | - CUDA version: 11.0 13 | 14 | Python 3.5 15 | - tensorflow-gpu 2.0.0b1 (cudnn=7.6.0) 16 | - scipy 17 | - pandas 18 | - matplotlib 19 | - librosa 20 | 21 | ### Environment set-up 22 | For example, 23 | ``` 24 | conda create -n strengthnet python=3.5 25 | conda activate strengthnet 26 | pip install -r requirements.txt 27 | conda install cudnn=7.6.0 28 | ``` 29 | 30 | ## Usage 31 | 32 | 1. Run `python utils.py` to extract .wav to .h5; 33 | 34 | 2. Run `python train.py` to train a CNN-BLSTM based StrengthNet; 35 | 36 | 37 | 38 | 39 | ### Evaluating new samples 40 | 41 | 1. Put the waveforms you wish to evaluate in a folder. For example, `//` 42 | 43 | 2. Run `python test.py --rootdir //` 44 | 45 | This script will evaluate all the `.wav` files in `//`, and write the results to `///StrengthNet_result_raw.txt`. 46 | 47 | By default, the `output/strengthnet.h5` pretrained model is used. 48 | 49 | 50 | ## Citation 51 | 52 | If you find this work useful in your research, please consider citing: 53 | ``` 54 | @inproceedings{liu22i_interspeech, 55 | author={Rui Liu and Berrak Sisman and Björn Schuller and Guanglai Gao and Haizhou Li}, 56 | title={{Accurate Emotion Strength Assessment for Seen and Unseen Speech Based on Data-Driven Deep Learning}}, 57 | year=2022, 58 | booktitle={Proc. Interspeech 2022}, 59 | pages={5493--5497}, 60 | doi={10.21437/Interspeech.2022-534} 61 | } 62 | ``` 63 | 64 | 65 | 66 | ## Resources 67 | 68 | The [ESD corpus](https://github.com/HLTSingapore/Emotional-Speech-Data) is released by the [HLT lab](https://www.eng.nus.edu.sg/ece/hlt/), NUS, Singapore.
69 | 70 | The strength scores for the English samples of the ESD corpus are available [here](https://github.com/ttslr/StrengthNet/blob/main/Score_List.csv). 71 | 72 |
73 | 74 | 75 | ## Acknowledgements: 76 | 77 | MOSNet: [https://github.com/lochenchou/MOSNet](https://github.com/lochenchou/MOSNet) 78 | 79 | Relative Attributes: [Relative Attributes](https://github.com/chaitanya100100/Relative-Attributes-Zero-Shot-Learning) 80 | 81 | 82 | ## License 83 | 84 | This work is released under MIT License (see LICENSE file for details). 85 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import tensorflow 2 | from tensorflow import keras 3 | from tensorflow.keras import Model, layers 4 | from tensorflow.keras.layers import Dense, Dropout, Conv2D 5 | from tensorflow.keras.layers import LSTM, TimeDistributed, Bidirectional 6 | from tensorflow.keras.constraints import max_norm 7 | 8 | class CNN_BLSTM(object): 9 | 10 | def __init__(self): 11 | print('CNN_BLSTM init') 12 | 13 | def build(self): 14 | _input = keras.Input(shape=(None, 80)) 15 | n_classes = 4 # emo_label = ['Angry', 'Happy', 'Surprise', 'Sad'] 16 | 17 | _input2 = Dense(257)(_input) 18 | re_input = layers.Reshape((-1, 257, 1), input_shape=(-1, 257))(_input2) 19 | 20 | # CNN 21 | conv1 = (Conv2D(16, (3,3), strides=(1, 1), activation='relu', padding='same'))(re_input) 22 | conv1 = (Conv2D(16, (3,3), strides=(1, 1), activation='relu', padding='same'))(conv1) 23 | conv1 = (Conv2D(16, (3,3), strides=(1, 3), activation='relu', padding='same'))(conv1) 24 | 25 | conv2 = (Conv2D(32, (3,3), strides=(1, 1), activation='relu', padding='same'))(conv1) 26 | conv2 = (Conv2D(32, (3,3), strides=(1, 1), activation='relu', padding='same'))(conv2) 27 | conv2 = (Conv2D(32, (3,3), strides=(1, 3), activation='relu', padding='same'))(conv2) 28 | 29 | conv3 = (Conv2D(64, (3,3), strides=(1, 1), activation='relu', padding='same'))(conv2) 30 | conv3 = (Conv2D(64, (3,3), strides=(1, 1), activation='relu', padding='same'))(conv3) 31 | conv3 = (Conv2D(64, (3,3), strides=(1, 3), activation='relu', padding='same'))(conv3) 32 | 33 | conv4 = (Conv2D(128, (3,3), strides=(1, 1), activation='relu', padding='same'))(conv3) 34 | conv4 = (Conv2D(128, (3,3), strides=(1, 1), activation='relu', padding='same'))(conv4) 35 | conv4 = (Conv2D(128, (3,3), strides=(1, 3), activation='relu', padding='same'))(conv4) 36 | 37 | re_shape = layers.Reshape((-1, 4*128), input_shape=(-1, 4, 128))(conv4) 38 | 39 | # BLSTM 40 | blstm1 = Bidirectional( 41 | LSTM(128, return_sequences=True, dropout=0.3, 42 | recurrent_dropout=0.3, recurrent_constraint=max_norm(0.00001)), 43 | merge_mode='concat')(re_shape) 44 | 45 | # DNN 46 | flatten = TimeDistributed(layers.Flatten())(blstm1) 47 | 48 | dense1=TimeDistributed(Dense(128, activation='relu'))(flatten) 49 | dense1=Dropout(0.3)(dense1) 50 | 51 | # TODO add emotion classfication layer 52 | # BLSTM 53 | blstm2 = Bidirectional( 54 | LSTM(128, dropout=0.3, 55 | recurrent_dropout=0.3, recurrent_constraint=max_norm(0.00001)), 56 | merge_mode='concat')(re_shape) 57 | dense2 = Dense(n_classes, activation = 'softmax', name = 'class')(Dropout(0.3)(blstm2)) 58 | 59 | frame_score=TimeDistributed(Dense(1), name='frame')(dense1) 60 | 61 | average_score=layers.GlobalAveragePooling1D(name='avg')(frame_score) 62 | 63 | # TODO add output layer to predict emotion class 64 | out_class = dense2 65 | 66 | model = Model(outputs=[average_score, frame_score, out_class], inputs=_input) 67 | 68 | return model 69 | 70 | -------------------------------------------------------------------------------- /output/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ttslr/StrengthNet/bf1b12050100fa1cacd3b68297fac29477a40354/output/.DS_Store -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow-gpu==2.0.0-beta1 2 | tqdm 3 | scipy 4 | pandas 5 | matplotlib 6 | librosa -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | import os 6 | import time 7 | import numpy as np 8 | from tqdm import tqdm 9 | import argparse 10 | import fnmatch 11 | 12 | from statistics import mean 13 | 14 | import tensorflow as tf 15 | from tensorflow import keras 16 | from model import CNN_BLSTM 17 | 18 | import utils 19 | import random 20 | 21 | 22 | def find_files(root_dir, query="*.wav", include_root_dir=True): 23 | """Find files recursively. 24 | 25 | Args: 26 | root_dir (str): Root root_dir to find. 27 | query (str): Query to find. 28 | include_root_dir (bool): If False, root_dir name is not included. 29 | 30 | Returns: 31 | list: List of found filenames. 32 | 33 | """ 34 | files = [] 35 | for root, dirnames, filenames in os.walk(root_dir, followlinks=True): 36 | for filename in fnmatch.filter(filenames, query): 37 | files.append(os.path.join(root, filename)) 38 | if not include_root_dir: 39 | files = [file_.replace(root_dir + "/", "") for file_ in files] 40 | 41 | return files 42 | 43 | def main(): 44 | parser = argparse.ArgumentParser( 45 | description="Evaluate custom waveform files using pretrained MOSnet.") 46 | parser.add_argument("--rootdir", default=None, type=str, 47 | help="rootdir of the waveforms to be evaluated") 48 | parser.add_argument("--pretrained_model", default="./output/strengthnet.h5", type=str, 49 | help="pretrained model file") 50 | args = parser.parse_args() 51 | 52 | #### tensorflow & gpu settings #### 53 | 54 | # 0 = all messages are logged (default behavior) 55 | # 1 = INFO messages are not printed 56 | # 2 = INFO and WARNING messages are not printed 57 | # 3 = INFO, WARNING, and ERROR messages are not printed 58 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {'0', '1', '2'} 59 | 60 | tf.debugging.set_log_device_placement(False) 61 | # set memory growth 62 | gpus = tf.config.experimental.list_physical_devices('GPU') 63 | if gpus: 64 | try: 65 | # Currently, memory growth needs to be the same across GPUs 66 | for gpu in gpus: 67 | tf.config.experimental.set_memory_growth(gpu, True) 68 | 69 | logical_gpus = tf.config.experimental.list_logical_devices('GPU') 70 | print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") 71 | except RuntimeError as e: 72 | # Memory growth must be set before GPUs have been initialized 73 | print(e) 74 | 75 | ################################### 76 | 77 | # find waveform files 78 | wavfiles = sorted(find_files(args.rootdir, "*.wav")) 79 | 80 | # init model 81 | print("Loading model weights") 82 | StrengthNet = CNN_BLSTM() 83 | model = StrengthNet.build() 84 | model.load_weights(args.pretrained_model) 85 | 86 | # evaluation 87 | print("Start evaluating {} waveforms...".format(len(wavfiles))) 88 | results = [] 89 | 90 | for wavfile in tqdm(wavfiles): 91 | 92 | # spectrogram 93 | mel_sgram = utils.get_melspectrograms(wavfile) 94 | timestep = mel_sgram.shape[0] 95 | mel_sgram = np.reshape(mel_sgram,(1, timestep, utils.n_mels)) 96 | # make prediction 97 | Strength_score, Frame_score, emo_class = model.predict(mel_sgram, verbose=0, batch_size=1) 98 | 99 | # write to list 100 | result = wavfile + " {:.3f}".format(Strength_score) 101 | results.append(result) 102 | 103 | # write final raw result 104 | resultrawpath = os.path.join(args.rootdir, "StrengthNet_result_raw.txt") 105 | with open(resultrawpath, "w") as outfile: 106 | outfile.write("\n".join(sorted(results))) 107 | 108 | 109 | if __name__ == '__main__': 110 | main() 111 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import numpy as np 4 | from tqdm import tqdm 5 | import scipy.stats 6 | import pandas as pd 7 | import matplotlib 8 | # Force matplotlib to not use any Xwindows backend. 9 | matplotlib.use('Agg') 10 | import matplotlib.pyplot as plt 11 | 12 | from sklearn.metrics import accuracy_score 13 | 14 | import argparse 15 | import tensorflow as tf 16 | from tensorflow import keras 17 | import model 18 | import utils 19 | import random 20 | 21 | 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument("--epoch", type=int, default=1000, help="number epochs") 24 | parser.add_argument("--batch_size", type=int, default=64, help="number batch_size") 25 | 26 | args = parser.parse_args() 27 | 28 | 29 | 30 | print('training with model architecture: {}'.format(args.model)) 31 | print('epochs: {}\nbatch_size: {}'.format(args.epoch, args.batch_size)) 32 | 33 | # 0 = all messages are logged (default behavior) 34 | # 1 = INFO messages are not printed 35 | # 2 = INFO and WARNING messages are not printed 36 | # 3 = INFO, WARNING, and ERROR messages are not printed 37 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {'0', '1', '2'} 38 | 39 | tf.debugging.set_log_device_placement(False) 40 | # set memory growth 41 | gpus = tf.config.experimental.list_physical_devices('GPU') 42 | if gpus: 43 | try: 44 | # Currently, memory growth needs to be the same across GPUs 45 | for gpu in gpus: 46 | tf.config.experimental.set_memory_growth(gpu, True) 47 | 48 | logical_gpus = tf.config.experimental.list_logical_devices('GPU') 49 | print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") 50 | except RuntimeError as e: 51 | # Memory growth must be set before GPUs have been initialized 52 | print(e) 53 | 54 | 55 | # set dir 56 | OUTPUT_DIR = './output' 57 | 58 | DATA_DIR = '../ESD/en/' 59 | # AUDIO_DIR = join(DATA_DIR, 'wav') 60 | BIN_DIR = '../StrengthNet/training_data_en/' 61 | list_file = '../ESD/en/Score_List.csv' 62 | 63 | 64 | EPOCHS = args.epoch 65 | BATCH_SIZE = args.batch_size 66 | 67 | 68 | NUM_TRAIN = 10000 69 | NUM_TEST=1000 70 | NUM_VALID=3000 71 | 72 | 73 | emo_label = ['Angry', 'Happy', 'Surprise', 'Sad'] 74 | 75 | 76 | 77 | if not os.path.exists(OUTPUT_DIR): 78 | os.makedirs(OUTPUT_DIR) 79 | 80 | strength_list = utils.read_list(list_file) 81 | random.shuffle(strength_list) 82 | 83 | train_list= strength_list[0:-(NUM_TEST+NUM_VALID)] 84 | random.shuffle(train_list) 85 | valid_list= strength_list[-(NUM_TEST+NUM_VALID):-NUM_TEST] 86 | test_list= strength_list[-NUM_TEST:] 87 | 88 | print('{} for training; {} for valid; {} for testing'.format(NUM_TRAIN, NUM_VALID, NUM_TEST)) 89 | 90 | 91 | 92 | # init model 93 | StrengthNet = model.CNN_BLSTM() 94 | model = StrengthNet.build() 95 | 96 | model.compile( 97 | optimizer=tf.keras.optimizers.Adam(1e-4), 98 | loss={'avg':'mae', 99 | 'frame':'mae', 100 | 'class': 'categorical_crossentropy'} # TODO add SER loss 101 | ) 102 | 103 | CALLBACKS = [ 104 | keras.callbacks.ModelCheckpoint( 105 | filepath=os.path.join(OUTPUT_DIR,'strengthnet.h5'), 106 | save_best_only=True, 107 | monitor='val_loss', 108 | verbose=1), 109 | keras.callbacks.TensorBoard( 110 | log_dir=os.path.join(OUTPUT_DIR,'tensorboard.log'), 111 | update_freq='epoch'), 112 | keras.callbacks.EarlyStopping( 113 | monitor='val_loss', 114 | mode='min', 115 | min_delta=0, 116 | patience=30, 117 | verbose=1) 118 | ] 119 | 120 | # data generator 121 | train_data = utils.data_generator(train_list, BIN_DIR, frame=True, batch_size=BATCH_SIZE) 122 | valid_data = utils.data_generator(valid_list, BIN_DIR, frame=True, batch_size=BATCH_SIZE) 123 | 124 | tr_steps = int(NUM_TRAIN/BATCH_SIZE) 125 | val_steps = int(NUM_VALID/BATCH_SIZE) 126 | 127 | 128 | # start fitting model 129 | hist = model.fit_generator(train_data, 130 | steps_per_epoch=tr_steps, 131 | epochs=EPOCHS, 132 | callbacks=CALLBACKS, 133 | validation_data=valid_data, 134 | validation_steps=val_steps, 135 | verbose=1,) 136 | 137 | 138 | # plot testing result 139 | model.load_weights(os.path.join(OUTPUT_DIR,'strengthnet.h5'),) # Load the best model 140 | 141 | print('testing...') 142 | Strength_Predict=np.zeros([len(test_list),]) 143 | Strength_true =np.zeros([len(test_list),]) 144 | class_predict=np.zeros([len(test_list),]) 145 | class_true=np.zeros([len(test_list),]) 146 | df = pd.DataFrame(columns=['audio', 'true_strength', 'predict_strength', 'true_class', 'predict_class']) 147 | 148 | 149 | for i in tqdm(range(len(test_list))): 150 | 151 | filepath=test_list[i].split(',') 152 | filename=filepath[0] 153 | 154 | _feat = utils.read(os.path.join(BIN_DIR,filename+'.h5')) 155 | _mel = _feat['mel_sgram'] 156 | 157 | strength=float(filepath[1]) 158 | class_true[i] = emo_label.index(filename.split('/')[1]) 159 | 160 | [Average_score, Frame_score, emo_class]=model.predict(_mel, verbose=0, batch_size=1) 161 | Strength_Predict[i]=Average_score 162 | Strength_true[i] =strength 163 | class_predict[i] = np.argmax(emo_class,1) 164 | df = df.append({'audio': filepath[0], 165 | 'true_strength': Strength_true[i], 166 | 'predict_strength': Strength_Predict[i], 167 | 'true_class': class_true[i], 168 | 'predict_class': class_predict[i]}, 169 | ignore_index=True) 170 | 171 | 172 | 173 | plt.style.use('seaborn-deep') 174 | x = df['true_strength'] 175 | y = df['predict_strength'] 176 | bins = np.linspace(0, 1, 40) 177 | plt.figure(2) 178 | parameters = {'xtick.labelsize': 16, 179 | 'ytick.labelsize': 16} 180 | plt.rcParams.update(parameters) 181 | plt.hist([x, y], bins, label=['true_strength', 'predict_strength']) 182 | plt.legend(loc='upper left', prop = {'size':14}) 183 | plt.xlabel('Strength',fontsize=17) 184 | plt.ylabel('Number',fontsize=17) 185 | plt.show() 186 | plt.savefig('./output/StrengthNet_distribution.png', bbox_inches='tight', dpi=150) 187 | 188 | 189 | 190 | SER_MSE=accuracy_score(class_predict,class_true) 191 | print('[UTTERANCE] SER Test error= %f' % SER_MSE) 192 | 193 | MAE=np.mean(np.abs(Strength_true-Strength_Predict)) 194 | print('[UTTERANCE] MAE= %f' % MAE) 195 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import os 3 | import h5py 4 | import scipy 5 | import numpy as np 6 | from tqdm import tqdm 7 | import tensorflow as tf 8 | from os.path import join 9 | import random 10 | # from tensorflow import keras 11 | 12 | from tensorflow.keras import utils 13 | 14 | 15 | 16 | FS = 16000 17 | FFT_SIZE = 512 18 | HOP_LENGTH=256 19 | WIN_LENGTH=512 20 | n_mels = 80 21 | 22 | # dir 23 | DATA_DIR = '../ESD/en/' 24 | BIN_DIR = '../StrengthNet/training_data_en/' 25 | 26 | # Strength score for english subset (0011-0020) of ESD data. 27 | list_file = 'Score_List.csv' 28 | 29 | emo_label = ['Angry', 'Happy', 'Surprise', 'Sad'] 30 | 31 | 32 | def get_melspectrograms(sound_file, fs=FS, fft_size=FFT_SIZE): 33 | # Loading sound file 34 | y, _ = librosa.load(sound_file, sr=fs) # or set sr to hp.sr. 35 | linear = librosa.stft(y=y, 36 | n_fft=fft_size, 37 | hop_length=HOP_LENGTH, 38 | win_length=WIN_LENGTH, 39 | window=scipy.signal.hamming, 40 | ) 41 | mag = np.abs(linear) #(1+n_fft/2, T) 42 | 43 | # TODO add mel spectrum 44 | mel_basis = librosa.filters.mel(fs, fft_size, n_mels) # (n_mels, 1+n_fft//2) 45 | mel = np.dot(mel_basis, mag) # (n_mels, t) 46 | # shape in (T, 1+n_fft/2) 47 | return np.transpose(mel.astype(np.float32)) 48 | 49 | 50 | def read_list(filelist): 51 | f = open(filelist, 'r') 52 | Path=[] 53 | for line in f: 54 | Path=Path+[line[0:-1]] 55 | return Path 56 | 57 | def read(file_path): 58 | 59 | data_file = h5py.File(file_path, 'r') 60 | mel_sgram = np.array(data_file['mel_sgram'][:]) 61 | 62 | timestep = mel_sgram.shape[0] 63 | mel_sgram = np.reshape(mel_sgram,(1, timestep, n_mels)) 64 | 65 | return { 66 | 'mel_sgram': mel_sgram, 67 | } 68 | 69 | def pad(array, reference_shape): 70 | 71 | result = np.zeros(reference_shape) 72 | result[:array.shape[0],:array.shape[1],:array.shape[2]] = array 73 | 74 | return result 75 | 76 | def data_generator(file_list, bin_root, frame=False, batch_size=1): 77 | index=0 78 | while True: 79 | 80 | filename = [file_list[index+x].split(',')[0] for x in range(batch_size)] 81 | 82 | for i in range(len(filename)): 83 | all_feat = read(join(bin_root,filename[i]+'.h5')) 84 | sgram = all_feat['mel_sgram'] 85 | 86 | # the very first feat 87 | if i == 0: 88 | feat = sgram 89 | max_timestep = feat.shape[1] 90 | else: 91 | if sgram.shape[1] > feat.shape[1]: 92 | # extend all feat in feat 93 | ref_shape = [feat.shape[0], sgram.shape[1], feat.shape[2]] 94 | feat = pad(feat, ref_shape) 95 | feat = np.append(feat, sgram, axis=0) 96 | elif sgram.shape[1] < feat.shape[1]: 97 | # extend sgram to feat.shape[1] 98 | ref_shape = [sgram.shape[0], feat.shape[1], feat.shape[2]] 99 | sgram = pad(sgram, ref_shape) 100 | feat = np.append(feat, sgram, axis=0) 101 | else: 102 | # same timestep, append all 103 | feat = np.append(feat, sgram, axis=0) 104 | 105 | strength = [float(file_list[x+index].split(',')[1]) for x in range(batch_size)] 106 | strength=np.asarray(strength).reshape([batch_size]) 107 | frame_strength = np.array([strength[i]*np.ones([feat.shape[1],1]) for i in range(batch_size)]) 108 | # add Multi-task 109 | emo_class = [emo_label.index(str(file_list[x+index].split(',')[0].split('/')[1])) for x in range(batch_size)] 110 | emo_target = utils.to_categorical(emo_class, num_classes=4) # one-hot encoding 111 | index += batch_size 112 | if index+batch_size >= len(file_list): 113 | index = 0 114 | random.shuffle(file_list) 115 | 116 | if frame: 117 | yield feat, [strength, frame_strength, emo_target] 118 | else: 119 | yield feat, [strength, emo_target] 120 | 121 | 122 | def extract_to_h5(): 123 | audio_dir = DATA_DIR 124 | output_dir = BIN_DIR 125 | 126 | print('audio dir: {}'.format(audio_dir)) 127 | print('output_dir: {}'.format(output_dir)) 128 | 129 | if not os.path.exists(output_dir): 130 | os.makedirs(output_dir) 131 | 132 | 133 | files = [] 134 | with open(list_file, 'r') as f: 135 | for line in f: 136 | files.append(line.split(',')[0]) 137 | out_dir1 = output_dir + '/' + line.split(',')[0].split('/')[0] 138 | if not os.path.exists(out_dir1): 139 | os.makedirs(out_dir1) 140 | 141 | out_dir2 = output_dir + '/' + line.split(',')[0].split('/')[0] + '/' + line.split(',')[0].split('/')[1] 142 | if not os.path.exists(out_dir2): 143 | os.makedirs(out_dir2) 144 | 145 | print('start extracting .wav to .h5, {} files found...'.format(len(files))) 146 | 147 | for i in tqdm(range(len(files))): 148 | f = files[i] 149 | 150 | # set audio file path 151 | audio_file = join(audio_dir, f) 152 | 153 | # Mel-spectrogram 154 | mel = get_melspectrograms(audio_file) 155 | 156 | 157 | with h5py.File(join(output_dir, '{}.h5'.format(f)), 'w') as hf: 158 | hf.create_dataset('mel_sgram', data=mel) 159 | 160 | 161 | if __name__ == '__main__': 162 | 163 | extract_to_h5() --------------------------------------------------------------------------------