├── config.py ├── pred_probas └── .gitkeep ├── trained_models └── .gitkeep ├── lstm_classifier ├── __init__.py ├── s2e │ ├── __init__.py │ ├── config.py │ ├── utils.py │ └── lstm_classifier.py ├── t2e │ ├── __init__.py │ ├── predict_probas.py │ ├── config.py │ ├── create_vocab.py │ ├── lstm_classifier.py │ └── utils.py └── combined │ ├── __init__.py │ ├── config.py │ ├── predict_probas.py │ ├── utils.py │ └── lstm_classifier.py ├── .gitignore ├── requirements.txt ├── LICENSE ├── src ├── build_audio_vectors.py ├── prepare_data.py ├── extract_emotion_labels.py ├── extract_audio_features.py └── train_sentence_classifiers_sklearn.py ├── 2_build_audio_vectors.ipynb ├── README.md ├── main.py ├── 1_extract_emotion_labels.ipynb └── 4_prepare_data.ipynb /config.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pred_probas/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /trained_models/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lstm_classifier/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lstm_classifier/s2e/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lstm_classifier/t2e/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lstm_classifier/combined/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *checkpoint* 2 | *.pyc 3 | *.csv 4 | *.txt 5 | *.wav 6 | data/* 7 | *.pkl 8 | *.zip 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | xgboost==0.82 2 | flask 3 | numpy==1.16.2 4 | scikit-learn==0.20.3 5 | scipy==1.4.1 6 | pandas 7 | librosa 8 | -------------------------------------------------------------------------------- /lstm_classifier/s2e/config.py: -------------------------------------------------------------------------------- 1 | model_config = { 2 | 'gpu': 0, 3 | 'bidirectional': False, 4 | 'input_dim': 8, 5 | 'hidden_dim': 50, 6 | 'output_dim': 6, # number of classes 7 | 'dropout': 0.2, 8 | 'learning_rate': 0.01, 9 | 'batch_size': 1567, # carefully chosen 10 | 'n_epochs': 55000, 11 | 'n_layers': 2, 12 | 'model_code': 'basic_lstm' 13 | } 14 | -------------------------------------------------------------------------------- /lstm_classifier/combined/config.py: -------------------------------------------------------------------------------- 1 | model_config = { 2 | 'gpu': 1, 3 | 'n_layers': 2, 4 | 'dropout': 0.2, 5 | 'output_dim': 6, # number of classes 6 | 'hidden_dim': 256, 7 | 'input_dim': 2472, 8 | 'batch_size': 200, # carefully chosen 9 | 'n_epochs': 55000, 10 | 'learning_rate': 0.001, 11 | 'bidirectional': True, 12 | 'model_code': 'bi_lstm' 13 | } 14 | -------------------------------------------------------------------------------- /lstm_classifier/t2e/predict_probas.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pickle 3 | import numpy as np 4 | import pandas as pd 5 | from lstm_classifier import LSTMClassifier 6 | from config import model_config as config 7 | from utils import load_data 8 | 9 | 10 | # Load test data 11 | test_pairs = load_data(test=True) 12 | inputs, lengths, targets = test_pairs 13 | 14 | # Load pretrained model 15 | model = LSTMClassifier(config) 16 | checkpoint = torch.load('runs/{}-best_model.pth'.format(config['model_code']), 17 | map_location='cpu') 18 | model.load_state_dict(checkpoint['model']) 19 | 20 | with torch.no_grad(): 21 | # Predict 22 | predict_probas = model(inputs, lengths).cpu().numpy() 23 | 24 | with open('../../pred_probas/text_lstm_classifier.pkl', 'wb') as f: 25 | pickle.dump(predict_probas, f) 26 | -------------------------------------------------------------------------------- /lstm_classifier/combined/predict_probas.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pickle 3 | import numpy as np 4 | import pandas as pd 5 | from lstm_classifier import LSTMClassifier 6 | from config import model_config as config 7 | from utils import load_data 8 | 9 | 10 | # Load test data 11 | test_pairs = load_data(test=True) 12 | inputs, targets = test_pairs 13 | inputs = inputs.unsqueeze(0) 14 | 15 | # Load pretrained model 16 | model = LSTMClassifier(config) 17 | checkpoint = torch.load('runs/{}-best_model.pth'.format(config['model_code']), 18 | map_location='cpu') 19 | model.load_state_dict(checkpoint['model']) 20 | 21 | with torch.no_grad(): 22 | # Predict 23 | predict_probas = model(inputs).cpu().numpy() 24 | 25 | with open('../../pred_probas/combined_lstm_classifier.pkl', 'wb') as f: 26 | pickle.dump(predict_probas, f) 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Gaurav 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lstm_classifier/t2e/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pickle 3 | import gensim 4 | from create_vocab import Vocabulary, create_vocab 5 | 6 | model_config = { 7 | 'gpu': 1, 8 | '': 0, 9 | '': 1, 10 | '': 2, 11 | '': 3, 12 | 'n_layers': 2, 13 | 'dropout': 0.2, 14 | 'output_dim': 6, # number of classes 15 | 'hidden_dim': 500, 16 | 'n_epochs': 45000, 17 | 'batch_size': 128, # carefully chosen 18 | 'embedding_dim': 200, # 50/100/200/300 19 | 'bidirectional': True, 20 | 'learning_rate': 0.0001, 21 | 'model_code': 'bi_lstm_2_layer', 22 | 'max_sequence_length': 20, 23 | 'embeddings_dir': 'embeddings/' 24 | } 25 | 26 | 27 | from utils import generate_word_embeddings 28 | 29 | 30 | def set_dynamic_hparams(): 31 | try: 32 | with open('vocab.pkl', 'rb') as f: 33 | vocab = pickle.load(f) 34 | except FileNotFoundError as e: 35 | vocab = create_vocab() 36 | generate_word_embeddings(vocab) 37 | 38 | model_config['vocab_size'] = vocab.size 39 | model_config['vocab_path'] = 'vocab.pkl' 40 | return model_config 41 | 42 | 43 | model_config = set_dynamic_hparams() 44 | -------------------------------------------------------------------------------- /lstm_classifier/t2e/create_vocab.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import pandas as pd 4 | 5 | 6 | class Vocabulary(object): 7 | def __init__(self): 8 | self.word2index = {'': 0, '': 1, '': 2, '': 3} 9 | self.word2count = {} 10 | self.index2word = {0: '', 1: '', 2: '', 3: ''} 11 | self.size = 4 # Count PAD, SOS, EOS and UNK 12 | 13 | def add_sentence(self, sentence): 14 | for word in sentence.split(): 15 | self.add_word(word) 16 | 17 | def add_word(self, word): 18 | if word not in self.word2index: 19 | self.word2index[word] = self.size 20 | self.word2count[word] = 1 21 | self.index2word[self.size] = word 22 | self.size += 1 23 | else: 24 | self.word2count[word] += 1 25 | 26 | 27 | def create_vocab(file_dir='../../data/t2e/'): 28 | print('Loading corpus...') 29 | texts = [] 30 | for mode in ['train', 'test']: 31 | texts += list(pd.read_csv('{}text_{}.csv'.format(file_dir, mode))['transcription']) 32 | 33 | print("Building vocab...") 34 | vocab = Vocabulary() 35 | 36 | for text in texts: 37 | vocab.add_sentence(text) 38 | 39 | print("Total words in vocab: {}".format(vocab.size)) 40 | with open('vocab.pkl', 'wb') as f: 41 | pickle.dump(vocab, f) 42 | 43 | print('Generating word embeddings') 44 | return vocab 45 | 46 | 47 | if __name__ == '__main__': 48 | create_vocab() 49 | -------------------------------------------------------------------------------- /src/build_audio_vectors.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script should be run AFTER extract_emotion_labels.py. It basically uses 3 | the csv file created in the previous step to split the original wav files into 4 | multiple smaller frames, each piece containing an emotion. 5 | 6 | Run this script from root as python src/build_audio_vectors.py 7 | """ 8 | 9 | import os 10 | import math 11 | import pickle 12 | import librosa 13 | import pandas as pd 14 | from tqdm import tqdm 15 | 16 | 17 | def process_session(iemocap_dir, labels_df, sr, sess): 18 | """ 19 | saves audio_vectors dict in a pickle file which contains vectors 20 | for audio files in session `sess` 21 | 22 | process_session: Str pd.DataFrame Nat Int -> None 23 | """ 24 | audio_vectors = {} 25 | wav_file_path = '{}Session{}/dialog/wav/'.format(iemocap_dir, sess) 26 | orig_wav_files = os.listdir(wav_file_path) 27 | for orig_wav_file in tqdm(orig_wav_files): 28 | try: 29 | orig_wav_vector, _sr = librosa.load(wav_file_path + orig_wav_file, 30 | sr=sr) 31 | orig_wav_file, file_format = orig_wav_file.split('.') 32 | for index, row in labels_df[labels_df['wav_file'].str.contains( 33 | orig_wav_file)].iterrows(): 34 | start_time, end_time, truncated_wav_file_name, = \ 35 | row['start_time'], row['end_time'], row['wav_file'] 36 | start_frame = math.floor(start_time * sr) 37 | end_frame = math.floor(end_time * sr) 38 | truncated_wav_vector = orig_wav_vector[start_frame:end_frame+1] 39 | audio_vectors[truncated_wav_file_name] = truncated_wav_vector 40 | except Exception as e: 41 | print('An exception occured for {}'.format(orig_wav_file)) 42 | with open('data/pre-processed/audio_vectors_{}.pkl'.format(sess), 'wb') as f: 43 | pickle.dump(audio_vectors, f) 44 | 45 | 46 | def main(): 47 | sampling_rate = 44100 48 | iemocap_dir = 'data/IEMOCAP_full_release/' 49 | labels_df = pd.read_csv('data/pre-processed/df_iemocap.csv') 50 | for sess in range(1, 6): 51 | # Note that compiling this way will take too much time So you might 52 | # consider parallelizing this process 53 | process_session(iemocap_dir, labels_df, sampling_rate, sess) 54 | 55 | 56 | if __name__ == '__main__': 57 | main() 58 | -------------------------------------------------------------------------------- /src/prepare_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script preprocesses data and prepares data to be actually used in training 3 | """ 4 | import re 5 | import os 6 | import pickle 7 | import unicodedata 8 | import pandas as pd 9 | from sklearn.preprocessing import MinMaxScaler 10 | from sklearn.model_selection import train_test_split 11 | 12 | 13 | def unicodeToAscii(s): 14 | return ''.join( 15 | c for c in unicodedata.normalize('NFD', s) 16 | if unicodedata.category(c) != 'Mn' 17 | ) 18 | 19 | 20 | def normalizeString(s): 21 | """ 22 | Lowercase, trim, and remove non-letter characters 23 | """ 24 | s = unicodeToAscii(s.lower().strip()) 25 | s = re.sub(r"([.!?])", r" \1", s) 26 | s = re.sub(r"[^a-zA-Z.!?]+", r" ", s) 27 | return s 28 | 29 | 30 | def transcribe_sessions(): 31 | file2transcriptions = {} 32 | useful_regex = re.compile(r'^(\w+)', re.IGNORECASE) 33 | transcript_path = 'data/IEMOCAP_full_release/Session{}/dialog/transcriptions/' 34 | for sess in range(1, 6): 35 | transcript_path = transcript_path.format(sess) 36 | for f in os.listdir(transcript_path): 37 | with open('{}{}'.format(transcript_path, f), 'r') as f: 38 | all_lines = f.readlines() 39 | 40 | for l in all_lines: 41 | audio_code = useful_regex.match(l).group() 42 | transcription = l.split(':')[-1].strip() 43 | # assuming that all the keys would be unique and hence no `try` 44 | file2transcriptions[audio_code] = transcription 45 | with open('data/t2e/audiocode2text.pkl', 'wb') as file: 46 | pickle.dump(file2transcriptions, file) 47 | return file2transcriptions 48 | 49 | 50 | def prepare_text_data(audiocode2text): 51 | # Prepare text data 52 | text_train = pd.DataFrame() 53 | text_train['wav_file'] = x_train['wav_file'] 54 | text_train['label'] = x_train['label'] 55 | text_train['transcription'] = [normalizeString(audiocode2text[code]) 56 | for code in x_train['wav_file']] 57 | 58 | text_test = pd.DataFrame() 59 | text_test['wav_file'] = x_test['wav_file'] 60 | text_test['label'] = x_test['label'] 61 | text_test['transcription'] = [normalizeString(audiocode2text[code]) 62 | for code in x_test['wav_file']] 63 | 64 | text_train.to_csv('data/t2e/text_train.csv', index=False) 65 | text_test.to_csv('data/t2e/text_test.csv', index=False) 66 | 67 | print(text_train.shape, text_test.shape) 68 | 69 | 70 | def main(): 71 | prepare_text_data(transcribe_sessions()) 72 | 73 | 74 | if __name__ == '__main__': 75 | main() 76 | -------------------------------------------------------------------------------- /lstm_classifier/s2e/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import pandas as pd 4 | from config import model_config as config 5 | 6 | from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score 7 | 8 | import itertools 9 | import matplotlib.pyplot as plt 10 | 11 | 12 | def load_data(batched=True, test=False, file_dir='../../data/s2e/'): 13 | bs = config['batch_size'] 14 | ftype = 'test' if test else 'train' 15 | df = pd.read_csv('{}modified_df_{}.csv'.format(file_dir, ftype)) 16 | # 0th index in label, rest all are features 17 | data = (np.array(df[df.columns[1:]]), np.array(df[df.columns[0]])) 18 | if test or not batched: 19 | return [torch.FloatTensor(data[0]), torch.LongTensor(data[1])] 20 | data = list(zip(data[0], data[1])) 21 | n_iters = len(data) // bs 22 | batches = [] 23 | for i in range(1, n_iters + 1): 24 | input_batch = [] 25 | output_batch = [] 26 | for e in data[bs * (i-1):bs * i]: 27 | input_batch.append(e[0]) 28 | output_batch.append(e[1]) 29 | batches.append([torch.FloatTensor(input_batch), 30 | torch.LongTensor(output_batch)]) 31 | return batches 32 | 33 | 34 | def evaluate(targets, predictions): 35 | performance = { 36 | 'acc': accuracy_score(targets, predictions), 37 | 'f1': f1_score(targets, predictions, average='macro'), 38 | 'precision': precision_score(targets, predictions, average='macro'), 39 | 'recall': recall_score(targets, predictions, average='macro')} 40 | return performance 41 | 42 | 43 | def plot_confusion_matrix(targets, predictions, classes, 44 | normalize=False, 45 | title='Confusion matrix', 46 | cmap=plt.cm.Blues): 47 | """ 48 | This function prints and plots the confusion matrix. 49 | Normalization can be applied by setting `normalize=True`. 50 | """ 51 | # plt.figure(figsize=(8,8)) 52 | cm = confusion_matrix(targets, predictions) 53 | plt.imshow(cm, interpolation='nearest', cmap=cmap) 54 | plt.title(title) 55 | plt.colorbar() 56 | tick_marks = np.arange(len(classes)) 57 | plt.xticks(tick_marks, classes, rotation=45) 58 | plt.yticks(tick_marks, classes) 59 | 60 | if normalize: 61 | cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 62 | print("Normalized confusion matrix") 63 | else: 64 | print('Confusion matrix, without normalization') 65 | 66 | print(cm) 67 | 68 | thresh = cm.max() / 2. 69 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): 70 | plt.text(j, i, cm[i, j], 71 | horizontalalignment="center", 72 | color="white" if cm[i, j] > thresh else "black") 73 | 74 | plt.tight_layout() 75 | plt.ylabel('True label') 76 | plt.xlabel('Predicted label') 77 | -------------------------------------------------------------------------------- /lstm_classifier/combined/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pickle 3 | import numpy as np 4 | import pandas as pd 5 | from config import model_config as config 6 | 7 | from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score 8 | 9 | import itertools 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | def load_data(batched=True, test=False, file_dir='../../data/combined/combined_features.pkl'): 14 | bs = config['batch_size'] 15 | ftype = 'test' if test else 'train' 16 | 17 | with open('{}'.format(file_dir), 'rb') as f: 18 | features = pickle.load(f) 19 | 20 | x = features['x_{}'.format(ftype)] 21 | y = features['y_{}'.format(ftype)] 22 | data = (x, y) 23 | if test or not batched: 24 | return [torch.FloatTensor(data[0]), torch.LongTensor(data[1])] 25 | data = list(zip(data[0], data[1])) 26 | n_iters = len(data) // bs 27 | batches = [] 28 | for i in range(1, n_iters + 1): 29 | input_batch = [] 30 | output_batch = [] 31 | for e in data[bs * (i-1):bs * i]: 32 | input_batch.append(e[0]) 33 | output_batch.append(e[1]) 34 | batches.append([torch.FloatTensor(input_batch), 35 | torch.LongTensor(output_batch)]) 36 | return batches 37 | 38 | 39 | def evaluate(targets, predictions): 40 | performance = { 41 | 'acc': accuracy_score(targets, predictions), 42 | 'f1': f1_score(targets, predictions, average='macro'), 43 | 'precision': precision_score(targets, predictions, average='macro'), 44 | 'recall': recall_score(targets, predictions, average='macro')} 45 | return performance 46 | 47 | 48 | def plot_confusion_matrix(targets, predictions, classes, 49 | normalize=False, 50 | title='Confusion matrix', 51 | cmap=plt.cm.Blues): 52 | """ 53 | This function prints and plots the confusion matrix. 54 | Normalization can be applied by setting `normalize=True`. 55 | """ 56 | # plt.figure(figsize=(8,8)) 57 | cm = confusion_matrix(targets, predictions) 58 | plt.imshow(cm, interpolation='nearest', cmap=cmap) 59 | plt.title(title) 60 | plt.colorbar() 61 | tick_marks = np.arange(len(classes)) 62 | plt.xticks(tick_marks, classes, rotation=45) 63 | plt.yticks(tick_marks, classes) 64 | 65 | if normalize: 66 | cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 67 | print("Normalized confusion matrix") 68 | else: 69 | print('Confusion matrix, without normalization') 70 | 71 | print(cm) 72 | 73 | thresh = cm.max() / 2. 74 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): 75 | plt.text(j, i, cm[i, j], 76 | horizontalalignment="center", 77 | color="white" if cm[i, j] > thresh else "black") 78 | 79 | plt.tight_layout() 80 | plt.ylabel('True label') 81 | plt.xlabel('Predicted label') 82 | -------------------------------------------------------------------------------- /src/extract_emotion_labels.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script parses the dataset, extracts label and stores it at one place 3 | 4 | Run this script from root as python src/extract_emotion_labels.py 5 | """ 6 | 7 | import re 8 | import os 9 | import pandas as pd 10 | 11 | 12 | def extract_info(): 13 | """ 14 | returns info_dict containing important info from the IEMOCAP dataset 15 | such as start time, end time, emotion labels etc. 16 | 17 | extract_info: None -> Dict 18 | """ 19 | info_dict = {'start_times': [], 'end_times': [], 'wav_file_names': [], 20 | 'emotions': [], 'vals': [], 'acts': [], 'doms': []} 21 | 22 | # regex used to identify useful info in the dataset files 23 | info_line = re.compile(r'\[.+\]\n', re.IGNORECASE) 24 | for sess in range(1, 6): 25 | emo_evaluation_dir = 'data/IEMOCAP_full_release/Session{}/dialog/EmoEvaluation/'.format(sess) 26 | # Only include the session files 27 | evaluation_files = [l for l in os.listdir(emo_evaluation_dir) 28 | if 'Ses' in l] 29 | for file in evaluation_files: 30 | with open(emo_evaluation_dir + file) as f: 31 | content = f.read() 32 | # grab the important stuff 33 | info_lines = re.findall(info_lines, content) 34 | for line in info_line[1:]: # skipping the first header line 35 | # Refer to the dataset to see how `line` looks like 36 | start_end_time, wav_file_name, emotion, val_act_dom = \ 37 | line.strip().split('\t') 38 | start_time, end_time = start_end_time[1:-1].split('-') 39 | val, act, dom = val_act_dom[1:-1].split(',') 40 | val, act, dom = float(val), float(act), float(dom) 41 | start_time, end_time = float(start_time), float(end_time) 42 | info_dict['start_times'].append(start_time) 43 | info_dict['end_times'].append(end_time) 44 | info_dict['wav_file_names'].append(wav_file_name) 45 | info_dict['emotions'].append(emotion) 46 | info_dict['vals'].append(val) 47 | info_dict['acts'].append(act) 48 | info_dict['doms'].append(dom) 49 | return info_dict 50 | 51 | 52 | def compile_dataset(info_dict): 53 | """ 54 | creates a csv file from info_dict which will serve as the dataset 55 | 56 | compile_dataset: Dict -> None 57 | """ 58 | df_iemocap = pd.DataFrame(columns=['start_time', 'end_time', 'wav_file', 'emotion', 'val', 'act', 'dom']) 59 | 60 | df_iemocap['start_time'] = info_dict['start_times'] 61 | df_iemocap['end_time'] = info_dict['end_times'] 62 | df_iemocap['wav_file'] = info_dict['wav_file_names'] 63 | df_iemocap['emotion'] = info_dict['emotions'] 64 | df_iemocap['val'] = info_dict['vals'] 65 | df_iemocap['act'] = info_dict['acts'] 66 | df_iemocap['dom'] = info_dict['doms'] 67 | # Finally, save to a file 68 | df_iemocap.to_csv('data/pre-processed/df_iemocap.csv', index=False) 69 | 70 | 71 | def main(): 72 | compile_dataset(extract_info()) 73 | 74 | 75 | if __name__ == '__main__': 76 | main() 77 | -------------------------------------------------------------------------------- /src/extract_audio_features.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script extract features from existing audio vectors 3 | """ 4 | 5 | import os 6 | import math 7 | import random 8 | import pickle 9 | import librosa 10 | import numpy as np 11 | import pandas as pd 12 | from tqdm import tqdm 13 | 14 | 15 | def add_session_data(df_features, labels_df, emotion_dict, audio_vectors_path): 16 | audio_vectors = pickle.load(open(audio_vectors_path, 'rb')) 17 | for index, row in tqdm(labels_df[labels_df['wav_file'].str.contains( 18 | 'Ses0{}'.format(sess))].iterrows()): 19 | try: 20 | wav_file_name = row['wav_file'] 21 | label = emotion_dict[row['emotion']] 22 | y = audio_vectors[wav_file_name] 23 | 24 | feature_list = [wav_file_name, label] # wav_file, label 25 | sig_mean = np.mean(abs(y)) 26 | feature_list.append(sig_mean) # sig_mean 27 | feature_list.append(np.std(y)) # sig_std 28 | 29 | rmse = librosa.feature.rmse(y + 0.0001)[0] 30 | feature_list.append(np.mean(rmse)) # rmse_mean 31 | feature_list.append(np.std(rmse)) # rmse_std 32 | 33 | silence = 0 34 | for e in rmse: 35 | if e <= 0.4 * np.mean(rmse): 36 | silence += 1 37 | silence /= float(len(rmse)) 38 | feature_list.append(silence) # silence 39 | 40 | y_harmonic = librosa.effects.hpss(y)[0] 41 | feature_list.append(np.mean(y_harmonic) * 1000) # harmonic (scaled by 1000) 42 | 43 | # based on the pitch detection algorithm mentioned here: 44 | # http://access.feld.cvut.cz/view.php?cisloclanku=2009060001 45 | cl = 0.45 * sig_mean 46 | center_clipped = [] 47 | for s in y: 48 | if s >= cl: 49 | center_clipped.append(s - cl) 50 | elif s <= -cl: 51 | center_clipped.append(s + cl) 52 | elif np.abs(s) < cl: 53 | center_clipped.append(0) 54 | auto_corrs = librosa.core.autocorrelate(np.array(center_clipped)) 55 | feature_list.append(1000 * np.max(auto_corrs)/len(auto_corrs)) # auto_corr_max (scaled by 1000) 56 | feature_list.append(np.std(auto_corrs)) # auto_corr_std 57 | 58 | df_features = df_features.append(pd.DataFrame(feature_list, index=columns).transpose(), ignore_index=True) 59 | except Exception as e: 60 | print('Some exception occured: {}'.format(e)) 61 | 62 | 63 | def main(): 64 | emotion_dict = {'ang': 0, 'hap': 1, 'exc': 2, 'sad': 3, 'fru': 4, 'fea': 5, 65 | 'sur': 6, 'neu': 7, 'xxx': 8, 'oth': 8} 66 | 67 | data_dir = 'data/pre-processed/' 68 | labels_path = '{}df_iemocap.csv'.format(data_dir) 69 | audio_vectors_path = '{}audio_vectors_'.format(data_dir) 70 | columns = ['wav_file', 'label', 'sig_mean', 'sig_std', 'rmse_mean', 71 | 'rmse_std', 'silence', 'harmonic', 'auto_corr_max', 72 | 'auto_corr_std'] 73 | df_features = pd.DataFrame(columns=columns) 74 | labels_df = pd.read_csv(labels_path) 75 | for sess in range(1, 6): 76 | add_session_data(df_features, labels_df, emotion_dict, 77 | '{}{}.pkl'.format(audio_vectors_path, sess)) 78 | df_features.to_csv('data/pre-processed/audio_features.csv', index=False) 79 | 80 | 81 | if __name__ == '__main__': 82 | main() 83 | -------------------------------------------------------------------------------- /lstm_classifier/s2e/lstm_classifier.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import pickle 4 | import numpy as np 5 | import torch.nn as nn 6 | from torch import optim 7 | import torch.nn.functional as F 8 | from utils import load_data, evaluate, plot_confusion_matrix 9 | 10 | from config import model_config as config 11 | 12 | 13 | class LSTMClassifier(nn.Module): 14 | """docstring for LSTMClassifier""" 15 | def __init__(self, config): 16 | super(LSTMClassifier, self).__init__() 17 | self.n_layers = config['n_layers'] 18 | self.input_dim = config['input_dim'] 19 | self.hidden_dim = config['hidden_dim'] 20 | self.output_dim = config['output_dim'] 21 | self.bidirectional = config['bidirectional'] 22 | self.dropout = config['dropout'] if self.n_layers > 1 else 0 23 | 24 | self.rnn = nn.LSTM(self.input_dim, self.hidden_dim, bias=True, 25 | num_layers=2, dropout=self.dropout, 26 | bidirectional=self.bidirectional) 27 | self.out = nn.Linear(self.hidden_dim, self.output_dim) 28 | self.softmax = F.softmax 29 | 30 | def forward(self, input_seq): 31 | # input_seq =. [1, batch_size, input_size] 32 | rnn_output, (hidden, _) = self.rnn(input_seq) 33 | if self.bidirectional: # sum outputs from the two directions 34 | rnn_output = rnn_output[:, :, :self.hidden_dim] +\ 35 | rnn_output[:, :, self.hidden_dim:] 36 | class_scores = F.softmax(self.out(rnn_output[0]), dim=1) 37 | return class_scores 38 | 39 | 40 | if __name__ == '__main__': 41 | emotion_dict = {'ang': 0, 'hap': 1, 'sad': 2, 'fea': 3, 'sur': 4, 'neu': 5} 42 | 43 | device = 'cuda:{}'.format(config['gpu']) if \ 44 | torch.cuda.is_available() else 'cpu' 45 | 46 | model = LSTMClassifier(config) 47 | model = model.to(device) 48 | criterion = nn.CrossEntropyLoss() 49 | optimizer = optim.Adam(model.parameters(), lr=config['learning_rate']) 50 | 51 | train_batches = load_data() 52 | test_pairs = load_data(test=True) 53 | 54 | best_acc = 0 55 | for epoch in range(config['n_epochs']): 56 | losses = [] 57 | for batch in train_batches: 58 | inputs = batch[0].unsqueeze(0) # frame in format as expected by model 59 | targets = batch[1] 60 | inputs = inputs.to(device) 61 | targets = targets.to(device) 62 | 63 | model.zero_grad() 64 | optimizer.zero_grad() 65 | 66 | predictions = model(inputs) 67 | predictions = predictions.to(device) 68 | 69 | loss = criterion(predictions, targets) 70 | loss.backward() 71 | optimizer.step() 72 | losses.append(loss.item()) 73 | 74 | # evaluate 75 | with torch.no_grad(): 76 | inputs = test_pairs[0].unsqueeze(0) 77 | targets = test_pairs[1] 78 | 79 | inputs = inputs.to(device) 80 | targets = targets.to(device) 81 | 82 | predictions = torch.argmax(model(inputs), dim=1) # take argmax to get class id 83 | predictions = predictions.to(device) 84 | 85 | # evaluate on cpu 86 | targets = np.array(targets.cpu()) 87 | predictions = np.array(predictions.cpu()) 88 | 89 | # Get results 90 | # plot_confusion_matrix(targets, predictions, 91 | # classes=emotion_dict.keys()) 92 | performance = evaluate(targets, predictions) 93 | if performance['acc'] > best_acc: 94 | best_acc = performance['acc'] 95 | print(performance) 96 | # save model and results 97 | torch.save({ 98 | 'model': model.state_dict(), 99 | 'optimizer': optimizer.state_dict() 100 | }, 'runs/{}-best_model.pth'.format(config['model_code'])) 101 | 102 | with open('results/{}-best_performance.pkl'.format(config['model_code']), 'wb') as f: 103 | pickle.dump(performance, f) 104 | -------------------------------------------------------------------------------- /lstm_classifier/combined/lstm_classifier.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import pickle 4 | import numpy as np 5 | import torch.nn as nn 6 | from torch import optim 7 | import torch.nn.functional as F 8 | from utils import load_data, evaluate, plot_confusion_matrix 9 | 10 | from config import model_config as config 11 | 12 | 13 | class LSTMClassifier(nn.Module): 14 | """docstring for LSTMClassifier""" 15 | def __init__(self, config): 16 | super(LSTMClassifier, self).__init__() 17 | self.n_layers = config['n_layers'] 18 | self.dropout = config['dropout'] if self.n_layers > 1 else 0 19 | self.input_dim = config['input_dim'] 20 | self.hidden_dim = config['hidden_dim'] 21 | self.output_dim = config['output_dim'] 22 | self.bidirectional = config['bidirectional'] 23 | 24 | self.rnn = nn.LSTM(self.input_dim, self.hidden_dim, bias=True, 25 | num_layers=self.n_layers, dropout=self.dropout, 26 | bidirectional=self.bidirectional) 27 | self.out = nn.Linear(self.hidden_dim, self.output_dim) 28 | self.softmax = F.softmax 29 | 30 | def forward(self, input_seq): 31 | # input_seq =. [1, batch_size, input_size] 32 | rnn_output, (hidden, _) = self.rnn(input_seq) 33 | if self.bidirectional: # sum outputs from the two directions 34 | rnn_output = rnn_output[:, :, :self.hidden_dim] +\ 35 | rnn_output[:, :, self.hidden_dim:] 36 | class_scores = F.softmax(self.out(rnn_output[0]), dim=1) 37 | return class_scores 38 | 39 | 40 | if __name__ == '__main__': 41 | emotion_dict = {'ang': 0, 'hap': 1, 'sad': 2, 'fea': 3, 'sur': 4, 'neu': 5} 42 | 43 | device = 'cuda:{}'.format(config['gpu']) if \ 44 | torch.cuda.is_available() else 'cpu' 45 | 46 | model = LSTMClassifier(config) 47 | model = model.to(device) 48 | criterion = nn.CrossEntropyLoss() 49 | optimizer = optim.Adam(model.parameters(), lr=config['learning_rate']) 50 | 51 | train_batches = load_data() 52 | test_pairs = load_data(test=True) 53 | 54 | best_acc = 0 55 | for epoch in range(config['n_epochs']): 56 | losses = [] 57 | for batch in train_batches: 58 | inputs = batch[0].unsqueeze(0) # frame in format as expected by model 59 | targets = batch[1] 60 | inputs = inputs.to(device) 61 | targets = targets.to(device) 62 | 63 | model.zero_grad() 64 | optimizer.zero_grad() 65 | 66 | predictions = model(inputs) 67 | predictions = predictions.to(device) 68 | 69 | loss = criterion(predictions, targets) 70 | loss.backward() 71 | optimizer.step() 72 | losses.append(loss.item()) 73 | 74 | # evaluate 75 | with torch.no_grad(): 76 | inputs = test_pairs[0].unsqueeze(0) 77 | targets = test_pairs[1] 78 | 79 | inputs = inputs.to(device) 80 | targets = targets.to(device) 81 | 82 | predictions = torch.argmax(model(inputs), dim=1) # take argmax to get class id 83 | predictions = predictions.to(device) 84 | 85 | # evaluate on cpu 86 | targets = np.array(targets.cpu()) 87 | predictions = np.array(predictions.cpu()) 88 | 89 | # Get results 90 | # plot_confusion_matrix(targets, predictions, 91 | # classes=emotion_dict.keys()) 92 | performance = evaluate(targets, predictions) 93 | if performance['acc'] > best_acc: 94 | print(performance) 95 | best_acc = performance['acc'] 96 | # save model and results 97 | torch.save({ 98 | 'model': model.state_dict(), 99 | 'optimizer': optimizer.state_dict() 100 | }, 'runs/{}-best_model.pth'.format(config['model_code'])) 101 | 102 | with open('results/{}-best_performance.pkl'.format(config['model_code']), 'wb') as f: 103 | pickle.dump(performance, f) 104 | -------------------------------------------------------------------------------- /lstm_classifier/t2e/lstm_classifier.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pickle 3 | import numpy as np 4 | import torch.nn as nn 5 | from torch import optim 6 | import torch.nn.functional as F 7 | 8 | from config import model_config as config 9 | from utils import load_data, evaluate, load_word_embeddings, plot_confusion_matrix 10 | 11 | 12 | class LSTMClassifier(nn.Module): 13 | """docstring for LSTMClassifier""" 14 | def __init__(self, config): 15 | super(LSTMClassifier, self).__init__() 16 | self.dropout = config['dropout'] 17 | self.n_layers = config['n_layers'] 18 | self.hidden_dim = config['hidden_dim'] 19 | self.output_dim = config['output_dim'] 20 | self.vocab_size = config['vocab_size'] 21 | self.embedding_dim = config['embedding_dim'] 22 | self.bidirectional = config['bidirectional'] 23 | 24 | self.embedding = nn.Embedding.from_pretrained( 25 | load_word_embeddings(), freeze=False) 26 | 27 | self.rnn = nn.LSTM(self.embedding_dim, self.hidden_dim, bias=True, 28 | num_layers=self.n_layers, dropout=self.dropout, 29 | bidirectional=self.bidirectional) 30 | self.n_directions = 2 if self.bidirectional else 1 31 | self.out = nn.Linear(self.n_directions * self.hidden_dim, self.output_dim) 32 | self.softmax = F.softmax 33 | 34 | def forward(self, input_seq, input_lengths): 35 | max_seq_len, bs = input_seq.size() 36 | # input_seq =. [max_seq_len, batch_size] 37 | embedded = self.embedding(input_seq) 38 | 39 | rnn_output, (hidden, _) = self.rnn(embedded) 40 | rnn_output = torch.cat((rnn_output[-1, :, :self.hidden_dim], 41 | rnn_output[0, :, self.hidden_dim:]), dim=1) 42 | # sum hidden states 43 | class_scores = F.softmax(self.out(rnn_output), dim=1) 44 | return class_scores 45 | 46 | 47 | if __name__ == '__main__': 48 | emotion_dict = {'ang': 0, 'hap': 1, 'sad': 2, 'fea': 3, 'sur': 4, 'neu': 5} 49 | 50 | device = 'cuda:{}'.format(config['gpu']) if \ 51 | torch.cuda.is_available() else 'cpu' 52 | 53 | model = LSTMClassifier(config) 54 | model = model.to(device) 55 | criterion = nn.NLLLoss() 56 | optimizer = optim.Adam(model.parameters(), lr=config['learning_rate']) 57 | 58 | train_batches = load_data() 59 | test_batch = load_data(test=True) 60 | 61 | best_acc = 0 62 | for epoch in range(config['n_epochs']): 63 | losses = [] 64 | for batch in train_batches: 65 | inputs, input_lengths, targets = batch 66 | inputs = inputs.to(device) 67 | input_lengths = input_lengths.to(device) 68 | targets = targets.to(device) 69 | 70 | model.zero_grad() 71 | optimizer.zero_grad() 72 | 73 | predictions = model(inputs, input_lengths) 74 | predictions = predictions.to(device) 75 | 76 | loss = criterion(predictions, targets) 77 | loss.backward() 78 | optimizer.step() 79 | 80 | losses.append(loss.item()) 81 | 82 | # evaluate 83 | with torch.no_grad(): 84 | inputs, lengths, targets = test_batch 85 | 86 | inputs = inputs.to(device) 87 | lengths = lengths.to(device) 88 | targets = targets.to(device) 89 | 90 | predictions = torch.argmax(model(inputs, lengths), dim=1) # take argmax to get class id 91 | predictions = predictions.to(device) 92 | 93 | # evaluate on cpu 94 | targets = np.array(targets.cpu()) 95 | predictions = np.array(predictions.cpu()) 96 | 97 | # Get results 98 | # plot_confusion_matrix(targets, predictions, 99 | # classes=emotion_dict.keys()) 100 | performance = evaluate(targets, predictions) 101 | if performance['acc'] > best_acc: 102 | best_acc = performance['acc'] 103 | # save model and results 104 | torch.save({ 105 | 'model': model.state_dict(), 106 | 'optimizer': optimizer.state_dict() 107 | }, 'runs/{}-best_model.pth'.format(config['model_code'])) 108 | 109 | with open('results/{}-best_performance.pkl'.format( 110 | config['model_code']), 'wb') as f: 111 | pickle.dump(performance, f) 112 | -------------------------------------------------------------------------------- /2_build_audio_vectors.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Build Audio Vectors\n", 8 | "Now that the labels have been extracted, we'll use the compiled csv (df_iemocap.csv) to split the original wav files into multiple frames" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "# Try for one file first\n", 18 | "import librosa\n", 19 | "import os\n", 20 | "import soundfile as sf\n", 21 | "import numpy as np\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "import matplotlib.style as ms\n", 24 | "from tqdm import tqdm\n", 25 | "import pickle\n", 26 | "\n", 27 | "import IPython.display\n", 28 | "import librosa.display\n", 29 | "ms.use('seaborn-muted')\n", 30 | "%matplotlib inline" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/plain": [ 41 | "(array([ 0.42572615, 0.48587543, 0.37312022, ..., -0.31514615,\n", 42 | " -0.16263676, 0. ], dtype=float32), 44100)" 43 | ] 44 | }, 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "file_path = 'data/IEMOCAP_full_release/Session1/dialog/wav/Ses01F_impro01.wav'\n", 52 | "\n", 53 | "y, sr = librosa.load(file_path, sr=44100)\n", 54 | "y, sr" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## Loop through all the files" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 3, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "import pandas as pd\n", 71 | "import math\n", 72 | "\n", 73 | "labels_df = pd.read_csv('data/pre-processed/df_iemocap.csv')\n", 74 | "iemocap_dir = 'data/IEMOCAP_full_release/'" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "The following cells take some time until completely executed" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 5, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "name": "stderr", 91 | "output_type": "stream", 92 | "text": [ 93 | "100%|██████████| 31/31 [05:11<00:00, 8.83s/it]\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "sr = 44100\n", 99 | "audio_vectors = {}\n", 100 | "for sess in [5]: # using one session due to memory constraint, can replace [5] with range(1, 6)\n", 101 | " wav_file_path = '{}Session{}/dialog/wav/'.format(iemocap_dir, sess)\n", 102 | " orig_wav_files = os.listdir(wav_file_path)\n", 103 | " for orig_wav_file in tqdm(orig_wav_files):\n", 104 | " try:\n", 105 | " orig_wav_vector, _sr = librosa.load(wav_file_path + orig_wav_file, sr=sr)\n", 106 | " orig_wav_file, file_format = orig_wav_file.split('.')\n", 107 | " for index, row in labels_df[labels_df['wav_file'].str.contains(orig_wav_file)].iterrows():\n", 108 | " start_time, end_time, truncated_wav_file_name, emotion, val, act, dom = row['start_time'], row['end_time'], row['wav_file'], row['emotion'], row['val'], row['act'], row['dom']\n", 109 | " start_frame = math.floor(start_time * sr)\n", 110 | " end_frame = math.floor(end_time * sr)\n", 111 | " truncated_wav_vector = orig_wav_vector[start_frame:end_frame + 1]\n", 112 | " audio_vectors[truncated_wav_file_name] = truncated_wav_vector\n", 113 | " except:\n", 114 | " print('An exception occured for {}'.format(orig_wav_file))\n", 115 | " with open('data/pre-processed/audio_vectors_{}.pkl'.format(sess), 'wb') as f:\n", 116 | " pickle.dump(audio_vectors, f)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [] 125 | } 126 | ], 127 | "metadata": { 128 | "kernelspec": { 129 | "display_name": "Python 3", 130 | "language": "python", 131 | "name": "python3" 132 | }, 133 | "language_info": { 134 | "codemirror_mode": { 135 | "name": "ipython", 136 | "version": 3 137 | }, 138 | "file_extension": ".py", 139 | "mimetype": "text/x-python", 140 | "name": "python", 141 | "nbconvert_exporter": "python", 142 | "pygments_lexer": "ipython3", 143 | "version": "3.6.9" 144 | } 145 | }, 146 | "nbformat": 4, 147 | "nbformat_minor": 2 148 | } 149 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Multimodal Speech Emotion Recognition and Ambiguity Resolution 2 | 3 | ## Overview 4 | Identifying emotion from speech is a non-trivial task pertaining to the ambiguous definition of emotion itself. In this work, we build light-weight multimodal machine learning models and compare it against the heavier and less interpretable deep learning counterparts. For both types of models, we use hand-crafted features from a given audio signal. Our experiments show that the light-weight models are comparable to the deep learning baselines and even outperform them in some cases, achieving state-of-the-art performance on the IEMOCAP dataset. 5 | 6 | The hand-crafted feature vectors obtained are used to train two types of models: 7 | 8 | 1. ML-based: Logistic Regression, SVMs, Random Forest, eXtreme Gradient Boosting and Multinomial Naive-Bayes. 9 | 2. DL-based: Multi-Layer Perceptron, LSTM Classifier 10 | 11 | This project was carried as a course project for the course CS 698 - Computational Audio taught by [Prof. Richard Mann](https://cs.uwaterloo.ca/~mannr/) at the University of Waterloo. For a more detailed explanation, please check the [report](https://arxiv.org/abs/1904.06022). 12 | 13 | ## Datasets 14 | The [IEMOCAP](https://link.springer.com/content/pdf/10.1007%2Fs10579-008-9076-6.pdf) dataset was used for all the experiments in this work. Please refer to the [report](https://arxiv.org/abs/1904.06022) for a detailed explanation of pre-processing steps applied to the dataset. 15 | 16 | ## Requirements 17 | All the experiments have been tested using the following libraries: 18 | - xgboost==0.82 19 | - torch==1.0.1.post2 20 | - scikit-learn==0.20.3 21 | - numpy==1.16.2 22 | - jupyter==1.0.0 23 | - pandas==0.24.1 24 | - librosa==0.7.0 25 | 26 | To avoid conflicts, it is recommended to setup a new python virtual environment to install these libraries. Once the env is setup, run `pip install -r requirements.txt` to install the dependencies. 27 | 28 | ## Instructions to run the code 29 | 1. Clone this repository by running `git clone git@github.com:Demfier/multimodal-speech-emotion-recognition`. 30 | 2. Go to the root directory of this project by running `cd multimodal-speech-emotion-recognition/` in your terminal. 31 | 3. Start a jupyter notebook by running `jupyter notebook` from the root of this project. 32 | 4. Run `1_extract_emotion_labels.ipynb` to extract labels from transriptions and compile other required data into a csv. 33 | 5. Run `2_build_audio_vectors.ipynb` to build vectors from the original wav files and save into a pickle file 34 | 6. Run `3_extract_audio_features.ipynb` to extract 8-dimensional audio feature vectors for the audio vectors 35 | 7. Run `4_prepare_data.ipynb` to preprocess and prepare audio + video data for experiments 36 | 8. It is recommended to train `LSTMClassifier` before running any other experiments for easy comparsion with other models later on: 37 | - Change `config.py` for any of the experiment settings. For instance, if you want to train a speech2emotion classifier, make necessary changes to `lstm_classifier/s2e/config.py`. Similar procedure follows for training text2emotion (`t2e`) and text+speech2emotion (`combined`) classifiers. 38 | - Run `python lstm_classifier.py` from `lstm_classifier/{exp_mode}` to train an LSTM classifier for the respective experiment mode (possible values of `exp_mode: s2e/t2e/combined`) 39 | 9. Run `5_audio_classification.ipynb` to train ML classifiers for audio 40 | 10. Run `5.1_sentence_classification.ipynb` to train ML classifiers for text 41 | 11. Run `5.2_combined_classification.ipynb` to train ML classifiers for audio+text 42 | 43 | **Note:** Make sure to include correct model paths in the notebooks as not everything is relative right now and it needs some refactoring 44 | 45 | **UPDATE**: You can access the preprocessed data files here to skip the steps 4-7: [https://www.dropbox.com/scl/fo/jdzz2y9nngw9rxsbz9vyj/h?rlkey=bji7zcqclusagzfwa7alm59hx&dl=0](https://www.dropbox.com/scl/fo/jdzz2y9nngw9rxsbz9vyj/h?rlkey=bji7zcqclusagzfwa7alm59hx&dl=0) 46 | 47 | ## Results 48 | Accuracy, F-score, Precision and Recall has been reported for the different experiments. 49 | 50 | **Audio** 51 | 52 | Models | Accuracy | F1 | Precision | Recall 53 | ---|---|---|---|--- 54 | RF | 56.0 | **56.0** | 57.2 | **57.3** 55 | XGB | 55.6 | **56.0** | 56.9 | 56.8 56 | SVM | 33.7 | 15.2 | 17.4 | 21.5 57 | MNB | 31.3 | 9.1 | 19.6 | 17.2 58 | LR | 33.4 | 14.9 | 17.8 | 20.9 59 | MLP | 41.0 | 36.5 | 42.2 | 35.9 60 | LSTM | 43.6 | 43.4 | 53.2 | 40.6 61 | ARE (4-class) | 56.3 | - | 54.6 | - 62 | E1 (4-class) | 56.2 | 45.9 | **67.6** | 48.9 63 | **E1** | **56.6** | 55.7 | 57.3 | **57.3** 64 | 65 | E1: Ensemble (RF + XGB + MLP) 66 | 67 | **Text** 68 | 69 | Models | Accuracy | F1 | Precision | Recall 70 | ---|---|---|---|--- 71 | RF | 62.2 | 60.8 | 65.0 | 62.0 72 | XGB | 56.9 | 55.0 | 70.3 | 51.8 73 | SVM | 62.1 | 61.7 | 62.5 | **63.5** 74 | MNB | 61.9 | 62.1 | **71.8** | 58.6 75 | LR | 64.2 | 64.3 | 69.5 | 62.3 76 | MLP | 60.6 | 61.5 | 62.4 | 63.0 77 | LSTM | 63.1 | 62.5 | 65.3 | 62.8 78 | TRE (4-class) | **65.5** | - | 63.5 | - 79 | E1 (4-class) | 63.1 | 61.4 | **67.7** | 59.0 80 | **E2** | 64.9 | **66.0** | 71.4 | 63.2 81 | 82 | E2: Ensemble (RF + XGB + MLP + MNB + LR) 83 | E1: Ensemble (RF + XGB + MLP) 84 | 85 | **Audio + Text** 86 | 87 | Models | Accuracy | F1 | Precision | Recall 88 | ---|---|---|---|--- 89 | RF | 65.3 | 65.8 | 69.3 | 65.5 90 | XGB | 62.2 | 63.1 | 67.9 | 61.7 91 | SVM | 63.4 | 63.8 | 63.1 | 65.6 92 | MNB | 60.5 | 60.3 | 70.3 | 57.1 93 | MLP | 66.1 | 68.1 | 68.0 | 69.6 94 | LR | 63.2 | 63.7 | 66.9 | 62.3 95 | LSTM | 64.2 | 64.7 | 66.1 | 65.0 96 | MDRE (4-class) | **75.3** | - | 71.8 | - 97 | E1 (4-class) | 70.3 | 67.5 | **73.2** | 65.5 98 | **E2** | 70.1 | **71.8** | 72.9 | **71.5** 99 | 100 | For more details, please refer to the [report](https://arxiv.org/abs/1904.06022) 101 | 102 | ## Citation 103 | If you find this work useful, please cite: 104 | 105 | ``` 106 | @article{sahu2019multimodal, 107 | title={Multimodal Speech Emotion Recognition and Ambiguity Resolution}, 108 | author={Sahu, Gaurav}, 109 | journal={arXiv preprint arXiv:1904.06022}, 110 | year={2019} 111 | } 112 | ``` 113 | -------------------------------------------------------------------------------- /lstm_classifier/t2e/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import pickle 4 | import gensim 5 | import numpy as np 6 | import pandas as pd 7 | from config import model_config as config 8 | from gensim.scripts.glove2word2vec import glove2word2vec 9 | 10 | from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score 11 | 12 | import itertools 13 | import matplotlib.pyplot as plt 14 | 15 | 16 | def generate_word_embeddings(vocab): 17 | if not os.path.exists('{}gensim.glove.6B.{}d.txt'.format( 18 | config['embeddings_dir'], config['embedding_dim'])): 19 | glove2word2vec(glove_input_file='{}glove.6B.{}d.txt'.format( 20 | config['embeddings_dir'], config['embedding_dim']), 21 | word2vec_output_file='{}gensim.glove.6B.{}d.txt'.format( 22 | config['embeddings_dir'], config['embedding_dim'])) 23 | 24 | embeddings_all = gensim.models.KeyedVectors.load_word2vec_format( 25 | '{}gensim.glove.6B.{}d.txt'.format(config['embeddings_dir'], 26 | config['embedding_dim'])) 27 | print('Loaded original embeddings') 28 | 29 | # initialize word embeddings matrix 30 | combined_word_embeddings = np.zeros((vocab.size, 31 | config['embedding_dim'])) 32 | for index, word in vocab.index2word.items(): 33 | try: 34 | if index < 4: # deal with special tokens 35 | combined_word_embeddings[index] = np.random.normal( 36 | size=(config['embedding_dim'], )) 37 | continue 38 | combined_word_embeddings[index] = embeddings_all[word] 39 | except KeyError as e: 40 | print('KeyError triggered for {}'.format(word)) 41 | combined_word_embeddings[index] = np.random.normal( 42 | size=(config['embedding_dim'], )) 43 | print('Created combined + filtered embeddings.') 44 | with open('{}saved_{}d_word_embeddings.pkl'.format( 45 | config['embeddings_dir'], config['embedding_dim']), 'wb') as f: 46 | pickle.dump(combined_word_embeddings, f) 47 | combined_word_embeddings = torch.from_numpy(combined_word_embeddings).float() 48 | return combined_word_embeddings 49 | 50 | 51 | def load_word_embeddings(): 52 | with open('{}saved_{}d_word_embeddings.pkl'.format( 53 | config['embeddings_dir'], config['embedding_dim']), 'rb') as f: 54 | combined_word_embeddings = pickle.load(f) 55 | return torch.from_numpy(combined_word_embeddings).float() 56 | 57 | 58 | def zero_padding(l, fillvalue=config['']): 59 | return list(itertools.zip_longest(*l, fillvalue=fillvalue)) 60 | 61 | 62 | def binary_matrix(l, value=config['']): 63 | m = [] 64 | for i, seq in enumerate(l): 65 | m.append([]) 66 | for token in seq: 67 | if token == 0: 68 | m[i].append(0) 69 | else: 70 | m[i].append(1) 71 | return m 72 | 73 | 74 | # Returns padded input sequence tensor and lengths 75 | def input_var(l, vocab): 76 | indexes_batch = [indexes_from_sentence(vocab, sentence) for sentence in l] 77 | for idx, indexes in enumerate(indexes_batch): 78 | indexes_batch[idx] = indexes_batch[idx] + [config['']] 79 | lengths = torch.tensor([len(indexes) for indexes in indexes_batch]) 80 | pad_list = zero_padding(indexes_batch) 81 | pad_var = torch.LongTensor(pad_list) 82 | return pad_var, lengths 83 | 84 | 85 | def indexes_from_sentence(vocab, sentence): 86 | indexes = [] 87 | for word in sentence.strip().split(): 88 | try: 89 | indexes.append(vocab.word2index[word]) 90 | except KeyError as e: 91 | indexes.append(config['']) 92 | return indexes[:config['max_sequence_length']] 93 | 94 | 95 | def load_data(batched=True, test=False, file_dir='../../data/t2e/'): 96 | # Load vocab 97 | with open(config['vocab_path'], 'rb') as f: 98 | vocab = pickle.load(f) 99 | 100 | bs = config['batch_size'] 101 | ftype = 'test' if test else 'train' 102 | 103 | df = pd.read_csv('{}text_{}.csv'.format(file_dir, ftype)) 104 | data = (np.array(list(df['transcription'])), np.array(df['label'])) 105 | 106 | data = list(zip(data[0], data[1])) 107 | data.sort(key=lambda x: len(x[0].split()), reverse=True) 108 | 109 | n_iters = len(data) // bs 110 | 111 | if test: 112 | input_batch = [] 113 | output_batch = [] 114 | for e in data: 115 | input_batch.append(e[0]) 116 | output_batch.append(e[1]) 117 | inp, lengths = input_var(input_batch, vocab) 118 | return [inp, lengths, torch.LongTensor(output_batch)] 119 | 120 | batches = [] 121 | for i in range(1, n_iters + 1): 122 | input_batch = [] 123 | output_batch = [] 124 | for e in data[bs * (i-1):bs * i]: 125 | input_batch.append(e[0]) 126 | output_batch.append(e[1]) 127 | inp, lengths = input_var(input_batch, vocab) 128 | batches.append([inp, lengths, 129 | torch.LongTensor(output_batch)]) 130 | return batches 131 | 132 | 133 | def evaluate(targets, predictions): 134 | performance = { 135 | 'acc': accuracy_score(targets, predictions), 136 | 'f1': f1_score(targets, predictions, average='macro'), 137 | 'precision': precision_score(targets, predictions, average='macro'), 138 | 'recall': recall_score(targets, predictions, average='macro')} 139 | return performance 140 | 141 | 142 | def plot_confusion_matrix(targets, predictions, classes, 143 | normalize=False, 144 | title='Confusion matrix', 145 | cmap=plt.cm.Blues): 146 | """ 147 | This function prints and plots the confusion matrix. 148 | Normalization can be applied by setting `normalize=True`. 149 | """ 150 | # plt.figure(figsize=(8,8)) 151 | cm = confusion_matrix(targets, predictions) 152 | plt.imshow(cm, interpolation='nearest', cmap=cmap) 153 | plt.title(title) 154 | plt.colorbar() 155 | tick_marks = np.arange(len(classes)) 156 | plt.xticks(tick_marks, classes, rotation=45) 157 | plt.yticks(tick_marks, classes) 158 | 159 | if normalize: 160 | cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 161 | print("Normalized confusion matrix") 162 | else: 163 | print('Confusion matrix, without normalization') 164 | 165 | print(cm) 166 | 167 | thresh = cm.max() / 2. 168 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): 169 | plt.text(j, i, cm[i, j], 170 | horizontalalignment="center", 171 | color="white" if cm[i, j] > thresh else "black") 172 | 173 | plt.tight_layout() 174 | plt.ylabel('True label') 175 | plt.xlabel('Predicted label') 176 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import math 4 | import random 5 | import pickle 6 | import librosa 7 | import numpy as np 8 | import pandas as pd 9 | from tqdm import tqdm 10 | import soundfile as sf 11 | 12 | 13 | # Part 1: Extract Audio Labels 14 | def extract_audio_labels(): 15 | info_line = re.compile(r'\[.+\]\n', re.IGNORECASE) 16 | 17 | start_times, end_times, wav_file_names, emotions, vals, acts, doms = \ 18 | [], [], [], [], [], [], [] 19 | 20 | for sess in range(1, 6): 21 | emo_evaluation_dir = \ 22 | 'data/IEMOCAP_full_release/Session{}/dialog/EmoEvaluation/'.format(sess) 23 | evaluation_files = [l for l in os.listdir(emo_evaluation_dir) if 'Ses' in l] 24 | for file in evaluation_files: 25 | with open(emo_evaluation_dir + file) as f: 26 | content = f.read() 27 | info_lines = re.findall(info_line, content) 28 | for line in info_lines[1:]: # the first line is a header 29 | start_end_time, wav_file_name, emotion, val_act_dom = \ 30 | line.strip().split('\t') 31 | start_time, end_time = start_end_time[1:-1].split('-') 32 | val, act, dom = val_act_dom[1:-1].split(',') 33 | val, act, dom = float(val), float(act), float(dom) 34 | start_time, end_time = float(start_time), float(end_time) 35 | start_times.append(start_time) 36 | end_times.append(end_time) 37 | wav_file_names.append(wav_file_name) 38 | emotions.append(emotion) 39 | vals.append(val) 40 | acts.append(act) 41 | doms.append(dom) 42 | 43 | df_iemocap = pd.DataFrame(columns=['start_time', 'end_time', 'wav_file', 44 | 'emotion', 'val', 'act', 'dom']) 45 | 46 | df_iemocap['start_time'] = start_times 47 | df_iemocap['end_time'] = end_times 48 | df_iemocap['wav_file'] = wav_file_names 49 | df_iemocap['emotion'] = emotions 50 | df_iemocap['val'] = vals 51 | df_iemocap['act'] = acts 52 | df_iemocap['dom'] = doms 53 | 54 | df_iemocap.to_csv('data/pre-processed/df_iemocap.csv', index=False) 55 | 56 | 57 | # Part 2: Build Audio Vectors 58 | def build_audio_vectors(): 59 | labels_df = pd.read_csv('data/pre-processed/df_iemocap.csv') 60 | iemocap_dir = 'data/IEMOCAP_full_release/' 61 | 62 | sr = 44100 63 | audio_vectors = {} 64 | for sess in range(1, 6): # using one session due to memory constraint, can replace [5] with range(1, 6) 65 | wav_file_path = '{}Session{}/dialog/wav/'.format(iemocap_dir, sess) 66 | orig_wav_files = os.listdir(wav_file_path) 67 | for orig_wav_file in tqdm(orig_wav_files): 68 | try: 69 | orig_wav_vector, _sr = librosa.load( 70 | wav_file_path + orig_wav_file, sr=sr) 71 | orig_wav_file, file_format = orig_wav_file.split('.') 72 | for index, row in labels_df[labels_df['wav_file'].str.contains( 73 | orig_wav_file)].iterrows(): 74 | start_time, end_time, truncated_wav_file_name, emotion,\ 75 | val, act, dom = row['start_time'], row['end_time'],\ 76 | row['wav_file'], row['emotion'], row['val'],\ 77 | row['act'], row['dom'] 78 | start_frame = math.floor(start_time * sr) 79 | end_frame = math.floor(end_time * sr) 80 | truncated_wav_vector = orig_wav_vector[start_frame:end_frame + 1] 81 | audio_vectors[truncated_wav_file_name] = truncated_wav_vector 82 | except: 83 | print('An exception occured for {}'.format(orig_wav_file)) 84 | with open('data/pre-processed/audio_vectors_{}.pkl'.format(sess), 'wb') as f: 85 | pickle.dump(audio_vectors, f) 86 | 87 | 88 | # Part 3: Extract Audio Features 89 | def extract_audio_features(): 90 | data_dir = 'data/pre-processed/' 91 | labels_df_path = '{}df_iemocap.csv'.format(data_dir) 92 | audio_vectors_path = '{}audio_vectors_1.pkl'.format(data_dir) 93 | labels_df = pd.read_csv(labels_df_path) 94 | audio_vectors = pickle.load(open(audio_vectors_path, 'rb')) 95 | 96 | columns = ['wav_file', 'label', 'sig_mean', 'sig_std', 'rmse_mean', 97 | 'rmse_std', 'silence', 'harmonic', 'auto_corr_max', 'auto_corr_std'] 98 | df_features = pd.DataFrame(columns=columns) 99 | 100 | emotion_dict = {'ang': 0, 101 | 'hap': 1, 102 | 'exc': 2, 103 | 'sad': 3, 104 | 'fru': 4, 105 | 'fea': 5, 106 | 'sur': 6, 107 | 'neu': 7, 108 | 'xxx': 8, 109 | 'oth': 8} 110 | 111 | data_dir = 'data/pre-processed/' 112 | labels_path = '{}df_iemocap.csv'.format(data_dir) 113 | audio_vectors_path = '{}audio_vectors_'.format(data_dir) 114 | labels_df = pd.read_csv(labels_path) 115 | 116 | for sess in (range(1, 6)): 117 | audio_vectors = pickle.load(open('{}{}.pkl'.format(audio_vectors_path, sess), 'rb')) 118 | for index, row in tqdm(labels_df[labels_df['wav_file'].str.contains('Ses0{}'.format(sess))].iterrows()): 119 | try: 120 | wav_file_name = row['wav_file'] 121 | label = emotion_dict[row['emotion']] 122 | y = audio_vectors[wav_file_name] 123 | 124 | feature_list = [wav_file_name, label] # wav_file, label 125 | sig_mean = np.mean(abs(y)) 126 | feature_list.append(sig_mean) # sig_mean 127 | feature_list.append(np.std(y)) # sig_std 128 | 129 | rmse = librosa.feature.rmse(y + 0.0001)[0] 130 | feature_list.append(np.mean(rmse)) # rmse_mean 131 | feature_list.append(np.std(rmse)) # rmse_std 132 | 133 | silence = 0 134 | for e in rmse: 135 | if e <= 0.4 * np.mean(rmse): 136 | silence += 1 137 | silence /= float(len(rmse)) 138 | feature_list.append(silence) # silence 139 | 140 | y_harmonic = librosa.effects.hpss(y)[0] 141 | feature_list.append(np.mean(y_harmonic) * 1000) # harmonic (scaled by 1000) 142 | 143 | # based on the pitch detection algorithm mentioned here: 144 | # http://access.feld.cvut.cz/view.php?cisloclanku=2009060001 145 | cl = 0.45 * sig_mean 146 | center_clipped = [] 147 | for s in y: 148 | if s >= cl: 149 | center_clipped.append(s - cl) 150 | elif s <= -cl: 151 | center_clipped.append(s + cl) 152 | elif np.abs(s) < cl: 153 | center_clipped.append(0) 154 | auto_corrs = librosa.core.autocorrelate(np.array(center_clipped)) 155 | feature_list.append(1000 * np.max(auto_corrs)/len(auto_corrs)) # auto_corr_max (scaled by 1000) 156 | feature_list.append(np.std(auto_corrs)) # auto_corr_std 157 | 158 | df_features = df_features.append(pd.DataFrame( 159 | feature_list, index=columns).transpose(), 160 | ignore_index=True) 161 | except Exception as e: 162 | print('Some exception occurred: {}'.format(e)) 163 | 164 | df_features.to_csv('data/pre-processed/audio_features.csv', index=False) 165 | 166 | 167 | def main(): 168 | print('Part 1: Extract Audio Labels') 169 | extract_audio_labels() 170 | print('Part 2: Build Audio Vectors') 171 | build_audio_vectors() 172 | print('Part 3: Extract Audio Features') 173 | extract_audio_features() 174 | 175 | 176 | if __name__ == '__main__': 177 | main() 178 | -------------------------------------------------------------------------------- /1_extract_emotion_labels.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Extract labels from the evaluation files\n", 8 | "\n", 9 | "Test for one file first" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import re\n", 19 | "\n", 20 | "# first test with one file\n", 21 | "file_path = 'data/IEMOCAP_full_release/Session1/dialog/EmoEvaluation/Ses01F_impro01.txt'" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 8, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "useful_regex = re.compile(r'\\[.+\\]\\n', re.IGNORECASE)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 13, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "with open(file_path) as f:\n", 40 | " file_content = f.read()\n", 41 | " \n", 42 | "info_lines = re.findall(useful_regex, file_content)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 20, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "['[6.2901 - 8.2357]', 'Ses01F_impro01_F000', 'neu', '[2.5000, 2.5000, 2.5000]']\n", 55 | "['[10.0100 - 11.3925]', 'Ses01F_impro01_F001', 'neu', '[2.5000, 2.5000, 2.5000]']\n", 56 | "['[14.8872 - 18.0175]', 'Ses01F_impro01_F002', 'neu', '[2.5000, 2.5000, 2.5000]']\n", 57 | "['[19.2900 - 20.7875]', 'Ses01F_impro01_F003', 'xxx', '[2.5000, 3.0000, 3.0000]']\n", 58 | "['[21.3257 - 24.7400]', 'Ses01F_impro01_F004', 'xxx', '[2.5000, 3.0000, 2.5000]']\n", 59 | "['[27.4600 - 31.4900]', 'Ses01F_impro01_F005', 'neu', '[2.5000, 3.5000, 2.0000]']\n", 60 | "['[38.9650 - 43.5900]', 'Ses01F_impro01_F006', 'fru', '[2.0000, 3.5000, 3.5000]']\n", 61 | "['[46.5800 - 52.1900]', 'Ses01F_impro01_F007', 'fru', '[2.5000, 3.5000, 3.5000]']\n", 62 | "['[56.1600 - 58.8225]', 'Ses01F_impro01_F008', 'fru', '[2.0000, 3.5000, 3.5000]']\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "for l in info_lines[1:10]:\n", 68 | " print(l.strip().split('\\t'))" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Compile all the information in a single file" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 64, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "import re\n", 85 | "import os\n", 86 | "\n", 87 | "\n", 88 | "info_line = re.compile(r'\\[.+\\]\\n', re.IGNORECASE)\n", 89 | "\n", 90 | "start_times, end_times, wav_file_names, emotions, vals, acts, doms = [], [], [], [], [], [], []\n", 91 | "\n", 92 | "for sess in range(1, 6):\n", 93 | " emo_evaluation_dir = 'data/IEMOCAP_full_release/Session{}/dialog/EmoEvaluation/'.format(sess)\n", 94 | " evaluation_files = [l for l in os.listdir(emo_evaluation_dir) if 'Ses' in l]\n", 95 | " for file in evaluation_files:\n", 96 | " with open(emo_evaluation_dir + file) as f:\n", 97 | " content = f.read()\n", 98 | " info_lines = re.findall(info_line, content)\n", 99 | " for line in info_lines[1:]: # the first line is a header\n", 100 | " start_end_time, wav_file_name, emotion, val_act_dom = line.strip().split('\\t')\n", 101 | " start_time, end_time = start_end_time[1:-1].split('-')\n", 102 | " val, act, dom = val_act_dom[1:-1].split(',')\n", 103 | " val, act, dom = float(val), float(act), float(dom)\n", 104 | " start_time, end_time = float(start_time), float(end_time)\n", 105 | " start_times.append(start_time)\n", 106 | " end_times.append(end_time)\n", 107 | " wav_file_names.append(wav_file_name)\n", 108 | " emotions.append(emotion)\n", 109 | " vals.append(val)\n", 110 | " acts.append(act)\n", 111 | " doms.append(dom)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 68, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "data": { 121 | "text/html": [ 122 | "
\n", 123 | "\n", 136 | "\n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | "
start_timeend_timewav_fileemotionvalactdom
10034358.10365.26Ses05F_impro05_M049fru2.53.54.5
10035365.30370.53Ses05F_impro05_M050neu2.53.54.0
10036371.63374.16Ses05F_impro05_M051neu3.02.52.5
10037375.10385.14Ses05F_impro05_M052neu3.53.03.5
10038386.39388.27Ses05F_impro05_M053neu4.02.53.0
\n", 202 | "
" 203 | ], 204 | "text/plain": [ 205 | " start_time end_time wav_file emotion val act dom\n", 206 | "10034 358.10 365.26 Ses05F_impro05_M049 fru 2.5 3.5 4.5\n", 207 | "10035 365.30 370.53 Ses05F_impro05_M050 neu 2.5 3.5 4.0\n", 208 | "10036 371.63 374.16 Ses05F_impro05_M051 neu 3.0 2.5 2.5\n", 209 | "10037 375.10 385.14 Ses05F_impro05_M052 neu 3.5 3.0 3.5\n", 210 | "10038 386.39 388.27 Ses05F_impro05_M053 neu 4.0 2.5 3.0" 211 | ] 212 | }, 213 | "execution_count": 68, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "import pandas as pd\n", 220 | "\n", 221 | "df_iemocap = pd.DataFrame(columns=['start_time', 'end_time', 'wav_file', 'emotion', 'val', 'act', 'dom'])\n", 222 | "\n", 223 | "df_iemocap['start_time'] = start_times\n", 224 | "df_iemocap['end_time'] = end_times\n", 225 | "df_iemocap['wav_file'] = wav_file_names\n", 226 | "df_iemocap['emotion'] = emotions\n", 227 | "df_iemocap['val'] = vals\n", 228 | "df_iemocap['act'] = acts\n", 229 | "df_iemocap['dom'] = doms\n", 230 | "\n", 231 | "df_iemocap.tail()" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 72, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "df_iemocap.to_csv('data/pre-processed/df_iemocap.csv', index=False)" 241 | ] 242 | } 243 | ], 244 | "metadata": { 245 | "kernelspec": { 246 | "display_name": "Python 3", 247 | "language": "python", 248 | "name": "python3" 249 | }, 250 | "language_info": { 251 | "codemirror_mode": { 252 | "name": "ipython", 253 | "version": 3 254 | }, 255 | "file_extension": ".py", 256 | "mimetype": "text/x-python", 257 | "name": "python", 258 | "nbconvert_exporter": "python", 259 | "pygments_lexer": "ipython3", 260 | "version": "3.6.9" 261 | } 262 | }, 263 | "nbformat": 4, 264 | "nbformat_minor": 2 265 | } 266 | -------------------------------------------------------------------------------- /src/train_sentence_classifiers_sklearn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Trains different classifiers available in sklearn for sentence classification 3 | """ 4 | 5 | import pandas as pd 6 | import numpy as np 7 | import pickle 8 | 9 | import itertools 10 | import xgboost as xgb 11 | from sklearn.svm import LinearSVC 12 | from sklearn.naive_bayes import MultinomialNB 13 | from sklearn.neural_network import MLPClassifier 14 | from sklearn.linear_model import LogisticRegression 15 | from sklearn.ensemble import RandomForestClassifier 16 | from sklearn.model_selection import train_test_split 17 | from sklearn.feature_selection import SelectFromModel 18 | from sklearn.feature_extraction.text import CountVectorizer 19 | 20 | from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer 21 | from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score 22 | 23 | import seaborn as sns 24 | import matplotlib.pyplot as plt 25 | 26 | EMOTION_DICT = {'ang': 0, 'hap': 1, 'sad': 2, 'fea': 3, 'sur': 4, 'neu': 5} 27 | EMO_KEYS = list(['ang', 'hap', 'sad', 'fea', 'sur', 'neu']) 28 | 29 | 30 | def train_tfidf_vectors(df): 31 | tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', 32 | encoding='latin-1', ngram_range=(1, 2), 33 | stop_words='english') 34 | return tfidf.fit_transform(df.transcription).toarray() 35 | 36 | 37 | def create_train_test_split(features, labels, test_size=0.2): 38 | return train_test_split(features, labels, test_size=0.20) 39 | 40 | 41 | def plot_confusion_matrix(cm, classes, 42 | normalize=False, 43 | title='Confusion matrix', 44 | cmap=plt.cm.Blues): 45 | """ 46 | This function prints and plots the confusion matrix. 47 | Normalization can be applied by setting `normalize=True`. 48 | """ 49 | # plt.figure(figsize=(8,8)) 50 | plt.imshow(cm, interpolation='nearest', cmap=cmap) 51 | plt.title(title) 52 | plt.colorbar() 53 | tick_marks = np.arange(len(classes)) 54 | plt.xticks(tick_marks, classes, rotation=45) 55 | plt.yticks(tick_marks, classes) 56 | 57 | if normalize: 58 | cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 59 | print("Normalized confusion matrix") 60 | else: 61 | print('Confusion matrix, without normalization') 62 | 63 | print(cm) 64 | 65 | thresh = cm.max() / 2. 66 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): 67 | plt.text(j, i, cm[i, j], 68 | horizontalalignment="center", 69 | color="white" if cm[i, j] > thresh else "black") 70 | 71 | plt.tight_layout() 72 | plt.ylabel('True label') 73 | plt.xlabel('Predicted label') 74 | 75 | 76 | def one_hot_encoder(true_labels, num_records, num_classes): 77 | temp = np.array(true_labels[:num_records]) 78 | true_labels = np.zeros((num_records, num_classes)) 79 | true_labels[np.arange(num_records), temp] = 1 80 | return true_labels 81 | 82 | 83 | def display_results(y_test, pred_probs, cm=True): 84 | pred = np.argmax(pred_probs, axis=-1) 85 | one_hot_true = one_hot_encoder(y_test, len(pred), len(EMOTION_DICT)) 86 | print('Test Set Accuracy = {0:.3f}'.format(accuracy_score(y_test, pred))) 87 | print('Test Set F-score = {0:.3f}'.format(f1_score(y_test, pred, average='macro'))) 88 | print('Test Set Precision = {0:.3f}'.format(precision_score(y_test, pred, average='macro'))) 89 | print('Test Set Recall = {0:.3f}'.format(recall_score(y_test, pred, average='macro'))) 90 | if cm: 91 | plot_confusion_matrix(confusion_matrix(y_test, pred), classes=EMO_KEYS) 92 | 93 | 94 | def model_random_forest_classifier(x_train, y_train, x_test, y_test): 95 | rf_classifier = RandomForestClassifier(n_estimators=6000, 96 | min_samples_split=25) 97 | rf_classifier.fit(x_train, y_train) 98 | 99 | # Predict 100 | pred_probs = rf_classifier.predict_proba(x_test) 101 | 102 | # Results 103 | display_results(y_test, pred_probs) 104 | 105 | with open('pred_probas/text_rf_classifier.pkl', 'wb') as f: 106 | pickle.dump(pred_probs, f) 107 | 108 | 109 | def model_xgb_classifier(x_train, y_train, x_test, y_test): 110 | xgb_classifier = xgb.XGBClassifier(max_depth=7, learning_rate=0.008, 111 | objective='multi:softprob', 112 | n_estimators=600, sub_sample=0.8, 113 | num_class=len(EMOTION_DICT), 114 | booster='gbtree', n_jobs=4) 115 | xgb_classifier.fit(x_train, y_train) 116 | 117 | # Predict 118 | pred_probs = xgb_classifier.predict_proba(x_test) 119 | 120 | # Results 121 | display_results(y_test, pred_probs) 122 | 123 | with open('pred_probas/text_xgb_classifier.pkl', 'wb') as f: 124 | pickle.dump(pred_probs, f) 125 | 126 | 127 | def model_svc_classifier(x_train, y_train, x_test, y_test): 128 | svc_classifier = LinearSVC() 129 | 130 | svc_classifier.fit(x_train, y_train) 131 | 132 | # Predict 133 | pred = svc_classifier.predict(x_test) 134 | 135 | # Results 136 | one_hot_true = one_hot_encoder(y_test, len(pred), len(EMOTION_DICT)) 137 | print('Test Set Accuracy = {0:.3f}'.format(accuracy_score(y_test, pred))) 138 | print('Test Set F-score = {0:.3f}'.format(f1_score(y_test, pred, average='macro'))) 139 | print('Test Set Precision = {0:.3f}'.format(precision_score(y_test, pred, average='macro'))) 140 | print('Test Set Recall = {0:.3f}'.format(recall_score(y_test, pred, average='macro'))) 141 | plot_confusion_matrix(confusion_matrix(y_test, pred), classes=EMOTION_DICT.keys()) 142 | 143 | with open('pred_probas/text_svc_classifier_model.pkl', 'wb') as f: 144 | pickle.dump(svc_classifier, f) 145 | 146 | 147 | def model_multinomial_naive_bayes_classifier(x_train, y_train, x_test, y_test): 148 | mnb_classifier = MultinomialNB() 149 | 150 | mnb_classifier.fit(x_train, y_train) 151 | 152 | # Predict 153 | pred_probs = mnb_classifier.predict_proba(x_test) 154 | 155 | # Results 156 | display_results(y_test, pred_probs) 157 | 158 | with open('pred_probas/text_mnb_classifier.pkl', 'wb') as f: 159 | pickle.dump(pred_probs, f) 160 | 161 | 162 | def model_mlp_classifier(x_train, y_train, x_test, y_test): 163 | mlp_classifier = MLPClassifier(hidden_layer_sizes=(500, ), 164 | activation='relu', solver='adam', 165 | alpha=0.0001, batch_size='auto', 166 | learning_rate='adaptive', 167 | learning_rate_init=0.01, power_t=0.5, 168 | max_iter=1000, shuffle=True, 169 | random_state=None, tol=0.0001, 170 | verbose=False, warm_start=True, 171 | momentum=0.8, nesterovs_momentum=True, 172 | early_stopping=False, 173 | validation_fraction=0.1, 174 | beta_1=0.9, beta_2=0.999, epsilon=1e-08) 175 | 176 | mlp_classifier.fit(x_train, y_train) 177 | 178 | # Predict 179 | pred_probs = mlp_classifier.predict_proba(x_test) 180 | 181 | # Results 182 | display_results(y_test, pred_probs) 183 | 184 | with open('pred_probas/text_mlp_classifier.pkl', 'wb') as f: 185 | pickle.dump(pred_probs, f) 186 | 187 | 188 | def model_lr_classifier(x_train, y_train, x_test, y_test): 189 | lr_classifier = LogisticRegression(solver='lbfgs', 190 | multi_class='multinomial', 191 | max_iter=1000) 192 | 193 | lr_classifier.fit(x_train, y_train) 194 | 195 | # Predict 196 | pred_probs = lr_classifier.predict_proba(x_test) 197 | 198 | # Results 199 | display_results(y_test, pred_probs) 200 | 201 | with open('pred_probas/text_lr_classifier.pkl', 'wb') as f: 202 | pickle.dump(pred_probs, f) 203 | 204 | 205 | def model_ensemble_of_classifiers(y_test): 206 | # Load predicted probabilities 207 | with open('pred_probas/text_rf_classifier.pkl', 'rb') as f: 208 | rf_pred_probs = pickle.load(f) 209 | 210 | with open('pred_probas/text_xgb_classifier.pkl', 'rb') as f: 211 | xgb_pred_probs = pickle.load(f) 212 | 213 | with open('pred_probas/text_svc_classifier_model.pkl', 'rb') as f: 214 | svc_preds = pickle.load(f) 215 | 216 | with open('pred_probas/text_mnb_classifier.pkl', 'rb') as f: 217 | mnb_pred_probs = pickle.load(f) 218 | 219 | with open('pred_probas/text_mlp_classifier.pkl', 'rb') as f: 220 | mlp_pred_probs = pickle.load(f) 221 | 222 | with open('pred_probas/text_lr_classifier.pkl', 'rb') as f: 223 | lr_pred_probs = pickle.load(f) 224 | 225 | # Average of the predicted probabilites 226 | ensemble_pred_probs = (xgb_pred_probs + 227 | mlp_pred_probs + 228 | rf_pred_probs + 229 | mnb_pred_probs + 230 | lr_pred_probs)/5.0 231 | 232 | # Show metrics 233 | display_results(y_test, ensemble_pred_probs) 234 | 235 | 236 | def load_data(): 237 | df = pd.read_csv('data/t2e/text_train.csv') 238 | df = df.append(pd.read_csv('data/t2e/text_test.csv')) 239 | features = train_tfidf_vectors(df) 240 | labels = df.label 241 | return features, labels 242 | 243 | 244 | def main(): 245 | x_train, x_test, y_train, y_test = create_train_test_split(load_data()) 246 | model_random_forest_classifier(x_train, y_train, x_test, y_test) 247 | model_xgb_classifier(x_train, y_train, x_test, y_test) 248 | model_svc_classifier(x_train, y_train, x_test, y_test) 249 | model_multinomial_naive_bayes_classifier(x_train, y_train, x_test, y_test) 250 | model_ensemble_of_classifiers(y_test) 251 | 252 | 253 | if __name__ == '__main__': 254 | main() 255 | -------------------------------------------------------------------------------- /4_prepare_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Build Speech data files" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "from sklearn.model_selection import train_test_split\n", 18 | "from sklearn.preprocessing import MinMaxScaler\n", 19 | "from IPython.display import display\n", 20 | "\n", 21 | "%matplotlib inline" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "output_type": "stream", 32 | "text": [ 33 | "(7527, 10)\n" 34 | ] 35 | }, 36 | { 37 | "data": { 38 | "text/html": [ 39 | "
\n", 40 | "\n", 53 | "\n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | "
wav_filelabelsig_meansig_stdrmse_meanrmse_stdsilenceharmonicauto_corr_maxauto_corr_std
0Ses01F_script02_2_F00070.0036710.0057390.0044340.0036400.018692-0.0081430.0231790.133057
1Ses01F_script02_2_F00170.0063650.0111550.0079130.0078500.444444-0.0171200.0945780.213759
6Ses01F_script02_2_F00600.0396590.0679390.0499300.0460500.345018-0.0046053.4417049.317455
7Ses01F_script02_2_F00740.0144780.0269410.0183840.0196870.422764-0.0118500.5682611.928247
8Ses01F_script02_2_F00800.0252710.0549580.0315710.0449580.470019-0.0051202.5293999.210082
\n", 137 | "
" 138 | ], 139 | "text/plain": [ 140 | " wav_file label sig_mean sig_std rmse_mean rmse_std \\\n", 141 | "0 Ses01F_script02_2_F000 7 0.003671 0.005739 0.004434 0.003640 \n", 142 | "1 Ses01F_script02_2_F001 7 0.006365 0.011155 0.007913 0.007850 \n", 143 | "6 Ses01F_script02_2_F006 0 0.039659 0.067939 0.049930 0.046050 \n", 144 | "7 Ses01F_script02_2_F007 4 0.014478 0.026941 0.018384 0.019687 \n", 145 | "8 Ses01F_script02_2_F008 0 0.025271 0.054958 0.031571 0.044958 \n", 146 | "\n", 147 | " silence harmonic auto_corr_max auto_corr_std \n", 148 | "0 0.018692 -0.008143 0.023179 0.133057 \n", 149 | "1 0.444444 -0.017120 0.094578 0.213759 \n", 150 | "6 0.345018 -0.004605 3.441704 9.317455 \n", 151 | "7 0.422764 -0.011850 0.568261 1.928247 \n", 152 | "8 0.470019 -0.005120 2.529399 9.210082 " 153 | ] 154 | }, 155 | "metadata": {}, 156 | "output_type": "display_data" 157 | }, 158 | { 159 | "data": { 160 | "text/html": [ 161 | "
\n", 162 | "\n", 175 | "\n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | "
wav_filelabelsig_meansig_stdrmse_meanrmse_stdsilenceharmonicauto_corr_maxauto_corr_std
0Ses01F_script02_2_F00050.0036710.0057390.0044340.0036400.018692-0.0081430.0231790.133057
1Ses01F_script02_2_F00150.0063650.0111550.0079130.0078500.444444-0.0171200.0945780.213759
6Ses01F_script02_2_F00600.0396590.0679390.0499300.0460500.345018-0.0046053.4417049.317455
7Ses01F_script02_2_F00720.0144780.0269410.0183840.0196870.422764-0.0118500.5682611.928247
8Ses01F_script02_2_F00800.0252710.0549580.0315710.0449580.470019-0.0051202.5293999.210082
\n", 259 | "
" 260 | ], 261 | "text/plain": [ 262 | " wav_file label sig_mean sig_std rmse_mean rmse_std \\\n", 263 | "0 Ses01F_script02_2_F000 5 0.003671 0.005739 0.004434 0.003640 \n", 264 | "1 Ses01F_script02_2_F001 5 0.006365 0.011155 0.007913 0.007850 \n", 265 | "6 Ses01F_script02_2_F006 0 0.039659 0.067939 0.049930 0.046050 \n", 266 | "7 Ses01F_script02_2_F007 2 0.014478 0.026941 0.018384 0.019687 \n", 267 | "8 Ses01F_script02_2_F008 0 0.025271 0.054958 0.031571 0.044958 \n", 268 | "\n", 269 | " silence harmonic auto_corr_max auto_corr_std \n", 270 | "0 0.018692 -0.008143 0.023179 0.133057 \n", 271 | "1 0.444444 -0.017120 0.094578 0.213759 \n", 272 | "6 0.345018 -0.004605 3.441704 9.317455 \n", 273 | "7 0.422764 -0.011850 0.568261 1.928247 \n", 274 | "8 0.470019 -0.005120 2.529399 9.210082 " 275 | ] 276 | }, 277 | "execution_count": 2, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "df = pd.read_csv('data/audio_features.csv')\n", 284 | "df = df[df['label'].isin([0, 1, 2, 3, 4, 5, 6, 7])]\n", 285 | "print(df.shape)\n", 286 | "display(df.head())\n", 287 | "\n", 288 | "# change 7 to 2\n", 289 | "df['label'] = df['label'].map({0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 4, 7: 5})\n", 290 | "df.head()" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 3, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "df.to_csv('data/no_sample_df.csv')\n", 300 | "\n", 301 | "# oversample fear\n", 302 | "fear_df = df[df['label']==3]\n", 303 | "for i in range(30):\n", 304 | " df = df.append(fear_df)\n", 305 | "\n", 306 | "sur_df = df[df['label']==4]\n", 307 | "for i in range(10):\n", 308 | " df = df.append(sur_df)\n", 309 | " \n", 310 | "df.to_csv('data/modified_df.csv')" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 4, 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "data": { 320 | "text/html": [ 321 | "
\n", 322 | "\n", 335 | "\n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | "
wav_filelabelsig_meansig_stdrmse_meanrmse_stdsilenceharmonicauto_corr_maxauto_corr_std
0Ses01F_script02_2_F00050.0108470.0132900.0107150.0193860.0243130.1686250.0002770.000468
1Ses01F_script02_2_F00150.0203060.0277020.0207740.0424890.5781120.1668680.0011410.000753
6Ses01F_script02_2_F00600.1372060.1788220.1422710.2520960.4487830.1693170.0416440.032933
7Ses01F_script02_2_F00720.0487930.0697130.0510510.1074390.5499110.1678990.0068730.006814
8Ses01F_script02_2_F00800.0866860.1442760.0891840.2461000.6113790.1692160.0306040.032553
\n", 419 | "
" 420 | ], 421 | "text/plain": [ 422 | " wav_file label sig_mean sig_std rmse_mean rmse_std \\\n", 423 | "0 Ses01F_script02_2_F000 5 0.010847 0.013290 0.010715 0.019386 \n", 424 | "1 Ses01F_script02_2_F001 5 0.020306 0.027702 0.020774 0.042489 \n", 425 | "6 Ses01F_script02_2_F006 0 0.137206 0.178822 0.142271 0.252096 \n", 426 | "7 Ses01F_script02_2_F007 2 0.048793 0.069713 0.051051 0.107439 \n", 427 | "8 Ses01F_script02_2_F008 0 0.086686 0.144276 0.089184 0.246100 \n", 428 | "\n", 429 | " silence harmonic auto_corr_max auto_corr_std \n", 430 | "0 0.024313 0.168625 0.000277 0.000468 \n", 431 | "1 0.578112 0.166868 0.001141 0.000753 \n", 432 | "6 0.448783 0.169317 0.041644 0.032933 \n", 433 | "7 0.549911 0.167899 0.006873 0.006814 \n", 434 | "8 0.611379 0.169216 0.030604 0.032553 " 435 | ] 436 | }, 437 | "execution_count": 4, 438 | "metadata": {}, 439 | "output_type": "execute_result" 440 | } 441 | ], 442 | "source": [ 443 | "emotion_dict = {'ang': 0,\n", 444 | " 'hap': 1,\n", 445 | " 'sad': 2,\n", 446 | " 'neu': 3,}\n", 447 | "\n", 448 | "# emotion_dict = {'ang': 0,\n", 449 | "# 'hap': 1,\n", 450 | "# 'exc': 2,\n", 451 | "# 'sad': 3,\n", 452 | "# 'fru': 4,\n", 453 | "# 'fea': 5,\n", 454 | "# 'sur': 6,\n", 455 | "# 'neu': 7,\n", 456 | "# 'xxx': 8,\n", 457 | "# 'oth': 8}\n", 458 | "\n", 459 | "scalar = MinMaxScaler()\n", 460 | "df[df.columns[2:]] = scalar.fit_transform(df[df.columns[2:]])\n", 461 | "df.head()" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 5, 467 | "metadata": {}, 468 | "outputs": [ 469 | { 470 | "name": "stdout", 471 | "output_type": "stream", 472 | "text": [ 473 | "(7837, 10) (1960, 10)\n" 474 | ] 475 | } 476 | ], 477 | "source": [ 478 | "x_train, x_test = train_test_split(df, test_size=0.20)\n", 479 | "\n", 480 | "x_train.to_csv('data/s2e/audio_train.csv', index=False)\n", 481 | "x_test.to_csv('data/s2e/audio_test.csv', index=False)\n", 482 | "\n", 483 | "print(x_train.shape, x_test.shape)" 484 | ] 485 | }, 486 | { 487 | "cell_type": "markdown", 488 | "metadata": {}, 489 | "source": [ 490 | "## Define preprocessing functions for text" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 6, 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "import unicodedata\n", 500 | "\n", 501 | "def unicodeToAscii(s):\n", 502 | " return ''.join(\n", 503 | " c for c in unicodedata.normalize('NFD', s)\n", 504 | " if unicodedata.category(c) != 'Mn'\n", 505 | " )\n", 506 | "\n", 507 | "# Lowercase, trim, and remove non-letter characters\n", 508 | "def normalizeString(s):\n", 509 | " s = unicodeToAscii(s.lower().strip())\n", 510 | " s = re.sub(r\"([.!?])\", r\" \\1\", s)\n", 511 | " s = re.sub(r\"[^a-zA-Z.!?]+\", r\" \", s)\n", 512 | " return s" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": {}, 518 | "source": [ 519 | "## Build Text data files" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": 7, 525 | "metadata": {}, 526 | "outputs": [ 527 | { 528 | "data": { 529 | "text/plain": [ 530 | "10087" 531 | ] 532 | }, 533 | "execution_count": 7, 534 | "metadata": {}, 535 | "output_type": "execute_result" 536 | } 537 | ], 538 | "source": [ 539 | "import re\n", 540 | "import os\n", 541 | "import pickle\n", 542 | "\n", 543 | "useful_regex = re.compile(r'^(\\w+)', re.IGNORECASE)\n", 544 | "\n", 545 | "file2transcriptions = {}\n", 546 | "\n", 547 | "for sess in range(1, 6):\n", 548 | " transcripts_path = 'data/IEMOCAP_full_release/Session{}/dialog/transcriptions/'.format(sess)\n", 549 | " transcript_files = os.listdir(transcripts_path)\n", 550 | " for f in transcript_files:\n", 551 | " with open('{}{}'.format(transcripts_path, f), 'r') as f:\n", 552 | " all_lines = f.readlines()\n", 553 | "\n", 554 | " for l in all_lines:\n", 555 | " audio_code = useful_regex.match(l).group()\n", 556 | " transcription = l.split(':')[-1].strip()\n", 557 | " # assuming that all the keys would be unique and hence no `try`\n", 558 | " file2transcriptions[audio_code] = transcription\n", 559 | "# save dict\n", 560 | "with open('data/t2e/audiocode2text.pkl', 'wb') as file:\n", 561 | " pickle.dump(file2transcriptions, file)\n", 562 | "len(file2transcriptions)" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 8, 568 | "metadata": {}, 569 | "outputs": [], 570 | "source": [ 571 | "audiocode2text = pickle.load(open('data/t2e/audiocode2text.pkl', 'rb'))" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 9, 577 | "metadata": {}, 578 | "outputs": [ 579 | { 580 | "name": "stdout", 581 | "output_type": "stream", 582 | "text": [ 583 | "(7837, 3) (1960, 3)\n" 584 | ] 585 | } 586 | ], 587 | "source": [ 588 | "# Prepare text data\n", 589 | "text_train = pd.DataFrame()\n", 590 | "text_train['wav_file'] = x_train['wav_file']\n", 591 | "text_train['label'] = x_train['label']\n", 592 | "text_train['transcription'] = [normalizeString(audiocode2text[code]) for code in x_train['wav_file']]\n", 593 | "\n", 594 | "text_test = pd.DataFrame()\n", 595 | "text_test['wav_file'] = x_test['wav_file']\n", 596 | "text_test['label'] = x_test['label']\n", 597 | "text_test['transcription'] = [normalizeString(audiocode2text[code]) for code in x_test['wav_file']]\n", 598 | "\n", 599 | "text_train.to_csv('data/t2e/text_train.csv', index=False)\n", 600 | "text_test.to_csv('data/t2e/text_test.csv', index=False)\n", 601 | "\n", 602 | "print(text_train.shape, text_test.shape)" 603 | ] 604 | } 605 | ], 606 | "metadata": { 607 | "kernelspec": { 608 | "display_name": "Python 3", 609 | "language": "python", 610 | "name": "python3" 611 | }, 612 | "language_info": { 613 | "codemirror_mode": { 614 | "name": "ipython", 615 | "version": 3 616 | }, 617 | "file_extension": ".py", 618 | "mimetype": "text/x-python", 619 | "name": "python", 620 | "nbconvert_exporter": "python", 621 | "pygments_lexer": "ipython3", 622 | "version": "3.6.9" 623 | } 624 | }, 625 | "nbformat": 4, 626 | "nbformat_minor": 2 627 | } 628 | --------------------------------------------------------------------------------