├── config.py
├── pred_probas
    └── .gitkeep
├── trained_models
    └── .gitkeep
├── lstm_classifier
    ├── __init__.py
    ├── s2e
    │   ├── __init__.py
    │   ├── config.py
    │   ├── utils.py
    │   └── lstm_classifier.py
    ├── t2e
    │   ├── __init__.py
    │   ├── predict_probas.py
    │   ├── config.py
    │   ├── create_vocab.py
    │   ├── lstm_classifier.py
    │   └── utils.py
    └── combined
    │   ├── __init__.py
    │   ├── config.py
    │   ├── predict_probas.py
    │   ├── utils.py
    │   └── lstm_classifier.py
├── .gitignore
├── requirements.txt
├── LICENSE
├── src
    ├── build_audio_vectors.py
    ├── prepare_data.py
    ├── extract_emotion_labels.py
    ├── extract_audio_features.py
    └── train_sentence_classifiers_sklearn.py
├── 2_build_audio_vectors.ipynb
├── README.md
├── main.py
├── 1_extract_emotion_labels.ipynb
└── 4_prepare_data.ipynb


/config.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pred_probas/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/trained_models/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lstm_classifier/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lstm_classifier/s2e/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lstm_classifier/t2e/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lstm_classifier/combined/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *checkpoint*
2 | *.pyc
3 | *.csv
4 | *.txt
5 | *.wav
6 | data/*
7 | *.pkl
8 | *.zip
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | xgboost==0.82
2 | flask
3 | numpy==1.16.2
4 | scikit-learn==0.20.3
5 | scipy==1.4.1
6 | pandas
7 | librosa
8 | 


--------------------------------------------------------------------------------
/lstm_classifier/s2e/config.py:
--------------------------------------------------------------------------------
 1 | model_config = {
 2 |     'gpu': 0,
 3 |     'bidirectional': False,
 4 |     'input_dim': 8,
 5 |     'hidden_dim': 50,
 6 |     'output_dim': 6,  # number of classes
 7 |     'dropout': 0.2,
 8 |     'learning_rate': 0.01,
 9 |     'batch_size': 1567,  # carefully chosen
10 |     'n_epochs': 55000,
11 |     'n_layers': 2,
12 |     'model_code': 'basic_lstm'
13 | }
14 | 


--------------------------------------------------------------------------------
/lstm_classifier/combined/config.py:
--------------------------------------------------------------------------------
 1 | model_config = {
 2 |     'gpu': 1,
 3 |     'n_layers': 2,
 4 |     'dropout': 0.2,
 5 |     'output_dim': 6,  # number of classes
 6 |     'hidden_dim': 256,
 7 |     'input_dim': 2472,
 8 |     'batch_size': 200,  # carefully chosen
 9 |     'n_epochs': 55000,
10 |     'learning_rate': 0.001,
11 |     'bidirectional': True,
12 |     'model_code': 'bi_lstm'
13 | }
14 | 


--------------------------------------------------------------------------------
/lstm_classifier/t2e/predict_probas.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pickle
 3 | import numpy as np
 4 | import pandas as pd
 5 | from lstm_classifier import LSTMClassifier
 6 | from config import model_config as config
 7 | from utils import load_data
 8 | 
 9 | 
10 | # Load test data
11 | test_pairs = load_data(test=True)
12 | inputs, lengths, targets = test_pairs
13 | 
14 | # Load pretrained model
15 | model = LSTMClassifier(config)
16 | checkpoint = torch.load('runs/{}-best_model.pth'.format(config['model_code']),
17 |                         map_location='cpu')
18 | model.load_state_dict(checkpoint['model'])
19 | 
20 | with torch.no_grad():
21 |     # Predict
22 |     predict_probas = model(inputs, lengths).cpu().numpy()
23 | 
24 |     with open('../../pred_probas/text_lstm_classifier.pkl', 'wb') as f:
25 |         pickle.dump(predict_probas, f)
26 | 


--------------------------------------------------------------------------------
/lstm_classifier/combined/predict_probas.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pickle
 3 | import numpy as np
 4 | import pandas as pd
 5 | from lstm_classifier import LSTMClassifier
 6 | from config import model_config as config
 7 | from utils import load_data
 8 | 
 9 | 
10 | # Load test data
11 | test_pairs = load_data(test=True)
12 | inputs, targets = test_pairs
13 | inputs = inputs.unsqueeze(0)
14 | 
15 | # Load pretrained model
16 | model = LSTMClassifier(config)
17 | checkpoint = torch.load('runs/{}-best_model.pth'.format(config['model_code']),
18 |                         map_location='cpu')
19 | model.load_state_dict(checkpoint['model'])
20 | 
21 | with torch.no_grad():
22 |     # Predict
23 |     predict_probas = model(inputs).cpu().numpy()
24 | 
25 |     with open('../../pred_probas/combined_lstm_classifier.pkl', 'wb') as f:
26 |         pickle.dump(predict_probas, f)
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Gaurav
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/lstm_classifier/t2e/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pickle
 3 | import gensim
 4 | from create_vocab import Vocabulary, create_vocab
 5 | 
 6 | model_config = {
 7 |     'gpu': 1,
 8 |     '<PAD>': 0,
 9 |     '<SOS>': 1,
10 |     '<EOS>': 2,
11 |     '<UNK>': 3,
12 |     'n_layers': 2,
13 |     'dropout': 0.2,
14 |     'output_dim': 6,  # number of classes
15 |     'hidden_dim': 500,
16 |     'n_epochs': 45000,
17 |     'batch_size': 128,  # carefully chosen
18 |     'embedding_dim': 200,  # 50/100/200/300
19 |     'bidirectional': True,
20 |     'learning_rate': 0.0001,
21 |     'model_code': 'bi_lstm_2_layer',
22 |     'max_sequence_length': 20,
23 |     'embeddings_dir': 'embeddings/'
24 | }
25 | 
26 | 
27 | from utils import generate_word_embeddings
28 | 
29 | 
30 | def set_dynamic_hparams():
31 |     try:
32 |         with open('vocab.pkl', 'rb') as f:
33 |             vocab = pickle.load(f)
34 |     except FileNotFoundError as e:
35 |         vocab = create_vocab()
36 |         generate_word_embeddings(vocab)
37 | 
38 |     model_config['vocab_size'] = vocab.size
39 |     model_config['vocab_path'] = 'vocab.pkl'
40 |     return model_config
41 | 
42 | 
43 | model_config = set_dynamic_hparams()
44 | 


--------------------------------------------------------------------------------
/lstm_classifier/t2e/create_vocab.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | 
 6 | class Vocabulary(object):
 7 |     def __init__(self):
 8 |         self.word2index = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
 9 |         self.word2count = {}
10 |         self.index2word = {0: '<PAD>', 1: '<SOS>', 2: '<EOS>', 3: '<UNK>'}
11 |         self.size = 4  # Count PAD, SOS, EOS and UNK
12 | 
13 |     def add_sentence(self, sentence):
14 |         for word in sentence.split():
15 |             self.add_word(word)
16 | 
17 |     def add_word(self, word):
18 |         if word not in self.word2index:
19 |             self.word2index[word] = self.size
20 |             self.word2count[word] = 1
21 |             self.index2word[self.size] = word
22 |             self.size += 1
23 |         else:
24 |             self.word2count[word] += 1
25 | 
26 | 
27 | def create_vocab(file_dir='../../data/t2e/'):
28 |     print('Loading corpus...')
29 |     texts = []
30 |     for mode in ['train', 'test']:
31 |         texts += list(pd.read_csv('{}text_{}.csv'.format(file_dir, mode))['transcription'])
32 | 
33 |     print("Building vocab...")
34 |     vocab = Vocabulary()
35 | 
36 |     for text in texts:
37 |         vocab.add_sentence(text)
38 | 
39 |     print("Total words in vocab:  {}".format(vocab.size))
40 |     with open('vocab.pkl', 'wb') as f:
41 |         pickle.dump(vocab, f)
42 | 
43 |     print('Generating word embeddings')
44 |     return vocab
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     create_vocab()
49 | 


--------------------------------------------------------------------------------
/src/build_audio_vectors.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script should be run AFTER extract_emotion_labels.py. It basically uses
 3 | the csv file created in the previous step to split the original wav files into
 4 | multiple smaller frames, each piece containing an emotion.
 5 | 
 6 | Run this script from root as python src/build_audio_vectors.py
 7 | """
 8 | 
 9 | import os
10 | import math
11 | import pickle
12 | import librosa
13 | import pandas as pd
14 | from tqdm import tqdm
15 | 
16 | 
17 | def process_session(iemocap_dir, labels_df, sr, sess):
18 |     """
19 |     saves audio_vectors dict in a pickle file which contains vectors
20 |     for audio files in session `sess`
21 | 
22 |     process_session: Str pd.DataFrame Nat Int -> None
23 |     """
24 |     audio_vectors = {}
25 |     wav_file_path = '{}Session{}/dialog/wav/'.format(iemocap_dir, sess)
26 |     orig_wav_files = os.listdir(wav_file_path)
27 |     for orig_wav_file in tqdm(orig_wav_files):
28 |         try:
29 |             orig_wav_vector, _sr = librosa.load(wav_file_path + orig_wav_file,
30 |                                                 sr=sr)
31 |             orig_wav_file, file_format = orig_wav_file.split('.')
32 |             for index, row in labels_df[labels_df['wav_file'].str.contains(
33 |                     orig_wav_file)].iterrows():
34 |                 start_time, end_time, truncated_wav_file_name, = \
35 |                     row['start_time'], row['end_time'], row['wav_file']
36 |                 start_frame = math.floor(start_time * sr)
37 |                 end_frame = math.floor(end_time * sr)
38 |                 truncated_wav_vector = orig_wav_vector[start_frame:end_frame+1]
39 |                 audio_vectors[truncated_wav_file_name] = truncated_wav_vector
40 |         except Exception as e:
41 |             print('An exception occured for {}'.format(orig_wav_file))
42 |     with open('data/pre-processed/audio_vectors_{}.pkl'.format(sess), 'wb') as f:
43 |         pickle.dump(audio_vectors, f)
44 | 
45 | 
46 | def main():
47 |     sampling_rate = 44100
48 |     iemocap_dir = 'data/IEMOCAP_full_release/'
49 |     labels_df = pd.read_csv('data/pre-processed/df_iemocap.csv')
50 |     for sess in range(1, 6):
51 |         # Note that compiling this way will take too much time So you might
52 |         # consider parallelizing this process
53 |         process_session(iemocap_dir, labels_df, sampling_rate, sess)
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     main()
58 | 


--------------------------------------------------------------------------------
/src/prepare_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script preprocesses data and prepares data to be actually used in training
 3 | """
 4 | import re
 5 | import os
 6 | import pickle
 7 | import unicodedata
 8 | import pandas as pd
 9 | from sklearn.preprocessing import MinMaxScaler
10 | from sklearn.model_selection import train_test_split
11 | 
12 | 
13 | def unicodeToAscii(s):
14 |     return ''.join(
15 |         c for c in unicodedata.normalize('NFD', s)
16 |         if unicodedata.category(c) != 'Mn'
17 |     )
18 | 
19 | 
20 | def normalizeString(s):
21 |     """
22 |     Lowercase, trim, and remove non-letter characters
23 |     """
24 |     s = unicodeToAscii(s.lower().strip())
25 |     s = re.sub(r"([.!?])", r" \1", s)
26 |     s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
27 |     return s
28 | 
29 | 
30 | def transcribe_sessions():
31 |     file2transcriptions = {}
32 |     useful_regex = re.compile(r'^(\w+)', re.IGNORECASE)
33 |     transcript_path = 'data/IEMOCAP_full_release/Session{}/dialog/transcriptions/'
34 |     for sess in range(1, 6):
35 |         transcript_path = transcript_path.format(sess)
36 |         for f in os.listdir(transcript_path):
37 |             with open('{}{}'.format(transcript_path, f), 'r') as f:
38 |                 all_lines = f.readlines()
39 | 
40 |             for l in all_lines:
41 |                 audio_code = useful_regex.match(l).group()
42 |                 transcription = l.split(':')[-1].strip()
43 |                 # assuming that all the keys would be unique and hence no `try`
44 |                 file2transcriptions[audio_code] = transcription
45 |     with open('data/t2e/audiocode2text.pkl', 'wb') as file:
46 |         pickle.dump(file2transcriptions, file)
47 |     return file2transcriptions
48 | 
49 | 
50 | def prepare_text_data(audiocode2text):
51 |     # Prepare text data
52 |     text_train = pd.DataFrame()
53 |     text_train['wav_file'] = x_train['wav_file']
54 |     text_train['label'] = x_train['label']
55 |     text_train['transcription'] = [normalizeString(audiocode2text[code])
56 |                                    for code in x_train['wav_file']]
57 | 
58 |     text_test = pd.DataFrame()
59 |     text_test['wav_file'] = x_test['wav_file']
60 |     text_test['label'] = x_test['label']
61 |     text_test['transcription'] = [normalizeString(audiocode2text[code])
62 |                                   for code in x_test['wav_file']]
63 | 
64 |     text_train.to_csv('data/t2e/text_train.csv', index=False)
65 |     text_test.to_csv('data/t2e/text_test.csv', index=False)
66 | 
67 |     print(text_train.shape, text_test.shape)
68 | 
69 | 
70 | def main():
71 |     prepare_text_data(transcribe_sessions())
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     main()
76 | 


--------------------------------------------------------------------------------
/lstm_classifier/s2e/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import pandas as pd
 4 | from config import model_config as config
 5 | 
 6 | from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
 7 | 
 8 | import itertools
 9 | import matplotlib.pyplot as plt
10 | 
11 | 
12 | def load_data(batched=True, test=False, file_dir='../../data/s2e/'):
13 |     bs = config['batch_size']
14 |     ftype = 'test' if test else 'train'
15 |     df = pd.read_csv('{}modified_df_{}.csv'.format(file_dir, ftype))
16 |     # 0th index in label, rest all are features
17 |     data = (np.array(df[df.columns[1:]]), np.array(df[df.columns[0]]))
18 |     if test or not batched:
19 |         return [torch.FloatTensor(data[0]), torch.LongTensor(data[1])]
20 |     data = list(zip(data[0], data[1]))
21 |     n_iters = len(data) // bs
22 |     batches = []
23 |     for i in range(1, n_iters + 1):
24 |         input_batch = []
25 |         output_batch = []
26 |         for e in data[bs * (i-1):bs * i]:
27 |             input_batch.append(e[0])
28 |             output_batch.append(e[1])
29 |         batches.append([torch.FloatTensor(input_batch),
30 |                         torch.LongTensor(output_batch)])
31 |     return batches
32 | 
33 | 
34 | def evaluate(targets, predictions):
35 |     performance = {
36 |         'acc': accuracy_score(targets, predictions),
37 |         'f1': f1_score(targets, predictions, average='macro'),
38 |         'precision': precision_score(targets, predictions, average='macro'),
39 |         'recall': recall_score(targets, predictions, average='macro')}
40 |     return performance
41 | 
42 | 
43 | def plot_confusion_matrix(targets, predictions, classes,
44 |                           normalize=False,
45 |                           title='Confusion matrix',
46 |                           cmap=plt.cm.Blues):
47 |     """
48 |     This function prints and plots the confusion matrix.
49 |     Normalization can be applied by setting `normalize=True`.
50 |     """
51 |     # plt.figure(figsize=(8,8))
52 |     cm = confusion_matrix(targets, predictions)
53 |     plt.imshow(cm, interpolation='nearest', cmap=cmap)
54 |     plt.title(title)
55 |     plt.colorbar()
56 |     tick_marks = np.arange(len(classes))
57 |     plt.xticks(tick_marks, classes, rotation=45)
58 |     plt.yticks(tick_marks, classes)
59 | 
60 |     if normalize:
61 |         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
62 |         print("Normalized confusion matrix")
63 |     else:
64 |         print('Confusion matrix, without normalization')
65 | 
66 |     print(cm)
67 | 
68 |     thresh = cm.max() / 2.
69 |     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
70 |         plt.text(j, i, cm[i, j],
71 |                  horizontalalignment="center",
72 |                  color="white" if cm[i, j] > thresh else "black")
73 | 
74 |     plt.tight_layout()
75 |     plt.ylabel('True label')
76 |     plt.xlabel('Predicted label')
77 | 


--------------------------------------------------------------------------------
/lstm_classifier/combined/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pickle
 3 | import numpy as np
 4 | import pandas as pd
 5 | from config import model_config as config
 6 | 
 7 | from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
 8 | 
 9 | import itertools
10 | import matplotlib.pyplot as plt
11 | 
12 | 
13 | def load_data(batched=True, test=False, file_dir='../../data/combined/combined_features.pkl'):
14 |     bs = config['batch_size']
15 |     ftype = 'test' if test else 'train'
16 | 
17 |     with open('{}'.format(file_dir), 'rb') as f:
18 |         features = pickle.load(f)
19 | 
20 |     x = features['x_{}'.format(ftype)]
21 |     y = features['y_{}'.format(ftype)]
22 |     data = (x, y)
23 |     if test or not batched:
24 |         return [torch.FloatTensor(data[0]), torch.LongTensor(data[1])]
25 |     data = list(zip(data[0], data[1]))
26 |     n_iters = len(data) // bs
27 |     batches = []
28 |     for i in range(1, n_iters + 1):
29 |         input_batch = []
30 |         output_batch = []
31 |         for e in data[bs * (i-1):bs * i]:
32 |             input_batch.append(e[0])
33 |             output_batch.append(e[1])
34 |         batches.append([torch.FloatTensor(input_batch),
35 |                         torch.LongTensor(output_batch)])
36 |     return batches
37 | 
38 | 
39 | def evaluate(targets, predictions):
40 |     performance = {
41 |         'acc': accuracy_score(targets, predictions),
42 |         'f1': f1_score(targets, predictions, average='macro'),
43 |         'precision': precision_score(targets, predictions, average='macro'),
44 |         'recall': recall_score(targets, predictions, average='macro')}
45 |     return performance
46 | 
47 | 
48 | def plot_confusion_matrix(targets, predictions, classes,
49 |                           normalize=False,
50 |                           title='Confusion matrix',
51 |                           cmap=plt.cm.Blues):
52 |     """
53 |     This function prints and plots the confusion matrix.
54 |     Normalization can be applied by setting `normalize=True`.
55 |     """
56 |     # plt.figure(figsize=(8,8))
57 |     cm = confusion_matrix(targets, predictions)
58 |     plt.imshow(cm, interpolation='nearest', cmap=cmap)
59 |     plt.title(title)
60 |     plt.colorbar()
61 |     tick_marks = np.arange(len(classes))
62 |     plt.xticks(tick_marks, classes, rotation=45)
63 |     plt.yticks(tick_marks, classes)
64 | 
65 |     if normalize:
66 |         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
67 |         print("Normalized confusion matrix")
68 |     else:
69 |         print('Confusion matrix, without normalization')
70 | 
71 |     print(cm)
72 | 
73 |     thresh = cm.max() / 2.
74 |     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
75 |         plt.text(j, i, cm[i, j],
76 |                  horizontalalignment="center",
77 |                  color="white" if cm[i, j] > thresh else "black")
78 | 
79 |     plt.tight_layout()
80 |     plt.ylabel('True label')
81 |     plt.xlabel('Predicted label')
82 | 


--------------------------------------------------------------------------------
/src/extract_emotion_labels.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script parses the dataset, extracts label and stores it at one place
 3 | 
 4 | Run this script from root as python src/extract_emotion_labels.py
 5 | """
 6 | 
 7 | import re
 8 | import os
 9 | import pandas as pd
10 | 
11 | 
12 | def extract_info():
13 |     """
14 |     returns info_dict containing important info from the IEMOCAP dataset
15 |     such as start time, end time, emotion labels etc.
16 | 
17 |     extract_info: None -> Dict
18 |     """
19 |     info_dict = {'start_times': [], 'end_times': [], 'wav_file_names': [],
20 |                  'emotions': [], 'vals': [], 'acts': [], 'doms': []}
21 | 
22 |     # regex used to identify useful info in the dataset files
23 |     info_line = re.compile(r'\[.+\]\n', re.IGNORECASE)
24 |     for sess in range(1, 6):
25 |         emo_evaluation_dir = 'data/IEMOCAP_full_release/Session{}/dialog/EmoEvaluation/'.format(sess)
26 |         # Only include the session files
27 |         evaluation_files = [l for l in os.listdir(emo_evaluation_dir)
28 |                             if 'Ses' in l]
29 |         for file in evaluation_files:
30 |             with open(emo_evaluation_dir + file) as f:
31 |                 content = f.read()
32 |             # grab the important stuff
33 |             info_lines = re.findall(info_lines, content)
34 |             for line in info_line[1:]:  # skipping the first header line
35 |                 # Refer to the dataset to see how `line` looks like
36 |                 start_end_time, wav_file_name, emotion, val_act_dom = \
37 |                     line.strip().split('\t')
38 |                 start_time, end_time = start_end_time[1:-1].split('-')
39 |                 val, act, dom = val_act_dom[1:-1].split(',')
40 |                 val, act, dom = float(val), float(act), float(dom)
41 |                 start_time, end_time = float(start_time), float(end_time)
42 |                 info_dict['start_times'].append(start_time)
43 |                 info_dict['end_times'].append(end_time)
44 |                 info_dict['wav_file_names'].append(wav_file_name)
45 |                 info_dict['emotions'].append(emotion)
46 |                 info_dict['vals'].append(val)
47 |                 info_dict['acts'].append(act)
48 |                 info_dict['doms'].append(dom)
49 |     return info_dict
50 | 
51 | 
52 | def compile_dataset(info_dict):
53 |     """
54 |     creates a csv file from info_dict which will serve as the dataset
55 | 
56 |     compile_dataset: Dict -> None
57 |     """
58 |     df_iemocap = pd.DataFrame(columns=['start_time', 'end_time', 'wav_file', 'emotion', 'val', 'act', 'dom'])
59 | 
60 |     df_iemocap['start_time'] = info_dict['start_times']
61 |     df_iemocap['end_time'] = info_dict['end_times']
62 |     df_iemocap['wav_file'] = info_dict['wav_file_names']
63 |     df_iemocap['emotion'] = info_dict['emotions']
64 |     df_iemocap['val'] = info_dict['vals']
65 |     df_iemocap['act'] = info_dict['acts']
66 |     df_iemocap['dom'] = info_dict['doms']
67 |     # Finally, save to a file
68 |     df_iemocap.to_csv('data/pre-processed/df_iemocap.csv', index=False)
69 | 
70 | 
71 | def main():
72 |     compile_dataset(extract_info())
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     main()
77 | 


--------------------------------------------------------------------------------
/src/extract_audio_features.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script extract features from existing audio vectors
 3 | """
 4 | 
 5 | import os
 6 | import math
 7 | import random
 8 | import pickle
 9 | import librosa
10 | import numpy as np
11 | import pandas as pd
12 | from tqdm import tqdm
13 | 
14 | 
15 | def add_session_data(df_features, labels_df, emotion_dict, audio_vectors_path):
16 |     audio_vectors = pickle.load(open(audio_vectors_path, 'rb'))
17 |     for index, row in tqdm(labels_df[labels_df['wav_file'].str.contains(
18 |             'Ses0{}'.format(sess))].iterrows()):
19 |         try:
20 |             wav_file_name = row['wav_file']
21 |             label = emotion_dict[row['emotion']]
22 |             y = audio_vectors[wav_file_name]
23 | 
24 |             feature_list = [wav_file_name, label]  # wav_file, label
25 |             sig_mean = np.mean(abs(y))
26 |             feature_list.append(sig_mean)  # sig_mean
27 |             feature_list.append(np.std(y))  # sig_std
28 | 
29 |             rmse = librosa.feature.rmse(y + 0.0001)[0]
30 |             feature_list.append(np.mean(rmse))  # rmse_mean
31 |             feature_list.append(np.std(rmse))  # rmse_std
32 | 
33 |             silence = 0
34 |             for e in rmse:
35 |                 if e <= 0.4 * np.mean(rmse):
36 |                     silence += 1
37 |             silence /= float(len(rmse))
38 |             feature_list.append(silence)  # silence
39 | 
40 |             y_harmonic = librosa.effects.hpss(y)[0]
41 |             feature_list.append(np.mean(y_harmonic) * 1000)  # harmonic (scaled by 1000)
42 | 
43 |             # based on the pitch detection algorithm mentioned here:
44 |             # http://access.feld.cvut.cz/view.php?cisloclanku=2009060001
45 |             cl = 0.45 * sig_mean
46 |             center_clipped = []
47 |             for s in y:
48 |                 if s >= cl:
49 |                     center_clipped.append(s - cl)
50 |                 elif s <= -cl:
51 |                     center_clipped.append(s + cl)
52 |                 elif np.abs(s) < cl:
53 |                     center_clipped.append(0)
54 |             auto_corrs = librosa.core.autocorrelate(np.array(center_clipped))
55 |             feature_list.append(1000 * np.max(auto_corrs)/len(auto_corrs))  # auto_corr_max (scaled by 1000)
56 |             feature_list.append(np.std(auto_corrs))  # auto_corr_std
57 | 
58 |             df_features = df_features.append(pd.DataFrame(feature_list, index=columns).transpose(), ignore_index=True)
59 |         except Exception as e:
60 |             print('Some exception occured: {}'.format(e))
61 | 
62 | 
63 | def main():
64 |     emotion_dict = {'ang': 0, 'hap': 1, 'exc': 2, 'sad': 3, 'fru': 4, 'fea': 5,
65 |                     'sur': 6, 'neu': 7, 'xxx': 8, 'oth': 8}
66 | 
67 |     data_dir = 'data/pre-processed/'
68 |     labels_path = '{}df_iemocap.csv'.format(data_dir)
69 |     audio_vectors_path = '{}audio_vectors_'.format(data_dir)
70 |     columns = ['wav_file', 'label', 'sig_mean', 'sig_std', 'rmse_mean',
71 |                'rmse_std', 'silence', 'harmonic', 'auto_corr_max',
72 |                'auto_corr_std']
73 |     df_features = pd.DataFrame(columns=columns)
74 |     labels_df = pd.read_csv(labels_path)
75 |     for sess in range(1, 6):
76 |         add_session_data(df_features, labels_df, emotion_dict,
77 |                          '{}{}.pkl'.format(audio_vectors_path, sess))
78 |     df_features.to_csv('data/pre-processed/audio_features.csv', index=False)
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     main()
83 | 


--------------------------------------------------------------------------------
/lstm_classifier/s2e/lstm_classifier.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import sys
  3 | import pickle
  4 | import numpy as np
  5 | import torch.nn as nn
  6 | from torch import optim
  7 | import torch.nn.functional as F
  8 | from utils import load_data, evaluate, plot_confusion_matrix
  9 | 
 10 | from config import model_config as config
 11 | 
 12 | 
 13 | class LSTMClassifier(nn.Module):
 14 |     """docstring for LSTMClassifier"""
 15 |     def __init__(self, config):
 16 |         super(LSTMClassifier, self).__init__()
 17 |         self.n_layers = config['n_layers']
 18 |         self.input_dim = config['input_dim']
 19 |         self.hidden_dim = config['hidden_dim']
 20 |         self.output_dim = config['output_dim']
 21 |         self.bidirectional = config['bidirectional']
 22 |         self.dropout = config['dropout'] if self.n_layers > 1 else 0
 23 | 
 24 |         self.rnn = nn.LSTM(self.input_dim, self.hidden_dim, bias=True,
 25 |                            num_layers=2, dropout=self.dropout,
 26 |                            bidirectional=self.bidirectional)
 27 |         self.out = nn.Linear(self.hidden_dim, self.output_dim)
 28 |         self.softmax = F.softmax
 29 | 
 30 |     def forward(self, input_seq):
 31 |         # input_seq =. [1, batch_size, input_size]
 32 |         rnn_output, (hidden, _) = self.rnn(input_seq)
 33 |         if self.bidirectional:  # sum outputs from the two directions
 34 |             rnn_output = rnn_output[:, :, :self.hidden_dim] +\
 35 |                         rnn_output[:, :, self.hidden_dim:]
 36 |         class_scores = F.softmax(self.out(rnn_output[0]), dim=1)
 37 |         return class_scores
 38 | 
 39 | 
 40 | if __name__ == '__main__':
 41 |     emotion_dict = {'ang': 0, 'hap': 1, 'sad': 2, 'fea': 3, 'sur': 4, 'neu': 5}
 42 | 
 43 |     device = 'cuda:{}'.format(config['gpu']) if \
 44 |              torch.cuda.is_available() else 'cpu'
 45 | 
 46 |     model = LSTMClassifier(config)
 47 |     model = model.to(device)
 48 |     criterion = nn.CrossEntropyLoss()
 49 |     optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
 50 | 
 51 |     train_batches = load_data()
 52 |     test_pairs = load_data(test=True)
 53 | 
 54 |     best_acc = 0
 55 |     for epoch in range(config['n_epochs']):
 56 |         losses = []
 57 |         for batch in train_batches:
 58 |             inputs = batch[0].unsqueeze(0)  # frame in format as expected by model
 59 |             targets = batch[1]
 60 |             inputs = inputs.to(device)
 61 |             targets = targets.to(device)
 62 | 
 63 |             model.zero_grad()
 64 |             optimizer.zero_grad()
 65 | 
 66 |             predictions = model(inputs)
 67 |             predictions = predictions.to(device)
 68 | 
 69 |             loss = criterion(predictions, targets)
 70 |             loss.backward()
 71 |             optimizer.step()
 72 |             losses.append(loss.item())
 73 | 
 74 |         # evaluate
 75 |         with torch.no_grad():
 76 |             inputs = test_pairs[0].unsqueeze(0)
 77 |             targets = test_pairs[1]
 78 | 
 79 |             inputs = inputs.to(device)
 80 |             targets = targets.to(device)
 81 | 
 82 |             predictions = torch.argmax(model(inputs), dim=1)  # take argmax to get class id
 83 |             predictions = predictions.to(device)
 84 | 
 85 |             # evaluate on cpu
 86 |             targets = np.array(targets.cpu())
 87 |             predictions = np.array(predictions.cpu())
 88 | 
 89 |             # Get results
 90 |             # plot_confusion_matrix(targets, predictions,
 91 |             #                       classes=emotion_dict.keys())
 92 |             performance = evaluate(targets, predictions)
 93 |             if performance['acc'] > best_acc:
 94 |                 best_acc = performance['acc']
 95 |                 print(performance)
 96 |                 # save model and results
 97 |                 torch.save({
 98 |                     'model': model.state_dict(),
 99 |                     'optimizer': optimizer.state_dict()
100 |                     }, 'runs/{}-best_model.pth'.format(config['model_code']))
101 | 
102 |                 with open('results/{}-best_performance.pkl'.format(config['model_code']), 'wb') as f:
103 |                     pickle.dump(performance, f)
104 | 


--------------------------------------------------------------------------------
/lstm_classifier/combined/lstm_classifier.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import sys
  3 | import pickle
  4 | import numpy as np
  5 | import torch.nn as nn
  6 | from torch import optim
  7 | import torch.nn.functional as F
  8 | from utils import load_data, evaluate, plot_confusion_matrix
  9 | 
 10 | from config import model_config as config
 11 | 
 12 | 
 13 | class LSTMClassifier(nn.Module):
 14 |     """docstring for LSTMClassifier"""
 15 |     def __init__(self, config):
 16 |         super(LSTMClassifier, self).__init__()
 17 |         self.n_layers = config['n_layers']
 18 |         self.dropout = config['dropout'] if self.n_layers > 1 else 0
 19 |         self.input_dim = config['input_dim']
 20 |         self.hidden_dim = config['hidden_dim']
 21 |         self.output_dim = config['output_dim']
 22 |         self.bidirectional = config['bidirectional']
 23 | 
 24 |         self.rnn = nn.LSTM(self.input_dim, self.hidden_dim, bias=True,
 25 |                            num_layers=self.n_layers, dropout=self.dropout,
 26 |                            bidirectional=self.bidirectional)
 27 |         self.out = nn.Linear(self.hidden_dim, self.output_dim)
 28 |         self.softmax = F.softmax
 29 | 
 30 |     def forward(self, input_seq):
 31 |         # input_seq =. [1, batch_size, input_size]
 32 |         rnn_output, (hidden, _) = self.rnn(input_seq)
 33 |         if self.bidirectional:  # sum outputs from the two directions
 34 |             rnn_output = rnn_output[:, :, :self.hidden_dim] +\
 35 |                         rnn_output[:, :, self.hidden_dim:]
 36 |         class_scores = F.softmax(self.out(rnn_output[0]), dim=1)
 37 |         return class_scores
 38 | 
 39 | 
 40 | if __name__ == '__main__':
 41 |     emotion_dict = {'ang': 0, 'hap': 1, 'sad': 2, 'fea': 3, 'sur': 4, 'neu': 5}
 42 | 
 43 |     device = 'cuda:{}'.format(config['gpu']) if \
 44 |              torch.cuda.is_available() else 'cpu'
 45 | 
 46 |     model = LSTMClassifier(config)
 47 |     model = model.to(device)
 48 |     criterion = nn.CrossEntropyLoss()
 49 |     optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
 50 | 
 51 |     train_batches = load_data()
 52 |     test_pairs = load_data(test=True)
 53 | 
 54 |     best_acc = 0
 55 |     for epoch in range(config['n_epochs']):
 56 |         losses = []
 57 |         for batch in train_batches:
 58 |             inputs = batch[0].unsqueeze(0)  # frame in format as expected by model
 59 |             targets = batch[1]
 60 |             inputs = inputs.to(device)
 61 |             targets = targets.to(device)
 62 | 
 63 |             model.zero_grad()
 64 |             optimizer.zero_grad()
 65 | 
 66 |             predictions = model(inputs)
 67 |             predictions = predictions.to(device)
 68 | 
 69 |             loss = criterion(predictions, targets)
 70 |             loss.backward()
 71 |             optimizer.step()
 72 |             losses.append(loss.item())
 73 | 
 74 |         # evaluate
 75 |         with torch.no_grad():
 76 |             inputs = test_pairs[0].unsqueeze(0)
 77 |             targets = test_pairs[1]
 78 | 
 79 |             inputs = inputs.to(device)
 80 |             targets = targets.to(device)
 81 | 
 82 |             predictions = torch.argmax(model(inputs), dim=1)  # take argmax to get class id
 83 |             predictions = predictions.to(device)
 84 | 
 85 |             # evaluate on cpu
 86 |             targets = np.array(targets.cpu())
 87 |             predictions = np.array(predictions.cpu())
 88 | 
 89 |             # Get results
 90 |             # plot_confusion_matrix(targets, predictions,
 91 |             #                       classes=emotion_dict.keys())
 92 |             performance = evaluate(targets, predictions)
 93 |             if performance['acc'] > best_acc:
 94 |                 print(performance)
 95 |                 best_acc = performance['acc']
 96 |                 # save model and results
 97 |                 torch.save({
 98 |                     'model': model.state_dict(),
 99 |                     'optimizer': optimizer.state_dict()
100 |                     }, 'runs/{}-best_model.pth'.format(config['model_code']))
101 | 
102 |                 with open('results/{}-best_performance.pkl'.format(config['model_code']), 'wb') as f:
103 |                     pickle.dump(performance, f)
104 | 


--------------------------------------------------------------------------------
/lstm_classifier/t2e/lstm_classifier.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import pickle
  3 | import numpy as np
  4 | import torch.nn as nn
  5 | from torch import optim
  6 | import torch.nn.functional as F
  7 | 
  8 | from config import model_config as config
  9 | from utils import load_data, evaluate, load_word_embeddings, plot_confusion_matrix
 10 | 
 11 | 
 12 | class LSTMClassifier(nn.Module):
 13 |     """docstring for LSTMClassifier"""
 14 |     def __init__(self, config):
 15 |         super(LSTMClassifier, self).__init__()
 16 |         self.dropout = config['dropout']
 17 |         self.n_layers = config['n_layers']
 18 |         self.hidden_dim = config['hidden_dim']
 19 |         self.output_dim = config['output_dim']
 20 |         self.vocab_size = config['vocab_size']
 21 |         self.embedding_dim = config['embedding_dim']
 22 |         self.bidirectional = config['bidirectional']
 23 | 
 24 |         self.embedding = nn.Embedding.from_pretrained(
 25 |             load_word_embeddings(), freeze=False)
 26 | 
 27 |         self.rnn = nn.LSTM(self.embedding_dim, self.hidden_dim, bias=True,
 28 |                            num_layers=self.n_layers, dropout=self.dropout,
 29 |                            bidirectional=self.bidirectional)
 30 |         self.n_directions = 2 if self.bidirectional else 1
 31 |         self.out = nn.Linear(self.n_directions * self.hidden_dim, self.output_dim)
 32 |         self.softmax = F.softmax
 33 | 
 34 |     def forward(self, input_seq, input_lengths):
 35 |         max_seq_len, bs = input_seq.size()
 36 |         # input_seq =. [max_seq_len, batch_size]
 37 |         embedded = self.embedding(input_seq)
 38 | 
 39 |         rnn_output, (hidden, _) = self.rnn(embedded)
 40 |         rnn_output = torch.cat((rnn_output[-1, :, :self.hidden_dim],
 41 |                                 rnn_output[0, :, self.hidden_dim:]), dim=1)
 42 |         # sum hidden states
 43 |         class_scores = F.softmax(self.out(rnn_output), dim=1)
 44 |         return class_scores
 45 | 
 46 | 
 47 | if __name__ == '__main__':
 48 |     emotion_dict = {'ang': 0, 'hap': 1, 'sad': 2, 'fea': 3, 'sur': 4, 'neu': 5}
 49 | 
 50 |     device = 'cuda:{}'.format(config['gpu']) if \
 51 |              torch.cuda.is_available() else 'cpu'
 52 | 
 53 |     model = LSTMClassifier(config)
 54 |     model = model.to(device)
 55 |     criterion = nn.NLLLoss()
 56 |     optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
 57 | 
 58 |     train_batches = load_data()
 59 |     test_batch = load_data(test=True)
 60 | 
 61 |     best_acc = 0
 62 |     for epoch in range(config['n_epochs']):
 63 |         losses = []
 64 |         for batch in train_batches:
 65 |             inputs, input_lengths, targets = batch
 66 |             inputs = inputs.to(device)
 67 |             input_lengths = input_lengths.to(device)
 68 |             targets = targets.to(device)
 69 | 
 70 |             model.zero_grad()
 71 |             optimizer.zero_grad()
 72 | 
 73 |             predictions = model(inputs, input_lengths)
 74 |             predictions = predictions.to(device)
 75 | 
 76 |             loss = criterion(predictions, targets)
 77 |             loss.backward()
 78 |             optimizer.step()
 79 | 
 80 |             losses.append(loss.item())
 81 | 
 82 |         # evaluate
 83 |         with torch.no_grad():
 84 |             inputs, lengths, targets = test_batch
 85 | 
 86 |             inputs = inputs.to(device)
 87 |             lengths = lengths.to(device)
 88 |             targets = targets.to(device)
 89 | 
 90 |             predictions = torch.argmax(model(inputs, lengths), dim=1)  # take argmax to get class id
 91 |             predictions = predictions.to(device)
 92 | 
 93 |             # evaluate on cpu
 94 |             targets = np.array(targets.cpu())
 95 |             predictions = np.array(predictions.cpu())
 96 | 
 97 |             # Get results
 98 |             # plot_confusion_matrix(targets, predictions,
 99 |             #                       classes=emotion_dict.keys())
100 |             performance = evaluate(targets, predictions)
101 |             if performance['acc'] > best_acc:
102 |                 best_acc = performance['acc']
103 |                 # save model and results
104 |                 torch.save({
105 |                     'model': model.state_dict(),
106 |                     'optimizer': optimizer.state_dict()
107 |                     }, 'runs/{}-best_model.pth'.format(config['model_code']))
108 | 
109 |                 with open('results/{}-best_performance.pkl'.format(
110 |                         config['model_code']), 'wb') as f:
111 |                     pickle.dump(performance, f)
112 | 


--------------------------------------------------------------------------------
/2_build_audio_vectors.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Build Audio Vectors\n",
  8 |     "Now that the labels have been extracted, we'll use the compiled csv (df_iemocap.csv) to split the original wav files into multiple frames"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "# Try for one file first\n",
 18 |     "import librosa\n",
 19 |     "import os\n",
 20 |     "import soundfile as sf\n",
 21 |     "import numpy as np\n",
 22 |     "import matplotlib.pyplot as plt\n",
 23 |     "import matplotlib.style as ms\n",
 24 |     "from tqdm import tqdm\n",
 25 |     "import pickle\n",
 26 |     "\n",
 27 |     "import IPython.display\n",
 28 |     "import librosa.display\n",
 29 |     "ms.use('seaborn-muted')\n",
 30 |     "%matplotlib inline"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "metadata": {},
 37 |    "outputs": [
 38 |     {
 39 |      "data": {
 40 |       "text/plain": [
 41 |        "(array([ 0.42572615,  0.48587543,  0.37312022, ..., -0.31514615,\n",
 42 |        "        -0.16263676,  0.        ], dtype=float32), 44100)"
 43 |       ]
 44 |      },
 45 |      "execution_count": 2,
 46 |      "metadata": {},
 47 |      "output_type": "execute_result"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "file_path = 'data/IEMOCAP_full_release/Session1/dialog/wav/Ses01F_impro01.wav'\n",
 52 |     "\n",
 53 |     "y, sr = librosa.load(file_path, sr=44100)\n",
 54 |     "y, sr"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## Loop through all the files"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 3,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "import pandas as pd\n",
 71 |     "import math\n",
 72 |     "\n",
 73 |     "labels_df = pd.read_csv('data/pre-processed/df_iemocap.csv')\n",
 74 |     "iemocap_dir = 'data/IEMOCAP_full_release/'"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "The following cells take some time until completely executed"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 5,
 87 |    "metadata": {},
 88 |    "outputs": [
 89 |     {
 90 |      "name": "stderr",
 91 |      "output_type": "stream",
 92 |      "text": [
 93 |       "100%|██████████| 31/31 [05:11<00:00,  8.83s/it]\n"
 94 |      ]
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "sr = 44100\n",
 99 |     "audio_vectors = {}\n",
100 |     "for sess in [5]:  # using one session due to memory constraint, can replace [5] with range(1, 6)\n",
101 |     "    wav_file_path = '{}Session{}/dialog/wav/'.format(iemocap_dir, sess)\n",
102 |     "    orig_wav_files = os.listdir(wav_file_path)\n",
103 |     "    for orig_wav_file in tqdm(orig_wav_files):\n",
104 |     "        try:\n",
105 |     "            orig_wav_vector, _sr = librosa.load(wav_file_path + orig_wav_file, sr=sr)\n",
106 |     "            orig_wav_file, file_format = orig_wav_file.split('.')\n",
107 |     "            for index, row in labels_df[labels_df['wav_file'].str.contains(orig_wav_file)].iterrows():\n",
108 |     "                start_time, end_time, truncated_wav_file_name, emotion, val, act, dom = row['start_time'], row['end_time'], row['wav_file'], row['emotion'], row['val'], row['act'], row['dom']\n",
109 |     "                start_frame = math.floor(start_time * sr)\n",
110 |     "                end_frame = math.floor(end_time * sr)\n",
111 |     "                truncated_wav_vector = orig_wav_vector[start_frame:end_frame + 1]\n",
112 |     "                audio_vectors[truncated_wav_file_name] = truncated_wav_vector\n",
113 |     "        except:\n",
114 |     "            print('An exception occured for {}'.format(orig_wav_file))\n",
115 |     "    with open('data/pre-processed/audio_vectors_{}.pkl'.format(sess), 'wb') as f:\n",
116 |     "        pickle.dump(audio_vectors, f)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": []
125 |   }
126 |  ],
127 |  "metadata": {
128 |   "kernelspec": {
129 |    "display_name": "Python 3",
130 |    "language": "python",
131 |    "name": "python3"
132 |   },
133 |   "language_info": {
134 |    "codemirror_mode": {
135 |     "name": "ipython",
136 |     "version": 3
137 |    },
138 |    "file_extension": ".py",
139 |    "mimetype": "text/x-python",
140 |    "name": "python",
141 |    "nbconvert_exporter": "python",
142 |    "pygments_lexer": "ipython3",
143 |    "version": "3.6.9"
144 |   }
145 |  },
146 |  "nbformat": 4,
147 |  "nbformat_minor": 2
148 | }
149 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Multimodal Speech Emotion Recognition and Ambiguity Resolution
  2 | 
  3 | ## Overview
  4 | Identifying emotion from speech is a non-trivial task pertaining to the ambiguous definition of emotion itself. In this work, we build light-weight multimodal machine learning models and compare it against the heavier and less interpretable deep learning counterparts. For both types of models, we use hand-crafted features from a given audio signal. Our experiments show that the light-weight models are comparable to the deep learning baselines and even outperform them in some cases, achieving state-of-the-art performance on the IEMOCAP dataset.
  5 | 
  6 | The hand-crafted feature vectors obtained are used to train two types of models:
  7 | 
  8 | 1. ML-based: Logistic Regression, SVMs, Random Forest, eXtreme Gradient Boosting and Multinomial Naive-Bayes.
  9 | 2. DL-based: Multi-Layer Perceptron, LSTM Classifier
 10 | 
 11 | This project was carried as a course project for the course CS 698 - Computational Audio taught by [Prof. Richard Mann](https://cs.uwaterloo.ca/~mannr/) at the University of Waterloo. For a more detailed explanation, please check the [report](https://arxiv.org/abs/1904.06022).
 12 | 
 13 | ## Datasets
 14 | The [IEMOCAP](https://link.springer.com/content/pdf/10.1007%2Fs10579-008-9076-6.pdf) dataset was used for all the experiments in this work. Please refer to the [report](https://arxiv.org/abs/1904.06022) for a detailed explanation of pre-processing steps applied to the dataset.
 15 | 
 16 | ## Requirements
 17 | All the experiments have been tested using the following libraries:
 18 | - xgboost==0.82
 19 | - torch==1.0.1.post2
 20 | - scikit-learn==0.20.3
 21 | - numpy==1.16.2
 22 | - jupyter==1.0.0
 23 | - pandas==0.24.1
 24 | - librosa==0.7.0
 25 | 
 26 | To avoid conflicts, it is recommended to setup a new python virtual environment to install these libraries. Once the env is setup, run `pip install -r requirements.txt` to install the dependencies.
 27 | 
 28 | ## Instructions to run the code
 29 | 1. Clone this repository by running `git clone git@github.com:Demfier/multimodal-speech-emotion-recognition`.
 30 | 2. Go to the root directory of this project by running `cd multimodal-speech-emotion-recognition/` in your terminal.
 31 | 3. Start a jupyter notebook by running `jupyter notebook` from the root of this project.
 32 | 4. Run `1_extract_emotion_labels.ipynb` to extract labels from transriptions and compile other required data into a csv.
 33 | 5. Run `2_build_audio_vectors.ipynb` to build vectors from the original wav files and save into a pickle file
 34 | 6. Run `3_extract_audio_features.ipynb` to extract 8-dimensional audio feature vectors for the audio vectors
 35 | 7. Run `4_prepare_data.ipynb` to preprocess and prepare audio + video data for experiments
 36 | 8. It is recommended to train `LSTMClassifier` before running any other experiments for easy comparsion with other models later on:
 37 |   - Change `config.py` for any of the experiment settings. For instance, if you want to train a speech2emotion classifier, make necessary changes to `lstm_classifier/s2e/config.py`. Similar procedure follows for training text2emotion (`t2e`) and text+speech2emotion (`combined`) classifiers.
 38 |   - Run `python lstm_classifier.py` from `lstm_classifier/{exp_mode}` to train an LSTM classifier for the respective experiment mode (possible values of `exp_mode: s2e/t2e/combined`)
 39 | 9. Run `5_audio_classification.ipynb` to train ML classifiers for audio
 40 | 10. Run `5.1_sentence_classification.ipynb` to train ML classifiers for text
 41 | 11. Run `5.2_combined_classification.ipynb` to train ML classifiers for audio+text
 42 | 
 43 | **Note:** Make sure to include correct model paths in the notebooks as not everything is relative right now and it needs some refactoring
 44 | 
 45 | **UPDATE**: You can access the preprocessed data files here to skip the steps 4-7: [https://www.dropbox.com/scl/fo/jdzz2y9nngw9rxsbz9vyj/h?rlkey=bji7zcqclusagzfwa7alm59hx&dl=0](https://www.dropbox.com/scl/fo/jdzz2y9nngw9rxsbz9vyj/h?rlkey=bji7zcqclusagzfwa7alm59hx&dl=0)
 46 | 
 47 | ## Results
 48 | Accuracy, F-score, Precision and Recall has been reported for the different experiments.
 49 | 
 50 | **Audio**
 51 | 
 52 | Models | Accuracy | F1 | Precision | Recall
 53 | ---|---|---|---|---
 54 | RF | 56.0 | **56.0** | 57.2 | **57.3**
 55 | XGB | 55.6 | **56.0** | 56.9 | 56.8
 56 | SVM | 33.7 | 15.2 | 17.4 | 21.5
 57 | MNB | 31.3 | 9.1 | 19.6 | 17.2
 58 | LR | 33.4 | 14.9 | 17.8 | 20.9
 59 | MLP | 41.0 | 36.5 | 42.2 | 35.9
 60 | LSTM | 43.6 | 43.4 | 53.2 | 40.6
 61 | ARE (4-class) | 56.3 | - | 54.6 | -
 62 | E1 (4-class) | 56.2 | 45.9 | **67.6** | 48.9
 63 | **E1** | **56.6** | 55.7 | 57.3 | **57.3**
 64 | 
 65 | E1: Ensemble (RF + XGB + MLP)
 66 | 
 67 | **Text**
 68 | 
 69 | Models | Accuracy | F1 | Precision | Recall
 70 | ---|---|---|---|---
 71 | RF | 62.2 | 60.8 | 65.0 | 62.0
 72 | XGB | 56.9 | 55.0 | 70.3 | 51.8
 73 | SVM | 62.1 | 61.7 | 62.5 | **63.5**
 74 | MNB | 61.9 | 62.1 | **71.8** | 58.6
 75 | LR | 64.2 | 64.3 | 69.5 | 62.3
 76 | MLP | 60.6 | 61.5 | 62.4 | 63.0
 77 | LSTM | 63.1 | 62.5 | 65.3 | 62.8
 78 | TRE (4-class) | **65.5** | - | 63.5 | -
 79 | E1 (4-class) | 63.1 | 61.4 | **67.7** | 59.0
 80 | **E2** | 64.9 | **66.0** | 71.4 | 63.2
 81 | 
 82 | E2: Ensemble (RF + XGB + MLP + MNB + LR)
 83 | E1: Ensemble (RF + XGB + MLP)
 84 | 
 85 | **Audio + Text**
 86 | 
 87 | Models | Accuracy | F1 | Precision | Recall
 88 | ---|---|---|---|---
 89 | RF | 65.3 | 65.8 | 69.3 | 65.5
 90 | XGB | 62.2 | 63.1 | 67.9 | 61.7
 91 | SVM | 63.4 | 63.8 | 63.1 | 65.6
 92 | MNB | 60.5 | 60.3 | 70.3 | 57.1
 93 | MLP | 66.1 | 68.1 | 68.0 | 69.6
 94 | LR | 63.2 | 63.7 | 66.9 | 62.3
 95 | LSTM | 64.2 | 64.7 | 66.1 | 65.0
 96 | MDRE (4-class) | **75.3** | - | 71.8 | -
 97 | E1 (4-class) | 70.3 | 67.5 | **73.2** | 65.5
 98 | **E2** | 70.1 | **71.8** | 72.9 | **71.5**
 99 | 
100 | For more details, please refer to the [report](https://arxiv.org/abs/1904.06022)
101 | 
102 | ## Citation
103 | If you find this work useful, please cite:
104 | 
105 | ```
106 | @article{sahu2019multimodal,
107 |   title={Multimodal Speech Emotion Recognition and Ambiguity Resolution},
108 |   author={Sahu, Gaurav},
109 |   journal={arXiv preprint arXiv:1904.06022},
110 |   year={2019}
111 | }
112 | ```
113 | 


--------------------------------------------------------------------------------
/lstm_classifier/t2e/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import pickle
  4 | import gensim
  5 | import numpy as np
  6 | import pandas as pd
  7 | from config import model_config as config
  8 | from gensim.scripts.glove2word2vec import glove2word2vec
  9 | 
 10 | from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
 11 | 
 12 | import itertools
 13 | import matplotlib.pyplot as plt
 14 | 
 15 | 
 16 | def generate_word_embeddings(vocab):
 17 |     if not os.path.exists('{}gensim.glove.6B.{}d.txt'.format(
 18 |             config['embeddings_dir'], config['embedding_dim'])):
 19 |         glove2word2vec(glove_input_file='{}glove.6B.{}d.txt'.format(
 20 |             config['embeddings_dir'], config['embedding_dim']),
 21 |             word2vec_output_file='{}gensim.glove.6B.{}d.txt'.format(
 22 |             config['embeddings_dir'], config['embedding_dim']))
 23 | 
 24 |     embeddings_all = gensim.models.KeyedVectors.load_word2vec_format(
 25 |         '{}gensim.glove.6B.{}d.txt'.format(config['embeddings_dir'],
 26 |                                            config['embedding_dim']))
 27 |     print('Loaded original embeddings')
 28 | 
 29 |     # initialize word embeddings matrix
 30 |     combined_word_embeddings = np.zeros((vocab.size,
 31 |                                          config['embedding_dim']))
 32 |     for index, word in vocab.index2word.items():
 33 |         try:
 34 |             if index < 4:  # deal with special tokens
 35 |                 combined_word_embeddings[index] = np.random.normal(
 36 |                     size=(config['embedding_dim'], ))
 37 |                 continue
 38 |             combined_word_embeddings[index] = embeddings_all[word]
 39 |         except KeyError as e:
 40 |             print('KeyError triggered for {}'.format(word))
 41 |             combined_word_embeddings[index] = np.random.normal(
 42 |                 size=(config['embedding_dim'], ))
 43 |     print('Created combined + filtered embeddings.')
 44 |     with open('{}saved_{}d_word_embeddings.pkl'.format(
 45 |             config['embeddings_dir'], config['embedding_dim']), 'wb') as f:
 46 |         pickle.dump(combined_word_embeddings, f)
 47 |     combined_word_embeddings = torch.from_numpy(combined_word_embeddings).float()
 48 |     return combined_word_embeddings
 49 | 
 50 | 
 51 | def load_word_embeddings():
 52 |     with open('{}saved_{}d_word_embeddings.pkl'.format(
 53 |             config['embeddings_dir'], config['embedding_dim']), 'rb') as f:
 54 |         combined_word_embeddings = pickle.load(f)
 55 |         return torch.from_numpy(combined_word_embeddings).float()
 56 | 
 57 | 
 58 | def zero_padding(l, fillvalue=config['<PAD>']):
 59 |     return list(itertools.zip_longest(*l, fillvalue=fillvalue))
 60 | 
 61 | 
 62 | def binary_matrix(l, value=config['<PAD>']):
 63 |     m = []
 64 |     for i, seq in enumerate(l):
 65 |         m.append([])
 66 |         for token in seq:
 67 |             if token == 0:
 68 |                 m[i].append(0)
 69 |             else:
 70 |                 m[i].append(1)
 71 |     return m
 72 | 
 73 | 
 74 | # Returns padded input sequence tensor and lengths
 75 | def input_var(l, vocab):
 76 |     indexes_batch = [indexes_from_sentence(vocab, sentence) for sentence in l]
 77 |     for idx, indexes in enumerate(indexes_batch):
 78 |         indexes_batch[idx] = indexes_batch[idx] + [config['<EOS>']]
 79 |     lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
 80 |     pad_list = zero_padding(indexes_batch)
 81 |     pad_var = torch.LongTensor(pad_list)
 82 |     return pad_var, lengths
 83 | 
 84 | 
 85 | def indexes_from_sentence(vocab, sentence):
 86 |     indexes = []
 87 |     for word in sentence.strip().split():
 88 |         try:
 89 |             indexes.append(vocab.word2index[word])
 90 |         except KeyError as e:
 91 |             indexes.append(config['<UNK>'])
 92 |     return indexes[:config['max_sequence_length']]
 93 | 
 94 | 
 95 | def load_data(batched=True, test=False, file_dir='../../data/t2e/'):
 96 |     # Load vocab
 97 |     with open(config['vocab_path'], 'rb') as f:
 98 |         vocab = pickle.load(f)
 99 | 
100 |     bs = config['batch_size']
101 |     ftype = 'test' if test else 'train'
102 | 
103 |     df = pd.read_csv('{}text_{}.csv'.format(file_dir, ftype))
104 |     data = (np.array(list(df['transcription'])), np.array(df['label']))
105 | 
106 |     data = list(zip(data[0], data[1]))
107 |     data.sort(key=lambda x: len(x[0].split()), reverse=True)
108 | 
109 |     n_iters = len(data) // bs
110 | 
111 |     if test:
112 |         input_batch = []
113 |         output_batch = []
114 |         for e in data:
115 |             input_batch.append(e[0])
116 |             output_batch.append(e[1])
117 |         inp, lengths = input_var(input_batch, vocab)
118 |         return [inp, lengths, torch.LongTensor(output_batch)]
119 | 
120 |     batches = []
121 |     for i in range(1, n_iters + 1):
122 |         input_batch = []
123 |         output_batch = []
124 |         for e in data[bs * (i-1):bs * i]:
125 |             input_batch.append(e[0])
126 |             output_batch.append(e[1])
127 |         inp, lengths = input_var(input_batch, vocab)
128 |         batches.append([inp, lengths,
129 |                         torch.LongTensor(output_batch)])
130 |     return batches
131 | 
132 | 
133 | def evaluate(targets, predictions):
134 |     performance = {
135 |         'acc': accuracy_score(targets, predictions),
136 |         'f1': f1_score(targets, predictions, average='macro'),
137 |         'precision': precision_score(targets, predictions, average='macro'),
138 |         'recall': recall_score(targets, predictions, average='macro')}
139 |     return performance
140 | 
141 | 
142 | def plot_confusion_matrix(targets, predictions, classes,
143 |                           normalize=False,
144 |                           title='Confusion matrix',
145 |                           cmap=plt.cm.Blues):
146 |     """
147 |     This function prints and plots the confusion matrix.
148 |     Normalization can be applied by setting `normalize=True`.
149 |     """
150 |     # plt.figure(figsize=(8,8))
151 |     cm = confusion_matrix(targets, predictions)
152 |     plt.imshow(cm, interpolation='nearest', cmap=cmap)
153 |     plt.title(title)
154 |     plt.colorbar()
155 |     tick_marks = np.arange(len(classes))
156 |     plt.xticks(tick_marks, classes, rotation=45)
157 |     plt.yticks(tick_marks, classes)
158 | 
159 |     if normalize:
160 |         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
161 |         print("Normalized confusion matrix")
162 |     else:
163 |         print('Confusion matrix, without normalization')
164 | 
165 |     print(cm)
166 | 
167 |     thresh = cm.max() / 2.
168 |     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
169 |         plt.text(j, i, cm[i, j],
170 |                  horizontalalignment="center",
171 |                  color="white" if cm[i, j] > thresh else "black")
172 | 
173 |     plt.tight_layout()
174 |     plt.ylabel('True label')
175 |     plt.xlabel('Predicted label')
176 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import math
  4 | import random
  5 | import pickle
  6 | import librosa
  7 | import numpy as np
  8 | import pandas as pd
  9 | from tqdm import tqdm
 10 | import soundfile as sf
 11 | 
 12 | 
 13 | # Part 1: Extract Audio Labels
 14 | def extract_audio_labels():
 15 |     info_line = re.compile(r'\[.+\]\n', re.IGNORECASE)
 16 | 
 17 |     start_times, end_times, wav_file_names, emotions, vals, acts, doms = \
 18 |         [], [], [], [], [], [], []
 19 | 
 20 |     for sess in range(1, 6):
 21 |         emo_evaluation_dir = \
 22 |             'data/IEMOCAP_full_release/Session{}/dialog/EmoEvaluation/'.format(sess)
 23 |         evaluation_files = [l for l in os.listdir(emo_evaluation_dir) if 'Ses' in l]
 24 |         for file in evaluation_files:
 25 |             with open(emo_evaluation_dir + file) as f:
 26 |                 content = f.read()
 27 |             info_lines = re.findall(info_line, content)
 28 |             for line in info_lines[1:]:  # the first line is a header
 29 |                 start_end_time, wav_file_name, emotion, val_act_dom = \
 30 |                     line.strip().split('\t')
 31 |                 start_time, end_time = start_end_time[1:-1].split('-')
 32 |                 val, act, dom = val_act_dom[1:-1].split(',')
 33 |                 val, act, dom = float(val), float(act), float(dom)
 34 |                 start_time, end_time = float(start_time), float(end_time)
 35 |                 start_times.append(start_time)
 36 |                 end_times.append(end_time)
 37 |                 wav_file_names.append(wav_file_name)
 38 |                 emotions.append(emotion)
 39 |                 vals.append(val)
 40 |                 acts.append(act)
 41 |                 doms.append(dom)
 42 | 
 43 |     df_iemocap = pd.DataFrame(columns=['start_time', 'end_time', 'wav_file',
 44 |                                        'emotion', 'val', 'act', 'dom'])
 45 | 
 46 |     df_iemocap['start_time'] = start_times
 47 |     df_iemocap['end_time'] = end_times
 48 |     df_iemocap['wav_file'] = wav_file_names
 49 |     df_iemocap['emotion'] = emotions
 50 |     df_iemocap['val'] = vals
 51 |     df_iemocap['act'] = acts
 52 |     df_iemocap['dom'] = doms
 53 | 
 54 |     df_iemocap.to_csv('data/pre-processed/df_iemocap.csv', index=False)
 55 | 
 56 | 
 57 | # Part 2: Build Audio Vectors
 58 | def build_audio_vectors():
 59 |     labels_df = pd.read_csv('data/pre-processed/df_iemocap.csv')
 60 |     iemocap_dir = 'data/IEMOCAP_full_release/'
 61 | 
 62 |     sr = 44100
 63 |     audio_vectors = {}
 64 |     for sess in range(1, 6):  # using one session due to memory constraint, can replace [5] with range(1, 6)
 65 |         wav_file_path = '{}Session{}/dialog/wav/'.format(iemocap_dir, sess)
 66 |         orig_wav_files = os.listdir(wav_file_path)
 67 |         for orig_wav_file in tqdm(orig_wav_files):
 68 |             try:
 69 |                 orig_wav_vector, _sr = librosa.load(
 70 |                         wav_file_path + orig_wav_file, sr=sr)
 71 |                 orig_wav_file, file_format = orig_wav_file.split('.')
 72 |                 for index, row in labels_df[labels_df['wav_file'].str.contains(
 73 |                         orig_wav_file)].iterrows():
 74 |                     start_time, end_time, truncated_wav_file_name, emotion,\
 75 |                         val, act, dom = row['start_time'], row['end_time'],\
 76 |                         row['wav_file'], row['emotion'], row['val'],\
 77 |                         row['act'], row['dom']
 78 |                     start_frame = math.floor(start_time * sr)
 79 |                     end_frame = math.floor(end_time * sr)
 80 |                     truncated_wav_vector = orig_wav_vector[start_frame:end_frame + 1]
 81 |                     audio_vectors[truncated_wav_file_name] = truncated_wav_vector
 82 |             except:
 83 |                 print('An exception occured for {}'.format(orig_wav_file))
 84 |         with open('data/pre-processed/audio_vectors_{}.pkl'.format(sess), 'wb') as f:
 85 |             pickle.dump(audio_vectors, f)
 86 | 
 87 | 
 88 | # Part 3: Extract Audio Features
 89 | def extract_audio_features():
 90 |     data_dir = 'data/pre-processed/'
 91 |     labels_df_path = '{}df_iemocap.csv'.format(data_dir)
 92 |     audio_vectors_path = '{}audio_vectors_1.pkl'.format(data_dir)
 93 |     labels_df = pd.read_csv(labels_df_path)
 94 |     audio_vectors = pickle.load(open(audio_vectors_path, 'rb'))
 95 | 
 96 |     columns = ['wav_file', 'label', 'sig_mean', 'sig_std', 'rmse_mean',
 97 |                'rmse_std', 'silence', 'harmonic', 'auto_corr_max', 'auto_corr_std']
 98 |     df_features = pd.DataFrame(columns=columns)
 99 | 
100 |     emotion_dict = {'ang': 0,
101 |                     'hap': 1,
102 |                     'exc': 2,
103 |                     'sad': 3,
104 |                     'fru': 4,
105 |                     'fea': 5,
106 |                     'sur': 6,
107 |                     'neu': 7,
108 |                     'xxx': 8,
109 |                     'oth': 8}
110 | 
111 |     data_dir = 'data/pre-processed/'
112 |     labels_path = '{}df_iemocap.csv'.format(data_dir)
113 |     audio_vectors_path = '{}audio_vectors_'.format(data_dir)
114 |     labels_df = pd.read_csv(labels_path)
115 | 
116 |     for sess in (range(1, 6)):
117 |         audio_vectors = pickle.load(open('{}{}.pkl'.format(audio_vectors_path, sess), 'rb'))
118 |         for index, row in tqdm(labels_df[labels_df['wav_file'].str.contains('Ses0{}'.format(sess))].iterrows()):
119 |             try:
120 |                 wav_file_name = row['wav_file']
121 |                 label = emotion_dict[row['emotion']]
122 |                 y = audio_vectors[wav_file_name]
123 | 
124 |                 feature_list = [wav_file_name, label]  # wav_file, label
125 |                 sig_mean = np.mean(abs(y))
126 |                 feature_list.append(sig_mean)  # sig_mean
127 |                 feature_list.append(np.std(y))  # sig_std
128 | 
129 |                 rmse = librosa.feature.rmse(y + 0.0001)[0]
130 |                 feature_list.append(np.mean(rmse))  # rmse_mean
131 |                 feature_list.append(np.std(rmse))  # rmse_std
132 | 
133 |                 silence = 0
134 |                 for e in rmse:
135 |                     if e <= 0.4 * np.mean(rmse):
136 |                         silence += 1
137 |                 silence /= float(len(rmse))
138 |                 feature_list.append(silence)  # silence
139 | 
140 |                 y_harmonic = librosa.effects.hpss(y)[0]
141 |                 feature_list.append(np.mean(y_harmonic) * 1000)  # harmonic (scaled by 1000)
142 | 
143 |                 # based on the pitch detection algorithm mentioned here:
144 |                 # http://access.feld.cvut.cz/view.php?cisloclanku=2009060001
145 |                 cl = 0.45 * sig_mean
146 |                 center_clipped = []
147 |                 for s in y:
148 |                     if s >= cl:
149 |                         center_clipped.append(s - cl)
150 |                     elif s <= -cl:
151 |                         center_clipped.append(s + cl)
152 |                     elif np.abs(s) < cl:
153 |                         center_clipped.append(0)
154 |                 auto_corrs = librosa.core.autocorrelate(np.array(center_clipped))
155 |                 feature_list.append(1000 * np.max(auto_corrs)/len(auto_corrs))  # auto_corr_max (scaled by 1000)
156 |                 feature_list.append(np.std(auto_corrs))  # auto_corr_std
157 | 
158 |                 df_features = df_features.append(pd.DataFrame(
159 |                     feature_list, index=columns).transpose(),
160 |                     ignore_index=True)
161 |             except Exception as e:
162 |                 print('Some exception occurred: {}'.format(e))
163 | 
164 |     df_features.to_csv('data/pre-processed/audio_features.csv', index=False)
165 | 
166 | 
167 | def main():
168 |     print('Part 1: Extract Audio Labels')
169 |     extract_audio_labels()
170 |     print('Part 2: Build Audio Vectors')
171 |     build_audio_vectors()
172 |     print('Part 3: Extract Audio Features')
173 |     extract_audio_features()
174 | 
175 | 
176 | if __name__ == '__main__':
177 |     main()
178 | 


--------------------------------------------------------------------------------
/1_extract_emotion_labels.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Extract labels from the evaluation files\n",
  8 |     "\n",
  9 |     "Test for one file first"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import re\n",
 19 |     "\n",
 20 |     "# first test with one file\n",
 21 |     "file_path = 'data/IEMOCAP_full_release/Session1/dialog/EmoEvaluation/Ses01F_impro01.txt'"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 8,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "useful_regex = re.compile(r'\\[.+\\]\\n', re.IGNORECASE)"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 13,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "with open(file_path) as f:\n",
 40 |     "    file_content = f.read()\n",
 41 |     "    \n",
 42 |     "info_lines = re.findall(useful_regex, file_content)"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 20,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "name": "stdout",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "['[6.2901 - 8.2357]', 'Ses01F_impro01_F000', 'neu', '[2.5000, 2.5000, 2.5000]']\n",
 55 |       "['[10.0100 - 11.3925]', 'Ses01F_impro01_F001', 'neu', '[2.5000, 2.5000, 2.5000]']\n",
 56 |       "['[14.8872 - 18.0175]', 'Ses01F_impro01_F002', 'neu', '[2.5000, 2.5000, 2.5000]']\n",
 57 |       "['[19.2900 - 20.7875]', 'Ses01F_impro01_F003', 'xxx', '[2.5000, 3.0000, 3.0000]']\n",
 58 |       "['[21.3257 - 24.7400]', 'Ses01F_impro01_F004', 'xxx', '[2.5000, 3.0000, 2.5000]']\n",
 59 |       "['[27.4600 - 31.4900]', 'Ses01F_impro01_F005', 'neu', '[2.5000, 3.5000, 2.0000]']\n",
 60 |       "['[38.9650 - 43.5900]', 'Ses01F_impro01_F006', 'fru', '[2.0000, 3.5000, 3.5000]']\n",
 61 |       "['[46.5800 - 52.1900]', 'Ses01F_impro01_F007', 'fru', '[2.5000, 3.5000, 3.5000]']\n",
 62 |       "['[56.1600 - 58.8225]', 'Ses01F_impro01_F008', 'fru', '[2.0000, 3.5000, 3.5000]']\n"
 63 |      ]
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "for l in info_lines[1:10]:\n",
 68 |     "    print(l.strip().split('\\t'))"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "## Compile all the information in a single file"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 64,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "import re\n",
 85 |     "import os\n",
 86 |     "\n",
 87 |     "\n",
 88 |     "info_line = re.compile(r'\\[.+\\]\\n', re.IGNORECASE)\n",
 89 |     "\n",
 90 |     "start_times, end_times, wav_file_names, emotions, vals, acts, doms = [], [], [], [], [], [], []\n",
 91 |     "\n",
 92 |     "for sess in range(1, 6):\n",
 93 |     "    emo_evaluation_dir = 'data/IEMOCAP_full_release/Session{}/dialog/EmoEvaluation/'.format(sess)\n",
 94 |     "    evaluation_files = [l for l in os.listdir(emo_evaluation_dir) if 'Ses' in l]\n",
 95 |     "    for file in evaluation_files:\n",
 96 |     "        with open(emo_evaluation_dir + file) as f:\n",
 97 |     "            content = f.read()\n",
 98 |     "        info_lines = re.findall(info_line, content)\n",
 99 |     "        for line in info_lines[1:]:  # the first line is a header\n",
100 |     "            start_end_time, wav_file_name, emotion, val_act_dom = line.strip().split('\\t')\n",
101 |     "            start_time, end_time = start_end_time[1:-1].split('-')\n",
102 |     "            val, act, dom = val_act_dom[1:-1].split(',')\n",
103 |     "            val, act, dom = float(val), float(act), float(dom)\n",
104 |     "            start_time, end_time = float(start_time), float(end_time)\n",
105 |     "            start_times.append(start_time)\n",
106 |     "            end_times.append(end_time)\n",
107 |     "            wav_file_names.append(wav_file_name)\n",
108 |     "            emotions.append(emotion)\n",
109 |     "            vals.append(val)\n",
110 |     "            acts.append(act)\n",
111 |     "            doms.append(dom)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 68,
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "data": {
121 |       "text/html": [
122 |        "<div>\n",
123 |        "<style scoped>\n",
124 |        "    .dataframe tbody tr th:only-of-type {\n",
125 |        "        vertical-align: middle;\n",
126 |        "    }\n",
127 |        "\n",
128 |        "    .dataframe tbody tr th {\n",
129 |        "        vertical-align: top;\n",
130 |        "    }\n",
131 |        "\n",
132 |        "    .dataframe thead th {\n",
133 |        "        text-align: right;\n",
134 |        "    }\n",
135 |        "</style>\n",
136 |        "<table border=\"1\" class=\"dataframe\">\n",
137 |        "  <thead>\n",
138 |        "    <tr style=\"text-align: right;\">\n",
139 |        "      <th></th>\n",
140 |        "      <th>start_time</th>\n",
141 |        "      <th>end_time</th>\n",
142 |        "      <th>wav_file</th>\n",
143 |        "      <th>emotion</th>\n",
144 |        "      <th>val</th>\n",
145 |        "      <th>act</th>\n",
146 |        "      <th>dom</th>\n",
147 |        "    </tr>\n",
148 |        "  </thead>\n",
149 |        "  <tbody>\n",
150 |        "    <tr>\n",
151 |        "      <th>10034</th>\n",
152 |        "      <td>358.10</td>\n",
153 |        "      <td>365.26</td>\n",
154 |        "      <td>Ses05F_impro05_M049</td>\n",
155 |        "      <td>fru</td>\n",
156 |        "      <td>2.5</td>\n",
157 |        "      <td>3.5</td>\n",
158 |        "      <td>4.5</td>\n",
159 |        "    </tr>\n",
160 |        "    <tr>\n",
161 |        "      <th>10035</th>\n",
162 |        "      <td>365.30</td>\n",
163 |        "      <td>370.53</td>\n",
164 |        "      <td>Ses05F_impro05_M050</td>\n",
165 |        "      <td>neu</td>\n",
166 |        "      <td>2.5</td>\n",
167 |        "      <td>3.5</td>\n",
168 |        "      <td>4.0</td>\n",
169 |        "    </tr>\n",
170 |        "    <tr>\n",
171 |        "      <th>10036</th>\n",
172 |        "      <td>371.63</td>\n",
173 |        "      <td>374.16</td>\n",
174 |        "      <td>Ses05F_impro05_M051</td>\n",
175 |        "      <td>neu</td>\n",
176 |        "      <td>3.0</td>\n",
177 |        "      <td>2.5</td>\n",
178 |        "      <td>2.5</td>\n",
179 |        "    </tr>\n",
180 |        "    <tr>\n",
181 |        "      <th>10037</th>\n",
182 |        "      <td>375.10</td>\n",
183 |        "      <td>385.14</td>\n",
184 |        "      <td>Ses05F_impro05_M052</td>\n",
185 |        "      <td>neu</td>\n",
186 |        "      <td>3.5</td>\n",
187 |        "      <td>3.0</td>\n",
188 |        "      <td>3.5</td>\n",
189 |        "    </tr>\n",
190 |        "    <tr>\n",
191 |        "      <th>10038</th>\n",
192 |        "      <td>386.39</td>\n",
193 |        "      <td>388.27</td>\n",
194 |        "      <td>Ses05F_impro05_M053</td>\n",
195 |        "      <td>neu</td>\n",
196 |        "      <td>4.0</td>\n",
197 |        "      <td>2.5</td>\n",
198 |        "      <td>3.0</td>\n",
199 |        "    </tr>\n",
200 |        "  </tbody>\n",
201 |        "</table>\n",
202 |        "</div>"
203 |       ],
204 |       "text/plain": [
205 |        "       start_time  end_time             wav_file emotion  val  act  dom\n",
206 |        "10034      358.10    365.26  Ses05F_impro05_M049     fru  2.5  3.5  4.5\n",
207 |        "10035      365.30    370.53  Ses05F_impro05_M050     neu  2.5  3.5  4.0\n",
208 |        "10036      371.63    374.16  Ses05F_impro05_M051     neu  3.0  2.5  2.5\n",
209 |        "10037      375.10    385.14  Ses05F_impro05_M052     neu  3.5  3.0  3.5\n",
210 |        "10038      386.39    388.27  Ses05F_impro05_M053     neu  4.0  2.5  3.0"
211 |       ]
212 |      },
213 |      "execution_count": 68,
214 |      "metadata": {},
215 |      "output_type": "execute_result"
216 |     }
217 |    ],
218 |    "source": [
219 |     "import pandas as pd\n",
220 |     "\n",
221 |     "df_iemocap = pd.DataFrame(columns=['start_time', 'end_time', 'wav_file', 'emotion', 'val', 'act', 'dom'])\n",
222 |     "\n",
223 |     "df_iemocap['start_time'] = start_times\n",
224 |     "df_iemocap['end_time'] = end_times\n",
225 |     "df_iemocap['wav_file'] = wav_file_names\n",
226 |     "df_iemocap['emotion'] = emotions\n",
227 |     "df_iemocap['val'] = vals\n",
228 |     "df_iemocap['act'] = acts\n",
229 |     "df_iemocap['dom'] = doms\n",
230 |     "\n",
231 |     "df_iemocap.tail()"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 72,
237 |    "metadata": {},
238 |    "outputs": [],
239 |    "source": [
240 |     "df_iemocap.to_csv('data/pre-processed/df_iemocap.csv', index=False)"
241 |    ]
242 |   }
243 |  ],
244 |  "metadata": {
245 |   "kernelspec": {
246 |    "display_name": "Python 3",
247 |    "language": "python",
248 |    "name": "python3"
249 |   },
250 |   "language_info": {
251 |    "codemirror_mode": {
252 |     "name": "ipython",
253 |     "version": 3
254 |    },
255 |    "file_extension": ".py",
256 |    "mimetype": "text/x-python",
257 |    "name": "python",
258 |    "nbconvert_exporter": "python",
259 |    "pygments_lexer": "ipython3",
260 |    "version": "3.6.9"
261 |   }
262 |  },
263 |  "nbformat": 4,
264 |  "nbformat_minor": 2
265 | }
266 | 


--------------------------------------------------------------------------------
/src/train_sentence_classifiers_sklearn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Trains different classifiers available in sklearn for sentence classification
  3 | """
  4 | 
  5 | import pandas as pd
  6 | import numpy as np
  7 | import pickle
  8 | 
  9 | import itertools
 10 | import xgboost as xgb
 11 | from sklearn.svm import LinearSVC
 12 | from sklearn.naive_bayes import MultinomialNB
 13 | from sklearn.neural_network import MLPClassifier
 14 | from sklearn.linear_model import LogisticRegression
 15 | from sklearn.ensemble import RandomForestClassifier
 16 | from sklearn.model_selection import train_test_split
 17 | from sklearn.feature_selection import SelectFromModel
 18 | from sklearn.feature_extraction.text import CountVectorizer
 19 | 
 20 | from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
 21 | from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score
 22 | 
 23 | import seaborn as sns
 24 | import matplotlib.pyplot as plt
 25 | 
 26 | EMOTION_DICT = {'ang': 0, 'hap': 1, 'sad': 2, 'fea': 3, 'sur': 4, 'neu': 5}
 27 | EMO_KEYS = list(['ang', 'hap', 'sad', 'fea', 'sur', 'neu'])
 28 | 
 29 | 
 30 | def train_tfidf_vectors(df):
 31 |     tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2',
 32 |                             encoding='latin-1', ngram_range=(1, 2),
 33 |                             stop_words='english')
 34 |     return tfidf.fit_transform(df.transcription).toarray()
 35 | 
 36 | 
 37 | def create_train_test_split(features, labels, test_size=0.2):
 38 |     return train_test_split(features, labels, test_size=0.20)
 39 | 
 40 | 
 41 | def plot_confusion_matrix(cm, classes,
 42 |                           normalize=False,
 43 |                           title='Confusion matrix',
 44 |                           cmap=plt.cm.Blues):
 45 |     """
 46 |     This function prints and plots the confusion matrix.
 47 |     Normalization can be applied by setting `normalize=True`.
 48 |     """
 49 |     # plt.figure(figsize=(8,8))
 50 |     plt.imshow(cm, interpolation='nearest', cmap=cmap)
 51 |     plt.title(title)
 52 |     plt.colorbar()
 53 |     tick_marks = np.arange(len(classes))
 54 |     plt.xticks(tick_marks, classes, rotation=45)
 55 |     plt.yticks(tick_marks, classes)
 56 | 
 57 |     if normalize:
 58 |         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
 59 |         print("Normalized confusion matrix")
 60 |     else:
 61 |         print('Confusion matrix, without normalization')
 62 | 
 63 |     print(cm)
 64 | 
 65 |     thresh = cm.max() / 2.
 66 |     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
 67 |         plt.text(j, i, cm[i, j],
 68 |                  horizontalalignment="center",
 69 |                  color="white" if cm[i, j] > thresh else "black")
 70 | 
 71 |     plt.tight_layout()
 72 |     plt.ylabel('True label')
 73 |     plt.xlabel('Predicted label')
 74 | 
 75 | 
 76 | def one_hot_encoder(true_labels, num_records, num_classes):
 77 |     temp = np.array(true_labels[:num_records])
 78 |     true_labels = np.zeros((num_records, num_classes))
 79 |     true_labels[np.arange(num_records), temp] = 1
 80 |     return true_labels
 81 | 
 82 | 
 83 | def display_results(y_test, pred_probs, cm=True):
 84 |     pred = np.argmax(pred_probs, axis=-1)
 85 |     one_hot_true = one_hot_encoder(y_test, len(pred), len(EMOTION_DICT))
 86 |     print('Test Set Accuracy =  {0:.3f}'.format(accuracy_score(y_test, pred)))
 87 |     print('Test Set F-score =  {0:.3f}'.format(f1_score(y_test, pred, average='macro')))
 88 |     print('Test Set Precision =  {0:.3f}'.format(precision_score(y_test, pred, average='macro')))
 89 |     print('Test Set Recall =  {0:.3f}'.format(recall_score(y_test, pred, average='macro')))
 90 |     if cm:
 91 |         plot_confusion_matrix(confusion_matrix(y_test, pred), classes=EMO_KEYS)
 92 | 
 93 | 
 94 | def model_random_forest_classifier(x_train, y_train, x_test, y_test):
 95 |     rf_classifier = RandomForestClassifier(n_estimators=6000,
 96 |                                            min_samples_split=25)
 97 |     rf_classifier.fit(x_train, y_train)
 98 | 
 99 |     # Predict
100 |     pred_probs = rf_classifier.predict_proba(x_test)
101 | 
102 |     # Results
103 |     display_results(y_test, pred_probs)
104 | 
105 |     with open('pred_probas/text_rf_classifier.pkl', 'wb') as f:
106 |         pickle.dump(pred_probs, f)
107 | 
108 | 
109 | def model_xgb_classifier(x_train, y_train, x_test, y_test):
110 |     xgb_classifier = xgb.XGBClassifier(max_depth=7, learning_rate=0.008,
111 |                                        objective='multi:softprob',
112 |                                        n_estimators=600, sub_sample=0.8,
113 |                                        num_class=len(EMOTION_DICT),
114 |                                        booster='gbtree', n_jobs=4)
115 |     xgb_classifier.fit(x_train, y_train)
116 | 
117 |     # Predict
118 |     pred_probs = xgb_classifier.predict_proba(x_test)
119 | 
120 |     # Results
121 |     display_results(y_test, pred_probs)
122 | 
123 |     with open('pred_probas/text_xgb_classifier.pkl', 'wb') as f:
124 |         pickle.dump(pred_probs, f)
125 | 
126 | 
127 | def model_svc_classifier(x_train, y_train, x_test, y_test):
128 |     svc_classifier = LinearSVC()
129 | 
130 |     svc_classifier.fit(x_train, y_train)
131 | 
132 |     # Predict
133 |     pred = svc_classifier.predict(x_test)
134 | 
135 |     # Results
136 |     one_hot_true = one_hot_encoder(y_test, len(pred), len(EMOTION_DICT))
137 |     print('Test Set Accuracy =  {0:.3f}'.format(accuracy_score(y_test, pred)))
138 |     print('Test Set F-score =  {0:.3f}'.format(f1_score(y_test, pred, average='macro')))
139 |     print('Test Set Precision =  {0:.3f}'.format(precision_score(y_test, pred, average='macro')))
140 |     print('Test Set Recall =  {0:.3f}'.format(recall_score(y_test, pred, average='macro')))
141 |     plot_confusion_matrix(confusion_matrix(y_test, pred), classes=EMOTION_DICT.keys())
142 | 
143 |     with open('pred_probas/text_svc_classifier_model.pkl', 'wb') as f:
144 |         pickle.dump(svc_classifier, f)
145 | 
146 | 
147 | def model_multinomial_naive_bayes_classifier(x_train, y_train, x_test, y_test):
148 |     mnb_classifier = MultinomialNB()
149 | 
150 |     mnb_classifier.fit(x_train, y_train)
151 | 
152 |     # Predict
153 |     pred_probs = mnb_classifier.predict_proba(x_test)
154 | 
155 |     # Results
156 |     display_results(y_test, pred_probs)
157 | 
158 |     with open('pred_probas/text_mnb_classifier.pkl', 'wb') as f:
159 |         pickle.dump(pred_probs, f)
160 | 
161 | 
162 | def model_mlp_classifier(x_train, y_train, x_test, y_test):
163 |     mlp_classifier = MLPClassifier(hidden_layer_sizes=(500, ),
164 |                                    activation='relu', solver='adam',
165 |                                    alpha=0.0001, batch_size='auto',
166 |                                    learning_rate='adaptive',
167 |                                    learning_rate_init=0.01, power_t=0.5,
168 |                                    max_iter=1000, shuffle=True,
169 |                                    random_state=None, tol=0.0001,
170 |                                    verbose=False, warm_start=True,
171 |                                    momentum=0.8, nesterovs_momentum=True,
172 |                                    early_stopping=False,
173 |                                    validation_fraction=0.1,
174 |                                    beta_1=0.9, beta_2=0.999, epsilon=1e-08)
175 | 
176 |     mlp_classifier.fit(x_train, y_train)
177 | 
178 |     # Predict
179 |     pred_probs = mlp_classifier.predict_proba(x_test)
180 | 
181 |     # Results
182 |     display_results(y_test, pred_probs)
183 | 
184 |     with open('pred_probas/text_mlp_classifier.pkl', 'wb') as f:
185 |         pickle.dump(pred_probs, f)
186 | 
187 | 
188 | def model_lr_classifier(x_train, y_train, x_test, y_test):
189 |     lr_classifier = LogisticRegression(solver='lbfgs',
190 |                                        multi_class='multinomial',
191 |                                        max_iter=1000)
192 | 
193 |     lr_classifier.fit(x_train, y_train)
194 | 
195 |     # Predict
196 |     pred_probs = lr_classifier.predict_proba(x_test)
197 | 
198 |     # Results
199 |     display_results(y_test, pred_probs)
200 | 
201 |     with open('pred_probas/text_lr_classifier.pkl', 'wb') as f:
202 |         pickle.dump(pred_probs, f)
203 | 
204 | 
205 | def model_ensemble_of_classifiers(y_test):
206 |     # Load predicted probabilities
207 |     with open('pred_probas/text_rf_classifier.pkl', 'rb') as f:
208 |         rf_pred_probs = pickle.load(f)
209 | 
210 |     with open('pred_probas/text_xgb_classifier.pkl', 'rb') as f:
211 |         xgb_pred_probs = pickle.load(f)
212 | 
213 |     with open('pred_probas/text_svc_classifier_model.pkl', 'rb') as f:
214 |         svc_preds = pickle.load(f)
215 | 
216 |     with open('pred_probas/text_mnb_classifier.pkl', 'rb') as f:
217 |         mnb_pred_probs = pickle.load(f)
218 | 
219 |     with open('pred_probas/text_mlp_classifier.pkl', 'rb') as f:
220 |         mlp_pred_probs = pickle.load(f)
221 | 
222 |     with open('pred_probas/text_lr_classifier.pkl', 'rb') as f:
223 |         lr_pred_probs = pickle.load(f)
224 | 
225 |     # Average of the predicted probabilites
226 |     ensemble_pred_probs = (xgb_pred_probs +
227 |                            mlp_pred_probs +
228 |                            rf_pred_probs +
229 |                            mnb_pred_probs +
230 |                            lr_pred_probs)/5.0
231 | 
232 |     # Show metrics
233 |     display_results(y_test, ensemble_pred_probs)
234 | 
235 | 
236 | def load_data():
237 |     df = pd.read_csv('data/t2e/text_train.csv')
238 |     df = df.append(pd.read_csv('data/t2e/text_test.csv'))
239 |     features = train_tfidf_vectors(df)
240 |     labels = df.label
241 |     return features, labels
242 | 
243 | 
244 | def main():
245 |     x_train, x_test, y_train, y_test = create_train_test_split(load_data())
246 |     model_random_forest_classifier(x_train, y_train, x_test, y_test)
247 |     model_xgb_classifier(x_train, y_train, x_test, y_test)
248 |     model_svc_classifier(x_train, y_train, x_test, y_test)
249 |     model_multinomial_naive_bayes_classifier(x_train, y_train, x_test, y_test)
250 |     model_ensemble_of_classifiers(y_test)
251 | 
252 | 
253 | if __name__ == '__main__':
254 |     main()
255 | 


--------------------------------------------------------------------------------
/4_prepare_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Build Speech data files"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import pandas as pd\n",
 17 |     "from sklearn.model_selection import train_test_split\n",
 18 |     "from sklearn.preprocessing import MinMaxScaler\n",
 19 |     "from IPython.display import display\n",
 20 |     "\n",
 21 |     "%matplotlib inline"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 2,
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "name": "stdout",
 31 |      "output_type": "stream",
 32 |      "text": [
 33 |       "(7527, 10)\n"
 34 |      ]
 35 |     },
 36 |     {
 37 |      "data": {
 38 |       "text/html": [
 39 |        "<div>\n",
 40 |        "<style scoped>\n",
 41 |        "    .dataframe tbody tr th:only-of-type {\n",
 42 |        "        vertical-align: middle;\n",
 43 |        "    }\n",
 44 |        "\n",
 45 |        "    .dataframe tbody tr th {\n",
 46 |        "        vertical-align: top;\n",
 47 |        "    }\n",
 48 |        "\n",
 49 |        "    .dataframe thead th {\n",
 50 |        "        text-align: right;\n",
 51 |        "    }\n",
 52 |        "</style>\n",
 53 |        "<table border=\"1\" class=\"dataframe\">\n",
 54 |        "  <thead>\n",
 55 |        "    <tr style=\"text-align: right;\">\n",
 56 |        "      <th></th>\n",
 57 |        "      <th>wav_file</th>\n",
 58 |        "      <th>label</th>\n",
 59 |        "      <th>sig_mean</th>\n",
 60 |        "      <th>sig_std</th>\n",
 61 |        "      <th>rmse_mean</th>\n",
 62 |        "      <th>rmse_std</th>\n",
 63 |        "      <th>silence</th>\n",
 64 |        "      <th>harmonic</th>\n",
 65 |        "      <th>auto_corr_max</th>\n",
 66 |        "      <th>auto_corr_std</th>\n",
 67 |        "    </tr>\n",
 68 |        "  </thead>\n",
 69 |        "  <tbody>\n",
 70 |        "    <tr>\n",
 71 |        "      <th>0</th>\n",
 72 |        "      <td>Ses01F_script02_2_F000</td>\n",
 73 |        "      <td>7</td>\n",
 74 |        "      <td>0.003671</td>\n",
 75 |        "      <td>0.005739</td>\n",
 76 |        "      <td>0.004434</td>\n",
 77 |        "      <td>0.003640</td>\n",
 78 |        "      <td>0.018692</td>\n",
 79 |        "      <td>-0.008143</td>\n",
 80 |        "      <td>0.023179</td>\n",
 81 |        "      <td>0.133057</td>\n",
 82 |        "    </tr>\n",
 83 |        "    <tr>\n",
 84 |        "      <th>1</th>\n",
 85 |        "      <td>Ses01F_script02_2_F001</td>\n",
 86 |        "      <td>7</td>\n",
 87 |        "      <td>0.006365</td>\n",
 88 |        "      <td>0.011155</td>\n",
 89 |        "      <td>0.007913</td>\n",
 90 |        "      <td>0.007850</td>\n",
 91 |        "      <td>0.444444</td>\n",
 92 |        "      <td>-0.017120</td>\n",
 93 |        "      <td>0.094578</td>\n",
 94 |        "      <td>0.213759</td>\n",
 95 |        "    </tr>\n",
 96 |        "    <tr>\n",
 97 |        "      <th>6</th>\n",
 98 |        "      <td>Ses01F_script02_2_F006</td>\n",
 99 |        "      <td>0</td>\n",
100 |        "      <td>0.039659</td>\n",
101 |        "      <td>0.067939</td>\n",
102 |        "      <td>0.049930</td>\n",
103 |        "      <td>0.046050</td>\n",
104 |        "      <td>0.345018</td>\n",
105 |        "      <td>-0.004605</td>\n",
106 |        "      <td>3.441704</td>\n",
107 |        "      <td>9.317455</td>\n",
108 |        "    </tr>\n",
109 |        "    <tr>\n",
110 |        "      <th>7</th>\n",
111 |        "      <td>Ses01F_script02_2_F007</td>\n",
112 |        "      <td>4</td>\n",
113 |        "      <td>0.014478</td>\n",
114 |        "      <td>0.026941</td>\n",
115 |        "      <td>0.018384</td>\n",
116 |        "      <td>0.019687</td>\n",
117 |        "      <td>0.422764</td>\n",
118 |        "      <td>-0.011850</td>\n",
119 |        "      <td>0.568261</td>\n",
120 |        "      <td>1.928247</td>\n",
121 |        "    </tr>\n",
122 |        "    <tr>\n",
123 |        "      <th>8</th>\n",
124 |        "      <td>Ses01F_script02_2_F008</td>\n",
125 |        "      <td>0</td>\n",
126 |        "      <td>0.025271</td>\n",
127 |        "      <td>0.054958</td>\n",
128 |        "      <td>0.031571</td>\n",
129 |        "      <td>0.044958</td>\n",
130 |        "      <td>0.470019</td>\n",
131 |        "      <td>-0.005120</td>\n",
132 |        "      <td>2.529399</td>\n",
133 |        "      <td>9.210082</td>\n",
134 |        "    </tr>\n",
135 |        "  </tbody>\n",
136 |        "</table>\n",
137 |        "</div>"
138 |       ],
139 |       "text/plain": [
140 |        "                 wav_file  label  sig_mean   sig_std  rmse_mean  rmse_std  \\\n",
141 |        "0  Ses01F_script02_2_F000      7  0.003671  0.005739   0.004434  0.003640   \n",
142 |        "1  Ses01F_script02_2_F001      7  0.006365  0.011155   0.007913  0.007850   \n",
143 |        "6  Ses01F_script02_2_F006      0  0.039659  0.067939   0.049930  0.046050   \n",
144 |        "7  Ses01F_script02_2_F007      4  0.014478  0.026941   0.018384  0.019687   \n",
145 |        "8  Ses01F_script02_2_F008      0  0.025271  0.054958   0.031571  0.044958   \n",
146 |        "\n",
147 |        "    silence  harmonic  auto_corr_max  auto_corr_std  \n",
148 |        "0  0.018692 -0.008143       0.023179       0.133057  \n",
149 |        "1  0.444444 -0.017120       0.094578       0.213759  \n",
150 |        "6  0.345018 -0.004605       3.441704       9.317455  \n",
151 |        "7  0.422764 -0.011850       0.568261       1.928247  \n",
152 |        "8  0.470019 -0.005120       2.529399       9.210082  "
153 |       ]
154 |      },
155 |      "metadata": {},
156 |      "output_type": "display_data"
157 |     },
158 |     {
159 |      "data": {
160 |       "text/html": [
161 |        "<div>\n",
162 |        "<style scoped>\n",
163 |        "    .dataframe tbody tr th:only-of-type {\n",
164 |        "        vertical-align: middle;\n",
165 |        "    }\n",
166 |        "\n",
167 |        "    .dataframe tbody tr th {\n",
168 |        "        vertical-align: top;\n",
169 |        "    }\n",
170 |        "\n",
171 |        "    .dataframe thead th {\n",
172 |        "        text-align: right;\n",
173 |        "    }\n",
174 |        "</style>\n",
175 |        "<table border=\"1\" class=\"dataframe\">\n",
176 |        "  <thead>\n",
177 |        "    <tr style=\"text-align: right;\">\n",
178 |        "      <th></th>\n",
179 |        "      <th>wav_file</th>\n",
180 |        "      <th>label</th>\n",
181 |        "      <th>sig_mean</th>\n",
182 |        "      <th>sig_std</th>\n",
183 |        "      <th>rmse_mean</th>\n",
184 |        "      <th>rmse_std</th>\n",
185 |        "      <th>silence</th>\n",
186 |        "      <th>harmonic</th>\n",
187 |        "      <th>auto_corr_max</th>\n",
188 |        "      <th>auto_corr_std</th>\n",
189 |        "    </tr>\n",
190 |        "  </thead>\n",
191 |        "  <tbody>\n",
192 |        "    <tr>\n",
193 |        "      <th>0</th>\n",
194 |        "      <td>Ses01F_script02_2_F000</td>\n",
195 |        "      <td>5</td>\n",
196 |        "      <td>0.003671</td>\n",
197 |        "      <td>0.005739</td>\n",
198 |        "      <td>0.004434</td>\n",
199 |        "      <td>0.003640</td>\n",
200 |        "      <td>0.018692</td>\n",
201 |        "      <td>-0.008143</td>\n",
202 |        "      <td>0.023179</td>\n",
203 |        "      <td>0.133057</td>\n",
204 |        "    </tr>\n",
205 |        "    <tr>\n",
206 |        "      <th>1</th>\n",
207 |        "      <td>Ses01F_script02_2_F001</td>\n",
208 |        "      <td>5</td>\n",
209 |        "      <td>0.006365</td>\n",
210 |        "      <td>0.011155</td>\n",
211 |        "      <td>0.007913</td>\n",
212 |        "      <td>0.007850</td>\n",
213 |        "      <td>0.444444</td>\n",
214 |        "      <td>-0.017120</td>\n",
215 |        "      <td>0.094578</td>\n",
216 |        "      <td>0.213759</td>\n",
217 |        "    </tr>\n",
218 |        "    <tr>\n",
219 |        "      <th>6</th>\n",
220 |        "      <td>Ses01F_script02_2_F006</td>\n",
221 |        "      <td>0</td>\n",
222 |        "      <td>0.039659</td>\n",
223 |        "      <td>0.067939</td>\n",
224 |        "      <td>0.049930</td>\n",
225 |        "      <td>0.046050</td>\n",
226 |        "      <td>0.345018</td>\n",
227 |        "      <td>-0.004605</td>\n",
228 |        "      <td>3.441704</td>\n",
229 |        "      <td>9.317455</td>\n",
230 |        "    </tr>\n",
231 |        "    <tr>\n",
232 |        "      <th>7</th>\n",
233 |        "      <td>Ses01F_script02_2_F007</td>\n",
234 |        "      <td>2</td>\n",
235 |        "      <td>0.014478</td>\n",
236 |        "      <td>0.026941</td>\n",
237 |        "      <td>0.018384</td>\n",
238 |        "      <td>0.019687</td>\n",
239 |        "      <td>0.422764</td>\n",
240 |        "      <td>-0.011850</td>\n",
241 |        "      <td>0.568261</td>\n",
242 |        "      <td>1.928247</td>\n",
243 |        "    </tr>\n",
244 |        "    <tr>\n",
245 |        "      <th>8</th>\n",
246 |        "      <td>Ses01F_script02_2_F008</td>\n",
247 |        "      <td>0</td>\n",
248 |        "      <td>0.025271</td>\n",
249 |        "      <td>0.054958</td>\n",
250 |        "      <td>0.031571</td>\n",
251 |        "      <td>0.044958</td>\n",
252 |        "      <td>0.470019</td>\n",
253 |        "      <td>-0.005120</td>\n",
254 |        "      <td>2.529399</td>\n",
255 |        "      <td>9.210082</td>\n",
256 |        "    </tr>\n",
257 |        "  </tbody>\n",
258 |        "</table>\n",
259 |        "</div>"
260 |       ],
261 |       "text/plain": [
262 |        "                 wav_file  label  sig_mean   sig_std  rmse_mean  rmse_std  \\\n",
263 |        "0  Ses01F_script02_2_F000      5  0.003671  0.005739   0.004434  0.003640   \n",
264 |        "1  Ses01F_script02_2_F001      5  0.006365  0.011155   0.007913  0.007850   \n",
265 |        "6  Ses01F_script02_2_F006      0  0.039659  0.067939   0.049930  0.046050   \n",
266 |        "7  Ses01F_script02_2_F007      2  0.014478  0.026941   0.018384  0.019687   \n",
267 |        "8  Ses01F_script02_2_F008      0  0.025271  0.054958   0.031571  0.044958   \n",
268 |        "\n",
269 |        "    silence  harmonic  auto_corr_max  auto_corr_std  \n",
270 |        "0  0.018692 -0.008143       0.023179       0.133057  \n",
271 |        "1  0.444444 -0.017120       0.094578       0.213759  \n",
272 |        "6  0.345018 -0.004605       3.441704       9.317455  \n",
273 |        "7  0.422764 -0.011850       0.568261       1.928247  \n",
274 |        "8  0.470019 -0.005120       2.529399       9.210082  "
275 |       ]
276 |      },
277 |      "execution_count": 2,
278 |      "metadata": {},
279 |      "output_type": "execute_result"
280 |     }
281 |    ],
282 |    "source": [
283 |     "df = pd.read_csv('data/audio_features.csv')\n",
284 |     "df = df[df['label'].isin([0, 1, 2, 3, 4, 5, 6, 7])]\n",
285 |     "print(df.shape)\n",
286 |     "display(df.head())\n",
287 |     "\n",
288 |     "# change 7 to 2\n",
289 |     "df['label'] = df['label'].map({0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 4, 7: 5})\n",
290 |     "df.head()"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 3,
296 |    "metadata": {},
297 |    "outputs": [],
298 |    "source": [
299 |     "df.to_csv('data/no_sample_df.csv')\n",
300 |     "\n",
301 |     "# oversample fear\n",
302 |     "fear_df = df[df['label']==3]\n",
303 |     "for i in range(30):\n",
304 |     "    df = df.append(fear_df)\n",
305 |     "\n",
306 |     "sur_df = df[df['label']==4]\n",
307 |     "for i in range(10):\n",
308 |     "    df = df.append(sur_df)\n",
309 |     "    \n",
310 |     "df.to_csv('data/modified_df.csv')"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 4,
316 |    "metadata": {},
317 |    "outputs": [
318 |     {
319 |      "data": {
320 |       "text/html": [
321 |        "<div>\n",
322 |        "<style scoped>\n",
323 |        "    .dataframe tbody tr th:only-of-type {\n",
324 |        "        vertical-align: middle;\n",
325 |        "    }\n",
326 |        "\n",
327 |        "    .dataframe tbody tr th {\n",
328 |        "        vertical-align: top;\n",
329 |        "    }\n",
330 |        "\n",
331 |        "    .dataframe thead th {\n",
332 |        "        text-align: right;\n",
333 |        "    }\n",
334 |        "</style>\n",
335 |        "<table border=\"1\" class=\"dataframe\">\n",
336 |        "  <thead>\n",
337 |        "    <tr style=\"text-align: right;\">\n",
338 |        "      <th></th>\n",
339 |        "      <th>wav_file</th>\n",
340 |        "      <th>label</th>\n",
341 |        "      <th>sig_mean</th>\n",
342 |        "      <th>sig_std</th>\n",
343 |        "      <th>rmse_mean</th>\n",
344 |        "      <th>rmse_std</th>\n",
345 |        "      <th>silence</th>\n",
346 |        "      <th>harmonic</th>\n",
347 |        "      <th>auto_corr_max</th>\n",
348 |        "      <th>auto_corr_std</th>\n",
349 |        "    </tr>\n",
350 |        "  </thead>\n",
351 |        "  <tbody>\n",
352 |        "    <tr>\n",
353 |        "      <th>0</th>\n",
354 |        "      <td>Ses01F_script02_2_F000</td>\n",
355 |        "      <td>5</td>\n",
356 |        "      <td>0.010847</td>\n",
357 |        "      <td>0.013290</td>\n",
358 |        "      <td>0.010715</td>\n",
359 |        "      <td>0.019386</td>\n",
360 |        "      <td>0.024313</td>\n",
361 |        "      <td>0.168625</td>\n",
362 |        "      <td>0.000277</td>\n",
363 |        "      <td>0.000468</td>\n",
364 |        "    </tr>\n",
365 |        "    <tr>\n",
366 |        "      <th>1</th>\n",
367 |        "      <td>Ses01F_script02_2_F001</td>\n",
368 |        "      <td>5</td>\n",
369 |        "      <td>0.020306</td>\n",
370 |        "      <td>0.027702</td>\n",
371 |        "      <td>0.020774</td>\n",
372 |        "      <td>0.042489</td>\n",
373 |        "      <td>0.578112</td>\n",
374 |        "      <td>0.166868</td>\n",
375 |        "      <td>0.001141</td>\n",
376 |        "      <td>0.000753</td>\n",
377 |        "    </tr>\n",
378 |        "    <tr>\n",
379 |        "      <th>6</th>\n",
380 |        "      <td>Ses01F_script02_2_F006</td>\n",
381 |        "      <td>0</td>\n",
382 |        "      <td>0.137206</td>\n",
383 |        "      <td>0.178822</td>\n",
384 |        "      <td>0.142271</td>\n",
385 |        "      <td>0.252096</td>\n",
386 |        "      <td>0.448783</td>\n",
387 |        "      <td>0.169317</td>\n",
388 |        "      <td>0.041644</td>\n",
389 |        "      <td>0.032933</td>\n",
390 |        "    </tr>\n",
391 |        "    <tr>\n",
392 |        "      <th>7</th>\n",
393 |        "      <td>Ses01F_script02_2_F007</td>\n",
394 |        "      <td>2</td>\n",
395 |        "      <td>0.048793</td>\n",
396 |        "      <td>0.069713</td>\n",
397 |        "      <td>0.051051</td>\n",
398 |        "      <td>0.107439</td>\n",
399 |        "      <td>0.549911</td>\n",
400 |        "      <td>0.167899</td>\n",
401 |        "      <td>0.006873</td>\n",
402 |        "      <td>0.006814</td>\n",
403 |        "    </tr>\n",
404 |        "    <tr>\n",
405 |        "      <th>8</th>\n",
406 |        "      <td>Ses01F_script02_2_F008</td>\n",
407 |        "      <td>0</td>\n",
408 |        "      <td>0.086686</td>\n",
409 |        "      <td>0.144276</td>\n",
410 |        "      <td>0.089184</td>\n",
411 |        "      <td>0.246100</td>\n",
412 |        "      <td>0.611379</td>\n",
413 |        "      <td>0.169216</td>\n",
414 |        "      <td>0.030604</td>\n",
415 |        "      <td>0.032553</td>\n",
416 |        "    </tr>\n",
417 |        "  </tbody>\n",
418 |        "</table>\n",
419 |        "</div>"
420 |       ],
421 |       "text/plain": [
422 |        "                 wav_file  label  sig_mean   sig_std  rmse_mean  rmse_std  \\\n",
423 |        "0  Ses01F_script02_2_F000      5  0.010847  0.013290   0.010715  0.019386   \n",
424 |        "1  Ses01F_script02_2_F001      5  0.020306  0.027702   0.020774  0.042489   \n",
425 |        "6  Ses01F_script02_2_F006      0  0.137206  0.178822   0.142271  0.252096   \n",
426 |        "7  Ses01F_script02_2_F007      2  0.048793  0.069713   0.051051  0.107439   \n",
427 |        "8  Ses01F_script02_2_F008      0  0.086686  0.144276   0.089184  0.246100   \n",
428 |        "\n",
429 |        "    silence  harmonic  auto_corr_max  auto_corr_std  \n",
430 |        "0  0.024313  0.168625       0.000277       0.000468  \n",
431 |        "1  0.578112  0.166868       0.001141       0.000753  \n",
432 |        "6  0.448783  0.169317       0.041644       0.032933  \n",
433 |        "7  0.549911  0.167899       0.006873       0.006814  \n",
434 |        "8  0.611379  0.169216       0.030604       0.032553  "
435 |       ]
436 |      },
437 |      "execution_count": 4,
438 |      "metadata": {},
439 |      "output_type": "execute_result"
440 |     }
441 |    ],
442 |    "source": [
443 |     "emotion_dict = {'ang': 0,\n",
444 |     "                'hap': 1,\n",
445 |     "                'sad': 2,\n",
446 |     "                'neu': 3,}\n",
447 |     "\n",
448 |     "# emotion_dict = {'ang': 0,\n",
449 |     "#                 'hap': 1,\n",
450 |     "#                 'exc': 2,\n",
451 |     "#                 'sad': 3,\n",
452 |     "#                 'fru': 4,\n",
453 |     "#                 'fea': 5,\n",
454 |     "#                 'sur': 6,\n",
455 |     "#                 'neu': 7,\n",
456 |     "#                 'xxx': 8,\n",
457 |     "#                 'oth': 8}\n",
458 |     "\n",
459 |     "scalar = MinMaxScaler()\n",
460 |     "df[df.columns[2:]] = scalar.fit_transform(df[df.columns[2:]])\n",
461 |     "df.head()"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": 5,
467 |    "metadata": {},
468 |    "outputs": [
469 |     {
470 |      "name": "stdout",
471 |      "output_type": "stream",
472 |      "text": [
473 |       "(7837, 10) (1960, 10)\n"
474 |      ]
475 |     }
476 |    ],
477 |    "source": [
478 |     "x_train, x_test = train_test_split(df, test_size=0.20)\n",
479 |     "\n",
480 |     "x_train.to_csv('data/s2e/audio_train.csv', index=False)\n",
481 |     "x_test.to_csv('data/s2e/audio_test.csv', index=False)\n",
482 |     "\n",
483 |     "print(x_train.shape, x_test.shape)"
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "markdown",
488 |    "metadata": {},
489 |    "source": [
490 |     "## Define preprocessing functions for text"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": 6,
496 |    "metadata": {},
497 |    "outputs": [],
498 |    "source": [
499 |     "import unicodedata\n",
500 |     "\n",
501 |     "def unicodeToAscii(s):\n",
502 |     "    return ''.join(\n",
503 |     "        c for c in unicodedata.normalize('NFD', s)\n",
504 |     "        if unicodedata.category(c) != 'Mn'\n",
505 |     "    )\n",
506 |     "\n",
507 |     "# Lowercase, trim, and remove non-letter characters\n",
508 |     "def normalizeString(s):\n",
509 |     "    s = unicodeToAscii(s.lower().strip())\n",
510 |     "    s = re.sub(r\"([.!?])\", r\" \\1\", s)\n",
511 |     "    s = re.sub(r\"[^a-zA-Z.!?]+\", r\" \", s)\n",
512 |     "    return s"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "markdown",
517 |    "metadata": {},
518 |    "source": [
519 |     "## Build Text data files"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "code",
524 |    "execution_count": 7,
525 |    "metadata": {},
526 |    "outputs": [
527 |     {
528 |      "data": {
529 |       "text/plain": [
530 |        "10087"
531 |       ]
532 |      },
533 |      "execution_count": 7,
534 |      "metadata": {},
535 |      "output_type": "execute_result"
536 |     }
537 |    ],
538 |    "source": [
539 |     "import re\n",
540 |     "import os\n",
541 |     "import pickle\n",
542 |     "\n",
543 |     "useful_regex = re.compile(r'^(\\w+)', re.IGNORECASE)\n",
544 |     "\n",
545 |     "file2transcriptions = {}\n",
546 |     "\n",
547 |     "for sess in range(1, 6):\n",
548 |     "    transcripts_path = 'data/IEMOCAP_full_release/Session{}/dialog/transcriptions/'.format(sess)\n",
549 |     "    transcript_files = os.listdir(transcripts_path)\n",
550 |     "    for f in transcript_files:\n",
551 |     "        with open('{}{}'.format(transcripts_path, f), 'r') as f:\n",
552 |     "            all_lines = f.readlines()\n",
553 |     "\n",
554 |     "        for l in all_lines:\n",
555 |     "            audio_code = useful_regex.match(l).group()\n",
556 |     "            transcription = l.split(':')[-1].strip()\n",
557 |     "            # assuming that all the keys would be unique and hence no `try`\n",
558 |     "            file2transcriptions[audio_code] = transcription\n",
559 |     "# save dict\n",
560 |     "with open('data/t2e/audiocode2text.pkl', 'wb') as file:\n",
561 |     "    pickle.dump(file2transcriptions, file)\n",
562 |     "len(file2transcriptions)"
563 |    ]
564 |   },
565 |   {
566 |    "cell_type": "code",
567 |    "execution_count": 8,
568 |    "metadata": {},
569 |    "outputs": [],
570 |    "source": [
571 |     "audiocode2text = pickle.load(open('data/t2e/audiocode2text.pkl', 'rb'))"
572 |    ]
573 |   },
574 |   {
575 |    "cell_type": "code",
576 |    "execution_count": 9,
577 |    "metadata": {},
578 |    "outputs": [
579 |     {
580 |      "name": "stdout",
581 |      "output_type": "stream",
582 |      "text": [
583 |       "(7837, 3) (1960, 3)\n"
584 |      ]
585 |     }
586 |    ],
587 |    "source": [
588 |     "# Prepare text data\n",
589 |     "text_train = pd.DataFrame()\n",
590 |     "text_train['wav_file'] = x_train['wav_file']\n",
591 |     "text_train['label'] = x_train['label']\n",
592 |     "text_train['transcription'] = [normalizeString(audiocode2text[code]) for code in x_train['wav_file']]\n",
593 |     "\n",
594 |     "text_test = pd.DataFrame()\n",
595 |     "text_test['wav_file'] = x_test['wav_file']\n",
596 |     "text_test['label'] = x_test['label']\n",
597 |     "text_test['transcription'] = [normalizeString(audiocode2text[code]) for code in x_test['wav_file']]\n",
598 |     "\n",
599 |     "text_train.to_csv('data/t2e/text_train.csv', index=False)\n",
600 |     "text_test.to_csv('data/t2e/text_test.csv', index=False)\n",
601 |     "\n",
602 |     "print(text_train.shape, text_test.shape)"
603 |    ]
604 |   }
605 |  ],
606 |  "metadata": {
607 |   "kernelspec": {
608 |    "display_name": "Python 3",
609 |    "language": "python",
610 |    "name": "python3"
611 |   },
612 |   "language_info": {
613 |    "codemirror_mode": {
614 |     "name": "ipython",
615 |     "version": 3
616 |    },
617 |    "file_extension": ".py",
618 |    "mimetype": "text/x-python",
619 |    "name": "python",
620 |    "nbconvert_exporter": "python",
621 |    "pygments_lexer": "ipython3",
622 |    "version": "3.6.9"
623 |   }
624 |  },
625 |  "nbformat": 4,
626 |  "nbformat_minor": 2
627 | }
628 | 


--------------------------------------------------------------------------------