├── GitAudioEmotion.ipynb
├── GitTextOnlyClassification.ipynb
├── LICENSE
├── README.md
├── Text_and_Audio_Emotion.ipynb
└── img
    ├── 1.txt
    └── Multimodal_1.png


/GitAudioEmotion.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "GitAudioEmotion.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": []
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "accelerator": "GPU"
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "code",
 19 |       "metadata": {
 20 |         "id": "qerI8Kh8hpEY",
 21 |         "colab_type": "code",
 22 |         "colab": {}
 23 |       },
 24 |       "source": [
 25 |         "!cp '/content/drive/My Drive/IEMOCAP_full_release.7z' '/content/'"
 26 |       ],
 27 |       "execution_count": 0,
 28 |       "outputs": []
 29 |     },
 30 |     {
 31 |       "cell_type": "markdown",
 32 |       "metadata": {
 33 |         "id": "8S1PhQVziOtB",
 34 |         "colab_type": "text"
 35 |       },
 36 |       "source": [
 37 |         "***UNZIP THE IMOCAP DATASETS***"
 38 |       ]
 39 |     },
 40 |     {
 41 |       "cell_type": "code",
 42 |       "metadata": {
 43 |         "id": "NOm3xEnhiKgi",
 44 |         "colab_type": "code",
 45 |         "colab": {}
 46 |       },
 47 |       "source": [
 48 |         "!p7zip -d '/content/IEMOCAP_full_release.7z'"
 49 |       ],
 50 |       "execution_count": 0,
 51 |       "outputs": []
 52 |     },
 53 |     {
 54 |       "cell_type": "code",
 55 |       "metadata": {
 56 |         "id": "ojofRy8zS0Ta",
 57 |         "colab_type": "code",
 58 |         "colab": {}
 59 |       },
 60 |       "source": [
 61 |         "!pip install transformers\n",
 62 |         "!pip install tensorboardx"
 63 |       ],
 64 |       "execution_count": 0,
 65 |       "outputs": []
 66 |     },
 67 |     {
 68 |       "cell_type": "markdown",
 69 |       "metadata": {
 70 |         "id": "Y5rruwKFiX0x",
 71 |         "colab_type": "text"
 72 |       },
 73 |       "source": [
 74 |         "*PREPROCESIING THE IMPCAP AUDIO FILES* : THANKS TO ***https://github.com/MITESHPUTHRANNEU/Speech-Emotion-Analyzer/blob/master/final_results_gender_test.ipynb  and https://github.com/david-yoon/multimodal-speech-emotion/blob/master/preprocessing/IEMOCAP_01_wav_to_feature.ipynb "
 75 |       ]
 76 |     },
 77 |     {
 78 |       "cell_type": "code",
 79 |       "metadata": {
 80 |         "id": "TmyifjHXii-K",
 81 |         "colab_type": "code",
 82 |         "colab": {}
 83 |       },
 84 |       "source": [
 85 |         "import librosa\n",
 86 |         "import librosa.display\n",
 87 |         "import numpy as np\n",
 88 |         "import matplotlib.pyplot as plt\n",
 89 |         "import pandas as pd\n",
 90 |         "from matplotlib.backend_bases import RendererBase\n",
 91 |         "from scipy import signal\n",
 92 |         "from scipy.io import wavfile\n",
 93 |         "import os\n",
 94 |         "from PIL import Image\n",
 95 |         "from scipy.fftpack import fft\n",
 96 |         "import torch\n",
 97 |         "import torch.nn as nn\n",
 98 |         "import torch.optim as optim\n",
 99 |         "import torch.nn.functional as F\n",
100 |         "from torch.utils import data\n",
101 |         "import torchvision.datasets as datasets\n",
102 |         "import torchvision.transforms as transforms\n",
103 |         "from tensorboardX import SummaryWriter"
104 |       ],
105 |       "execution_count": 0,
106 |       "outputs": []
107 |     },
108 |     {
109 |       "cell_type": "markdown",
110 |       "metadata": {
111 |         "id": "6wan_pCAlZj7",
112 |         "colab_type": "text"
113 |       },
114 |       "source": [
115 |         "**read the all the files in a list at sentence level**"
116 |       ]
117 |     },
118 |     {
119 |       "cell_type": "code",
120 |       "metadata": {
121 |         "id": "Kkcz86elmCv6",
122 |         "colab_type": "code",
123 |         "colab": {}
124 |       },
125 |       "source": [
126 |         "import os\n",
127 |         "import chardet\n",
128 |         "\n",
129 |         "def file_search(dirname, ret, list_avoid_dir=[]):\n",
130 |         "    filenames = os.listdir(dirname)\n",
131 |         "    \n",
132 |         "    for filename in filenames:\n",
133 |         "        full_filename = os.path.join(dirname, filename)\n",
134 |         "\n",
135 |         "        if os.path.isdir(full_filename) :\n",
136 |         "            if full_filename.split('/')[-1] in list_avoid_dir:\n",
137 |         "                continue\n",
138 |         "            else:\n",
139 |         "                file_search(full_filename, ret, list_avoid_dir)\n",
140 |         "            \n",
141 |         "        else:\n",
142 |         "            ret.append( full_filename ) "
143 |       ],
144 |       "execution_count": 0,
145 |       "outputs": []
146 |     },
147 |     {
148 |       "cell_type": "code",
149 |       "metadata": {
150 |         "id": "RToQ2If_lIbO",
151 |         "colab_type": "code",
152 |         "outputId": "131b4e31-bd23-4e17-cd2f-b7d73118622b",
153 |         "colab": {
154 |           "base_uri": "https://localhost:8080/",
155 |           "height": 101
156 |         }
157 |       },
158 |       "source": [
159 |         "list_files = []\n",
160 |         "for x in range(5):\n",
161 |         "    sess_name = 'Session' + str(x+1)\n",
162 |         "    path = '/content/IEMOCAP_full_release/'+ sess_name + '/sentences/wav/'\n",
163 |         "    file_search(path, list_files)\n",
164 |         "    list_files = sorted(list_files)\n",
165 |         "    print (sess_name + \", #sum files: \" + str(len(list_files)))\n",
166 |         "#extract_feature( list_files, out_file )"
167 |       ],
168 |       "execution_count": 0,
169 |       "outputs": [
170 |         {
171 |           "output_type": "stream",
172 |           "text": [
173 |             "Session1, #sum files: 1820\n",
174 |             "Session2, #sum files: 3633\n",
175 |             "Session3, #sum files: 5769\n",
176 |             "Session4, #sum files: 7873\n",
177 |             "Session5, #sum files: 10043\n"
178 |           ],
179 |           "name": "stdout"
180 |         }
181 |       ]
182 |     },
183 |     {
184 |       "cell_type": "markdown",
185 |       "metadata": {
186 |         "id": "uKP6PwqvILJT",
187 |         "colab_type": "text"
188 |       },
189 |       "source": [
190 |         "***below code is:***\n",
191 |         "https://github.com/Escanor1996/Speech-Emotion-Recognition-SER-/blob/master/SER.ipynb\n",
192 |         "\n",
193 |         "***Paper:*** Attention Based Fully Convolutional Network for Speech Emotion Recognition\n"
194 |       ]
195 |     },
196 |     {
197 |       "cell_type": "code",
198 |       "metadata": {
199 |         "id": "788jVVFCH7vb",
200 |         "colab_type": "code",
201 |         "colab": {}
202 |       },
203 |       "source": [
204 |         "def audio2spectrogram(filepath):\n",
205 |         "    #fig = plt.figure(figsize=(5,5))\n",
206 |         "    samplerate, test_sound  = wavfile.read(filepath,mmap=True)\n",
207 |         "    #print('samplerate',samplerate)\n",
208 |         "    _, spectrogram = log_specgram(test_sound, samplerate)\n",
209 |         "    #print(spectrogram.shape)\n",
210 |         "    #print(type(spectrogram))\n",
211 |         "    #plt.imshow(spectrogram.T, aspect='auto', origin='lower')\n",
212 |         "    return spectrogram\n",
213 |         "    \n",
214 |         "def audio2wave(filepath):\n",
215 |         "    fig = plt.figure(figsize=(5,5))\n",
216 |         "    samplerate, test_sound  = wavfile.read(filepath,mmap=True)\n",
217 |         "    plt.plot(test_sound)"
218 |       ],
219 |       "execution_count": 0,
220 |       "outputs": []
221 |     },
222 |     {
223 |       "cell_type": "code",
224 |       "metadata": {
225 |         "id": "jkiYHO6MH3Kr",
226 |         "colab_type": "code",
227 |         "colab": {}
228 |       },
229 |       "source": [
230 |         "def log_specgram(audio, sample_rate, window_size=40,\n",
231 |         "                 step_size=20, eps=1e-10):\n",
232 |         "    nperseg = int(round(window_size * sample_rate / 1e3))\n",
233 |         "    noverlap = int(round(step_size * sample_rate / 1e3))\n",
234 |         "    #print('noverlap',noverlap)\n",
235 |         "    #print('nperseg',nperseg)\n",
236 |         "    freqs, _, spec = signal.spectrogram(audio,\n",
237 |         "                                    fs=sample_rate,\n",
238 |         "                                    window='hann',\n",
239 |         "                                    nperseg=nperseg,\n",
240 |         "                                    noverlap=noverlap,\n",
241 |         "                                    detrend=False)\n",
242 |         "    return freqs, np.log(spec.T.astype(np.float32) + eps)"
243 |       ],
244 |       "execution_count": 0,
245 |       "outputs": []
246 |     },
247 |     {
248 |       "cell_type": "code",
249 |       "metadata": {
250 |         "id": "sv66rHV3pLkE",
251 |         "colab_type": "code",
252 |         "colab": {}
253 |       },
254 |       "source": [
255 |         "N_CHANNELS = 3\n",
256 |         "def get_3d_spec(Sxx_in, moments=None):\n",
257 |         "    if moments is not None:\n",
258 |         "        (base_mean, base_std, delta_mean, delta_std,\n",
259 |         "             delta2_mean, delta2_std) = moments\n",
260 |         "    else:\n",
261 |         "        base_mean, delta_mean, delta2_mean = (0, 0, 0)\n",
262 |         "        base_std, delta_std, delta2_std = (1, 1, 1)\n",
263 |         "    h, w = Sxx_in.shape\n",
264 |         "    right1 = np.concatenate([Sxx_in[:, 0].reshape((h, -1)), Sxx_in], axis=1)[:, :-1]\n",
265 |         "    delta = (Sxx_in - right1)[:, 1:]\n",
266 |         "    delta_pad = delta[:, 0].reshape((h, -1))\n",
267 |         "    delta = np.concatenate([delta_pad, delta], axis=1)\n",
268 |         "    right2 = np.concatenate([delta[:, 0].reshape((h, -1)), delta], axis=1)[:, :-1]\n",
269 |         "    delta2 = (delta - right2)[:, 1:]\n",
270 |         "    delta2_pad = delta2[:, 0].reshape((h, -1))\n",
271 |         "    delta2 = np.concatenate([delta2_pad, delta2], axis=1)\n",
272 |         "    base = (Sxx_in - base_mean) / base_std\n",
273 |         "    delta = (delta - delta_mean) / delta_std\n",
274 |         "    delta2 = (delta2 - delta2_mean) / delta2_std\n",
275 |         "    stacked = [arr.reshape((h, w, 1)) for arr in (base, delta, delta2)]\n",
276 |         "    return np.concatenate(stacked, axis=2)"
277 |       ],
278 |       "execution_count": 0,
279 |       "outputs": []
280 |     },
281 |     {
282 |       "cell_type": "markdown",
283 |       "metadata": {
284 |         "id": "DMNBRvIxRfUc",
285 |         "colab_type": "text"
286 |       },
287 |       "source": [
288 |         "Spectrogram: one axis represents the time(X-axis), the second axis represents frequencies(Y-axis) and the colors represent magnitude (amplitude) of the observed frequency at a particular time"
289 |       ]
290 |     },
291 |     {
292 |       "cell_type": "markdown",
293 |       "metadata": {
294 |         "id": "HCQegmJSYX-T",
295 |         "colab_type": "text"
296 |       },
297 |       "source": [
298 |         "***read the processed transcription file to collect the labels***"
299 |       ]
300 |     },
301 |     {
302 |       "cell_type": "code",
303 |       "metadata": {
304 |         "id": "bmXOncwLSThg",
305 |         "colab_type": "code",
306 |         "colab": {}
307 |       },
308 |       "source": [
309 |         "import pandas as pd\n",
310 |         "df=pd.read_excel('/content/drive/My Drive/Imocap_text/processed_tran.xlsx')\n",
311 |         "#filename=list_files[60].split('/')[-1].strip('.wav')\n",
312 |         "#lable=df.loc[df['sessionID']==filename]['label'].values[0]\n",
313 |         "#if(lable!=-1):"
314 |       ],
315 |       "execution_count": 0,
316 |       "outputs": []
317 |     },
318 |     {
319 |       "cell_type": "markdown",
320 |       "metadata": {
321 |         "id": "atR-YPWjneVs",
322 |         "colab_type": "text"
323 |       },
324 |       "source": [
325 |         "***EXTACT THE MFCC FEATURE USING LIBROSA***"
326 |       ]
327 |     },
328 |     {
329 |       "cell_type": "code",
330 |       "metadata": {
331 |         "id": "vsY4d9gPm3lC",
332 |         "colab_type": "code",
333 |         "colab": {}
334 |       },
335 |       "source": [
336 |         "no_rows=len(list_files)\n",
337 |         "index=0\n",
338 |         "sprectrogram_shape=[]\n",
339 |         "docs = []\n",
340 |         "bookmark=0\n",
341 |         "extraLabel=0\n",
342 |         "for everyFile in list_files:\n",
343 |         "  if(everyFile.split('/')[-1].endswith('.wav')):\n",
344 |         "    filename=everyFile.split('/')[-1].strip('.wav')\n",
345 |         "    lable=df.loc[df['sessionID']==filename]['label'].values[0]\n",
346 |         "    print('label',lable)\n",
347 |         "    if(lable!=-1):\n",
348 |         "      #sprectrogram_shape.append(audio2spectrogram(everyFile))\n",
349 |         "      spector=audio2spectrogram(everyFile)\n",
350 |         "      spector=get_3d_spec(spector)\n",
351 |         "      npimg = np.transpose(spector,(2,0,1))\n",
352 |         "      input_tensor=torch.tensor(npimg)\n",
353 |         "      input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n",
354 |         "      #X, sample_rate = librosa.load(everyFile, res_type='kaiser_fast',sr=22050*2)\n",
355 |         "      #sample_rate = np.array(sample_rate)\n",
356 |         "      #mfccs = np.mean(librosa.feature.mfcc(y=X,sr=sample_rate,n_mfcc=13),axis=0)\n",
357 |         "      #feature = mfccs\n",
358 |         "      docs.append({\n",
359 |         "         'fileName':everyFile.split('/')[-1].strip('.wav'),\n",
360 |         "         #'feature_mfcc':feature,\n",
361 |         "         'sprectrome':input_batch,\n",
362 |         "         'label':lable\n",
363 |         "              })\n",
364 |         "      index+=1\n",
365 |         "      print('index',index)\n",
366 |         "    else:\n",
367 |         "      extraLabel=extraLabel+1\n",
368 |         "      print('extraLabel',extraLabel)"
369 |       ],
370 |       "execution_count": 0,
371 |       "outputs": []
372 |     },
373 |     {
374 |       "cell_type": "markdown",
375 |       "metadata": {
376 |         "id": "jPnKlxI1PSmO",
377 |         "colab_type": "text"
378 |       },
379 |       "source": [
380 |         "***TestAlexNet input***"
381 |       ]
382 |     },
383 |     {
384 |       "cell_type": "code",
385 |       "metadata": {
386 |         "id": "Ij-TYGuMPcjZ",
387 |         "colab_type": "code",
388 |         "colab": {}
389 |       },
390 |       "source": [
391 |         "import torch\n",
392 |         "import torch.nn as nn\n",
393 |         "#from .utils import load_state_dict_from_url\n",
394 |         "from torch.hub import load_state_dict_from_url\n",
395 |         "\n",
396 |         "__all__ = ['AlexNet', 'alexnet']\n",
397 |         "\n",
398 |         "\n",
399 |         "model_urls = {\n",
400 |         "    'alexnet': 'https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth',\n",
401 |         "}\n",
402 |         "\n",
403 |         "\n",
404 |         "class AlexNet(nn.Module):\n",
405 |         "    def __init__(self, num_classes=1000):\n",
406 |         "        super(AlexNet, self).__init__()\n",
407 |         "        self.num_classes=num_classes\n",
408 |         "        self.features = nn.Sequential(\n",
409 |         "            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),\n",
410 |         "            nn.ReLU(inplace=True),\n",
411 |         "            nn.MaxPool2d(kernel_size=3, stride=2),\n",
412 |         "            nn.Conv2d(64, 192, kernel_size=5, padding=2),\n",
413 |         "            nn.ReLU(inplace=True),\n",
414 |         "            nn.MaxPool2d(kernel_size=3, stride=2),\n",
415 |         "            nn.Conv2d(192, 384, kernel_size=3, padding=1),\n",
416 |         "            nn.ReLU(inplace=True),\n",
417 |         "            nn.Conv2d(384, 256, kernel_size=3, padding=1),\n",
418 |         "            nn.ReLU(inplace=True),\n",
419 |         "            nn.Conv2d(256, 256, kernel_size=3, padding=1),\n",
420 |         "            nn.ReLU(inplace=True),\n",
421 |         "            nn.MaxPool2d(kernel_size=3, stride=2),\n",
422 |         "        )\n",
423 |         "        self.avgpool = nn.AdaptiveAvgPool2d((12, 12))\n",
424 |         "        self.classifier = nn.Sequential(\n",
425 |         "            nn.Dropout(),\n",
426 |         "            nn.Linear(256 * 6 * 6, 4096),\n",
427 |         "            nn.ReLU(inplace=True),\n",
428 |         "            nn.Dropout(),\n",
429 |         "            nn.Linear(4096, 4096),\n",
430 |         "            nn.ReLU(inplace=True),\n",
431 |         "            nn.Linear(4096, num_classes),\n",
432 |         "        )\n",
433 |         "\n",
434 |         "    def forward(self, x):\n",
435 |         "        x = self.features(x)\n",
436 |         "        print('features',x.shape)\n",
437 |         "        \n",
438 |         "        #x = self.avgpool(x)\n",
439 |         "        #print('avgpool',x.shape)\n",
440 |         "        #x = torch.flatten(x, 1)\n",
441 |         "        #print('flatten',x.shape)\n",
442 |         "        #x = self.classifier(x)\n",
443 |         "        return x\n",
444 |         "def alexnet(pretrained=False, progress=True, **kwargs):\n",
445 |         "    model = AlexNet(**kwargs)\n",
446 |         "    if pretrained:\n",
447 |         "        state_dict = load_state_dict_from_url(model_urls['alexnet'],\n",
448 |         "                                              progress=progress)\n",
449 |         "        model.load_state_dict(state_dict)\n",
450 |         "    return model"
451 |       ],
452 |       "execution_count": 0,
453 |       "outputs": []
454 |     },
455 |     {
456 |       "cell_type": "markdown",
457 |       "metadata": {
458 |         "id": "-qClksYw-dtb",
459 |         "colab_type": "text"
460 |       },
461 |       "source": [
462 |         "***MOdified AlexNet***"
463 |       ]
464 |     },
465 |     {
466 |       "cell_type": "code",
467 |       "metadata": {
468 |         "id": "0zQEjewO-hLS",
469 |         "colab_type": "code",
470 |         "colab": {}
471 |       },
472 |       "source": [
473 |         "class ModifiedAlexNet(nn.Module):\n",
474 |         "    def __init__(self, num_classes=4):\n",
475 |         "        super(ModifiedAlexNet, self).__init__()\n",
476 |         "        self.num_classes=num_classes\n",
477 |         "        self.features = nn.Sequential(\n",
478 |         "            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),\n",
479 |         "            nn.ReLU(inplace=True),\n",
480 |         "            nn.MaxPool2d(kernel_size=3, stride=2),\n",
481 |         "            nn.Conv2d(64, 192, kernel_size=5, padding=2),\n",
482 |         "            nn.ReLU(inplace=True),\n",
483 |         "            nn.MaxPool2d(kernel_size=3, stride=2),\n",
484 |         "            nn.Conv2d(192, 384, kernel_size=3, padding=1),\n",
485 |         "            nn.ReLU(inplace=True),\n",
486 |         "            nn.Conv2d(384, 256, kernel_size=3, padding=1),\n",
487 |         "            nn.ReLU(inplace=True),\n",
488 |         "            nn.Conv2d(256, 256, kernel_size=3, padding=1),\n",
489 |         "            nn.ReLU(inplace=True),\n",
490 |         "            nn.MaxPool2d(kernel_size=3, stride=2),\n",
491 |         "        )\n",
492 |         "        self.classifier = nn.Sequential(\n",
493 |         "            nn.Dropout(0.5),\n",
494 |         "            nn.Linear(256, num_classes),\n",
495 |         "        )\n",
496 |         "        self.softmax = nn.Softmax(dim=1)\n",
497 |         "\n",
498 |         "    def forward(self, x):\n",
499 |         "        x = self.features(x)\n",
500 |         "        #print('features',x.shape)\n",
501 |         "        x=torch.flatten(x, start_dim=2)#a1,a2,a3......al{a of dim c} \n",
502 |         "        x=torch.sum(x, dim=2)#a1*alpha1+a2*alpha2+.......+al*alphal\n",
503 |         "        #print(x.shape)\n",
504 |         "        x=self.classifier(x)\n",
505 |         "        #print('classifier',x)\n",
506 |         "        #x=self.softmax(x)\n",
507 |         "        #print('softmax',x)\n",
508 |         "        #x = self.avgpool(x)\n",
509 |         "        #print('avgpool',x.shape)\n",
510 |         "        #x = torch.flatten(x, 1)\n",
511 |         "        #print('flatten',x.shape)\n",
512 |         "        #x = self.classifier(x)\n",
513 |         "        return x\n",
514 |         "   \n",
515 |         "def modifiedAlexNet(pretrained=False, progress=True, **kwargs):\n",
516 |         "    model_modified = ModifiedAlexNet(**kwargs)\n",
517 |         "    if pretrained:\n",
518 |         "        state_dict = load_state_dict_from_url(model_urls['alexnet'],\n",
519 |         "                                              progress=progress)\n",
520 |         "        model_modified.load_state_dict(state_dict)\n",
521 |         "    return model_modified"
522 |       ],
523 |       "execution_count": 0,
524 |       "outputs": []
525 |     },
526 |     {
527 |       "cell_type": "markdown",
528 |       "metadata": {
529 |         "id": "cBABecl_Gl9N",
530 |         "colab_type": "text"
531 |       },
532 |       "source": [
533 |         "***create the Modified model instance and initiliaze with the pretraine d model***"
534 |       ]
535 |     },
536 |     {
537 |       "cell_type": "code",
538 |       "metadata": {
539 |         "id": "BsIPd9oBGvag",
540 |         "colab_type": "code",
541 |         "outputId": "5cd3a187-7234-4f23-e930-34e1f30543bc",
542 |         "colab": {
543 |           "base_uri": "https://localhost:8080/",
544 |           "height": 386
545 |         }
546 |       },
547 |       "source": [
548 |         "original_model=alexnet(pretrained=True)\n",
549 |         "original_dict = original_model.state_dict()\n",
550 |         "modifiedAlexNet=modifiedAlexNet(pretrained=False)\n",
551 |         "modified_model_dict = modifiedAlexNet.state_dict()\n",
552 |         "pretrained_modified_model_dict = {k: v for k, v in original_dict.items() if k in modified_model_dict}\n",
553 |         "modifiedAlexNet.to('cuda')"
554 |       ],
555 |       "execution_count": 0,
556 |       "outputs": [
557 |         {
558 |           "output_type": "execute_result",
559 |           "data": {
560 |             "text/plain": [
561 |               "ModifiedAlexNet(\n",
562 |               "  (features): Sequential(\n",
563 |               "    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))\n",
564 |               "    (1): ReLU(inplace=True)\n",
565 |               "    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)\n",
566 |               "    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))\n",
567 |               "    (4): ReLU(inplace=True)\n",
568 |               "    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)\n",
569 |               "    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
570 |               "    (7): ReLU(inplace=True)\n",
571 |               "    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
572 |               "    (9): ReLU(inplace=True)\n",
573 |               "    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
574 |               "    (11): ReLU(inplace=True)\n",
575 |               "    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)\n",
576 |               "  )\n",
577 |               "  (classifier): Sequential(\n",
578 |               "    (0): Dropout(p=0.5, inplace=False)\n",
579 |               "    (1): Linear(in_features=256, out_features=4, bias=True)\n",
580 |               "  )\n",
581 |               "  (softmax): Softmax(dim=1)\n",
582 |               ")"
583 |             ]
584 |           },
585 |           "metadata": {
586 |             "tags": []
587 |           },
588 |           "execution_count": 15
589 |         }
590 |       ]
591 |     },
592 |     {
593 |       "cell_type": "markdown",
594 |       "metadata": {
595 |         "id": "3ObB4bOBPmA2",
596 |         "colab_type": "text"
597 |       },
598 |       "source": [
599 |         "***Input code to AlexNet with Audio Files***"
600 |       ]
601 |     },
602 |     {
603 |       "cell_type": "code",
604 |       "metadata": {
605 |         "id": "BQxmqNSyTWnr",
606 |         "colab_type": "code",
607 |         "outputId": "0aa1f7c2-e9a3-4633-f7af-e215a86c3a43",
608 |         "colab": {
609 |           "base_uri": "https://localhost:8080/",
610 |           "height": 34
611 |         }
612 |       },
613 |       "source": [
614 |         "x=audio2spectrogram(list_files[40])\n",
615 |         "x=get_3d_spec(x)\n",
616 |         "npimg = np.transpose(x,(2,0,1))\n",
617 |         "input_tensor=torch.tensor(npimg)\n",
618 |         "\n",
619 |         "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n",
620 |         "if torch.cuda.is_available():\n",
621 |         "    input_batch = input_batch.to('cuda')\n",
622 |         "    modifiedAlexNet.to('cuda')\n",
623 |         "with torch.no_grad():\n",
624 |         "    output = modifiedAlexNet(input_batch)\n",
625 |         "    #output.squeeze().shape\n",
626 |         "    #output=torch.flatten(output, start_dim=2)\n",
627 |         "    #print(output.shape)\n",
628 |         "    #output=torch.sum(output, dim=2)\n",
629 |         "    print(output)"
630 |       ],
631 |       "execution_count": 0,
632 |       "outputs": [
633 |         {
634 |           "output_type": "stream",
635 |           "text": [
636 |             "tensor([[-3.6122, -2.7249,  2.3582, -0.5873]], device='cuda:0')\n"
637 |           ],
638 |           "name": "stdout"
639 |         }
640 |       ]
641 |     },
642 |     {
643 |       "cell_type": "markdown",
644 |       "metadata": {
645 |         "id": "SfT1DDnLQg1b",
646 |         "colab_type": "text"
647 |       },
648 |       "source": [
649 |         "***Inputdata schuffling and deviding the data***"
650 |       ]
651 |     },
652 |     {
653 |       "cell_type": "code",
654 |       "metadata": {
655 |         "id": "q7JZNzX9QvM3",
656 |         "colab_type": "code",
657 |         "outputId": "9ad46ee4-1d1d-4b2a-9bf4-e180dd50465a",
658 |         "colab": {
659 |           "base_uri": "https://localhost:8080/",
660 |           "height": 50
661 |         }
662 |       },
663 |       "source": [
664 |         "import random\n",
665 |         "random.shuffle(docs)\n",
666 |         "random.shuffle(docs)\n",
667 |         "random.shuffle(docs)\n",
668 |         "total_length=len(docs)\n",
669 |         "train_length=int(.9*total_length)\n",
670 |         "train_list=docs[0:train_length]\n",
671 |         "test_list=docs[train_length:]\n",
672 |         "print('no of items for train ',len(train_list))\n",
673 |         "print('no of items for test ',len(test_list))"
674 |       ],
675 |       "execution_count": 0,
676 |       "outputs": [
677 |         {
678 |           "output_type": "stream",
679 |           "text": [
680 |             "no of items for train  4977\n",
681 |             "no of items for test  554\n"
682 |           ],
683 |           "name": "stdout"
684 |         }
685 |       ]
686 |     },
687 |     {
688 |       "cell_type": "markdown",
689 |       "metadata": {
690 |         "id": "3k1pQ5INTvrm",
691 |         "colab_type": "text"
692 |       },
693 |       "source": [
694 |         "***Plot Training loss and Accuracy***"
695 |       ]
696 |     },
697 |     {
698 |       "cell_type": "code",
699 |       "metadata": {
700 |         "id": "b1mv-uS-T2Mo",
701 |         "colab_type": "code",
702 |         "outputId": "b9713c74-8b8b-418c-f9f9-f48e471e0980",
703 |         "colab": {
704 |           "base_uri": "https://localhost:8080/",
705 |           "height": 871
706 |         }
707 |       },
708 |       "source": [
709 |         "%load_ext tensorboard\n",
710 |         "%tensorboard --logdir ./"
711 |       ],
712 |       "execution_count": 0,
713 |       "outputs": [
714 |         {
715 |           "output_type": "stream",
716 |           "text": [
717 |             "The tensorboard extension is already loaded. To reload it, use:\n",
718 |             "  %reload_ext tensorboard\n"
719 |           ],
720 |           "name": "stdout"
721 |         },
722 |         {
723 |           "output_type": "display_data",
724 |           "data": {
725 |             "text/plain": [
726 |               "Reusing TensorBoard on port 6006 (pid 341), started 0:35:43 ago. (Use '!kill 341' to kill it.)"
727 |             ]
728 |           },
729 |           "metadata": {
730 |             "tags": []
731 |           }
732 |         },
733 |         {
734 |           "output_type": "display_data",
735 |           "data": {
736 |             "application/javascript": [
737 |               "\n",
738 |               "        (async () => {\n",
739 |               "            const url = await google.colab.kernel.proxyPort(6006, {\"cache\": true});\n",
740 |               "            const iframe = document.createElement('iframe');\n",
741 |               "            iframe.src = url;\n",
742 |               "            iframe.setAttribute('width', '100%');\n",
743 |               "            iframe.setAttribute('height', '800');\n",
744 |               "            iframe.setAttribute('frameborder', 0);\n",
745 |               "            document.body.appendChild(iframe);\n",
746 |               "        })();\n",
747 |               "    "
748 |             ],
749 |             "text/plain": [
750 |               "<IPython.core.display.Javascript object>"
751 |             ]
752 |           },
753 |           "metadata": {
754 |             "tags": []
755 |           }
756 |         }
757 |       ]
758 |     },
759 |     {
760 |       "cell_type": "markdown",
761 |       "metadata": {
762 |         "id": "EwgNOF5oK-JV",
763 |         "colab_type": "text"
764 |       },
765 |       "source": [
766 |         "***Model Parameter***"
767 |       ]
768 |     },
769 |     {
770 |       "cell_type": "code",
771 |       "metadata": {
772 |         "id": "gzWnvXf5LCIU",
773 |         "colab_type": "code",
774 |         "outputId": "12e2f481-95ed-473e-ebce-25b6474665af",
775 |         "colab": {
776 |           "base_uri": "https://localhost:8080/",
777 |           "height": 218
778 |         }
779 |       },
780 |       "source": [
781 |         "for name, param in modifiedAlexNet.named_parameters():\n",
782 |         "      if(param.requires_grad):\n",
783 |         "        print(name)\n",
784 |         "      else:\n",
785 |         "        print('no grad',name)"
786 |       ],
787 |       "execution_count": 0,
788 |       "outputs": [
789 |         {
790 |           "output_type": "stream",
791 |           "text": [
792 |             "features.0.weight\n",
793 |             "features.0.bias\n",
794 |             "features.3.weight\n",
795 |             "features.3.bias\n",
796 |             "features.6.weight\n",
797 |             "features.6.bias\n",
798 |             "features.8.weight\n",
799 |             "features.8.bias\n",
800 |             "features.10.weight\n",
801 |             "features.10.bias\n",
802 |             "classifier.1.weight\n",
803 |             "classifier.1.bias\n"
804 |           ],
805 |           "name": "stdout"
806 |         }
807 |       ]
808 |     },
809 |     {
810 |       "cell_type": "markdown",
811 |       "metadata": {
812 |         "id": "JoSuRA4cKeSy",
813 |         "colab_type": "text"
814 |       },
815 |       "source": [
816 |         "***optimizer***"
817 |       ]
818 |     },
819 |     {
820 |       "cell_type": "code",
821 |       "metadata": {
822 |         "id": "-_BAUU5kKhAs",
823 |         "colab_type": "code",
824 |         "colab": {}
825 |       },
826 |       "source": [
827 |         "import torch.optim as optim\n",
828 |         "from transformers import AdamW\n",
829 |         "criterion = nn.CrossEntropyLoss()\n",
830 |         "optimizer = AdamW(modifiedAlexNet.parameters(),\n",
831 |         "                  lr =  2e-4, \n",
832 |         "                  eps = 1e-8\n",
833 |         "                )\n",
834 |         "from transformers import get_linear_schedule_with_warmup\n",
835 |         "\n",
836 |         "NUM_EPOCHS=16\n",
837 |         "\n",
838 |         "writer = SummaryWriter(log_dir='/content/')\n",
839 |         "total_steps = len(train_list) * NUM_EPOCHS\n",
840 |         "\n",
841 |         "# Create the learning rate scheduler.\n",
842 |         "scheduler = get_linear_schedule_with_warmup(optimizer, \n",
843 |         "                                            num_warmup_steps = 0, # Default value in run_glue.py\n",
844 |         "                                            num_training_steps = total_steps)\n"
845 |       ],
846 |       "execution_count": 0,
847 |       "outputs": []
848 |     },
849 |     {
850 |       "cell_type": "markdown",
851 |       "metadata": {
852 |         "id": "yYFow8ZlQB67",
853 |         "colab_type": "text"
854 |       },
855 |       "source": [
856 |         "***Training Loop***"
857 |       ]
858 |     },
859 |     {
860 |       "cell_type": "code",
861 |       "metadata": {
862 |         "id": "1Hj8xc3LQFDi",
863 |         "colab_type": "code",
864 |         "colab": {}
865 |       },
866 |       "source": [
867 |         "total_steps = 1\n",
868 |         "\n",
869 |         "\n",
870 |         "seed_val = 42\n",
871 |         "random.seed(seed_val)\n",
872 |         "np.random.seed(seed_val)\n",
873 |         "torch.manual_seed(seed_val)\n",
874 |         "torch.cuda.manual_seed_all(seed_val)\n",
875 |         "\n",
876 |         "for epoch in range(NUM_EPOCHS):\n",
877 |         "  modifiedAlexNet.train()\n",
878 |         "  for every_trainlist in train_list:\n",
879 |         "    label1=every_trainlist['label']\n",
880 |         "    label1=torch.tensor([label1])\n",
881 |         "    sprectrome=every_trainlist['sprectrome']\n",
882 |         "    if(sprectrome.shape[2]>65):\n",
883 |         "      optimizer.zero_grad()\n",
884 |         "      sprectrome = sprectrome.to('cuda')\n",
885 |         "      label1=label1.to('cuda')\n",
886 |         "      modifiedAlexNet.zero_grad()\n",
887 |         "      output = modifiedAlexNet(sprectrome)\n",
888 |         "      #print('softmax output ',output)\n",
889 |         "      loss = criterion(output, label1)\n",
890 |         "      #print('label1',label1)\n",
891 |         "      #print('loss',loss.item())\n",
892 |         "      loss.backward()\n",
893 |         "      torch.nn.utils.clip_grad_norm_(modifiedAlexNet.parameters(), 1.0)\n",
894 |         "      optimizer.step()\n",
895 |         "      scheduler.step()\n",
896 |         "      _, preds = torch.max(output, 1)\n",
897 |         "      accuracy = torch.sum(preds == label1)\n",
898 |         "      #print('accuracy.item()',accuracy.item())\n",
899 |         "      #print('preds',preds)\n",
900 |         "      if total_steps % 10 == 0:\n",
901 |         "        with torch.no_grad():\n",
902 |         "          _, preds = torch.max(output, 1)\n",
903 |         "          accuracy = torch.sum(preds == label1)\n",
904 |         "          #print('Epoch: {} \\tStep: {} \\tLoss: {:.4f} \\tAcc: {}'.format(epoch + 1, total_steps, loss.item(), accuracy.item()))\n",
905 |         "          tbwriter.add_scalar('loss', loss.item(), total_steps)\n",
906 |         "          tbwriter.add_scalar('accuracy', accuracy.item(), total_steps)                     \n",
907 |         "      total_steps+=1"
908 |       ],
909 |       "execution_count": 0,
910 |       "outputs": []
911 |     },
912 |     {
913 |       "cell_type": "markdown",
914 |       "metadata": {
915 |         "id": "ytSK9YJdzDh2",
916 |         "colab_type": "text"
917 |       },
918 |       "source": [
919 |         "***save and load the model***"
920 |       ]
921 |     },
922 |     {
923 |       "cell_type": "code",
924 |       "metadata": {
925 |         "id": "GyEVoGs-zHZm",
926 |         "colab_type": "code",
927 |         "colab": {}
928 |       },
929 |       "source": [
930 |         "torch.save(modifiedAlexNet, '/content/drive/My Drive/savedModel/model_audio_new_opt.pt')\n",
931 |         "model=torch.load('/content/drive/My Drive/savedModel/model_audio_new_opt.pt')\n",
932 |         "model.eval()\n",
933 |         "model.to('cpu')"
934 |       ],
935 |       "execution_count": 0,
936 |       "outputs": []
937 |     },
938 |     {
939 |       "cell_type": "markdown",
940 |       "metadata": {
941 |         "id": "GfuJ5F0cysjU",
942 |         "colab_type": "text"
943 |       },
944 |       "source": [
945 |         "***testing lopp***"
946 |       ]
947 |     },
948 |     {
949 |       "cell_type": "code",
950 |       "metadata": {
951 |         "id": "QYbcOfizywFG",
952 |         "colab_type": "code",
953 |         "colab": {}
954 |       },
955 |       "source": [
956 |         "y_actu=[]\n",
957 |         "y_pred=[]\n",
958 |         "for every_test_list in test_list:\n",
959 |         "    label1=every_test_list['label']\n",
960 |         "    label1=torch.tensor([label1])\n",
961 |         "    sprectrome=every_test_list['sprectrome']\n",
962 |         "    with torch.no_grad():\n",
963 |         "      if(sprectrome.shape[2]>65):\n",
964 |         "        #sprectrome = sprectrome.to('cuda')\n",
965 |         "        #label1=label1.to('cuda')\n",
966 |         "        output = model(sprectrome)\n",
967 |         "        _, preds = torch.max(output, 1)\n",
968 |         "        y_actu.append(label1.numpy()[0])\n",
969 |         "        y_pred.append(preds.numpy()[0])"
970 |       ],
971 |       "execution_count": 0,
972 |       "outputs": []
973 |     },
974 |     {
975 |       "cell_type": "markdown",
976 |       "metadata": {
977 |         "id": "kNAdE3fQ3z8v",
978 |         "colab_type": "text"
979 |       },
980 |       "source": [
981 |         "***confusionMatrix***"
982 |       ]
983 |     },
984 |     {
985 |       "cell_type": "code",
986 |       "metadata": {
987 |         "id": "ueItzxt432pq",
988 |         "colab_type": "code",
989 |         "colab": {}
990 |       },
991 |       "source": [
992 |         "from sklearn.metrics import confusion_matrix\n",
993 |         "confusion_matrix(y_actu, y_pred)"
994 |       ],
995 |       "execution_count": 0,
996 |       "outputs": []
997 |     }
998 |   ]
999 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Audio-and-text-based-emotion-recognition
 2 | A multimodal approach on emotion recognition using audio and text. 
 3 | 
 4 | A pytorch implementation of the paper 
 5 | - Attention Based Fully Convolutional Network for Speech Emotion Recognition (https://arxiv.org/pdf/1806.01506v2.pdf) 
 6 | - Multimodal Speech Emotion Recognition using Audio and Text (https://arxiv.org/pdf/1810.04635.pdf)
 7 | - Emotion Recognition from Speech (https://arxiv.org/pdf/1912.10458.pdf)
 8 | 
 9 | ![](./img/Multimodal_1.png)
10 | # Objective
11 | 
12 | This model is used to recognize emotion based on variable length audio inputs and texts.
13 | 
14 | # Datasets
15 | 
16 | We used IMEOCAP dataset for the project. It can be downloaded from https://sail.usc.edu/iemocap/
17 | We also omitted one second audio data from the dataset.
18 | 
19 | # Methodology
20 | 
21 | * Audio model
22 | * Text model
23 | * Multimodal approach
24 | 
25 | ### Audio model
26 | 
27 | * The audio data from IEMOCAP dataset is used to extract log spectrogram and 3D spectrogram. Since Alexnet requires 3D convolution(As spectrogram is a 2D image), we stacked delta spectrogram as the third dimension.
28 | * The spectrogram is extracted using signal in scipy library..
29 | * Input data is passed to the AlexNet model. The AlexNet model used in this project is a modified AlexNet which accepts variable length audio signals as its input.
30 | 
31 | ### Text model
32 | 
33 | * The text data from IEMOCAP dataset(sentences) is used. 
34 | * Every sentence extracted is passed to BERT model and represented as a 768 dimension vector.
35 | 
36 | ### Multimodal approach
37 | 
38 | * Separately trained audio model and separately trained text model is used here to collect the embeddings.
39 | * The embeddings are concatenated and fed to the classification layer.
40 | * Only the classification layer is trained.
41 | 


--------------------------------------------------------------------------------
/img/1.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/img/Multimodal_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aris-ai/Audio-and-text-based-emotion-recognition/3135c9bbab93887556295a50bffaf83ec70aecd3/img/Multimodal_1.png


--------------------------------------------------------------------------------