├── MPP_Code
    ├── README.md
    ├── models
    │   ├── emotion_classification_model.py
    │   └── emotion_regression_model.py
    └── training
    │   ├── execute_classification_explicit.py
    │   ├── execute_classification_implicit.py
    │   ├── execute_regression.py
    │   ├── execute_sarcasm_mustard++.py
    │   └── execute_sarcasm_mustard.py
├── README.md
└── mustard++_text.csv


/MPP_Code/README.md:
--------------------------------------------------------------------------------
 1 | # MUStARD++ Code
 2 | 
 3 | 
 4 | This folder contains the code relevant to submission 'A Multimodal Corpus for Emotion Recognition in Sarcasm'
 5 | 
 6 | The folder _models_ contains the classification models for sarcasm detection and emotion recognition.
 7 | 
 8 | The folder _training_ contains the scripts for training multimodal sarcasm detection and emotion recognition models on the dataset
 9 | 
10 | Features can be extracted from the videos (to which a google drive link has been provided in the main page)
11 | 


--------------------------------------------------------------------------------
/MPP_Code/models/emotion_classification_model.py:
--------------------------------------------------------------------------------
   1 | import torch
   2 | import torch.nn as nn
   3 | import numpy as np
   4 | import random
   5 | 
   6 | '''Comment these to randomize the model training'''
   7 | np.random.seed(42)
   8 | random.seed(42)
   9 | torch.manual_seed(42)
  10 | torch.backends.cudnn.deterministic = True
  11 | 
  12 | """This file contains all the models we have used for -
  13 | 1. Multiclass Implicit Emotion Classification
  14 | 2. Multiclass Explicit Emotion Classification
  15 | 3. Multimodal Sarcasm Detection
  16 |     3.1 for MUStARD
  17 |     3.2 for MUStARD++
  18 | 
  19 | 
  20 | Class names and Variable names are self explanatory
  21 | Before setting the values to input embedding  we first sort the modality name in descending order.
  22 | (VTA) in order to remove randomness in the model
  23 | 
  24 | Parameters:
  25 |       input_embedding_A:
  26 |             Takes the input dimension of first modality
  27 |       input_embedding_B:
  28 |             Takes the input dimension of second modality
  29 |       input_embedding_C:
  30 |             Takes the input dimension of third modality
  31 |       shared_embedding:
  32 |             This is the dimension size to which we have to project all modality, to have equal dimension input from each input modality
  33 |       projection_embedding:
  34 |             This is the intermediate dimension size to which project our shared embedding to calculate attention
  35 |       dropout: 
  36 |             Parameter to pass dropout (to be hyper-tuned)
  37 | 
  38 | 
  39 | we assign "num_classes" variable depending upon the task
  40 | for example,
  41 |     a. num_classes=5 (5) (Multiclass Implicit Emotion Classification)
  42 |     b. num_classes=9 (Multiclass Explicit Emotion Classification)
  43 |     c. num_classes=2 (Multimodal Sarcasm Detection)
  44 | 
  45 | Output Layer = Softmax Layer
  46 | """
  47 | 
  48 | audio_embedding_size = 291 #Modify the embedding sizes based on audio feature extraction chosen:Ex: 314, 319
  49 | class Speaker_Independent_Triple_Mode_with_Context(nn.Module):
  50 |     def __init__(self, input_embedding_A=2048, input_embedding_B=1024, input_embedding_C=audio_embedding_size, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5):
  51 |         super(Speaker_Independent_Triple_Mode_with_Context, self).__init__()
  52 | 
  53 |         self.input_embedding_A = input_embedding_A
  54 |         self.input_embedding_B = input_embedding_B
  55 |         self.input_embedding_C = input_embedding_C
  56 |         self.shared_embedding = shared_embedding
  57 |         self.projection_embedding = projection_embedding
  58 |         self.num_classes = num_classes
  59 |         self.dropout = dropout
  60 | 
  61 |         self.A_context_share = nn.Linear(
  62 |             self.input_embedding_A, self.shared_embedding)
  63 |         self.A_utterance_share = nn.Linear(
  64 |             self.input_embedding_A, self.shared_embedding)
  65 | 
  66 |         self.C_context_share = nn.Linear(
  67 |             self.input_embedding_C, self.shared_embedding)
  68 |         self.C_utterance_share = nn.Linear(
  69 |             self.input_embedding_C, self.shared_embedding)
  70 | 
  71 |         self.B_context_share = nn.Linear(
  72 |             self.input_embedding_B, self.shared_embedding)
  73 |         self.B_utterance_share = nn.Linear(
  74 |             self.input_embedding_B, self.shared_embedding)
  75 | 
  76 |         self.norm_A_context = nn.BatchNorm1d(self.shared_embedding)
  77 |         self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
  78 | 
  79 |         self.norm_C_context = nn.BatchNorm1d(self.shared_embedding)
  80 |         self.norm_C_utterance = nn.BatchNorm1d(self.shared_embedding)
  81 | 
  82 |         self.norm_B_context = nn.BatchNorm1d(self.shared_embedding)
  83 |         self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
  84 | 
  85 |         self.collabrative_gate_1 = nn.Linear(
  86 |             2*self.shared_embedding, self.projection_embedding)
  87 |         self.collabrative_gate_2 = nn.Linear(
  88 |             self.projection_embedding, self.shared_embedding)
  89 | 
  90 |         self.pred_module = nn.Sequential(
  91 |             nn.Linear(3*self.shared_embedding, 2*self.shared_embedding),
  92 |             nn.BatchNorm1d(2*self.shared_embedding),
  93 |             nn.ReLU(),
  94 |             nn.Dropout(dropout),
  95 |             nn.Linear(2*self.shared_embedding, self.shared_embedding),
  96 |             nn.BatchNorm1d(self.shared_embedding),
  97 |             nn.ReLU(),
  98 |             nn.Dropout(dropout),
  99 |             nn.Linear(self.shared_embedding,  512),
 100 |             nn.BatchNorm1d(512),
 101 |             nn.ReLU(),
 102 |             nn.Dropout(dropout),
 103 |             nn.Linear(512,  128),
 104 |             nn.BatchNorm1d(128),
 105 |             nn.ReLU(),
 106 |             nn.Dropout(dropout),
 107 |             nn.Linear(128,  self.num_classes)
 108 |         )
 109 | 
 110 |     def attention(self, featureA, featureB):
 111 |         """ This method takes two features and calculates the attention """
 112 |         input = torch.cat((featureA, featureB), dim=1)
 113 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 114 | 
 115 |     def attention_aggregator(self, feA, feB, feC, feD, feE, feF):
 116 |         """ This method calculates the attention for feA with respect to others"""
 117 |         input = self.attention(feA, feB) + \
 118 |             self.attention(feA, feC) + \
 119 |             self.attention(feA, feD) + \
 120 |             self.attention(feA, feE) + \
 121 |             self.attention(feA, feF)
 122 |         # here we call for pairwise attention
 123 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 124 | 
 125 |     def forward(self, uA, cA, uB, cB, uC, cC):
 126 |         """Args:
 127 |                 uA:
 128 |                     Utterance Video
 129 |                 uB:
 130 |                     Utterance Text
 131 |                 uC:
 132 |                     Utterance Audio
 133 |                 cA:
 134 |                     Context Video
 135 |                 cB:
 136 |                     Context Text
 137 |                 cC:
 138 |                     Context Audio
 139 | 
 140 |             Returns:
 141 |                 probability of emotion classes
 142 |                 (
 143 |                     Since we have used Cross-entropy as loss function,
 144 |                     Therefore we have not used softmax here because Cross-entropy perform Softmax while calculating loss
 145 |                     While evaluation we have to perform softmax explicitly
 146 |                 )
 147 |         """
 148 |         """ Feature Projection, in order to make all feature of same dimension"""
 149 | 
 150 |         shared_A_context = self.norm_A_context(
 151 |             nn.functional.relu(self.A_context_share(cA)))
 152 |         shared_A_utterance = self.norm_A_utterance(
 153 |             nn.functional.relu(self.A_utterance_share(uA)))
 154 | 
 155 |         shared_C_context = self.norm_C_context(
 156 |             nn.functional.relu(self.C_context_share(cC)))
 157 |         shared_C_utterance = self.norm_C_utterance(
 158 |             nn.functional.relu(self.C_utterance_share(uC)))
 159 | 
 160 |         shared_B_context = self.norm_B_context(
 161 |             nn.functional.relu(self.B_context_share(cB)))
 162 |         shared_B_utterance = self.norm_B_utterance(
 163 |             nn.functional.relu(self.B_utterance_share(uB)))
 164 | 
 165 |         # Feature Modulation
 166 |         updated_shared_A = shared_A_utterance * self.attention_aggregator(
 167 |             shared_A_utterance, shared_A_context, shared_C_context, shared_C_utterance, shared_B_context, shared_B_utterance)
 168 |         updated_shared_C = shared_C_utterance * self.attention_aggregator(
 169 |             shared_C_utterance, shared_C_context, shared_A_context, shared_A_utterance, shared_B_context, shared_B_utterance)
 170 |         updated_shared_B = shared_B_utterance * self.attention_aggregator(
 171 |             shared_B_utterance, shared_B_context, shared_A_context, shared_A_utterance, shared_C_context, shared_C_utterance)
 172 | 
 173 |         temp = torch.cat((updated_shared_A, updated_shared_C), dim=1)
 174 |         input = torch.cat((temp, updated_shared_B), dim=1)
 175 | 
 176 |         return self.pred_module(input)
 177 | 
 178 | ################################################################################################################################################################################################################
 179 | 
 180 | 
 181 | class Speaker_Independent_Dual_Mode_with_Context(nn.Module):
 182 |     def __init__(self, input_embedding_A=1024, input_embedding_B=2048, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5):
 183 |         super(Speaker_Independent_Dual_Mode_with_Context, self).__init__()
 184 | 
 185 |         self.input_embedding_A = input_embedding_A
 186 |         self.input_embedding_B = input_embedding_B
 187 |         self.shared_embedding = shared_embedding
 188 |         self.projection_embedding = projection_embedding
 189 |         self.num_classes = num_classes
 190 |         self.dropout = dropout
 191 | 
 192 |         self.A_context_share = nn.Linear(
 193 |             self.input_embedding_A, self.shared_embedding)
 194 |         self.A_utterance_share = nn.Linear(
 195 |             self.input_embedding_A, self.shared_embedding)
 196 | 
 197 |         self.B_context_share = nn.Linear(
 198 |             self.input_embedding_B, self.shared_embedding)
 199 |         self.B_utterance_share = nn.Linear(
 200 |             self.input_embedding_B, self.shared_embedding)
 201 | 
 202 |         self.norm_A_context = nn.BatchNorm1d(self.shared_embedding)
 203 |         self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
 204 | 
 205 |         self.norm_B_context = nn.BatchNorm1d(self.shared_embedding)
 206 |         self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
 207 | 
 208 |         self.collabrative_gate_1 = nn.Linear(
 209 |             2*self.shared_embedding, self.projection_embedding)
 210 |         self.collabrative_gate_2 = nn.Linear(
 211 |             self.projection_embedding, self.shared_embedding)
 212 | 
 213 |         self.pred_module = nn.Sequential(
 214 |             nn.Linear(2*self.shared_embedding, self.shared_embedding),
 215 |             nn.BatchNorm1d(self.shared_embedding),
 216 |             nn.ReLU(),
 217 |             nn.Dropout(dropout),
 218 |             nn.Linear(self.shared_embedding,  512),
 219 |             nn.BatchNorm1d(512),
 220 |             nn.ReLU(),
 221 |             nn.Dropout(dropout),
 222 |             nn.Linear(512,  128),
 223 |             nn.BatchNorm1d(128),
 224 |             nn.ReLU(),
 225 |             nn.Dropout(dropout),
 226 |             nn.Linear(128,  self.num_classes)
 227 |         )
 228 | 
 229 |     def attention(self, featureA, featureB):
 230 |         """ This method takes two features and calculates the attention """
 231 |         input = torch.cat((featureA, featureB), dim=1)
 232 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 233 | 
 234 |     def attention_aggregator(self, feA, feB, feC, feD):
 235 |         """ This method calculates the attention for feA with respect to others"""
 236 |         input = self.attention(feA, feB) + \
 237 |             self.attention(feA, feC) + \
 238 |             self.attention(feA, feD)
 239 |         # here we call for pairwise attention
 240 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 241 | 
 242 |     def forward(self, uA, cA, uB, cB):
 243 |         """making Feature Projection in order to make all feature of same dimension"""
 244 | 
 245 |         shared_A_context = self.norm_A_context(
 246 |             nn.functional.relu(self.A_context_share(cA)))
 247 |         shared_A_utterance = self.norm_A_utterance(
 248 |             nn.functional.relu(self.A_utterance_share(uA)))
 249 | 
 250 |         shared_B_context = self.norm_B_context(
 251 |             nn.functional.relu(self.B_context_share(cB)))
 252 |         shared_B_utterance = self.norm_B_utterance(
 253 |             nn.functional.relu(self.B_utterance_share(uB)))
 254 | 
 255 |         updated_shared_A = shared_A_utterance * self.attention_aggregator(
 256 |             shared_A_utterance, shared_A_context, shared_B_context, shared_B_utterance)
 257 | 
 258 |         updated_shared_B = shared_B_utterance * self.attention_aggregator(
 259 |             shared_B_utterance, shared_B_context, shared_A_context, shared_A_utterance)
 260 | 
 261 |         input = torch.cat((updated_shared_A, updated_shared_B), dim=1)
 262 | 
 263 |         return self.pred_module(input)
 264 | 
 265 | ################################################################################################################################################################################################################
 266 | 
 267 | 
 268 | class Speaker_Independent_Single_Mode_with_Context(nn.Module):
 269 |     def __init__(self, input_embedding_A=1024, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5):
 270 |         super(Speaker_Independent_Single_Mode_with_Context, self).__init__()
 271 | 
 272 |         self.input_embedding = input_embedding_A
 273 | 
 274 |         self.shared_embedding = shared_embedding
 275 |         self.projection_embedding = projection_embedding
 276 |         self.num_classes = num_classes
 277 |         self.dropout = dropout
 278 | 
 279 |         self.context_share = nn.Linear(
 280 |             self.input_embedding, self.shared_embedding)
 281 |         self.utterance_share = nn.Linear(
 282 |             self.input_embedding, self.shared_embedding)
 283 | 
 284 |         self.norm_context = nn.BatchNorm1d(self.shared_embedding)
 285 |         self.norm_utterance = nn.BatchNorm1d(self.shared_embedding)
 286 | 
 287 |         self.collabrative_gate_1 = nn.Linear(
 288 |             2*self.shared_embedding, self.projection_embedding)
 289 |         self.collabrative_gate_2 = nn.Linear(
 290 |             self.projection_embedding, self.shared_embedding)
 291 | 
 292 |         self.pred_module = nn.Sequential(
 293 |             nn.Linear(self.shared_embedding,  512),
 294 |             nn.BatchNorm1d(512),
 295 |             nn.ReLU(),
 296 |             nn.Dropout(dropout),
 297 |             nn.Linear(512,  128),
 298 |             nn.BatchNorm1d(128),
 299 |             nn.ReLU(),
 300 |             nn.Dropout(dropout),
 301 |             nn.Linear(128,  self.num_classes)
 302 |         )
 303 | 
 304 |     def attention(self, featureA, featureB):
 305 |         """ This method takes two features and calculates the attention """
 306 |         input = torch.cat((featureA, featureB), dim=1)
 307 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 308 | 
 309 |     def attention_aggregator(self, feA, feB):
 310 |         """ This method calculates the attention for feA with respect to others"""
 311 |         input = self.attention(feA, feB)
 312 |         # here we call for pairwise attention
 313 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 314 | 
 315 |     def forward(self, uA, cA):
 316 |         """ Feature Projection, in order to make all feature of same dimension"""
 317 | 
 318 |         shared_context = self.norm_context(
 319 |             nn.functional.relu(self.context_share(cA)))
 320 |         shared_utterance = self.norm_utterance(
 321 |             nn.functional.relu(self.utterance_share(uA)))
 322 | 
 323 |         updated_shared = shared_utterance * self.attention_aggregator(
 324 |             shared_utterance, shared_context)
 325 | 
 326 |         input = updated_shared
 327 | 
 328 |         return self.pred_module(updated_shared)
 329 | 
 330 | ################################################################################################################################################################################################################
 331 | 
 332 | 
 333 | class Speaker_Independent_Triple_Mode_without_Context(nn.Module):
 334 |     def __init__(self, input_embedding_A=2048, input_embedding_B=1024, input_embedding_C=audio_embedding_size, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5):
 335 |         super(Speaker_Independent_Triple_Mode_without_Context, self).__init__()
 336 | 
 337 |         self.input_embedding_A = input_embedding_A
 338 |         self.input_embedding_B = input_embedding_B
 339 |         self.input_embedding_C = input_embedding_C
 340 |         self.shared_embedding = shared_embedding
 341 |         self.projection_embedding = projection_embedding
 342 |         self.num_classes = num_classes
 343 |         self.dropout = dropout
 344 | 
 345 |         self.A_utterance_share = nn.Linear(
 346 |             self.input_embedding_A, self.shared_embedding)
 347 | 
 348 |         self.C_utterance_share = nn.Linear(
 349 |             self.input_embedding_C, self.shared_embedding)
 350 | 
 351 |         self.B_utterance_share = nn.Linear(
 352 |             self.input_embedding_B, self.shared_embedding)
 353 | 
 354 |         self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
 355 | 
 356 |         self.norm_C_utterance = nn.BatchNorm1d(self.shared_embedding)
 357 | 
 358 |         self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
 359 | 
 360 |         self.collabrative_gate_1 = nn.Linear(
 361 |             2*self.shared_embedding, self.projection_embedding)
 362 |         self.collabrative_gate_2 = nn.Linear(
 363 |             self.projection_embedding, self.shared_embedding)
 364 | 
 365 |         self.pred_module = nn.Sequential(
 366 |             nn.Linear(3*self.shared_embedding, 2*self.shared_embedding),
 367 |             nn.BatchNorm1d(2*self.shared_embedding),
 368 |             nn.ReLU(),
 369 |             nn.Dropout(dropout),
 370 |             nn.Linear(2*self.shared_embedding, self.shared_embedding),
 371 |             nn.BatchNorm1d(self.shared_embedding),
 372 |             nn.ReLU(),
 373 |             nn.Dropout(dropout),
 374 |             nn.Linear(self.shared_embedding,  512),
 375 |             nn.BatchNorm1d(512),
 376 |             nn.ReLU(),
 377 |             nn.Dropout(dropout),
 378 |             nn.Linear(512,  128),
 379 |             nn.BatchNorm1d(128),
 380 |             nn.ReLU(),
 381 |             nn.Dropout(dropout),
 382 |             nn.Linear(128,  self.num_classes)
 383 |         )
 384 | 
 385 |     def attention(self, featureA, featureB):
 386 |         """ This method takes two features and calcuates the attention """
 387 |         input = torch.cat((featureA, featureB), dim=1)
 388 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 389 | 
 390 |     def attention_aggregator(self, feA, feB, feC):
 391 |         """ This method calculates the attention for feA with respect to others"""
 392 |         input = self.attention(feA, feB) + self.attention(feA, feC)
 393 |         # here we call for pairwise attention
 394 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 395 | 
 396 |     def forward(self, uA, uB,  uC):
 397 |         """making Feature Projection in order to make all feature of same dimension"""
 398 | 
 399 |         shared_A_utterance = self.norm_A_utterance(
 400 |             nn.functional.relu(self.A_utterance_share(uA)))
 401 | 
 402 |         shared_C_utterance = self.norm_C_utterance(
 403 |             nn.functional.relu(self.C_utterance_share(uC)))
 404 | 
 405 |         shared_B_utterance = self.norm_B_utterance(
 406 |             nn.functional.relu(self.B_utterance_share(uB)))
 407 | 
 408 |         updated_shared_A = shared_A_utterance * self.attention_aggregator(
 409 |             shared_A_utterance,   shared_C_utterance,  shared_B_utterance)
 410 |         updated_shared_C = shared_C_utterance * self.attention_aggregator(
 411 |             shared_C_utterance,   shared_A_utterance,  shared_B_utterance)
 412 |         updated_shared_B = shared_B_utterance * self.attention_aggregator(
 413 |             shared_B_utterance,   shared_A_utterance,  shared_C_utterance)
 414 | 
 415 |         temp = torch.cat((updated_shared_A, updated_shared_C), dim=1)
 416 |         input = torch.cat((temp, updated_shared_B), dim=1)
 417 | 
 418 |         return self.pred_module(input)
 419 | 
 420 | ################################################################################################################################################################################################################
 421 | 
 422 | 
 423 | class Speaker_Independent_Dual_Mode_without_Context(nn.Module):
 424 |     def __init__(self, input_embedding_A=1024, input_embedding_B=2048, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5):
 425 |         super(Speaker_Independent_Dual_Mode_without_Context, self).__init__()
 426 | 
 427 |         self.input_embedding_A = input_embedding_A
 428 |         self.input_embedding_B = input_embedding_B
 429 |         self.shared_embedding = shared_embedding
 430 |         self.projection_embedding = projection_embedding
 431 |         self.num_classes = num_classes
 432 |         self.dropout = dropout
 433 | 
 434 |         self.A_utterance_share = nn.Linear(
 435 |             self.input_embedding_A, self.shared_embedding)
 436 | 
 437 |         self.B_utterance_share = nn.Linear(
 438 |             self.input_embedding_B, self.shared_embedding)
 439 | 
 440 |         self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
 441 | 
 442 |         self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
 443 | 
 444 |         self.collabrative_gate_1 = nn.Linear(
 445 |             2*self.shared_embedding, self.projection_embedding)
 446 |         self.collabrative_gate_2 = nn.Linear(
 447 |             self.projection_embedding, self.shared_embedding)
 448 | 
 449 |         self.pred_module = nn.Sequential(
 450 |             nn.Linear(2*self.shared_embedding, self.shared_embedding),
 451 |             nn.BatchNorm1d(self.shared_embedding),
 452 |             nn.ReLU(),
 453 |             nn.Dropout(dropout),
 454 |             nn.Linear(self.shared_embedding,  512),
 455 |             nn.BatchNorm1d(512),
 456 |             nn.ReLU(),
 457 |             nn.Dropout(dropout),
 458 |             nn.Linear(512,  128),
 459 |             nn.BatchNorm1d(128),
 460 |             nn.ReLU(),
 461 |             nn.Dropout(dropout),
 462 |             nn.Linear(128,  self.num_classes)
 463 |         )
 464 | 
 465 |     def attention(self, featureA, featureB):
 466 |         """ This method takes two features and caluate the attention """
 467 |         input = torch.cat((featureA, featureB), dim=1)
 468 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 469 | 
 470 |     def attention_aggregator(self, feA, feB):
 471 |         """ This method caluates the attention for feA with respect to others"""    
 472 |         input = self.attention(feA, feB)
 473 |         # here we call for pairwise attention
 474 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 475 | 
 476 |     def forward(self, uA,  uB):
 477 |         """making Feature Projection in order to make all feature of same dimension"""
 478 | 
 479 |         shared_A_utterance = self.norm_A_utterance(
 480 |             nn.functional.relu(self.A_utterance_share(uA)))
 481 | 
 482 |         shared_B_utterance = self.norm_B_utterance(
 483 |             nn.functional.relu(self.B_utterance_share(uB)))
 484 | 
 485 |         updated_shared_A = shared_A_utterance * self.attention_aggregator(
 486 |             shared_A_utterance,  shared_B_utterance)
 487 | 
 488 |         updated_shared_B = shared_B_utterance * self.attention_aggregator(
 489 |             shared_B_utterance,  shared_A_utterance)
 490 | 
 491 |         input = torch.cat((updated_shared_A, updated_shared_B), dim=1)
 492 | 
 493 |         return self.pred_module(input)
 494 | 
 495 | ################################################################################################################################################################################################################
 496 | 
 497 | 
 498 | class Speaker_Independent_Single_Mode_without_Context(nn.Module):
 499 |     def __init__(self, input_embedding_A=1024, shared_embedding=1024, projection_embedding=512, dropout=0.2, num_classes=5):
 500 |         super(Speaker_Independent_Single_Mode_without_Context, self).__init__()
 501 |         print("No. of classes:",num_classes)
 502 |         self.input_embedding = input_embedding_A
 503 | 
 504 |         self.shared_embedding = shared_embedding
 505 |         self.projection_embedding = projection_embedding
 506 |         self.num_classes = num_classes
 507 |         self.dropout = dropout
 508 | 
 509 |         self.utterance_share = nn.Linear(
 510 |             self.input_embedding, self.shared_embedding)
 511 | 
 512 |         self.norm_utterance = nn.BatchNorm1d(self.shared_embedding)
 513 | 
 514 |         self.collabrative_gate_1 = nn.Linear(
 515 |             2*self.shared_embedding, self.projection_embedding)
 516 |         self.collabrative_gate_2 = nn.Linear(
 517 |             self.projection_embedding, self.shared_embedding)
 518 | 
 519 |         self.pred_module = nn.Sequential(
 520 |             # nn.Linear(3*self.shared_embedding, 2*self.shared_embedding),
 521 |             # nn.BatchNorm1d(2*self.shared_embedding),
 522 |             # nn.ReLU(),
 523 |             # nn.Linear(2*self.shared_embedding, self.shared_embedding),
 524 |             # nn.BatchNorm1d(self.shared_embedding),
 525 |             # nn.ReLU(),
 526 |             nn.Linear(self.shared_embedding,  512),
 527 |             nn.BatchNorm1d(512),
 528 |             nn.ReLU(),
 529 |             nn.Dropout(dropout),
 530 |             nn.Linear(512,  128),
 531 |             nn.BatchNorm1d(128),
 532 |             nn.ReLU(),
 533 |             nn.Dropout(dropout),
 534 |             nn.Linear(128,  self.num_classes)
 535 |         )
 536 | 
 537 |     def attention(self, featureA, featureB):
 538 |         """ This method takes two features and calcuates the attention """
 539 |         input = torch.cat((featureA, featureB), dim=1)
 540 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 541 | 
 542 |     def attention_aggregator(self, feA, feB):
 543 |         """ This method calculates the attention for feA with respect to others"""
 544 |         input = self.attention(feA, feB)
 545 |         # here we call for pairwise attention
 546 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 547 | 
 548 |     def forward(self, uA):
 549 |         """making Feature Projection in order to make all feature of same dimension"""
 550 | 
 551 |         shared_utterance = self.norm_utterance(
 552 |             nn.functional.relu(self.utterance_share(uA)))
 553 | 
 554 |         updated_shared = shared_utterance * self.attention_aggregator(
 555 |             shared_utterance, shared_utterance)
 556 | 
 557 |         input = updated_shared
 558 | 
 559 |         return self.pred_module(updated_shared)
 560 | ################################################################################################################################################################################################################
 561 | 
 562 | 
 563 | class Speaker_Dependent_Triple_Mode_with_Context(nn.Module):
 564 |     def __init__(self, n_speaker=24, input_embedding_A=2048, input_embedding_B=1024, input_embedding_C=audio_embedding_size, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5):
 565 |         super(Speaker_Dependent_Triple_Mode_with_Context, self).__init__()
 566 | 
 567 |         self.n_speaker = n_speaker
 568 | 
 569 |         self.input_embedding_A = input_embedding_A
 570 |         self.input_embedding_B = input_embedding_B
 571 |         self.input_embedding_C = input_embedding_C
 572 | 
 573 |         self.shared_embedding = shared_embedding
 574 |         self.projection_embedding = projection_embedding
 575 |         self.num_classes = num_classes
 576 |         self.dropout = dropout
 577 | 
 578 |         self.A_context_share = nn.Linear(
 579 |             self.input_embedding_A, self.shared_embedding)
 580 |         self.A_utterance_share = nn.Linear(
 581 |             self.input_embedding_A, self.shared_embedding)
 582 | 
 583 |         self.C_context_share = nn.Linear(
 584 |             self.input_embedding_C, self.shared_embedding)
 585 |         self.C_utterance_share = nn.Linear(
 586 |             self.input_embedding_C, self.shared_embedding)
 587 | 
 588 |         self.B_context_share = nn.Linear(
 589 |             self.input_embedding_B, self.shared_embedding)
 590 |         self.B_utterance_share = nn.Linear(
 591 |             self.input_embedding_B, self.shared_embedding)
 592 | 
 593 |         self.norm_A_context = nn.BatchNorm1d(self.shared_embedding)
 594 |         self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
 595 | 
 596 |         self.norm_C_context = nn.BatchNorm1d(self.shared_embedding)
 597 |         self.norm_C_utterance = nn.BatchNorm1d(self.shared_embedding)
 598 | 
 599 |         self.norm_B_context = nn.BatchNorm1d(self.shared_embedding)
 600 |         self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
 601 | 
 602 |         self.collabrative_gate_1 = nn.Linear(
 603 |             2*self.shared_embedding, self.projection_embedding)
 604 |         self.collabrative_gate_2 = nn.Linear(
 605 |             self.projection_embedding, self.shared_embedding)
 606 | 
 607 |         self.pred_module = nn.Sequential(
 608 |             nn.Linear(self.n_speaker+3*self.shared_embedding,
 609 |                       2*self.shared_embedding),
 610 |             nn.BatchNorm1d(2*self.shared_embedding),
 611 |             nn.ReLU(),
 612 |             nn.Dropout(dropout),
 613 |             nn.Linear(2*self.shared_embedding, self.shared_embedding),
 614 |             nn.BatchNorm1d(self.shared_embedding),
 615 |             nn.ReLU(),
 616 |             nn.Dropout(dropout),
 617 |             nn.Linear(self.shared_embedding,  512),
 618 |             nn.BatchNorm1d(512),
 619 |             nn.ReLU(),
 620 |             nn.Dropout(dropout),
 621 |             nn.Linear(512,  128),
 622 |             nn.BatchNorm1d(128),
 623 |             nn.ReLU(),
 624 |             nn.Dropout(dropout),
 625 |             nn.Linear(128,  self.num_classes)
 626 | 
 627 |         )
 628 | 
 629 |     def attention(self, featureA, featureB):
 630 |         """ This method takes two features and calculates the attention """
 631 |         input = torch.cat((featureA, featureB), dim=1)
 632 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 633 | 
 634 |     def attention_aggregator(self, feA, feB, feC, feD, feE, feF):
 635 |         """ This method calculates the attention for feA with respect to others"""
 636 |         input = self.attention(feA, feB) + self.attention(feA, feC) + self.attention(
 637 |             feA, feD) + self.attention(feA, feE) + self.attention(feA, feF)
 638 |         # here we call for pairwise attention
 639 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 640 | 
 641 |     def forward(self, uA, cA, uB, cB, uC, cC, speaker_embedding):
 642 |         """Args:
 643 |                 uA:
 644 |                     Utterance Video
 645 |                 uB:
 646 |                     Utterance Text
 647 |                 uC:
 648 |                     Utterance Audio
 649 |                 cA:
 650 |                     Context Video
 651 |                 cB:
 652 |                     Context Text
 653 |                 cC:
 654 |                     Context Audio
 655 | 
 656 |             Returns:
 657 |                 probability of emotion classes
 658 |                 (
 659 |                     Since we have used Crossentropy as loss function,
 660 |                     Therefore we have not used softmax here because Crossentropy perform Softmax while calculating loss
 661 |                     While evaluation we have to perform softmax explicitly
 662 |                 )
 663 |         """
 664 |         """making Feature Projection in order to make all feature of same dimension"""
 665 | 
 666 |         shared_A_context = self.norm_A_context(
 667 |             nn.functional.relu(self.A_context_share(cA)))
 668 |         shared_A_utterance = self.norm_A_utterance(
 669 |             nn.functional.relu(self.A_utterance_share(uA)))
 670 | 
 671 |         shared_C_context = self.norm_C_context(
 672 |             nn.functional.relu(self.C_context_share(cC)))
 673 |         shared_C_utterance = self.norm_C_utterance(
 674 |             nn.functional.relu(self.C_utterance_share(uC)))
 675 | 
 676 |         shared_B_context = self.norm_B_context(
 677 |             nn.functional.relu(self.B_context_share(cB)))
 678 |         shared_B_utterance = self.norm_B_utterance(
 679 |             nn.functional.relu(self.B_utterance_share(uB)))
 680 | 
 681 |         updated_shared_A = shared_A_utterance * self.attention_aggregator(
 682 |             shared_A_utterance, shared_A_context, shared_C_context, shared_C_utterance, shared_B_context, shared_B_utterance)
 683 |         updated_shared_C = shared_C_utterance * self.attention_aggregator(
 684 |             shared_C_utterance, shared_C_context, shared_A_context, shared_A_utterance, shared_B_context, shared_B_utterance)
 685 |         updated_shared_B = shared_B_utterance * self.attention_aggregator(
 686 |             shared_B_utterance, shared_B_context, shared_A_context, shared_A_utterance, shared_C_context, shared_C_utterance)
 687 | 
 688 |         temp = torch.cat((updated_shared_A, updated_shared_C), dim=1)
 689 |         input = torch.cat((temp, updated_shared_B), dim=1)
 690 | 
 691 |         input = torch.cat((input, speaker_embedding), dim=1)
 692 | 
 693 |         return self.pred_module(input)
 694 | 
 695 | ################################################################################################################################################################################################################
 696 | 
 697 | 
 698 | class Speaker_Dependent_Dual_Mode_with_Context(nn.Module):
 699 |     def __init__(self, n_speaker=24, input_embedding_A=1024, input_embedding_B=2048, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5):
 700 |         super(Speaker_Dependent_Dual_Mode_with_Context, self).__init__()
 701 | 
 702 |         self.n_speaker = n_speaker
 703 | 
 704 |         self.input_embedding_A = input_embedding_A
 705 |         self.input_embedding_B = input_embedding_B
 706 |         self.shared_embedding = shared_embedding
 707 |         self.projection_embedding = projection_embedding
 708 |         self.num_classes = num_classes
 709 |         self.dropout = dropout
 710 | 
 711 |         self.A_context_share = nn.Linear(
 712 |             self.input_embedding_A, self.shared_embedding)
 713 |         self.A_utterance_share = nn.Linear(
 714 |             self.input_embedding_A, self.shared_embedding)
 715 | 
 716 |         self.B_context_share = nn.Linear(
 717 |             self.input_embedding_B, self.shared_embedding)
 718 |         self.B_utterance_share = nn.Linear(
 719 |             self.input_embedding_B, self.shared_embedding)
 720 | 
 721 |         self.norm_A_context = nn.BatchNorm1d(self.shared_embedding)
 722 |         self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
 723 | 
 724 |         self.norm_B_context = nn.BatchNorm1d(self.shared_embedding)
 725 |         self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
 726 | 
 727 |         self.collabrative_gate_1 = nn.Linear(
 728 |             2*self.shared_embedding, self.projection_embedding)
 729 |         self.collabrative_gate_2 = nn.Linear(
 730 |             self.projection_embedding, self.shared_embedding)
 731 | 
 732 |         self.pred_module = nn.Sequential(
 733 |             nn.Linear(self.n_speaker+2*self.shared_embedding,
 734 |                       self.shared_embedding),
 735 |             nn.BatchNorm1d(self.shared_embedding),
 736 |             nn.ReLU(),
 737 |             nn.Dropout(dropout),
 738 |             nn.Linear(self.shared_embedding,  512),
 739 |             nn.BatchNorm1d(512),
 740 |             nn.ReLU(),
 741 |             nn.Dropout(dropout),
 742 |             nn.Linear(512,  128),
 743 |             nn.BatchNorm1d(128),
 744 |             nn.ReLU(),
 745 |             nn.Dropout(dropout),
 746 |             nn.Linear(128,  self.num_classes)
 747 |         )
 748 | 
 749 |     def attention(self, featureA, featureB):
 750 |         """ This method takes two features and caluate the attention """
 751 |         input = torch.cat((featureA, featureB), dim=1)
 752 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 753 | 
 754 |     def attention_aggregator(self, feA, feB, feC, feD):
 755 |         """ This method caluates the attention for feA with respect to others"""    
 756 |         input = self.attention(feA, feB) + self.attention(feA,
 757 |                                                           feC) + self.attention(feA, feD)
 758 |         # here we call for pairwise attention
 759 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 760 | 
 761 |     def forward(self, uA, cA, uB, cB, speaker_embedding):
 762 |         """making Feature Projection in order to make all feature of same dimension"""
 763 | 
 764 |         shared_A_context = self.norm_A_context(
 765 |             nn.functional.relu(self.A_context_share(cA)))
 766 |         shared_A_utterance = self.norm_A_utterance(
 767 |             nn.functional.relu(self.A_utterance_share(uA)))
 768 | 
 769 |         shared_B_context = self.norm_B_context(
 770 |             nn.functional.relu(self.B_context_share(cB)))
 771 |         shared_B_utterance = self.norm_B_utterance(
 772 |             nn.functional.relu(self.B_utterance_share(uB)))
 773 | 
 774 |         updated_shared_A = shared_A_utterance * self.attention_aggregator(
 775 |             shared_A_utterance, shared_A_context, shared_B_context, shared_B_utterance)
 776 | 
 777 |         updated_shared_B = shared_B_utterance * self.attention_aggregator(
 778 |             shared_B_utterance, shared_B_context, shared_A_context, shared_A_utterance)
 779 | 
 780 |         input = torch.cat((updated_shared_A, updated_shared_B), dim=1)
 781 | 
 782 |         input = torch.cat((input, speaker_embedding), dim=1)
 783 | 
 784 |         return self.pred_module(input)
 785 | 
 786 | ################################################################################################################################################################################################################
 787 | 
 788 | 
 789 | class Speaker_Dependent_Single_Mode_with_Context(nn.Module):
 790 |     def __init__(self, n_speaker=24, input_embedding_A=1024, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5):
 791 |         super(Speaker_Dependent_Single_Mode_with_Context, self).__init__()
 792 | 
 793 |         self.n_speaker = n_speaker
 794 | 
 795 |         self.input_embedding = input_embedding_A
 796 | 
 797 |         self.shared_embedding = shared_embedding
 798 |         self.projection_embedding = projection_embedding
 799 |         self.num_classes = num_classes
 800 |         self.dropout = dropout
 801 | 
 802 |         self.context_share = nn.Linear(
 803 |             self.input_embedding, self.shared_embedding)
 804 |         self.utterance_share = nn.Linear(
 805 |             self.input_embedding, self.shared_embedding)
 806 | 
 807 |         self.norm_context = nn.BatchNorm1d(self.shared_embedding)
 808 |         self.norm_utterance = nn.BatchNorm1d(self.shared_embedding)
 809 | 
 810 |         self.collabrative_gate_1 = nn.Linear(
 811 |             2*self.shared_embedding, self.projection_embedding)
 812 |         self.collabrative_gate_2 = nn.Linear(
 813 |             self.projection_embedding, self.shared_embedding)
 814 | 
 815 |         self.pred_module = nn.Sequential(
 816 |             nn.Linear(self.n_speaker+self.shared_embedding,  512),
 817 |             nn.BatchNorm1d(512),
 818 |             nn.ReLU(),
 819 |             nn.Dropout(dropout),
 820 |             nn.Linear(512,  128),
 821 |             nn.BatchNorm1d(128),
 822 |             nn.ReLU(),
 823 |             nn.Dropout(dropout),
 824 |             nn.Linear(128,  self.num_classes)
 825 |         )
 826 | 
 827 |     def attention(self, featureA, featureB):
 828 |         """ This method takes two features and caluate the attention """
 829 |         input = torch.cat((featureA, featureB), dim=1)
 830 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 831 | 
 832 |     def attention_aggregator(self, feA, feB):
 833 |         """ This method caluates the attention for feA with respect to others"""    
 834 |         input = self.attention(feA, feB)
 835 |         # here we call for pairwise attention
 836 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 837 | 
 838 |     def forward(self, uA, cA, speaker_embedding):
 839 |         """making Feature Projection in order to make all feature of same dimension"""
 840 | 
 841 |         shared_context = self.norm_context(
 842 |             nn.functional.relu(self.context_share(cA)))
 843 |         shared_utterance = self.norm_utterance(
 844 |             nn.functional.relu(self.utterance_share(uA)))
 845 | 
 846 |         updated_shared = shared_utterance * self.attention_aggregator(
 847 |             shared_utterance, shared_context)
 848 | 
 849 |         input = torch.cat((updated_shared, speaker_embedding), dim=1)
 850 | 
 851 |         return self.pred_module(input)
 852 | 
 853 | ################################################################################################################################################################################################################
 854 | 
 855 | 
 856 | class Speaker_Dependent_Triple_Mode_without_Context(nn.Module):
 857 |     def __init__(self, n_speaker=24, input_embedding_A=2048, input_embedding_B=1024, input_embedding_C=audio_embedding_size, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5):
 858 |         super(Speaker_Dependent_Triple_Mode_without_Context, self).__init__()
 859 | 
 860 |         self.n_speaker = n_speaker
 861 | 
 862 |         self.input_embedding_A = input_embedding_A
 863 |         self.input_embedding_B = input_embedding_B
 864 |         self.input_embedding_C = input_embedding_C
 865 |         self.shared_embedding = shared_embedding
 866 |         self.projection_embedding = projection_embedding
 867 |         self.num_classes = num_classes
 868 |         self.dropout = dropout
 869 | 
 870 |         self.A_utterance_share = nn.Linear(
 871 |             self.input_embedding_A, self.shared_embedding)
 872 | 
 873 |         self.C_utterance_share = nn.Linear(
 874 |             self.input_embedding_C, self.shared_embedding)
 875 | 
 876 |         self.B_utterance_share = nn.Linear(
 877 |             self.input_embedding_B, self.shared_embedding)
 878 | 
 879 |         self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
 880 | 
 881 |         self.norm_C_utterance = nn.BatchNorm1d(self.shared_embedding)
 882 | 
 883 |         self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
 884 | 
 885 |         self.collabrative_gate_1 = nn.Linear(
 886 |             2*self.shared_embedding, self.projection_embedding)
 887 |         self.collabrative_gate_2 = nn.Linear(
 888 |             self.projection_embedding, self.shared_embedding)
 889 | 
 890 |         self.pred_module = nn.Sequential(
 891 |             nn.Linear(self.n_speaker+3*self.shared_embedding,
 892 |                       2*self.shared_embedding),
 893 |             nn.BatchNorm1d(2*self.shared_embedding),
 894 |             nn.ReLU(),
 895 |             nn.Linear(2*self.shared_embedding, self.shared_embedding),
 896 |             nn.BatchNorm1d(self.shared_embedding),
 897 |             nn.ReLU(),
 898 |             nn.Dropout(dropout),
 899 |             nn.Linear(self.shared_embedding,  512),
 900 |             nn.BatchNorm1d(512),
 901 |             nn.ReLU(),
 902 |             nn.Dropout(dropout),
 903 |             nn.Linear(512,  128),
 904 |             nn.BatchNorm1d(128),
 905 |             nn.ReLU(),
 906 |             nn.Dropout(dropout),
 907 |             nn.Linear(128,  self.num_classes)
 908 |         )
 909 | 
 910 |     def attention(self, featureA, featureB):
 911 |         """ This method takes two features and caluate the attention """
 912 |         input = torch.cat((featureA, featureB), dim=1)
 913 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 914 | 
 915 |     def attention_aggregator(self, feA, feB, feC):
 916 |         """ This method caluates the attention for feA with respect to others"""    
 917 |         input = self.attention(feA, feB) + self.attention(feA, feC)
 918 |         # here we call for pairwise attention
 919 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 920 | 
 921 |     def forward(self, uA, uB,  uC, speaker_embedding):
 922 |         """making Feature Projection in order to make all feature of same dimension"""
 923 | 
 924 |         shared_A_utterance = self.norm_A_utterance(
 925 |             nn.functional.relu(self.A_utterance_share(uA)))
 926 | 
 927 |         shared_C_utterance = self.norm_C_utterance(
 928 |             nn.functional.relu(self.C_utterance_share(uC)))
 929 | 
 930 |         shared_B_utterance = self.norm_B_utterance(
 931 |             nn.functional.relu(self.B_utterance_share(uB)))
 932 | 
 933 |         updated_shared_A = shared_A_utterance * self.attention_aggregator(
 934 |             shared_A_utterance,   shared_C_utterance,  shared_B_utterance)
 935 |         updated_shared_C = shared_C_utterance * self.attention_aggregator(
 936 |             shared_C_utterance,   shared_A_utterance,  shared_B_utterance)
 937 |         updated_shared_B = shared_B_utterance * self.attention_aggregator(
 938 |             shared_B_utterance,   shared_A_utterance,  shared_C_utterance)
 939 | 
 940 |         temp = torch.cat((updated_shared_A, updated_shared_C), dim=1)
 941 |         input = torch.cat((temp, updated_shared_B), dim=1)
 942 | 
 943 |         input = torch.cat((input, speaker_embedding), dim=1)
 944 | 
 945 |         return self.pred_module(input)
 946 | 
 947 | ################################################################################################################################################################################################################
 948 | 
 949 | 
 950 | class Speaker_Dependent_Dual_Mode_without_Context(nn.Module):
 951 |     def __init__(self, n_speaker=24, input_embedding_A=1024, input_embedding_B=2048, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5):
 952 |         super(Speaker_Dependent_Dual_Mode_without_Context, self).__init__()
 953 | 
 954 |         self.n_speaker = n_speaker
 955 | 
 956 |         self.input_embedding_A = input_embedding_A
 957 |         self.input_embedding_B = input_embedding_B
 958 |         self.shared_embedding = shared_embedding
 959 |         self.projection_embedding = projection_embedding
 960 |         self.num_classes = num_classes
 961 |         self.dropout = dropout
 962 | 
 963 |         self.A_utterance_share = nn.Linear(
 964 |             self.input_embedding_A, self.shared_embedding)
 965 | 
 966 |         self.B_utterance_share = nn.Linear(
 967 |             self.input_embedding_B, self.shared_embedding)
 968 | 
 969 |         self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
 970 | 
 971 |         self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
 972 | 
 973 |         self.collabrative_gate_1 = nn.Linear(
 974 |             2*self.shared_embedding, self.projection_embedding)
 975 |         self.collabrative_gate_2 = nn.Linear(
 976 |             self.projection_embedding, self.shared_embedding)
 977 | 
 978 |         self.pred_module = nn.Sequential(
 979 |             nn.Linear(self.n_speaker+2*self.shared_embedding,
 980 |                       self.shared_embedding),
 981 |             nn.BatchNorm1d(self.shared_embedding),
 982 |             nn.ReLU(),
 983 |             nn.Dropout(dropout),
 984 |             nn.Linear(self.shared_embedding,  512),
 985 |             nn.BatchNorm1d(512),
 986 |             nn.ReLU(),
 987 |             nn.Dropout(dropout),
 988 |             nn.Linear(512,  128),
 989 |             nn.BatchNorm1d(128),
 990 |             nn.ReLU(),
 991 |             nn.Dropout(dropout),
 992 |             nn.Linear(128,  self.num_classes)
 993 |         )
 994 | 
 995 |     def attention(self, featureA, featureB):
 996 |         """ This method takes two features and caluate the attention """
 997 |         input = torch.cat((featureA, featureB), dim=1)
 998 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 999 | 
1000 |     def attention_aggregator(self, feA, feB):
1001 |         """ This method caluates the attention for feA with respect to others"""    
1002 |         input = self.attention(feA, feB)
1003 |         # here we call for pairwise attention
1004 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
1005 | 
1006 |     def forward(self, uA,  uB, speaker_embedding):
1007 |         """making Feature Projection in order to make all feature of same dimension"""
1008 | 
1009 |         shared_A_utterance = self.norm_A_utterance(
1010 |             nn.functional.relu(self.A_utterance_share(uA)))
1011 | 
1012 |         shared_B_utterance = self.norm_B_utterance(
1013 |             nn.functional.relu(self.B_utterance_share(uB)))
1014 | 
1015 |         updated_shared_A = shared_A_utterance * self.attention_aggregator(
1016 |             shared_A_utterance,  shared_B_utterance)
1017 | 
1018 |         updated_shared_B = shared_B_utterance * self.attention_aggregator(
1019 |             shared_B_utterance,  shared_A_utterance)
1020 | 
1021 |         input = torch.cat((updated_shared_A, updated_shared_B), dim=1)
1022 | 
1023 |         input = torch.cat((input, speaker_embedding), dim=1)
1024 | 
1025 |         return self.pred_module(input)
1026 | 
1027 | ################################################################################################################################################################################################################
1028 | 
1029 | 
1030 | class Speaker_Dependent_Single_Mode_without_Context(nn.Module):
1031 |     def __init__(self, n_speaker=24, input_embedding_A=1024, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5):
1032 |         super(Speaker_Dependent_Single_Mode_without_Context, self).__init__()
1033 | 
1034 |         self.n_speaker = n_speaker
1035 | 
1036 |         self.input_embedding = input_embedding_A
1037 | 
1038 |         self.shared_embedding = shared_embedding
1039 |         self.projection_embedding = projection_embedding
1040 |         self.num_classes = num_classes
1041 |         self.dropout = dropout
1042 | 
1043 |         self.utterance_share = nn.Linear(
1044 |             self.input_embedding, self.shared_embedding)
1045 | 
1046 |         self.norm_utterance = nn.BatchNorm1d(self.shared_embedding)
1047 | 
1048 |         self.collabrative_gate_1 = nn.Linear(
1049 |             2*self.shared_embedding, self.projection_embedding)
1050 |         self.collabrative_gate_2 = nn.Linear(
1051 |             self.projection_embedding, self.shared_embedding)
1052 | 
1053 |         self.pred_module = nn.Sequential(
1054 |             nn.Linear(self.n_speaker+self.shared_embedding, 512),
1055 |             nn.BatchNorm1d(512),
1056 |             nn.ReLU(),
1057 |             nn.Dropout(dropout),
1058 |             nn.Linear(512,  128),
1059 |             nn.BatchNorm1d(128),
1060 |             nn.ReLU(),
1061 |             nn.Dropout(dropout),
1062 |             nn.Linear(128,  self.num_classes)
1063 |         )
1064 | 
1065 |     def attention(self, featureA, featureB):
1066 |         """ This method takes two features and caluate the attention """
1067 |         input = torch.cat((featureA, featureB), dim=1)
1068 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
1069 | 
1070 |     def attention_aggregator(self, feA, feB):
1071 |         """ This method caluates the attention for feA with respect to others"""    
1072 |         input = self.attention(feA, feB)
1073 |         # here we call for pairwise attention
1074 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
1075 | 
1076 |     def forward(self, uA, speaker_embedding):
1077 |         """making Feature Projection in order to make all feature of same dimension"""
1078 | 
1079 |         shared_utterance = self.norm_utterance(
1080 |             nn.functional.relu(self.utterance_share(uA)))
1081 | 
1082 |         updated_shared = shared_utterance * self.attention_aggregator(
1083 |             shared_utterance, shared_utterance)
1084 | 
1085 |         input = torch.cat((updated_shared, speaker_embedding), dim=1)
1086 |         return self.pred_module(input)
1087 | 
1088 | ################################################################################################################################################################################################################
1089 | 


--------------------------------------------------------------------------------
/MPP_Code/models/emotion_regression_model.py:
--------------------------------------------------------------------------------
   1 | import torch
   2 | import torch.nn as nn
   3 | import numpy as np
   4 | import random
   5 | 
   6 | np.random.seed(42)
   7 | random.seed(42)
   8 | torch.manual_seed(42)
   9 | torch.backends.cudnn.deterministic = True
  10 | 
  11 | """This file contains all the models we have used for -
  12 | 1. Valence-Arousal Prediction
  13 | 
  14 | Class names and Variable names are self explanatory
  15 | Before setting the values to input embedding we first sort the modality name in descending order. (VTA) in order to remove randomness in the model
  16 | 
  17 | Parameters:
  18 |       input_embedding_A:
  19 |             Takes the input dimension of first modality
  20 |       input_embedding_B:
  21 |             Takes the input dimension of second modality
  22 |       input_embedding_C:
  23 |             Takes the input dimension of third modality
  24 |       shared_embedding:
  25 |             This is the dimension size to which we have to project all modality, to have equal dimension input from each input modality
  26 |       projection_embedding :
  27 |             This is the intermediate dimension size to which project our shared embedding to calculate attention
  28 |       dropout : 
  29 |             Parameter to pass dropout (to be hyper-tuned)
  30 | 
  31 | We have used Tanh As Output layer
  32 | 
  33 | """
  34 | 
  35 | 
  36 | class Speaker_Independent_Triple_Mode_with_Context(nn.Module):
  37 |     def __init__(self, input_embedding_A=2048, input_embedding_B=1024, input_embedding_C=291, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=2):
  38 |         super(Speaker_Independent_Triple_Mode_with_Context, self).__init__()
  39 | 
  40 |         self.input_embedding_A = input_embedding_A
  41 |         self.input_embedding_B = input_embedding_B
  42 |         self.input_embedding_C = input_embedding_C
  43 |         self.shared_embedding = shared_embedding
  44 |         self.projection_embedding = projection_embedding
  45 |         self.num_classes = num_classes
  46 |         self.dropout = dropout
  47 | 
  48 |         self.A_context_share = nn.Linear(
  49 |             self.input_embedding_A, self.shared_embedding)
  50 |         self.A_utterance_share = nn.Linear(
  51 |             self.input_embedding_A, self.shared_embedding)
  52 | 
  53 |         self.C_context_share = nn.Linear(
  54 |             self.input_embedding_C, self.shared_embedding)
  55 |         self.C_utterance_share = nn.Linear(
  56 |             self.input_embedding_C, self.shared_embedding)
  57 | 
  58 |         self.B_context_share = nn.Linear(
  59 |             self.input_embedding_B, self.shared_embedding)
  60 |         self.B_utterance_share = nn.Linear(
  61 |             self.input_embedding_B, self.shared_embedding)
  62 | 
  63 |         self.norm_A_context = nn.BatchNorm1d(self.shared_embedding)
  64 |         self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
  65 | 
  66 |         self.norm_C_context = nn.BatchNorm1d(self.shared_embedding)
  67 |         self.norm_C_utterance = nn.BatchNorm1d(self.shared_embedding)
  68 | 
  69 |         self.norm_B_context = nn.BatchNorm1d(self.shared_embedding)
  70 |         self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
  71 | 
  72 |         self.collabrative_gate_1 = nn.Linear(
  73 |             2*self.shared_embedding, self.projection_embedding)
  74 |         self.collabrative_gate_2 = nn.Linear(
  75 |             self.projection_embedding, self.shared_embedding)
  76 | 
  77 |         self.pred_module = nn.Sequential(
  78 |             nn.Linear(3*self.shared_embedding, 2*self.shared_embedding),
  79 |             nn.BatchNorm1d(2*self.shared_embedding),
  80 |             nn.ReLU(),
  81 |             nn.Dropout(dropout),
  82 |             nn.Linear(2*self.shared_embedding, self.shared_embedding),
  83 |             nn.BatchNorm1d(self.shared_embedding),
  84 |             nn.ReLU(),
  85 |             nn.Dropout(dropout),
  86 |             nn.Linear(self.shared_embedding,  512),
  87 |             nn.BatchNorm1d(512),
  88 |             nn.ReLU(),
  89 |             nn.Dropout(dropout),
  90 |             nn.Linear(512,  128),
  91 |             nn.BatchNorm1d(128),
  92 |             nn.ReLU(),
  93 |             nn.Dropout(dropout),
  94 |             nn.Linear(128,  self.num_classes),
  95 |             nn.ReLU()
  96 |         )
  97 | 
  98 |     def attention(self, featureA, featureB):
  99 |         """ This method takes two features and calculate the attention """
 100 | 
 101 |         input = torch.cat((featureA, featureB), dim=1)
 102 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 103 | 
 104 |     def attention_aggregator(self, feA, feB, feC, feD, feE, feF):
 105 |         """ This method calculate the attention for feA with respect to others"""
 106 | 
 107 |         input = self.attention(feA, feB) + \
 108 |             self.attention(feA, feC) + \
 109 |             self.attention(feA, feD) + \
 110 |             self.attention(feA, feE) + \
 111 |             self.attention(feA, feF)
 112 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 113 | 
 114 |     def forward(self, uA, cA, uB, cB, uC, cC):
 115 |         """making Feature Projection in order to make all feature of same dimension"""
 116 | 
 117 | 
 118 |         shared_A_context = self.norm_A_context(
 119 |             nn.functional.relu(self.A_context_share(cA)))
 120 |         shared_A_utterance = self.norm_A_utterance(
 121 |             nn.functional.relu(self.A_utterance_share(uA)))
 122 | 
 123 |         shared_C_context = self.norm_C_context(
 124 |             nn.functional.relu(self.C_context_share(cC)))
 125 |         shared_C_utterance = self.norm_C_utterance(
 126 |             nn.functional.relu(self.C_utterance_share(uC)))
 127 | 
 128 |         shared_B_context = self.norm_B_context(
 129 |             nn.functional.relu(self.B_context_share(cB)))
 130 |         shared_B_utterance = self.norm_B_utterance(
 131 |             nn.functional.relu(self.B_utterance_share(uB)))
 132 | 
 133 |         updated_shared_A = shared_A_utterance * self.attention_aggregator(
 134 |             shared_A_utterance, shared_A_context, shared_C_context, shared_C_utterance, shared_B_context, shared_B_utterance)
 135 |         updated_shared_C = shared_C_utterance * self.attention_aggregator(
 136 |             shared_C_utterance, shared_C_context, shared_A_context, shared_A_utterance, shared_B_context, shared_B_utterance)
 137 |         updated_shared_B = shared_B_utterance * self.attention_aggregator(
 138 |             shared_B_utterance, shared_B_context, shared_A_context, shared_A_utterance, shared_C_context, shared_C_utterance)
 139 | 
 140 |         temp = torch.cat((updated_shared_A, updated_shared_C), dim=1)
 141 |         input = torch.cat((temp, updated_shared_B), dim=1)
 142 | 
 143 |         return self.pred_module(input)
 144 | 
 145 | ################################################################################################################################################################################################################
 146 | 
 147 | 
 148 | class Speaker_Independent_Dual_Mode_with_Context(nn.Module):
 149 |     def __init__(self, input_embedding_A=1024, input_embedding_B=2048, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=2):
 150 |         super(Speaker_Independent_Dual_Mode_with_Context, self).__init__()
 151 | 
 152 |         self.input_embedding_A = input_embedding_A
 153 |         self.input_embedding_B = input_embedding_B
 154 |         self.shared_embedding = shared_embedding
 155 |         self.projection_embedding = projection_embedding
 156 |         self.num_classes = num_classes
 157 |         self.dropout = dropout
 158 | 
 159 |         self.A_context_share = nn.Linear(
 160 |             self.input_embedding_A, self.shared_embedding)
 161 |         self.A_utterance_share = nn.Linear(
 162 |             self.input_embedding_A, self.shared_embedding)
 163 | 
 164 |         self.B_context_share = nn.Linear(
 165 |             self.input_embedding_B, self.shared_embedding)
 166 |         self.B_utterance_share = nn.Linear(
 167 |             self.input_embedding_B, self.shared_embedding)
 168 | 
 169 |         self.norm_A_context = nn.BatchNorm1d(self.shared_embedding)
 170 |         self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
 171 | 
 172 |         self.norm_B_context = nn.BatchNorm1d(self.shared_embedding)
 173 |         self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
 174 | 
 175 |         self.collabrative_gate_1 = nn.Linear(
 176 |             2*self.shared_embedding, self.projection_embedding)
 177 |         self.collabrative_gate_2 = nn.Linear(
 178 |             self.projection_embedding, self.shared_embedding)
 179 | 
 180 |         self.pred_module = nn.Sequential(
 181 |             nn.Linear(2*self.shared_embedding, self.shared_embedding),
 182 |             nn.BatchNorm1d(self.shared_embedding),
 183 |             nn.ReLU(),
 184 |             nn.Dropout(dropout),
 185 |             nn.Linear(self.shared_embedding,  512),
 186 |             nn.BatchNorm1d(512),
 187 |             nn.ReLU(),
 188 |             nn.Dropout(dropout),
 189 |             nn.Linear(512,  128),
 190 |             nn.BatchNorm1d(128),
 191 |             nn.ReLU(),
 192 |             nn.Dropout(dropout),
 193 |             nn.Linear(128,  self.num_classes),
 194 |             nn.ReLU()
 195 |         )
 196 | 
 197 |     def attention(self, featureA, featureB):
 198 |         """ This method takes two features and caluate the attention """
 199 | 
 200 |         input = torch.cat((featureA, featureB), dim=1)
 201 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 202 | 
 203 |     def attention_aggregator(self, feA, feB, feC, feD):
 204 |         """ This method caluates the attention for feA with respect to others"""    
 205 | 
 206 |         input = self.attention(feA, feB) + \
 207 |             self.attention(feA, feC) + \
 208 |             self.attention(feA, feD)
 209 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 210 | 
 211 |     def forward(self, uA, cA, uB, cB):
 212 |         """making Feature Projection in order to make all feature of same dimension"""
 213 | 
 214 | 
 215 |         shared_A_context = self.norm_A_context(
 216 |             nn.functional.relu(self.A_context_share(cA)))
 217 |         shared_A_utterance = self.norm_A_utterance(
 218 |             nn.functional.relu(self.A_utterance_share(uA)))
 219 | 
 220 |         shared_B_context = self.norm_B_context(
 221 |             nn.functional.relu(self.B_context_share(cB)))
 222 |         shared_B_utterance = self.norm_B_utterance(
 223 |             nn.functional.relu(self.B_utterance_share(uB)))
 224 | 
 225 |         updated_shared_A = shared_A_utterance * self.attention_aggregator(
 226 |             shared_A_utterance, shared_A_context, shared_B_context, shared_B_utterance)
 227 | 
 228 |         updated_shared_B = shared_B_utterance * self.attention_aggregator(
 229 |             shared_B_utterance, shared_B_context, shared_A_context, shared_A_utterance)
 230 | 
 231 |         input = torch.cat((updated_shared_A, updated_shared_B), dim=1)
 232 | 
 233 |         return self.pred_module(input)
 234 | 
 235 | ################################################################################################################################################################################################################
 236 | 
 237 | 
 238 | class Speaker_Independent_Single_Mode_with_Context(nn.Module):
 239 |     def __init__(self, input_embedding_A=1024, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=2):
 240 |         super(Speaker_Independent_Single_Mode_with_Context, self).__init__()
 241 | 
 242 |         self.input_embedding = input_embedding_A
 243 | 
 244 |         self.shared_embedding = shared_embedding
 245 |         self.projection_embedding = projection_embedding
 246 |         self.num_classes = num_classes
 247 |         self.dropout = dropout
 248 | 
 249 |         self.context_share = nn.Linear(
 250 |             self.input_embedding, self.shared_embedding)
 251 |         self.utterance_share = nn.Linear(
 252 |             self.input_embedding, self.shared_embedding)
 253 | 
 254 |         self.norm_context = nn.BatchNorm1d(self.shared_embedding)
 255 |         self.norm_utterance = nn.BatchNorm1d(self.shared_embedding)
 256 | 
 257 |         self.collabrative_gate_1 = nn.Linear(
 258 |             2*self.shared_embedding, self.projection_embedding)
 259 |         self.collabrative_gate_2 = nn.Linear(
 260 |             self.projection_embedding, self.shared_embedding)
 261 | 
 262 |         self.pred_module = nn.Sequential(
 263 |             nn.Linear(self.shared_embedding,  512),
 264 |             nn.BatchNorm1d(512),
 265 |             nn.ReLU(),
 266 |             nn.Dropout(dropout),
 267 |             nn.Linear(512,  128),
 268 |             nn.BatchNorm1d(128),
 269 |             nn.ReLU(),
 270 |             nn.Dropout(dropout),
 271 |             nn.Linear(128,  self.num_classes),
 272 |             nn.ReLU()
 273 |         )
 274 | 
 275 |     def attention(self, featureA, featureB):
 276 |         """ This method takes two features and calculate the attention """
 277 | 
 278 |         input = torch.cat((featureA, featureB), dim=1)
 279 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 280 | 
 281 |     def attention_aggregator(self, feA, feB):
 282 |         """ This method calculates the attention for feA with respect to others"""
 283 | 
 284 |         input = self.attention(feA, feB)
 285 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 286 | 
 287 |     def forward(self, uA, cA):
 288 |         """making Feature Projection in order to make all feature of same dimension"""
 289 | 
 290 | 
 291 |         shared_context = self.norm_context(
 292 |             nn.functional.relu(self.context_share(cA)))
 293 |         shared_utterance = self.norm_utterance(
 294 |             nn.functional.relu(self.utterance_share(uA)))
 295 | 
 296 |         updated_shared = shared_utterance * self.attention_aggregator(
 297 |             shared_utterance, shared_context)
 298 | 
 299 |         input = updated_shared
 300 | 
 301 |         return self.pred_module(updated_shared)
 302 | 
 303 | ################################################################################################################################################################################################################
 304 | 
 305 | 
 306 | class Speaker_Independent_Triple_Mode_without_Context(nn.Module):
 307 |     def __init__(self, input_embedding_A=2048, input_embedding_B=1024, input_embedding_C=291, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=2):
 308 |         super(Speaker_Independent_Triple_Mode_without_Context, self).__init__()
 309 | 
 310 |         self.input_embedding_A = input_embedding_A
 311 |         self.input_embedding_B = input_embedding_B
 312 |         self.input_embedding_C = input_embedding_C
 313 |         self.shared_embedding = shared_embedding
 314 |         self.projection_embedding = projection_embedding
 315 |         self.num_classes = num_classes
 316 |         self.dropout = dropout
 317 | 
 318 |         self.A_utterance_share = nn.Linear(
 319 |             self.input_embedding_A, self.shared_embedding)
 320 | 
 321 |         self.C_utterance_share = nn.Linear(
 322 |             self.input_embedding_C, self.shared_embedding)
 323 | 
 324 |         self.B_utterance_share = nn.Linear(
 325 |             self.input_embedding_B, self.shared_embedding)
 326 | 
 327 |         self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
 328 | 
 329 |         self.norm_C_utterance = nn.BatchNorm1d(self.shared_embedding)
 330 | 
 331 |         self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
 332 | 
 333 |         self.collabrative_gate_1 = nn.Linear(
 334 |             2*self.shared_embedding, self.projection_embedding)
 335 |         self.collabrative_gate_2 = nn.Linear(
 336 |             self.projection_embedding, self.shared_embedding)
 337 | 
 338 |         self.pred_module = nn.Sequential(
 339 |             nn.Linear(3*self.shared_embedding, 2*self.shared_embedding),
 340 |             nn.BatchNorm1d(2*self.shared_embedding),
 341 |             nn.ReLU(),
 342 |             nn.Dropout(dropout),
 343 |             nn.Linear(2*self.shared_embedding, self.shared_embedding),
 344 |             nn.BatchNorm1d(self.shared_embedding),
 345 |             nn.ReLU(),
 346 |             nn.Dropout(dropout),
 347 |             nn.Linear(self.shared_embedding,  512),
 348 |             nn.BatchNorm1d(512),
 349 |             nn.ReLU(),
 350 |             nn.Dropout(dropout),
 351 |             nn.Linear(512,  128),
 352 |             nn.BatchNorm1d(128),
 353 |             nn.ReLU(),
 354 |             nn.Dropout(dropout),
 355 |             nn.Linear(128,  self.num_classes),
 356 |             nn.ReLU()
 357 |         )
 358 | 
 359 |     def attention(self, featureA, featureB):
 360 |         """ This method takes two features and calculate the attention """
 361 | 
 362 |         input = torch.cat((featureA, featureB), dim=1)
 363 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 364 | 
 365 |     def attention_aggregator(self, feA, feB, feC):
 366 |         """ This method calculates the attention for feA with respect to others"""
 367 | 
 368 |         input = self.attention(feA, feB) + self.attention(feA, feC)
 369 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 370 | 
 371 |     def forward(self, uA, uB,  uC):
 372 |         """making Feature Projection in order to make all feature of same dimension"""
 373 | 
 374 | 
 375 |         shared_A_utterance = self.norm_A_utterance(
 376 |             nn.functional.relu(self.A_utterance_share(uA)))
 377 | 
 378 |         shared_C_utterance = self.norm_C_utterance(
 379 |             nn.functional.relu(self.C_utterance_share(uC)))
 380 | 
 381 |         shared_B_utterance = self.norm_B_utterance(
 382 |             nn.functional.relu(self.B_utterance_share(uB)))
 383 | 
 384 |         updated_shared_A = shared_A_utterance * self.attention_aggregator(
 385 |             shared_A_utterance,   shared_C_utterance,  shared_B_utterance)
 386 |         updated_shared_C = shared_C_utterance * self.attention_aggregator(
 387 |             shared_C_utterance,   shared_A_utterance,  shared_B_utterance)
 388 |         updated_shared_B = shared_B_utterance * self.attention_aggregator(
 389 |             shared_B_utterance,   shared_A_utterance,  shared_C_utterance)
 390 | 
 391 |         temp = torch.cat((updated_shared_A, updated_shared_C), dim=1)
 392 |         input = torch.cat((temp, updated_shared_B), dim=1)
 393 | 
 394 |         return self.pred_module(input)
 395 | 
 396 | ################################################################################################################################################################################################################
 397 | 
 398 | 
 399 | class Speaker_Independent_Dual_Mode_without_Context(nn.Module):
 400 |     def __init__(self, input_embedding_A=1024, input_embedding_B=2048, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=2):
 401 |         super(Speaker_Independent_Dual_Mode_without_Context, self).__init__()
 402 | 
 403 |         self.input_embedding_A = input_embedding_A
 404 |         self.input_embedding_B = input_embedding_B
 405 |         self.shared_embedding = shared_embedding
 406 |         self.projection_embedding = projection_embedding
 407 |         self.num_classes = num_classes
 408 |         self.dropout = dropout
 409 | 
 410 |         self.A_utterance_share = nn.Linear(
 411 |             self.input_embedding_A, self.shared_embedding)
 412 | 
 413 |         self.B_utterance_share = nn.Linear(
 414 |             self.input_embedding_B, self.shared_embedding)
 415 | 
 416 |         self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
 417 | 
 418 |         self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
 419 | 
 420 |         self.collabrative_gate_1 = nn.Linear(
 421 |             2*self.shared_embedding, self.projection_embedding)
 422 |         self.collabrative_gate_2 = nn.Linear(
 423 |             self.projection_embedding, self.shared_embedding)
 424 | 
 425 |         self.pred_module = nn.Sequential(
 426 |             nn.Linear(2*self.shared_embedding, self.shared_embedding),
 427 |             nn.BatchNorm1d(self.shared_embedding),
 428 |             nn.ReLU(),
 429 |             nn.Dropout(dropout),
 430 |             nn.Linear(self.shared_embedding,  512),
 431 |             nn.BatchNorm1d(512),
 432 |             nn.ReLU(),
 433 |             nn.Dropout(dropout),
 434 |             nn.Linear(512,  128),
 435 |             nn.BatchNorm1d(128),
 436 |             nn.ReLU(),
 437 |             nn.Dropout(dropout),
 438 |             nn.Linear(128,  self.num_classes),
 439 |             nn.ReLU()
 440 |         )
 441 | 
 442 |     def attention(self, featureA, featureB):
 443 |         """ This method takes two features and caluate the attention """
 444 | 
 445 |         input = torch.cat((featureA, featureB), dim=1)
 446 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 447 | 
 448 |     def attention_aggregator(self, feA, feB):
 449 |         """ This method caluates the attention for feA with respect to others"""    
 450 | 
 451 |         input = self.attention(feA, feB)
 452 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 453 | 
 454 |     def forward(self, uA,  uB):
 455 |         """making Feature Projection in order to make all feature of same dimension"""
 456 | 
 457 | 
 458 |         shared_A_utterance = self.norm_A_utterance(
 459 |             nn.functional.relu(self.A_utterance_share(uA)))
 460 | 
 461 |         shared_B_utterance = self.norm_B_utterance(
 462 |             nn.functional.relu(self.B_utterance_share(uB)))
 463 | 
 464 |         updated_shared_A = shared_A_utterance * self.attention_aggregator(
 465 |             shared_A_utterance,  shared_B_utterance)
 466 | 
 467 |         updated_shared_B = shared_B_utterance * self.attention_aggregator(
 468 |             shared_B_utterance,  shared_A_utterance)
 469 | 
 470 |         input = torch.cat((updated_shared_A, updated_shared_B), dim=1)
 471 | 
 472 |         return self.pred_module(input)
 473 | 
 474 | ################################################################################################################################################################################################################
 475 | 
 476 | 
 477 | class Speaker_Independent_Single_Mode_without_Context(nn.Module):
 478 |     def __init__(self, input_embedding_A=1024, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=2):
 479 |         super(Speaker_Independent_Single_Mode_without_Context, self).__init__()
 480 | 
 481 |         self.input_embedding = input_embedding_A
 482 | 
 483 |         self.shared_embedding = shared_embedding
 484 |         self.projection_embedding = projection_embedding
 485 |         self.num_classes = num_classes
 486 |         self.dropout = dropout
 487 | 
 488 |         self.utterance_share = nn.Linear(
 489 |             self.input_embedding, self.shared_embedding)
 490 | 
 491 |         self.norm_utterance = nn.BatchNorm1d(self.shared_embedding)
 492 | 
 493 |         self.collabrative_gate_1 = nn.Linear(
 494 |             2*self.shared_embedding, self.projection_embedding)
 495 |         self.collabrative_gate_2 = nn.Linear(
 496 |             self.projection_embedding, self.shared_embedding)
 497 | 
 498 |         self.pred_module = nn.Sequential(
 499 |             # nn.Linear(3*self.shared_embedding, 2*self.shared_embedding),
 500 |             # nn.BatchNorm1d(2*self.shared_embedding),
 501 |             # nn.ReLU(),
 502 |             # nn.Linear(2*self.shared_embedding, self.shared_embedding),
 503 |             # nn.BatchNorm1d(self.shared_embedding),
 504 |             # nn.ReLU(),
 505 |             nn.Linear(self.shared_embedding,  512),
 506 |             nn.BatchNorm1d(512),
 507 |             nn.ReLU(),
 508 |             nn.Dropout(dropout),
 509 |             nn.Linear(512,  128),
 510 |             nn.BatchNorm1d(128),
 511 |             nn.ReLU(),
 512 |             nn.Dropout(dropout),
 513 |             nn.Linear(128,  self.num_classes),
 514 |             nn.ReLU()
 515 |         )
 516 | 
 517 |     def attention(self, featureA, featureB):
 518 |         """ This method takes two features and calculates the attention """
 519 | 
 520 |         input = torch.cat((featureA, featureB), dim=1)
 521 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 522 | 
 523 |     def attention_aggregator(self, feA, feB):
 524 |         """ This method calculates the attention for feA with respect to others"""
 525 | 
 526 |         input = self.attention(feA, feB)
 527 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 528 | 
 529 |     def forward(self, uA):
 530 |         """making Feature Projection in order to make all feature of same dimension"""
 531 | 
 532 | 
 533 |         shared_utterance = self.norm_utterance(
 534 |             nn.functional.relu(self.utterance_share(uA)))
 535 | 
 536 |         updated_shared = shared_utterance * self.attention_aggregator(
 537 |             shared_utterance, shared_utterance)
 538 | 
 539 |         input = updated_shared
 540 | 
 541 |         return self.pred_module(updated_shared)
 542 | ################################################################################################################################################################################################################
 543 | 
 544 | 
 545 | class Speaker_Dependent_Triple_Mode_with_Context(nn.Module):
 546 |     def __init__(self, n_speaker=24, input_embedding_A=2048, input_embedding_B=1024, input_embedding_C=291, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=2):
 547 |         super(Speaker_Dependent_Triple_Mode_with_Context, self).__init__()
 548 | 
 549 |         self.n_speaker = n_speaker
 550 | 
 551 |         self.input_embedding_A = input_embedding_A
 552 |         self.input_embedding_B = input_embedding_B
 553 |         self.input_embedding_C = input_embedding_C
 554 | 
 555 |         self.shared_embedding = shared_embedding
 556 |         self.projection_embedding = projection_embedding
 557 |         self.num_classes = num_classes
 558 |         self.dropout = dropout
 559 | 
 560 |         self.A_context_share = nn.Linear(
 561 |             self.input_embedding_A, self.shared_embedding)
 562 |         self.A_utterance_share = nn.Linear(
 563 |             self.input_embedding_A, self.shared_embedding)
 564 | 
 565 |         self.C_context_share = nn.Linear(
 566 |             self.input_embedding_C, self.shared_embedding)
 567 |         self.C_utterance_share = nn.Linear(
 568 |             self.input_embedding_C, self.shared_embedding)
 569 | 
 570 |         self.B_context_share = nn.Linear(
 571 |             self.input_embedding_B, self.shared_embedding)
 572 |         self.B_utterance_share = nn.Linear(
 573 |             self.input_embedding_B, self.shared_embedding)
 574 | 
 575 |         self.norm_A_context = nn.BatchNorm1d(self.shared_embedding)
 576 |         self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
 577 | 
 578 |         self.norm_C_context = nn.BatchNorm1d(self.shared_embedding)
 579 |         self.norm_C_utterance = nn.BatchNorm1d(self.shared_embedding)
 580 | 
 581 |         self.norm_B_context = nn.BatchNorm1d(self.shared_embedding)
 582 |         self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
 583 | 
 584 |         self.collabrative_gate_1 = nn.Linear(
 585 |             2*self.shared_embedding, self.projection_embedding)
 586 |         self.collabrative_gate_2 = nn.Linear(
 587 |             self.projection_embedding, self.shared_embedding)
 588 | 
 589 |         self.pred_module = nn.Sequential(
 590 |             nn.Linear(self.n_speaker+3*self.shared_embedding,
 591 |                       2*self.shared_embedding),
 592 |             nn.BatchNorm1d(2*self.shared_embedding),
 593 |             nn.ReLU(),
 594 |             nn.Dropout(dropout),
 595 |             nn.Linear(2*self.shared_embedding, self.shared_embedding),
 596 |             nn.BatchNorm1d(self.shared_embedding),
 597 |             nn.ReLU(),
 598 |             nn.Dropout(dropout),
 599 |             nn.Linear(self.shared_embedding,  512),
 600 |             nn.BatchNorm1d(512),
 601 |             nn.ReLU(),
 602 |             nn.Dropout(dropout),
 603 |             nn.Linear(512,  128),
 604 |             nn.BatchNorm1d(128),
 605 |             nn.ReLU(),
 606 |             nn.Dropout(dropout),
 607 |             nn.Linear(128,  self.num_classes),
 608 |             nn.ReLU()
 609 | 
 610 |         )
 611 | 
 612 |     def attention(self, featureA, featureB):
 613 |         """ This method takes two features and calculates the attention """
 614 | 
 615 |         input = torch.cat((featureA, featureB), dim=1)
 616 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 617 | 
 618 |     def attention_aggregator(self, feA, feB, feC, feD, feE, feF):
 619 |         """ This method calculates the attention for feA with respect to others"""
 620 | 
 621 |         input = self.attention(feA, feB) + self.attention(feA, feC) + self.attention(
 622 |             feA, feD) + self.attention(feA, feE) + self.attention(feA, feF)
 623 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 624 | 
 625 |     def forward(self, uA, cA, uB, cB, uC, cC, speaker_embedding):
 626 |         """making Feature Projection in order to make all feature of same dimension"""
 627 | 
 628 | 
 629 |         shared_A_context = self.norm_A_context(
 630 |             nn.functional.relu(self.A_context_share(cA)))
 631 |         shared_A_utterance = self.norm_A_utterance(
 632 |             nn.functional.relu(self.A_utterance_share(uA)))
 633 | 
 634 |         shared_C_context = self.norm_C_context(
 635 |             nn.functional.relu(self.C_context_share(cC)))
 636 |         shared_C_utterance = self.norm_C_utterance(
 637 |             nn.functional.relu(self.C_utterance_share(uC)))
 638 | 
 639 |         shared_B_context = self.norm_B_context(
 640 |             nn.functional.relu(self.B_context_share(cB)))
 641 |         shared_B_utterance = self.norm_B_utterance(
 642 |             nn.functional.relu(self.B_utterance_share(uB)))
 643 | 
 644 |         updated_shared_A = shared_A_utterance * self.attention_aggregator(
 645 |             shared_A_utterance, shared_A_context, shared_C_context, shared_C_utterance, shared_B_context, shared_B_utterance)
 646 |         updated_shared_C = shared_C_utterance * self.attention_aggregator(
 647 |             shared_C_utterance, shared_C_context, shared_A_context, shared_A_utterance, shared_B_context, shared_B_utterance)
 648 |         updated_shared_B = shared_B_utterance * self.attention_aggregator(
 649 |             shared_B_utterance, shared_B_context, shared_A_context, shared_A_utterance, shared_C_context, shared_C_utterance)
 650 | 
 651 |         temp = torch.cat((updated_shared_A, updated_shared_C), dim=1)
 652 |         input = torch.cat((temp, updated_shared_B), dim=1)
 653 | 
 654 |         input = torch.cat((input, speaker_embedding), dim=1)
 655 | 
 656 |         return self.pred_module(input)
 657 | 
 658 | ################################################################################################################################################################################################################
 659 | 
 660 | 
 661 | class Speaker_Dependent_Dual_Mode_with_Context(nn.Module):
 662 |     def __init__(self, n_speaker=24, input_embedding_A=1024, input_embedding_B=2048, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=2):
 663 |         super(Speaker_Dependent_Dual_Mode_with_Context, self).__init__()
 664 | 
 665 |         self.n_speaker = n_speaker
 666 | 
 667 |         self.input_embedding_A = input_embedding_A
 668 |         self.input_embedding_B = input_embedding_B
 669 |         self.shared_embedding = shared_embedding
 670 |         self.projection_embedding = projection_embedding
 671 |         self.num_classes = num_classes
 672 |         self.dropout = dropout
 673 | 
 674 |         self.A_context_share = nn.Linear(
 675 |             self.input_embedding_A, self.shared_embedding)
 676 |         self.A_utterance_share = nn.Linear(
 677 |             self.input_embedding_A, self.shared_embedding)
 678 | 
 679 |         self.B_context_share = nn.Linear(
 680 |             self.input_embedding_B, self.shared_embedding)
 681 |         self.B_utterance_share = nn.Linear(
 682 |             self.input_embedding_B, self.shared_embedding)
 683 | 
 684 |         self.norm_A_context = nn.BatchNorm1d(self.shared_embedding)
 685 |         self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
 686 | 
 687 |         self.norm_B_context = nn.BatchNorm1d(self.shared_embedding)
 688 |         self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
 689 | 
 690 |         self.collabrative_gate_1 = nn.Linear(
 691 |             2*self.shared_embedding, self.projection_embedding)
 692 |         self.collabrative_gate_2 = nn.Linear(
 693 |             self.projection_embedding, self.shared_embedding)
 694 | 
 695 |         self.pred_module = nn.Sequential(
 696 |             nn.Linear(self.n_speaker+2*self.shared_embedding,
 697 |                       self.shared_embedding),
 698 |             nn.BatchNorm1d(self.shared_embedding),
 699 |             nn.ReLU(),
 700 |             nn.Dropout(dropout),
 701 |             nn.Linear(self.shared_embedding,  512),
 702 |             nn.BatchNorm1d(512),
 703 |             nn.ReLU(),
 704 |             nn.Dropout(dropout),
 705 |             nn.Linear(512,  128),
 706 |             nn.BatchNorm1d(128),
 707 |             nn.ReLU(),
 708 |             nn.Dropout(dropout),
 709 |             nn.Linear(128,  self.num_classes),
 710 |             nn.ReLU()
 711 |         )
 712 | 
 713 |     def attention(self, featureA, featureB):
 714 |         """ This method takes two features and caluate the attention """
 715 | 
 716 |         input = torch.cat((featureA, featureB), dim=1)
 717 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 718 | 
 719 |     def attention_aggregator(self, feA, feB, feC, feD):
 720 |         """ This method caluates the attention for feA with respect to others"""    
 721 | 
 722 |         input = self.attention(feA, feB) + self.attention(feA,
 723 |                                                           feC) + self.attention(feA, feD)
 724 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 725 | 
 726 |     def forward(self, uA, cA, uB, cB, speaker_embedding):
 727 |         """making Feature Projection in order to make all feature of same dimension"""
 728 | 
 729 | 
 730 |         shared_A_context = self.norm_A_context(
 731 |             nn.functional.relu(self.A_context_share(cA)))
 732 |         shared_A_utterance = self.norm_A_utterance(
 733 |             nn.functional.relu(self.A_utterance_share(uA)))
 734 | 
 735 |         shared_B_context = self.norm_B_context(
 736 |             nn.functional.relu(self.B_context_share(cB)))
 737 |         shared_B_utterance = self.norm_B_utterance(
 738 |             nn.functional.relu(self.B_utterance_share(uB)))
 739 | 
 740 |         updated_shared_A = shared_A_utterance * self.attention_aggregator(
 741 |             shared_A_utterance, shared_A_context, shared_B_context, shared_B_utterance)
 742 | 
 743 |         updated_shared_B = shared_B_utterance * self.attention_aggregator(
 744 |             shared_B_utterance, shared_B_context, shared_A_context, shared_A_utterance)
 745 | 
 746 |         input = torch.cat((updated_shared_A, updated_shared_B), dim=1)
 747 | 
 748 |         input = torch.cat((input, speaker_embedding), dim=1)
 749 | 
 750 |         return self.pred_module(input)
 751 | 
 752 | ################################################################################################################################################################################################################
 753 | 
 754 | 
 755 | class Speaker_Dependent_Single_Mode_with_Context(nn.Module):
 756 |     def __init__(self, n_speaker=24, input_embedding_A=1024, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=2):
 757 |         super(Speaker_Dependent_Single_Mode_with_Context, self).__init__()
 758 | 
 759 |         self.n_speaker = n_speaker
 760 | 
 761 |         self.input_embedding = input_embedding_A
 762 | 
 763 |         self.shared_embedding = shared_embedding
 764 |         self.projection_embedding = projection_embedding
 765 |         self.num_classes = num_classes
 766 |         self.dropout = dropout
 767 | 
 768 |         self.context_share = nn.Linear(
 769 |             self.input_embedding, self.shared_embedding)
 770 |         self.utterance_share = nn.Linear(
 771 |             self.input_embedding, self.shared_embedding)
 772 | 
 773 |         self.norm_context = nn.BatchNorm1d(self.shared_embedding)
 774 |         self.norm_utterance = nn.BatchNorm1d(self.shared_embedding)
 775 | 
 776 |         self.collabrative_gate_1 = nn.Linear(
 777 |             2*self.shared_embedding, self.projection_embedding)
 778 |         self.collabrative_gate_2 = nn.Linear(
 779 |             self.projection_embedding, self.shared_embedding)
 780 | 
 781 |         self.pred_module = nn.Sequential(
 782 |             nn.Linear(self.n_speaker+self.shared_embedding,  512),
 783 |             nn.BatchNorm1d(512),
 784 |             nn.ReLU(),
 785 |             nn.Dropout(dropout),
 786 |             nn.Linear(512,  128),
 787 |             nn.BatchNorm1d(128),
 788 |             nn.ReLU(),
 789 |             nn.Dropout(dropout),
 790 |             nn.Linear(128,  self.num_classes),
 791 |             nn.ReLU()
 792 |         )
 793 | 
 794 |     def attention(self, featureA, featureB):
 795 |         """ This method takes two features and calculates the attention """
 796 | 
 797 |         input = torch.cat((featureA, featureB), dim=1)
 798 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 799 | 
 800 |     def attention_aggregator(self, feA, feB):
 801 |         """ This method calculates the attention for feA with respect to others"""
 802 | 
 803 |         input = self.attention(feA, feB)
 804 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 805 | 
 806 |     def forward(self, uA, cA, speaker_embedding):
 807 |         """making Feature Projection in order to make all feature of same dimension"""
 808 | 
 809 | 
 810 |         shared_context = self.norm_context(
 811 |             nn.functional.relu(self.context_share(cA)))
 812 |         shared_utterance = self.norm_utterance(
 813 |             nn.functional.relu(self.utterance_share(uA)))
 814 | 
 815 |         updated_shared = shared_utterance * self.attention_aggregator(
 816 |             shared_utterance, shared_context)
 817 | 
 818 |         input = torch.cat((updated_shared, speaker_embedding), dim=1)
 819 | 
 820 |         return self.pred_module(input)
 821 | 
 822 | ################################################################################################################################################################################################################
 823 | 
 824 | 
 825 | class Speaker_Dependent_Triple_Mode_without_Context(nn.Module):
 826 |     def __init__(self, n_speaker=24, input_embedding_A=2048, input_embedding_B=1024, input_embedding_C=291, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=2):
 827 |         super(Speaker_Dependent_Triple_Mode_without_Context, self).__init__()
 828 | 
 829 |         self.n_speaker = n_speaker
 830 | 
 831 |         self.input_embedding_A = input_embedding_A
 832 |         self.input_embedding_B = input_embedding_B
 833 |         self.input_embedding_C = input_embedding_C
 834 |         self.shared_embedding = shared_embedding
 835 |         self.projection_embedding = projection_embedding
 836 |         self.num_classes = num_classes
 837 |         self.dropout = dropout
 838 | 
 839 |         self.A_utterance_share = nn.Linear(
 840 |             self.input_embedding_A, self.shared_embedding)
 841 | 
 842 |         self.C_utterance_share = nn.Linear(
 843 |             self.input_embedding_C, self.shared_embedding)
 844 | 
 845 |         self.B_utterance_share = nn.Linear(
 846 |             self.input_embedding_B, self.shared_embedding)
 847 | 
 848 |         self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
 849 | 
 850 |         self.norm_C_utterance = nn.BatchNorm1d(self.shared_embedding)
 851 | 
 852 |         self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
 853 | 
 854 |         self.collabrative_gate_1 = nn.Linear(
 855 |             2*self.shared_embedding, self.projection_embedding)
 856 |         self.collabrative_gate_2 = nn.Linear(
 857 |             self.projection_embedding, self.shared_embedding)
 858 | 
 859 |         self.pred_module = nn.Sequential(
 860 |             nn.Linear(self.n_speaker+3*self.shared_embedding,
 861 |                       2*self.shared_embedding),
 862 |             nn.BatchNorm1d(2*self.shared_embedding),
 863 |             nn.ReLU(),
 864 |             nn.Linear(2*self.shared_embedding, self.shared_embedding),
 865 |             nn.BatchNorm1d(self.shared_embedding),
 866 |             nn.ReLU(),
 867 |             nn.Dropout(dropout),
 868 |             nn.Linear(self.shared_embedding,  512),
 869 |             nn.BatchNorm1d(512),
 870 |             nn.ReLU(),
 871 |             nn.Dropout(dropout),
 872 |             nn.Linear(512,  128),
 873 |             nn.BatchNorm1d(128),
 874 |             nn.ReLU(),
 875 |             nn.Dropout(dropout),
 876 |             nn.Linear(128,  self.num_classes),
 877 |             nn.ReLU()
 878 |         )
 879 | 
 880 |     def attention(self, featureA, featureB):
 881 |         """ This method takes two features and calculates the attention """
 882 | 
 883 |         input = torch.cat((featureA, featureB), dim=1)
 884 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 885 | 
 886 |     def attention_aggregator(self, feA, feB, feC):
 887 |         """ This method calculates the attention for feA with respect to others"""
 888 | 
 889 |         input = self.attention(feA, feB) + self.attention(feA, feC)
 890 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 891 | 
 892 |     def forward(self, uA, uB,  uC, speaker_embedding):
 893 |         """making Feature Projection in order to make all feature of same dimension"""
 894 | 
 895 | 
 896 |         shared_A_utterance = self.norm_A_utterance(
 897 |             nn.functional.relu(self.A_utterance_share(uA)))
 898 | 
 899 |         shared_C_utterance = self.norm_C_utterance(
 900 |             nn.functional.relu(self.C_utterance_share(uC)))
 901 | 
 902 |         shared_B_utterance = self.norm_B_utterance(
 903 |             nn.functional.relu(self.B_utterance_share(uB)))
 904 | 
 905 |         updated_shared_A = shared_A_utterance * self.attention_aggregator(
 906 |             shared_A_utterance,   shared_C_utterance,  shared_B_utterance)
 907 |         updated_shared_C = shared_C_utterance * self.attention_aggregator(
 908 |             shared_C_utterance,   shared_A_utterance,  shared_B_utterance)
 909 |         updated_shared_B = shared_B_utterance * self.attention_aggregator(
 910 |             shared_B_utterance,   shared_A_utterance,  shared_C_utterance)
 911 | 
 912 |         temp = torch.cat((updated_shared_A, updated_shared_C), dim=1)
 913 |         input = torch.cat((temp, updated_shared_B), dim=1)
 914 | 
 915 |         input = torch.cat((input, speaker_embedding), dim=1)
 916 | 
 917 |         return self.pred_module(input)
 918 | 
 919 | ################################################################################################################################################################################################################
 920 | 
 921 | 
 922 | class Speaker_Dependent_Dual_Mode_without_Context(nn.Module):
 923 |     def __init__(self, n_speaker=24, input_embedding_A=1024, input_embedding_B=2048, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=2):
 924 |         super(Speaker_Dependent_Dual_Mode_without_Context, self).__init__()
 925 | 
 926 |         self.n_speaker = n_speaker
 927 | 
 928 |         self.input_embedding_A = input_embedding_A
 929 |         self.input_embedding_B = input_embedding_B
 930 |         self.shared_embedding = shared_embedding
 931 |         self.projection_embedding = projection_embedding
 932 |         self.num_classes = num_classes
 933 |         self.dropout = dropout
 934 | 
 935 |         self.A_utterance_share = nn.Linear(
 936 |             self.input_embedding_A, self.shared_embedding)
 937 | 
 938 |         self.B_utterance_share = nn.Linear(
 939 |             self.input_embedding_B, self.shared_embedding)
 940 | 
 941 |         self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
 942 | 
 943 |         self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
 944 | 
 945 |         self.collabrative_gate_1 = nn.Linear(
 946 |             2*self.shared_embedding, self.projection_embedding)
 947 |         self.collabrative_gate_2 = nn.Linear(
 948 |             self.projection_embedding, self.shared_embedding)
 949 | 
 950 |         self.pred_module = nn.Sequential(
 951 |             nn.Linear(self.n_speaker+2*self.shared_embedding,
 952 |                       self.shared_embedding),
 953 |             nn.BatchNorm1d(self.shared_embedding),
 954 |             nn.ReLU(),
 955 |             nn.Dropout(dropout),
 956 |             nn.Linear(self.shared_embedding,  512),
 957 |             nn.BatchNorm1d(512),
 958 |             nn.ReLU(),
 959 |             nn.Dropout(dropout),
 960 |             nn.Linear(512,  128),
 961 |             nn.BatchNorm1d(128),
 962 |             nn.ReLU(),
 963 |             nn.Dropout(dropout),
 964 |             nn.Linear(128,  self.num_classes),
 965 |             nn.ReLU()
 966 |         )
 967 | 
 968 |     def attention(self, featureA, featureB):
 969 |         """ This method takes two features and calculates the attention """
 970 | 
 971 |         input = torch.cat((featureA, featureB), dim=1)
 972 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
 973 | 
 974 |     def attention_aggregator(self, feA, feB):
 975 |         """ This method calculates the attention for feA with respect to others"""
 976 | 
 977 |         input = self.attention(feA, feB)
 978 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
 979 | 
 980 |     def forward(self, uA,  uB, speaker_embedding):
 981 |         """making Feature Projection in order to make all feature of same dimension"""
 982 | 
 983 | 
 984 |         shared_A_utterance = self.norm_A_utterance(
 985 |             nn.functional.relu(self.A_utterance_share(uA)))
 986 | 
 987 |         shared_B_utterance = self.norm_B_utterance(
 988 |             nn.functional.relu(self.B_utterance_share(uB)))
 989 | 
 990 |         updated_shared_A = shared_A_utterance * self.attention_aggregator(
 991 |             shared_A_utterance,  shared_B_utterance)
 992 | 
 993 |         updated_shared_B = shared_B_utterance * self.attention_aggregator(
 994 |             shared_B_utterance,  shared_A_utterance)
 995 | 
 996 |         input = torch.cat((updated_shared_A, updated_shared_B), dim=1)
 997 | 
 998 |         input = torch.cat((input, speaker_embedding), dim=1)
 999 | 
1000 |         return self.pred_module(input)
1001 | 
1002 | ################################################################################################################################################################################################################
1003 | 
1004 | 
1005 | class Speaker_Dependent_Single_Mode_without_Context(nn.Module):
1006 |     def __init__(self, n_speaker=24, input_embedding_A=1024, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=2):
1007 |         super(Speaker_Dependent_Single_Mode_without_Context, self).__init__()
1008 | 
1009 |         self.n_speaker = n_speaker
1010 | 
1011 |         self.input_embedding = input_embedding_A
1012 | 
1013 |         self.shared_embedding = shared_embedding
1014 |         self.projection_embedding = projection_embedding
1015 |         self.num_classes = num_classes
1016 |         self.dropout = dropout
1017 | 
1018 |         self.utterance_share = nn.Linear(
1019 |             self.input_embedding, self.shared_embedding)
1020 | 
1021 |         self.norm_utterance = nn.BatchNorm1d(self.shared_embedding)
1022 | 
1023 |         self.collabrative_gate_1 = nn.Linear(
1024 |             2*self.shared_embedding, self.projection_embedding)
1025 |         self.collabrative_gate_2 = nn.Linear(
1026 |             self.projection_embedding, self.shared_embedding)
1027 | 
1028 |         self.pred_module = nn.Sequential(
1029 |             nn.Linear(self.n_speaker+self.shared_embedding, 512),
1030 |             nn.BatchNorm1d(512),
1031 |             nn.ReLU(),
1032 |             nn.Dropout(dropout),
1033 |             nn.Linear(512,  128),
1034 |             nn.BatchNorm1d(128),
1035 |             nn.ReLU(),
1036 |             nn.Dropout(dropout),
1037 |             nn.Linear(128,  self.num_classes),
1038 |             nn.ReLU()
1039 |         )
1040 | 
1041 |     def attention(self, featureA, featureB):
1042 |         """ This method takes two features and caluate the attention """
1043 | 
1044 |         input = torch.cat((featureA, featureB), dim=1)
1045 |         return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
1046 | 
1047 |     def attention_aggregator(self, feA, feB):
1048 |         """ This method calculates the attention for feA with respect to others"""
1049 | 
1050 |         input = self.attention(feA, feB)
1051 |         return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
1052 | 
1053 |     def forward(self, uA, speaker_embedding):
1054 |         """making Feature Projection in order to make all feature of same dimension"""
1055 | 
1056 | 
1057 |         shared_utterance = self.norm_utterance(
1058 |             nn.functional.relu(self.utterance_share(uA)))
1059 | 
1060 |         updated_shared = shared_utterance * self.attention_aggregator(
1061 |             shared_utterance, shared_utterance)
1062 | 
1063 |         input = torch.cat((updated_shared, speaker_embedding), dim=1)
1064 | 
1065 |         return self.pred_module(input)
1066 | 
1067 | ################################################################################################################################################################################################################
1068 | 


--------------------------------------------------------------------------------
/MPP_Code/training/execute_classification_explicit.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import time
  4 | import sys
  5 | import random
  6 | import pickle
  7 | import argparse
  8 | from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report
  9 | 
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.optim as optim
 13 | 
 14 | import torchvision
 15 | from torchvision import datasets, models
 16 | from torch.utils.data import Dataset, DataLoader
 17 | 
 18 | from models import emotion_classification_model
 19 | 
 20 | """
 21 |         This script is used for hyper-parameter tuning and training the explicit classification model.
 22 |         Also generates classification_reports for analysis
 23 | """
 24 | 
 25 | class Tee(object):
 26 |     """
 27 |         This class is not the part of execution.
 28 |         Object of this class is used for printing the log in file as well as in Terminal
 29 |     """
 30 |     def __init__(self, *files):
 31 |         self.files = files
 32 | 
 33 |     def write(self, obj):
 34 |         for f in self.files:
 35 |             f.write(obj)
 36 |             f.flush()  # If you want the output to be visible immediately
 37 | 
 38 |     def flush(self):
 39 |         for f in self.files:
 40 |             f.flush()
 41 | 
 42 | 
 43 | def seed():
 44 |     """ This method is used for seeding the code and different points"""
 45 | 
 46 |     np.random.seed(42)
 47 |     random.seed(42)
 48 |     torch.manual_seed(42)
 49 |     torch.cuda.manual_seed(42)
 50 |     torch.backends.cudnn.enabled = False
 51 |     torch.backends.cudnn.deterministic = True
 52 | 
 53 | 
 54 | def seed_worker(worker_id):
 55 |     """ This method is used for seeding the worker in the dataloader"""
 56 | 
 57 |     worker_seed = 42
 58 |     np.random.seed(worker_seed)
 59 |     random.seed(worker_seed)
 60 | 
 61 | 
 62 | seed()
 63 | """ argument parser is used for running the script from terminal, makes it robust """
 64 | argParser = argparse.ArgumentParser()
 65 | argParser.add_argument("-s", "--speaker", required=True,
 66 |                        help="Enter y/Y for Speaker Dependent else n/N")
 67 | 
 68 | argParser.add_argument("-m", "--mode", required=True,
 69 |                        help="VTA for Video, Text,  Audio repectively")
 70 | argParser.add_argument("-c", "--context", required=True,
 71 |                        help="y/Y for Context Dependent else n/N")
 72 | argParser.add_argument("-e", "--epooch", default=500, help="Number of epooch")
 73 | argParser.add_argument("-l", "--learning_rate",
 74 |                        default=0.001, help="Learning rate")
 75 | argParser.add_argument("-p", "--patience", default=5, help="Patience")
 76 | argParser.add_argument("-b", "--batch_size", default=64, help="Batch Size")
 77 | argParser.add_argument("-cr", "--classification_report", default='n',
 78 |                        help="Prints Classification report of Validation Set ")
 79 | argParser.add_argument("-gpu", "--gpu", default=1,
 80 |                        help="Which GPU to use")
 81 | argParser.add_argument("-seed", "--seed", default=42,
 82 |                        help="SEED value")
 83 | argParser.add_argument("-d", "--dropout", default=0.3,
 84 |                        help="Dropout value")
 85 | 
 86 | args = argParser.parse_args()
 87 | 
 88 | '''
 89 | Loading data. Modify the path to point to data folder
 90 | Data folder would hold the text utterances in csv files, features pre-extracted,
 91 | 
 92 | '''
 93 | path = "MPP_Code/data/"
 94 | mustard_input = pd.read_csv(path+'mustard_PP_utterance.csv')
 95 | print(mustard_input.columns)
 96 | 
 97 | 
 98 | # Use the right pickle file based on which benchmark you intend to use
 99 | temp = open(path+'extracted_features/an_merged/features_Tbart_Vkey_Audio_sarcasm.pickle', 'rb')
100 | data = pickle.load(temp)
101 | 
102 | # Normalizing class
103 | for key in list(data.keys()):
104 |     for idx in ['cText', 'uText', 'cAudio', 'uAudio', 'cVideo', 'uVideo']:
105 |         data[key][idx] /= np.max(abs(data[key][idx]))
106 | 
107 | # Dataset class
108 | class ContentDataset(Dataset):
109 |     def __init__(self, mapping, dataset, speaker_list):
110 |         self.mapping = mapping
111 |         self.dataset = dataset
112 |         self.speakers_mapping = speaker_list
113 | 
114 |     def __len__(self):
115 |         return len(self.mapping)
116 | 
117 |     def __getitem__(self, idx):
118 |         if torch.is_tensor(idx):
119 |             idx = idx.tolist()
120 | 
121 |         index = self.mapping.loc[idx, 'SCENE']
122 |         data = self.dataset[index]
123 |         label = self.mapping.loc[idx, 'Emo_E']-1
124 |         # if label == 9:
125 |         #     label = 3  # for happy emotion
126 |         spkr = np.eye(len(self.speakers_mapping))[self.speakers_mapping.index(
127 |             self.mapping.loc[idx, 'SPEAKER'])]
128 | 
129 |         return data['uText'], data['cText'], data['uAudio'], data['cAudio'], data['uVideo'], data['cVideo'], spkr, label
130 | 
131 | 
132 | device = torch.device("cuda:"+str(args.gpu))
133 | 
134 | 
135 | def evaluation(loader, mod, call, report=False, flag=False):
136 |     """Args:
137 |             loader:
138 |                 It is the validation dataloader
139 |             mod:
140 |                 It is the best model, which we have to evaluate
141 |             call:
142 |                 call is the COMMAND to be excuted to run the forward method of the model
143 |                 it changed as per the modality and other possible input
144 |             report:
145 |                 It True then the classification report for the validation set is printed
146 |             flag:
147 |                 if True the instead of evaluation metrics, method returns the calss labels
148 |     """
149 |     with torch.no_grad():
150 |         pred = []
151 |         true = []
152 | 
153 |         total_loss = []
154 |         criterion = nn.CrossEntropyLoss()
155 |         criterion.to(device)
156 |         seed()
157 |         for batch in loader:
158 |             uText = batch[0].float().to(device)
159 |             cText = batch[1].float().to(device)
160 |             uAudio = batch[2].float().to(device)
161 |             cAudio = batch[3].float().to(device)
162 |             uVideo = batch[4].float().to(device)
163 |             cVideo = batch[5].float().to(device)
164 |             speaker = batch[6].float().to(device)
165 |             y_true = batch[7].long().to(device)
166 |             del batch
167 |             output = torch.softmax(eval(call), dim=1)
168 |             loss = criterion(output, y_true)
169 |             del uText, cText, uAudio, cAudio, uVideo, cVideo, speaker
170 |             total_loss.append(loss)
171 |             pred.extend(output.detach().cpu().tolist())
172 |             true.extend(y_true.tolist())
173 |         if flag:
174 |             return true, np.argmax(pred, axis=1)
175 |         if report:
176 |             print(classification_report(true, np.argmax(pred, axis=1)))
177 |         return f1_score(true, np.argmax(pred, axis=1), average='macro'), sum(total_loss)/len(total_loss)
178 | 
179 | 
180 | def training(mod, criterion, optimizer, call, train_loader, valid_loader, fold, e=500, patience=5, report=False):
181 |     """Args:
182 |             mod :
183 |                 It is the mod we have to train
184 |             criterion :
185 |                 Loss function, her we have Cross entropy loss
186 |             optimizer :
187 |               object of torch.optim class
188 |             call:
189 |                 call is the COMMAND to be excuted to run the forward method of the model
190 |                 it changed as per the modality and other possible input
191 |             train_loader:
192 |                 It is a instance of train dataloader
193 |             valid_loader:
194 |                 It is a instance of validation dataloader, it is given as a input to evaluation class
195 |             fold:
196 |                 5 FOLD {0,1,2,3,4}
197 |             e:
198 |                 maximum epoch
199 |             patience:
200 |                 how many epoch to wait after the early stopping condition in satisfied
201 |             report:
202 |                 It True then the classification report for the validation set is printed, it is given as a input to evaluation class
203 |             save:
204 |                 If true then best model for each fold is saved
205 | 
206 |     """
207 | 
208 |     print('-'*100)
209 |     train_losses = [0]
210 |     valid_losses = [0]
211 |     max_f1 = 0
212 |     patience_flag = 1
213 |     best_epooch = 0
214 |     print(fold, e, patience)
215 | 
216 |     while e > 0:
217 |         total_loss = []
218 |         seed()
219 |         for batch in train_loader:
220 |             uText = batch[0].float().to(device)
221 |             cText = batch[1].float().to(device)
222 |             uAudio = batch[2].float().to(device)
223 |             cAudio = batch[3].float().to(device)
224 |             uVideo = batch[4].float().to(device)
225 |             cVideo = batch[5].float().to(device)
226 |             speaker = batch[6].float().to(device)
227 |             y_true = batch[7].long().to(device)
228 |             del batch
229 |             output = eval(call)
230 |             loss = criterion(output, y_true)
231 |             del uText, cText, uAudio, cAudio, uVideo, cVideo, speaker
232 |             optimizer.zero_grad()
233 |             total_loss.append(loss.detach().item())
234 |             loss.backward()
235 |             optimizer.step()
236 |         with torch.no_grad():
237 |             valid_f1, valid_loss = evaluation(
238 |                 valid_loader, mod, call, report, False)
239 |             train_losses.append(sum(total_loss)/len(total_loss))
240 |             valid_losses.append(valid_loss)
241 | 
242 |             e = e-1
243 |             if max_f1 < valid_f1:
244 |                 max_f1 = valid_f1
245 |                 best_model = mod
246 |                 best_epooch = 500-e
247 |                 print(
248 |                     f'Epooch:{best_epooch} | Train Loss: {loss.detach().item():.3f} | Valid loss: { valid_loss.detach().item():7.3f} | Valid F1: { valid_f1:7.3f}')
249 | 
250 |             if abs(train_losses[-2]-train_losses[-1]) < 0.0001:
251 |                 if patience_flag == 1:
252 |                     e = patience
253 |                     patience_flag = 0
254 |             else:
255 |                 patience_flag = 1
256 |     return evaluation(valid_loader, best_model, call, report, True), best_epooch
257 | 
258 | 
259 | def get_command(input_modes, context_flag, speaker_flag):
260 |     """
261 |         This method is used to create the COMMAND to execute the forward method of particular model,
262 |         Depending upon the input combination
263 |         Args:
264 |             input_modes:
265 |                 Input Modality {VTA, VT, VA, TA, V, T, A}
266 |             context_flag :
267 |                 If true then "with context" else "without context" 
268 |             speaker_flag:
269 |                 if true then Speaker dependent else Speaker Independent
270 |     """
271 |     if input_modes == 'VTA':
272 |         COMMAND = "mod(**{'uA':uVideo, 'uB':uText, 'uC':uAudio"
273 |         if context_flag == 'y':
274 |             COMMAND += ",'cA':cVideo, 'cB':cText, 'cC':cAudio"
275 | 
276 |     elif input_modes == 'VT':
277 |         COMMAND = "mod(**{'uA':uVideo, 'uB':uText"
278 |         if context_flag == 'y':
279 |             COMMAND += ",'cA':cVideo, 'cB':cText"
280 | 
281 |     elif input_modes == 'VA':
282 |         COMMAND = "mod(**{'uA':uVideo, 'uB':uAudio"
283 |         if context_flag == 'y':
284 |             COMMAND += ",'cA':cVideo, 'cB':cAudio"
285 | 
286 |     elif input_modes == 'TA':
287 |         COMMAND = "mod(**{'uA':uText, 'uB':uAudio"
288 |         if context_flag == 'y':
289 |             COMMAND += ",'cA':cText, 'cB':cAudio"
290 | 
291 |     elif input_modes == 'T':
292 |         COMMAND = "mod(**{'uA':uText"
293 |         if context_flag == 'y':
294 |             COMMAND += ",'cA':cText"
295 | 
296 |     elif input_modes == 'V':
297 |         COMMAND = "mod(**{'uA':uVideo"
298 |         if context_flag == 'y':
299 |             COMMAND += ",'cA':cVideo"
300 | 
301 |     elif input_modes == 'A':
302 |         COMMAND = "mod(**{'uA':uAudio"
303 |         if context_flag == 'y':
304 |             COMMAND += ",'cA':cAudio"
305 |     if speaker_flag == 'y':
306 |         COMMAND += ",'speaker_embedding':speaker})"
307 |     else:
308 |         COMMAND += "})"
309 | 
310 |     return COMMAND
311 | 
312 | 
313 | def get_model_and_parameters(args):
314 |     """
315 |         args is an instance of argument parser
316 |         which will be used to 
317 |     """
318 |     # Here we are sorting VTA in descending order, in order to have consistency in the model
319 | 
320 |     input_modes = ''.join(reversed(sorted(list(args.mode.upper()))))
321 | 
322 |     parameters = {}
323 |     MODEL_NAME = 'Speaker_'
324 | 
325 |     parameters['num_classes'] = 9
326 | 
327 |     if args.speaker.lower() == 'y':
328 |         MODEL_NAME += 'Dependent_'
329 |         parameters['n_speaker'] = 24
330 |     else:
331 |         MODEL_NAME += 'Independent_'
332 | 
333 |     if len(input_modes) == 3:
334 |         MODEL_NAME += 'Triple_'
335 |         parameters['input_embedding_A'] = 2048
336 |         parameters['input_embedding_B'] = 1024
337 |         parameters['input_embedding_C'] = 291
338 | 
339 |     elif len(input_modes) == 2:
340 |         MODEL_NAME += 'Dual_'
341 |         parameters['input_embedding_A'] = 2048 if input_modes[0] == 'V' else 1024
342 |         parameters['input_embedding_B'] = 291 if input_modes[1] == 'A' else 1024
343 |     else:
344 |         MODEL_NAME += 'Single_'
345 |         parameters['input_embedding_A'] = 2048 if input_modes == 'V' else 1024 if input_modes == 'T' else 291
346 | 
347 |     MODEL_NAME += 'Mode_with'
348 |     MODEL_NAME += 'out' if args.context.lower() == 'n' else ''
349 |     MODEL_NAME += '_Context'
350 | 
351 |     MODEL_NAME = 'emotion_classification_model.' + MODEL_NAME
352 | 
353 |     COMMAND = get_command(
354 |         input_modes, args.context.lower(), args.speaker.lower())
355 |     return MODEL_NAME, parameters, COMMAND
356 | 
357 | 
358 | # Initializing based on feature extraction
359 | video_embedding_size = 2048
360 | audio_embedding_size = 291
361 | text_embedding_size = 1024
362 | shared_embedding_size = 1024
363 | projection_embedding_size = 512
364 | epoch = args.epoch
365 | lr = args.learning_rate
366 | patience = args.patience
367 | batch_size = args.batch_size
368 | dropout = args.dropout
369 | 
370 | # get out model name , parameters, and command as per the arguments provided in command line
371 | MODEL_NAME, parameters, COMMAND = get_model_and_parameters(args)
372 | 
373 | parameters['shared_embedding'] = shared_embedding_size
374 | parameters['projection_embedding'] = projection_embedding_size
375 | parameters['dropout'] = dropout
376 | 
377 | """ This filename is used further for storing storing stats and logs"""
378 | filename = args.mode
379 | filename += '_context_'+args.context.upper()
380 | filename += '_speaker_'+args.speaker.upper()
381 | 
382 | """ Provide paths based on if/how you wish to log or generate charts """
383 | 
384 | f = open('MPP_Code/log/lrec_complete_explicit/an_lrec_' +
385 |          filename+'.txt', 'w')
386 | """ File to store charts of clf report"""
387 | c = open('MPP_Code/charts/lrec_complete_explicit/an_lrec_' +
388 |          filename+'.txt', 'a+')
389 | 
390 | """ Dataframe to store stats, will be saved as CSV """
391 | stats = pd.DataFrame(columns=['dropout', 'lr', 'batch_size', 'shared_embedding_size',
392 |                               'projection_embedding_size', 'epoch', 'Precision', 'Recall', 'F1'])
393 | 
394 | 
395 | """ 'original' variable is mode to switch between printing area
396 |     if we do not want to print on log file then we will use 'original'
397 |     if we want to log then 'f'  ---> it will print on both terminal and log file
398 | """
399 | original = sys.stdout
400 | 
401 | sys.stdout = Tee(sys.stdout, f)
402 | 
403 | print(MODEL_NAME.split('.')[1])
404 | 
405 | sys.stdout = original
406 | 
407 | """ since we are loading speaker name from dict 
408 |     we are sorting them in ascending order to remove randomness and make code reproducible
409 | """
410 | speaker_list = sorted(list(mustard_input.SPEAKER.value_counts().keys()))
411 | 
412 | """
413 | These are various combinations used for parameter tuning (GRID SEARCH)
414 | """
415 | for dropout in [0.2, 0.3, 0.4]:
416 |     for lr in [0.001, 0.0001]:
417 |         for batch_size in [128, 64]:
418 |             for shared_embedding_size, projection_embedding_size in zip([2048, 1024], [1024, 256]):          
419 |                 stat = [dropout, lr, batch_size,
420 |                         shared_embedding_size, projection_embedding_size]
421 |                 parameters['shared_embedding'] = shared_embedding_size
422 |                 parameters['projection_embedding'] = projection_embedding_size
423 |                 parameters['dropout'] = dropout
424 | 
425 |                 pred_all = []
426 |                 true_all = []
427 |                 indexes = []
428 |                 types = []
429 | 
430 |                 for fold in range(5):
431 |                     """
432 |                         for 5 FOLD cross validation 
433 |                         we have made the stratified splits explicitly
434 |                         this is done in order to keep consistency in different experiments (to deal with randomness)
435 |                     """
436 |                     train = pd.read_csv(
437 |                         'MPP_Code/data/splits_final_mustard++/train_' + str(fold)+'.csv')
438 |                     valid = pd.read_csv(
439 |                         'MPP_Code/data/splits_final_mustard++/test_' + str(fold)+'.csv')
440 |                     seed()
441 |                     train_dataset = ContentDataset(train, data, speaker_list)
442 |                     seed()
443 |                     train_loader = DataLoader(
444 |                         train_dataset, batch_size, num_workers=0, pin_memory=False, worker_init_fn=seed_worker)
445 |                     seed()
446 |                     valid_dataset = ContentDataset(valid, data, speaker_list)
447 |                     seed()
448 |                     valid_loader = DataLoader(
449 |                         valid_dataset, batch_size, num_workers=0, pin_memory=False, worker_init_fn=seed_worker)
450 | 
451 |                     indexes.extend(valid['SCENE'].tolist())
452 |                     types.extend(valid['SAR_T'].tolist())
453 | 
454 |                     seed()
455 |                     mod = eval(MODEL_NAME)(**parameters)
456 |                     mod.to(device)
457 |                     seed()
458 |                     criterion = nn.CrossEntropyLoss()
459 |                     criterion.to(device)
460 |                     seed()
461 |                     optimizer = optim.Adam(
462 |                         params=mod.parameters(), betas=(0.5, 0.99), lr=lr)
463 | 
464 |                     (true, pred), epo = training(mod=mod, criterion=criterion, optimizer=optimizer, call=COMMAND,
465 |                                                  train_loader=train_loader, valid_loader=valid_loader, fold=fold, e=epooch, patience=patience)
466 |                     pred_all.extend(pred)
467 |                     true_all.extend(true)
468 | 
469 |                 # training ends here
470 | 
471 | 
472 |                 # For log file emotion wise results
473 |                 sys.stdout = Tee(sys.stdout, f)
474 | 
475 |                 print(f'n_epooch:{epo} | dropout:{dropout} | lr:{lr} | batch_size:{batch_size} | shared_embedding_size:{shared_embedding_size} | projection_embedding_size:{projection_embedding_size}')
476 |                 report_dict = classification_report(true_all, pred_all, output_dict=True)
477 |                 
478 |                 report = classification_report(true_all, pred_all,digits=3)
479 |                 print(report)
480 |                 print('-'*100)
481 |                 sys.stdout = Tee(sys.stdout, c)
482 |                 print(f'dropout:{dropout} | lr:{lr} | batch_size:{batch_size} | shared_embedding_size:{shared_embedding_size} | projection_embedding_size:{projection_embedding_size}')
483 |                 
484 |                 clf_df = pd.DataFrame(report_dict).transpose()
485 |                 print(clf_df)
486 |                 print('-'*100)
487 |                 sys.stdout = original
488 |                 stat.append(epo)
489 |                 stat.extend(
490 |                     list(map(float, report.split('\n')[-2].split('     ')[1:-1])))
491 |                 stats.loc[len(stats)] = stat
492 |                 stats = stats.sort_values(by='F1')
493 |                 stats.to_csv('MPP_Code/stats/lrec_complete_explicit/an_lrec_'+
494 |                              filename+'.csv', index=False)
495 | 
496 | 
497 |                 """ Following code can be used to save prediction """
498 |                 results = []
499 | 
500 |                 for row in zip(indexes, types, true_all, pred_all):
501 |                     results.append(row)
502 | 
503 |                 results = pd.DataFrame(
504 |                     results, columns=['SCENE', 'TYPE', 'TRUE', 'PRED'])
505 |                 results.to_csv('MPP_Code/predictions/lrec_complete_explicit/' + args.mode+'/an_lrec_' +
506 |                                filename+'_'+str(int(dropout*10))+'_'+str(batch_size)+'_'+str(shared_embedding_size)+'_'+str(len(str(lr*100)))+'_'+'.csv', index=False)
507 |                 
508 | 
509 |                 """ Following code can be used to save sarcasm_type-wise analysis (will be saved in log file)"""
510 |                 sys.stdout = Tee(sys.stdout, f)
511 | 
512 |                 for ty in ['PRO', 'LIK', 'ILL', 'EMB']:
513 |                     t_t = results[results['TYPE'] == ty]['TRUE'].to_numpy()
514 |                     p_t = results[results['TYPE'] == ty]['PRED'].to_numpy()
515 |                     report1 = classification_report(t_t, p_t)
516 |                     print("FOR :----> ", ty)
517 |                     print(report1)
518 |                     print('-'*100)
519 |                 print('#'*100)
520 |                 print('#'*100)
521 |                 sys.stdout = original
522 | 
523 | 
524 | f.close()
525 | 


--------------------------------------------------------------------------------
/MPP_Code/training/execute_classification_implicit.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | The file contains code to perform hyperparameter tuning (grid-search) and training the model.
  3 | It includes log statements that get stored in logs folder.
  4 | The classification scores and corresponding hyperparamters get stored in stats folder.
  5 | The classification reports from sklearn get stored in charts folder.
  6 | 
  7 | It utilizes features saved in the form of pickles in the data/extracted_features folder.
  8 | The extracted_features folder contains pickles with BART, T5, RobertA for text. Use as necessary
  9 | '''
 10 | 
 11 | import numpy as np
 12 | import pandas as pd
 13 | import time
 14 | import sys
 15 | import random
 16 | import pickle
 17 | import argparse
 18 | from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report
 19 | 
 20 | import torch
 21 | import torch.nn as nn
 22 | import torch.optim as optim
 23 | 
 24 | import torchvision
 25 | from torchvision import datasets, models
 26 | from torch.utils.data import Dataset, DataLoader
 27 | 
 28 | from models import emotion_classification_model
 29 | 
 30 | """
 31 |         This script is used for hyper-parameter tuning and training the model.
 32 | """
 33 | 
 34 | class Tee(object):
 35 |     """
 36 |         This class is not the part of execution.
 37 |         Object of this class is used for printing the log in file as well as oin Terminal
 38 |     """
 39 |     def __init__(self, *files):
 40 |         self.files = files
 41 | 
 42 |     def write(self, obj):
 43 |         for f in self.files:
 44 |             f.write(obj)
 45 |             f.flush()  # If we want the output to be visible immediately
 46 | 
 47 |     def flush(self):
 48 |         for f in self.files:
 49 |             f.flush()
 50 | 
 51 | seed_dyn = 42 #Ignore seed_dyn if you are trying to randomize
 52 | abl = "6" #To perform ablation in audio
 53 | 
 54 | def seed():
 55 |     """ This method is used for seeding the code and different points"""
 56 |     np.random.seed(seed_dyn)
 57 |     random.seed(seed_dyn)
 58 |     torch.manual_seed(seed_dyn)
 59 |     torch.cuda.manual_seed(seed_dyn)
 60 |     torch.backends.cudnn.enabled = False
 61 |     torch.backends.cudnn.deterministic = True
 62 | 
 63 | 
 64 | def seed_worker(worker_id):
 65 |     """ This method is used for seeding the worker in the dataloader"""
 66 | 
 67 |     worker_seed = seed_dyn
 68 |     np.random.seed(worker_seed)
 69 |     random.seed(worker_seed)
 70 | 
 71 | 
 72 | seed()
 73 | 
 74 | """ argument parser is used for running the script from terminal, makes it robust """
 75 | argParser = argparse.ArgumentParser()
 76 | argParser.add_argument("-s", "--speaker", required=True,
 77 |                        help="Enter y/Y for Speaker Dependent else n/N")
 78 | 
 79 | argParser.add_argument("-m", "--mode", required=True,
 80 |                        help="VTA for Video, Text,  Audio repectively")
 81 | argParser.add_argument("-c", "--context", required=True,
 82 |                        help="y/Y for Context Dependent else n/N")
 83 | argParser.add_argument("-e", "--epooch", default=500, help="Number of epooch")
 84 | argParser.add_argument("-l", "--learning_rate",
 85 |                        default=0.001, help="Learning rate")
 86 | argParser.add_argument("-p", "--patience", default=5, help="Patience")
 87 | argParser.add_argument("-b", "--batch_size", default=64, help="Batch Size")
 88 | argParser.add_argument("-cr", "--classification_report", default='n',
 89 |                        help="Prints Classification report of Validation Set ")
 90 | argParser.add_argument("-gpu", "--gpu", default=0,
 91 |                        help="Which GPU to use")
 92 | argParser.add_argument("-seed", "--seed", default=42,
 93 |                        help="SEED value")
 94 | argParser.add_argument("-d", "--dropout", default=0.3,
 95 |                        help="Dropout value")
 96 | 
 97 | args = argParser.parse_args()
 98 | 
 99 | '''
100 | Loading data. Modify the path to point to data folder
101 | Data folder would hold the text utterances in csv files, features pre-extracted
102 | '''
103 | path = "MPP_Code/data/"
104 | mustard_input = pd.read_csv(path+'final_datasets/augmented_sarcastic_utterances.csv')
105 | 
106 | # All features audio, Video and text saved in dictionary format
107 | # temp = open(path+'extracted_features/an_merged/all_features_aud_text.pickle', 'rb')
108 | # temp = open(path+'extracted_features/an_merged/ablation_'+abl+'.pickle', 'rb')
109 | # temp = open(path+'extracted_features/all_features.pickle', 'rb')
110 | # temp = open(path+'extracted_features/all_features_with_t5.pickle', 'rb')
111 | # temp = open(path+'extracted_features/all_features_with_roberta.pickle', 'rb')
112 | # temp = open(path+'extracted_features/all_features_with_meanedbart.pickle', 'rb')
113 | # temp = open(path+'extracted_features/an_merged/augmented_features_with_bart.pickle', 'rb')
114 | temp = open(path+'extracted_features/an_merged/features_Tbart_Vkey_Audio_sarcasm.pickle', 'rb')
115 | data = pickle.load(temp)
116 | 
117 | # Normalizing class
118 | for key in list(data.keys()):
119 |     for idx in ['cText', 'uText', 'cAudio', 'uAudio', 'cVideo', 'uVideo']:
120 |         data[key][idx] /= np.max(abs(data[key][idx]))
121 | 
122 | 
123 | class ContentDataset(Dataset):
124 | 
125 |     def __init__(self, mapping, dataset, speaker_list):
126 |         self.mapping = mapping
127 |         self.dataset = dataset
128 |         self.speakers_mapping = speaker_list
129 | 
130 |     def __len__(self):
131 |         return len(self.mapping)
132 | 
133 |     def __getitem__(self, idx):
134 |         if torch.is_tensor(idx):
135 |             idx = idx.tolist()
136 | 
137 |         index = self.mapping.loc[idx, 'SCENE']
138 |         data = self.dataset[index]
139 |         label = int(self.mapping.loc[idx, 'Emo_I'])-1
140 |         spkr = np.eye(len(self.speakers_mapping))[self.speakers_mapping.index(
141 |             self.mapping.loc[idx, 'SPEAKER'])]
142 | 
143 |         return data['uText'], data['cText'], data['uAudio'], data['cAudio'], data['uVideo'], data['cVideo'], spkr, label
144 | 
145 | 
146 | device = torch.device("cuda:"+str(args.gpu))
147 | 
148 | 
149 | # flag is for returning the predictions
150 | def evaluation(loader, mod, call, report=False, flag=False):
151 |     """Args:
152 |             loader:
153 |                 It is the validation dataloader
154 |             mod:
155 |                 It is the best model, which we have to evaluate
156 |             call:
157 |                 call is the COMMAND to be excuted to run the forward method of the model
158 |                 it changed as per the modality and other possible input
159 |             report:
160 |                 It True then the classification report for the validation set is printed
161 |             flag:
162 |                 if True the instead of evaluation metrics, method returns the calss labels
163 |     """
164 |     with torch.no_grad():
165 |         pred = []
166 |         true = []
167 |         total_loss = []
168 |         criterion = nn.CrossEntropyLoss() #Loss function for calculating validation loss
169 |         criterion.to(device)
170 |         seed()
171 |         for batch in loader:
172 |             uText = batch[0].float().to(device)
173 |             cText = batch[1].float().to(device)
174 |             uAudio = batch[2].float().to(device)
175 |             cAudio = batch[3].float().to(device)
176 |             uVideo = batch[4].float().to(device)
177 |             cVideo = batch[5].float().to(device)
178 |             speaker = batch[6].float().to(device)
179 |             y_true = batch[7].long().to(device)
180 |             del batch
181 |             output = torch.softmax(eval(call), dim=1) # softmax is claculated explicitly
182 |             # call is the command to be executed, since we have different combination of input modality, this decides the input by default
183 |             loss = criterion(output, y_true)
184 |             del uText, cText, uAudio, cAudio, uVideo, cVideo, speaker
185 | 
186 |             total_loss.append(loss)
187 |             pred.extend(output.detach().cpu().tolist())
188 |             true.extend(y_true.tolist())
189 |         if flag:
190 |             # for directly predicting the emotion class label
191 |             return true, np.argmax(pred, axis=1)
192 |         if report:
193 |             # for printing the classification report on validation set.
194 |             print(classification_report(true, np.argmax(pred, axis=1)))
195 |         return f1_score(true, np.argmax(pred, axis=1), average='macro'), sum(total_loss)/len(total_loss)
196 | 
197 | 
198 | def training(mod, criterion, optimizer, call, train_loader, valid_loader, fold, e=500, patience=5, report=False,save=False):
199 |     """Args:
200 |             mod :
201 |                 It is the mod we have to train
202 |             criterion :
203 |                 Loss function, her we have Cross entropy loss
204 |             optimizer :
205 |               object of torch.optim class
206 |             call:
207 |                 call is the COMMAND to be excuted to run the forward method of the model
208 |                 it changed as per the modality and other possible input
209 |             train_loader:
210 |                 It is a instance of train dataloader
211 |             valid_loader:
212 |                 It is a instance of validation dataloader, it is given as a input to evaluation class
213 |             fold:
214 |                 5 FOLD {0,1,2,3,4}
215 |             e:
216 |                 maximum epoch
217 |             patience:
218 |                 how many epoch to wait after the early stopping condition in satisfied
219 |             report:
220 |                 It True then the classification report for the validation set is printed, it is given as a input to evaluation class
221 |             save:
222 |                 If true then best model for each fold is saved
223 | 
224 |     """
225 | 
226 |     print('-'*100)
227 |     train_losses = [0]
228 |     valid_losses = [0]
229 |     max_f1 = 0
230 |     patience_flag = 1
231 |     best_epooch = 0
232 |     print(fold, e, patience)
233 | 
234 |     while e > 0:
235 |         total_loss = []
236 |         seed()
237 |         for batch in train_loader:
238 |             uText = batch[0].float().to(device)
239 |             cText = batch[1].float().to(device)
240 |             uAudio = batch[2].float().to(device)
241 |             cAudio = batch[3].float().to(device)
242 |             uVideo = batch[4].float().to(device)
243 |             cVideo = batch[5].float().to(device)
244 |             speaker = batch[6].float().to(device)
245 |             y_true = batch[7].long().to(device)
246 |             del batch
247 |             # call is the command to be executed, since we have different combination of input modality, this decides the input by default
248 |             output = eval(call)
249 |             loss = criterion(output, y_true)
250 |             del uText, cText, uAudio, cAudio, uVideo, cVideo, speaker
251 |             # with torch.cuda.device(device):
252 |             #     torch.cuda.empty_cache()
253 |             optimizer.zero_grad()
254 |             total_loss.append(loss.detach().item())
255 |             loss.backward()
256 |             optimizer.step()
257 |         with torch.no_grad():
258 |             valid_f1, valid_loss = evaluation(
259 |                 valid_loader, mod, call, report, False)
260 |             train_losses.append(sum(total_loss)/len(total_loss))
261 |             valid_losses.append(valid_loss)
262 | 
263 |             e = e-1
264 |             if max_f1 < valid_f1:
265 |                 max_f1 = valid_f1
266 |                 best_model = mod
267 |                 best_epooch = 500-e
268 |                 print(
269 |                     f'Epooch:{best_epooch} | Train Loss: {loss.detach().item():.3f} | Valid loss: { valid_loss.detach().item():7.3f} | Valid F1: { valid_f1:7.3f}')
270 | 
271 |             if abs(train_losses[-2]-train_losses[-1]) < 0.0001:
272 |                 if patience_flag == 1:
273 |                     e = patience
274 |                     patience_flag = 0
275 |             else:
276 |                 patience_flag = 1
277 |     if save:
278 |         best_model.to('cpu')
279 |         torch.save(best_model.state_dict(), 'MPP_Code/saved_models/emotion/implicit' +
280 |                              filename+'_'+str(fold)+'.pth')
281 |     return evaluation(valid_loader, best_model, call, report, True), best_epooch
282 | 
283 | 
284 | def get_command(input_modes, context_flag, speaker_flag):
285 |     """
286 |         This method is used to create the COMMAND to execute the forward methof of particular model,
287 |         Depending upon the input combination
288 |         Args:
289 |             input_modes:
290 |                 Input Modality {VTA, VT, VA, TA, V, T, A}
291 |             context_flag :
292 |                 If true then "with context" else "without context" 
293 |             speaker_flag:
294 |                 if true then Speaker dependent else Speaker INdependent
295 |     """
296 |     if input_modes == 'VTA':
297 |         COMMAND = "mod(**{'uA':uVideo, 'uB':uText, 'uC':uAudio"
298 |         if context_flag == 'y':
299 |             COMMAND += ",'cA':cVideo, 'cB':cText, 'cC':cAudio"
300 | 
301 |     elif input_modes == 'VT':
302 |         COMMAND = "mod(**{'uA':uVideo, 'uB':uText"
303 |         if context_flag == 'y':
304 |             COMMAND += ",'cA':cVideo, 'cB':cText"
305 | 
306 |     elif input_modes == 'VA':
307 |         COMMAND = "mod(**{'uA':uVideo, 'uB':uAudio"
308 |         if context_flag == 'y':
309 |             COMMAND += ",'cA':cVideo, 'cB':cAudio"
310 | 
311 |     elif input_modes == 'TA':
312 |         COMMAND = "mod(**{'uA':uText, 'uB':uAudio"
313 |         if context_flag == 'y':
314 |             COMMAND += ",'cA':cText, 'cB':cAudio"
315 | 
316 |     elif input_modes == 'T':
317 |         COMMAND = "mod(**{'uA':uText"
318 |         if context_flag == 'y':
319 |             COMMAND += ",'cA':cText"
320 | 
321 |     elif input_modes == 'V':
322 |         COMMAND = "mod(**{'uA':uVideo"
323 |         if context_flag == 'y':
324 |             COMMAND += ",'cA':cVideo"
325 | 
326 |     elif input_modes == 'A':
327 |         COMMAND = "mod(**{'uA':uAudio"
328 |         if context_flag == 'y':
329 |             COMMAND += ",'cA':cAudio"
330 |     if speaker_flag == 'y':
331 |         COMMAND += ",'speaker_embedding':speaker})"
332 |     else:
333 |         COMMAND += "})"
334 | 
335 |     return COMMAND
336 | 
337 | 
338 | def get_model_and_parameters(args):
339 |     """
340 |         args is an instance of argument parser
341 |         which will be used to 
342 |     """
343 |     # Here we are sorting VTA in descending order, in order to vae consistency in the model
344 |     input_modes = ''.join(reversed(sorted(list(args.mode.upper()))))
345 | 
346 |     parameters = {}
347 |     MODEL_NAME = 'Speaker_'
348 | 
349 |     parameters['num_classes'] = 5
350 | 
351 |     if args.speaker.lower() == 'y':
352 |         MODEL_NAME += 'Dependent_'
353 |         parameters['n_speaker'] = len(speaker_list)
354 |     else:
355 |         MODEL_NAME += 'Independent_'
356 | 
357 |     if len(input_modes) == 3:
358 |         MODEL_NAME += 'Triple_'
359 |         parameters['input_embedding_A'] = 2048
360 |         parameters['input_embedding_B'] = 1024
361 |         parameters['input_embedding_C'] = audio_embedding_size #319
362 | 
363 |     elif len(input_modes) == 2:
364 |         MODEL_NAME += 'Dual_'
365 |         parameters['input_embedding_A'] = 2048 if input_modes[0] == 'V' else 1024
366 |         parameters['input_embedding_B'] = audio_embedding_size if input_modes[1] == 'A' else 1024
367 |     else:
368 |         MODEL_NAME += 'Single_'
369 |         parameters['input_embedding_A'] = 2048 if input_modes == 'V' else 1024 if input_modes == 'T' else audio_embedding_size
370 | 
371 |     MODEL_NAME += 'Mode_with'
372 |     MODEL_NAME += 'out' if args.context.lower() == 'n' else ''
373 |     MODEL_NAME += '_Context'
374 | 
375 |     MODEL_NAME = 'emotion_classification_model.' + MODEL_NAME
376 | 
377 |     COMMAND = get_command(
378 |         input_modes, args.context.lower(), args.speaker.lower())
379 |     return MODEL_NAME, parameters, COMMAND
380 | 
381 | 
382 | #just intializing 
383 | video_embedding_size = 2048
384 | audio_embedding_size = 291
385 | 
386 | text_embedding_size = 1024
387 | shared_embedding_size = 1024
388 | projection_embedding_size = 512
389 | epooch = args.epooch
390 | lr = 0.001  # args.learning_rate
391 | patience = args.patience
392 | batch_size = 128  # args.batch_size
393 | dropout = 0.5
394 | 
395 | # get out model name , parameters, and command as per the arguments provided in command line
396 | MODEL_NAME, parameters, COMMAND = get_model_and_parameters(args)
397 | 
398 | parameters['shared_embedding'] = shared_embedding_size
399 | parameters['projection_embedding'] = projection_embedding_size
400 | parameters['dropout'] = dropout
401 | 
402 | """ This filename is used further for storing storing stats and log"""
403 | filename = args.mode
404 | filename += '_context_'+args.context.upper()
405 | filename += '_speaker_'+args.speaker.upper()
406 | 
407 | """ File to store log"""
408 | f = open('MPP_Code/log/aug_complete_implicit/an_aug_'+
409 |          filename+'.txt', 'a+')
410 | """ File to store charts of clf report"""
411 | c = open('MPP_Code/charts/aug_complete_implicit/an_aug_'+
412 |          filename+'.txt', 'a+')
413 | """ Dataframe to store stats, will be saved as CSV """
414 | stats = pd.DataFrame(columns=['dropout', 'lr', 'batch_size', 'shared_embedding_size',
415 |                               'projection_embedding_size', 'epoch', 'Precision', 'Recall', 'F1'])
416 | 
417 | """ 'original' variable is mode to switch between printing area
418 |     if we do not want to print on log file then we will use 'original'
419 |     if we want to log then 'f'  ---> it will print on both termminal and log file
420 | """
421 | original = sys.stdout
422 | 
423 | sys.stdout = Tee(sys.stdout, f)
424 | 
425 | print(MODEL_NAME.split('.')[1]) # printing MODEL NAME in both places
426 | 
427 | sys.stdout = original
428 | 
429 | """ since we are loading speaker name from dict 
430 |     we are sorting them in ascending order to remove randomness and make code reproducable
431 | """
432 | speaker_list = sorted(list(mustard_input.SPEAKER.value_counts().keys()))
433 | 
434 | """
435 | These are various combination for parameters tuning (GRID SEARCH)
436 | """
437 | for dropout in [0.2, 0.3, 0.4]:
438 |     for lr in [0.001, 0.0001]:
439 |         for batch_size in [128, 64]:
440 |             for shared_embedding_size, projection_embedding_size in zip([2048, 1024], [1024, 256]):
441 |                 stat = [dropout, lr, batch_size,
442 |                         shared_embedding_size, projection_embedding_size]
443 |                 parameters['shared_embedding'] = shared_embedding_size
444 |                 parameters['projection_embedding'] = projection_embedding_size
445 |                 parameters['dropout'] = dropout
446 | 
447 | 
448 |                 # Following lists are used for calculating overall results
449 |                 pred_all = []
450 |                 true_all = []
451 |                 indexes = []
452 |                 types = []
453 | 
454 |                 for fold in range(5):
455 | 
456 |                     """
457 |                         for 5 FOLD cross validation 
458 |                         we have made the stratified splits explicitly
459 |                         this is done in order to keep consistency in different experiments (to deal with randomness)
460 |                     """
461 |                     train = pd.read_csv(
462 |                         'MPP_Code/data/splits_aug_mustard++/train_' + str(fold)+'.csv')
463 |                     valid = pd.read_csv(
464 |                         'MPP_Code/data/splits_aug_mustard++/test_' + str(fold)+'.csv')
465 |                     seed()
466 |                     train_dataset = ContentDataset(train, data, speaker_list)
467 |                     seed()
468 |                     train_loader = DataLoader(
469 |                         train_dataset, batch_size, num_workers=0, pin_memory=False, worker_init_fn=seed_worker)
470 |                     seed()
471 |                     valid_dataset = ContentDataset(valid, data, speaker_list)
472 |                     seed()
473 |                     valid_loader = DataLoader(
474 |                         valid_dataset, batch_size, num_workers=0, pin_memory=False, worker_init_fn=seed_worker)
475 | 
476 |                     indexes.extend(valid['SCENE'].tolist())
477 |                     types.extend(valid['SAR_T'].tolist())
478 | 
479 |                     seed()
480 |                     mod = eval(MODEL_NAME)(**parameters) # MODEL intitalization
481 |                     seed()
482 |                     criterion = nn.CrossEntropyLoss()
483 |                     criterion.to(device)
484 |                     seed()
485 |                     optimizer = optim.Adam(
486 |                         params=mod.parameters(), betas=(0.5, 0.99), lr=lr)
487 | 
488 |                     (true, pred), epo = training(mod=mod, criterion=criterion, optimizer=optimizer, call=COMMAND,
489 |                                                  train_loader=train_loader, valid_loader=valid_loader, fold=fold, e=epooch, patience=patience)
490 |                     pred_all.extend(pred)
491 |                     true_all.extend(true)
492 | 
493 |                 # training ends here
494 | 
495 | 
496 |                 # FOr log file emotion wisae results
497 |                 sys.stdout = Tee(sys.stdout, f)
498 |                 #printing in log file
499 |                 print(f'n_epooch:{epo} | dropout:{dropout} | lr:{lr} | batch_size:{batch_size} | shared_embedding_size:{shared_embedding_size} | projection_embedding_size:{projection_embedding_size}')
500 |                 report_dict = classification_report(true_all, pred_all, output_dict=True)
501 |                 
502 |                 report = classification_report(true_all, pred_all,digits=3)
503 |                 print(report)
504 |                 print('-'*100)
505 |                 sys.stdout = Tee(sys.stdout, c)
506 |                 print(f'dropout:{dropout} | lr:{lr} | batch_size:{batch_size} | shared_embedding_size:{shared_embedding_size} | projection_embedding_size:{projection_embedding_size}')
507 |                 
508 |                 clf_df = pd.DataFrame(report_dict).transpose()
509 |                 print(clf_df)
510 |                 print('-'*100)
511 |                 sys.stdout = original
512 | 
513 |                 # FOr stats file emotion wise results
514 |                 stat.append(epo)
515 |                 stat.extend(
516 |                     list(map(float, report.split('\n')[-2].split('     ')[1:-1])))
517 |                 stats.loc[len(stats)] = stat
518 |                 stats = stats.sort_values(by='F1')
519 |                 stats.to_csv('MPP_Code/stats/aug_complete_implicit/an_aug_'+
520 |                              filename+'.csv', index=False)
521 | 
522 |                 """ Following code can be used to save prediction """
523 |                 results = []
524 |                 # print(indexes)
525 |                 for row in zip(indexes, types, true_all, pred_all):
526 |                     results.append(row)
527 | 
528 |                 results = pd.DataFrame(
529 |                     results, columns=['SCENE', 'TYPE', 'TRUE', 'PRED'])
530 |                 results.to_csv('MPP_Code/predictions/aug_complete_implicit/'+ args.mode+'/an_aug_' +
531 |                                filename+'_'+str(int(dropout*10))+'_'+str(batch_size)+'_'+str(shared_embedding_size)+'_'+str(len(str(lr)))+'_'+'.csv', index=False)
532 | 
533 |                 """ Following code can be used to save Sarcasm type wise analysis (will be saved in log file)"""
534 | 
535 |                 sys.stdout = Tee(sys.stdout, f)
536 |                 for ty in ['PRO', 'LIK', 'ILL', 'EMB','NONE']:
537 |                     t_t = results[results['TYPE'] == ty]['TRUE'].to_numpy()
538 |                     p_t = results[results['TYPE'] == ty]['PRED'].to_numpy()
539 |                     print(results['TYPE'])
540 |                     report1 = classification_report(t_t, p_t)
541 |                     print("FOR :----> ", ty)
542 |                     print(report1)
543 |                     print('-'*100)
544 |                 print('#'*100)
545 |                 print('#'*100)
546 |                 sys.stdout = original
547 | 
548 | 
549 | f.close()


--------------------------------------------------------------------------------
/MPP_Code/training/execute_regression.py:
--------------------------------------------------------------------------------
  1 | 
  2 | '''
  3 | The file contains code to perform hyperparameter tuning (grid-search) and training the model.
  4 | It includes log statements that get stored in logs folder.
  5 | The regression scores and corresponding hyperparamters get stored in stats folder.
  6 | It utilizes features saved in the form of pickles in the data/extracted_features folder.
  7 | The extracted_features folder contains pickles with BART, T5, RobertA for text. Use as necessary
  8 | 
  9 | '''
 10 | 
 11 | 
 12 | 
 13 | import numpy as np
 14 | import pandas as pd
 15 | import time
 16 | import sys
 17 | import random
 18 | import pickle
 19 | import argparse
 20 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
 21 | from collections import defaultdict
 22 | 
 23 | import torch
 24 | import torch.nn as nn
 25 | import torch.optim as optim
 26 | 
 27 | import torchvision
 28 | from torchvision import datasets, models
 29 | from torch.utils.data import Dataset, DataLoader
 30 | 
 31 | from models import emotion_regression_model
 32 | 
 33 | """
 34 |         This script is used for hyper-parameter tuning and training the model.
 35 | """
 36 | class Tee(object):
 37 |     """
 38 |         This class is used for printing the log in file as well as on Terminal
 39 |     """
 40 |     def __init__(self, *files):
 41 |         self.files = files
 42 | 
 43 |     def write(self, obj):
 44 |         for f in self.files:
 45 |             f.write(obj)
 46 |             f.flush()  # If you want the output to be visible immediately
 47 | 
 48 |     def flush(self):
 49 |         for f in self.files:
 50 |             f.flush()
 51 | 
 52 | 
 53 | def seed():
 54 |     """ This method is used for seeding the code and different points"""
 55 |     np.random.seed(42)
 56 |     random.seed(42)
 57 |     torch.manual_seed(42)
 58 |     torch.cuda.manual_seed(42)
 59 |     torch.backends.cudnn.enabled = False
 60 |     torch.backends.cudnn.deterministic = True
 61 | 
 62 | 
 63 | def seed_worker(worker_id):
 64 |     """ This method is used for seeding the worker in the dataloader"""
 65 |     worker_seed = 42
 66 |     np.random.seed(worker_seed)
 67 |     random.seed(worker_seed)
 68 | 
 69 | 
 70 | seed()
 71 | """ argument parser is used for running the script from terminal, makes it robust """
 72 | argParser = argparse.ArgumentParser()
 73 | argParser.add_argument("-s", "--speaker", required=True,
 74 |                        help="Enter y/Y for Speaker Dependent else n/N")
 75 | 
 76 | argParser.add_argument("-m", "--mode", required=True,
 77 |                        help="VTA for Video, Text,  Audio repectively")
 78 | argParser.add_argument("-c", "--context", required=True,
 79 |                        help="y/Y for Context Dependent else n/N")
 80 | argParser.add_argument("-e", "--epooch", default=500, help="Number of epooch")
 81 | argParser.add_argument("-l", "--learning_rate",
 82 |                        default=0.001, help="Learning rate")
 83 | argParser.add_argument("-p", "--patience", default=5, help="Patience")
 84 | argParser.add_argument("-b", "--batch_size", default=64, help="Batch Size")
 85 | argParser.add_argument("-cr", "--classification_report", default='n',
 86 |                        help="Prints Classification report of Validation Set ")
 87 | argParser.add_argument("-gpu", "--gpu", default=0,
 88 |                        help="Which GPU to use")
 89 | argParser.add_argument("-seed", "--seed", default=42,
 90 |                        help="SEED value")
 91 | argParser.add_argument("-d", "--dropout", default=0.3,
 92 |                        help="Dropout value")
 93 | 
 94 | args = argParser.parse_args()
 95 | 
 96 | '''
 97 | Loading data. Modify the path to point to data folder
 98 | Data folder would hold the text utterances in csv files, features pre-extracted,
 99 | 
100 | '''
101 | path = "MPP_Code/data/"
102 | mustard_input = pd.read_csv(path+'mustard_PP_utterance.csv')
103 | print(mustard_input.columns)
104 | 
105 | temp = open(path+'extracted_features/an_merged/features_Tbart_Vkey_Audio.pickle', 'rb')
106 | data = pickle.load(temp)
107 | 
108 | # Normalizing class
109 | for key in list(data.keys()):
110 |     for idx in ['cText', 'uText', 'cAudio', 'uAudio', 'cVideo', 'uVideo']:
111 |         data[key][idx] /= np.max(abs(data[key][idx]))
112 | 
113 | # Dataset class
114 | 
115 | 
116 | class ContentDataset(Dataset):
117 | 
118 |     def __init__(self, mapping, dataset, speaker_list):
119 |         self.mapping = mapping
120 |         self.dataset = dataset
121 |         self.speakers_mapping = speaker_list
122 | 
123 |     def __len__(self):
124 |         return len(self.mapping)
125 | 
126 |     def __getitem__(self, idx):
127 |         if torch.is_tensor(idx):
128 |             idx = idx.tolist()
129 | 
130 |         index = self.mapping.loc[idx, 'SCENE']
131 |         data = self.dataset[index]
132 |         aro = self.mapping.loc[idx, 'ARO']
133 |         val = self.mapping.loc[idx, 'VAL']
134 |         spkr = np.eye(len(self.speakers_mapping))[self.speakers_mapping.index(
135 |             self.mapping.loc[idx, 'SPEAKER'])]
136 | 
137 |         return data['uText'], data['cText'], data['uAudio'], data['cAudio'], data['uVideo'], data['cVideo'], spkr, val, aro
138 | 
139 | 
140 | device = torch.device("cuda:"+str(args.gpu))
141 | 
142 | 
143 | def regression_report(t, p):
144 |     """Args:
145 |             t:
146 |                 True labels
147 |             p:
148 |                 Predicted labels
149 |         """
150 |     dic = defaultdict(list)
151 | 
152 |     t = np.array(t)
153 |     p = np.array(p)
154 |     # emo_map = {} # this for class wise analysis
155 |     # emo_map[str(-0.5*0.89999998)[:5]] = 'Ang'
156 |     # emo_map[str(-0.89999988 * -0.40000001)[:5]] = 'Sad'
157 |     # emo_map[str(-0.5 * 0.40000001)[:5]] = 'Fru'
158 |     # emo_map[str(-0.55000001 * 0.64999998)[:5]] = 'Rid'
159 |     # emo_map[str(-0.80000001 * 0.5)[:5]] = 'Dis'
160 | 
161 |     for e, i in enumerate(t):
162 |         dic[str(i[0]*i[1])[:5]].append(e)
163 | 
164 | 
165 |     """ Formating the output"""
166 |     print('_'*73)
167 |     print('|\t|\t\tValence\t      ||\t\tArousal\t\t|')
168 |     print('|'+'-'*71+'|')
169 |     print(f'|\t|   MSE\t  |   MAE   |   R2    ||   MSE   |   MAE   |   R2       | ')
170 |     print('|'+'-'*71+'|')
171 |     r2 = 0.0000001, 0.0000001
172 |     for key in list(dic.keys()):
173 |         true = t[dic[str(key)[:5]]]
174 |         pred = p[dic[str(key)[:5]]]
175 |         mse = mean_squared_error(true, pred, multioutput='raw_values',squared=False)
176 |         mae = mean_absolute_error(true, pred, multioutput='raw_values')
177 |         # r2 = r2_score(true, pred, multioutput='raw_values')
178 |         print(
179 |             f'|  { mse[0]:7.3f}  |{ mae[0]:7.3f}  |  { r2[0]:7.3f}||  { mse[1]:7.3f}|  { mae[1]:7.3f}|  { r2[1]:7.3f}   | ')
180 |     print('-'*73)
181 |     mse = mean_squared_error(t, p, multioutput='raw_values',squared=False)
182 |     mae = mean_absolute_error(t, p, multioutput='raw_values')
183 |     r2 = r2_score(t, p, multioutput='raw_values')
184 |     print(
185 |         f'|  Avg\t|{ mse[0]:7.3f}  |{ mae[0]:7.3f}  |  { r2[0]:7.3f}||  { mse[1]:7.3f}|  { mae[1]:7.3f}|  { r2[0]:7.3f}   | ')
186 |     print('|'+'_'*71+'|')
187 | 
188 | 
189 | # flag is for returning the predictions
190 | def evaluation(loader, mod, call, report=False, flag=False):
191 |     """Args:
192 |             loader:
193 |                 It is the validation dataloader
194 |             mod:
195 |                 It is the best model, which we have to evaluate
196 |             call:
197 |                 call is the COMMAND to be executed to run the forward method of the model
198 |                 it changed as per the modality and other possible input
199 |             report:
200 |                 It True then the classification report for the validation set is printed
201 |             flag:
202 |                 if True the instead of evaluation metrics, method returns the calss labels
203 |     """
204 |     with torch.no_grad():
205 |         pred = []
206 |         true = []
207 | 
208 |         #     X_test.reset_index()
209 |         total_loss = []
210 |         criterion = nn.SmoothL1Loss(reduction='none') #Loss function for calculating validation loss
211 |         criterion.to(device)
212 |         seed()
213 |         for batch in loader:
214 |             uText = batch[0].float().to(device)
215 |             cText = batch[1].float().to(device)
216 |             uAudio = batch[2].float().to(device)
217 |             cAudio = batch[3].float().to(device)
218 |             uVideo = batch[4].float().to(device)
219 |             cVideo = batch[5].float().to(device)
220 |             speaker = batch[6].float().to(device)
221 |             y_true = torch.cat(
222 |                 (batch[7].reshape(-1, 1), batch[8].reshape(-1, 1)), dim=1).float().to(device)
223 |             del batch
224 |             output = eval(call)
225 |             # call is the command to be executed, sice we have different combination of input modality, this decides the input by default
226 |             loss = criterion(output, y_true).mean(axis=0).sum()
227 |             del uText, cText, uAudio, cAudio, uVideo, cVideo, speaker
228 |             # with torch.cuda.device(device):
229 |             #     torch.cuda.empty_cache()
230 |             total_loss.append(loss)
231 |             pred.extend(output.detach().cpu().tolist())
232 |             true.extend(y_true.tolist())
233 |         if flag:
234 |             return true, pred
235 |         if report:
236 |             regression_report(true, pred)
237 |         return mean_absolute_error(true, pred, multioutput='raw_values'), mean_squared_error(true, pred, multioutput='raw_values',squared=False), r2_score(true, pred, multioutput='raw_values'), sum(total_loss)/len(total_loss)
238 | 
239 | 
240 | def training(mod, criterion, optimizer, call, train_loader, valid_loader, fold, e=500, patience=5, report=False):
241 |     """Args:
242 |             mod :
243 |                 It is the mod we have to train
244 |             criterion :
245 |                 Loss function, her we have Cross entropy loss
246 |             optimizer :
247 |               object of torch.optim class
248 |             call:
249 |                 call is the COMMAND to be excuted to run the forward method of the model
250 |                 it changed as per the modality and other possible input
251 |             train_loader:
252 |                 It is a instance of train dataloader
253 |             valid_loader:
254 |                 It is a instance of validation dataloader, it is given as a input to evaluation class
255 |             fold:
256 |                 5 FOLD {0,1,2,3,4}
257 |             e:
258 |                 maximum epoch
259 |             patience:
260 |                 how many epoch to wait after the early stopping condition in satisfied
261 |             report:
262 |                 It True then the classification report for the validation set is printed, it is given as a input to evaluation class
263 |             save:
264 |                 If true then best model for each fold is saved
265 | 
266 |     """
267 |     print("#"*90)
268 |     mse_losses = [0]
269 |     valid_losses = [0]
270 |     max_ms = 100
271 |     patience_flag = 1
272 |     best_epooch = 0
273 |     print(fold, e, patience)
274 |     print(f'| epoch\t|train-loss|valid-loss| VAL-MAE|VAL-MSE| VAL-R2 ||ARO-MAE|ARO-MSE|ARO-R2 | ')
275 | 
276 |     while e > 0:
277 |         total_loss = []
278 |         seed()
279 |         for batch in train_loader:
280 |             uText = batch[0].float().to(device)
281 |             cText = batch[1].float().to(device)
282 |             uAudio = batch[2].float().to(device)
283 |             cAudio = batch[3].float().to(device)
284 |             uVideo = batch[4].float().to(device)
285 |             cVideo = batch[5].float().to(device)
286 |             speaker = batch[6].float().to(device)
287 |             y_true = torch.cat(
288 |                 (batch[7].reshape(-1, 1), batch[8].reshape(-1, 1)), dim=1).float().to(device)
289 |             del batch
290 |             output = eval(call)
291 |             loss = criterion(output, y_true).mean(axis=0).sum()
292 |             del uText, cText, uAudio, cAudio, uVideo, cVideo, speaker
293 |             # with torch.cuda.device(device):
294 |             #     torch.cuda.empty_cache()
295 |             optimizer.zero_grad()
296 |             total_loss.append(loss.detach().item())
297 |             loss.backward()
298 |             optimizer.step()
299 |         with torch.no_grad():
300 |             valid_mae, valid_mse, valid_r2, valid_loss = evaluation(
301 |                 valid_loader, mod, call, report, False)
302 |             mse_losses.append(sum(total_loss)/len(total_loss))
303 |             valid_losses.append(valid_loss)
304 | 
305 |             e = e-1
306 |             if max_ms > valid_loss:
307 |                 max_ms = valid_loss
308 |                 best_model = mod
309 |                 best_epooch = 500-e
310 |                 print(
311 |                     f'|  { best_epooch}\t|{ (sum(total_loss)/len(total_loss)):7.3f}   | { valid_loss:7.3f}  |{ valid_mae[0]:7.3f} |{ valid_mse[0]:7.3f}|{ valid_r2[0]:7.3f}\t||{ valid_mae[1]:7.3f}|{ valid_mae[1]:7.3f}|{ valid_r2[1]:7.3f}| ')
312 | 
313 |             if abs(mse_losses[-2]-mse_losses[-1]) < 0.0001:
314 | 
315 |                 if patience_flag == 1:
316 |                     e = patience
317 |                     patience_flag = 0
318 |             else:
319 |                 patience_flag = 1
320 |     return evaluation(valid_loader, best_model, call, report, True), best_epooch
321 | 
322 | 
323 | def get_command(input_modes, context_flag, speaker_flag):
324 |     """
325 |         This method is used to create the COMMAND to execute the forward method of particular model,
326 |         Depending upon the input combination
327 |         Args:
328 |             input_modes:
329 |                 Input Modality {VTA, VT, VA, TA, V, T, A}
330 |             context_flag :
331 |                 If true then "with context" else "without context" 
332 |             speaker_flag:
333 |                 if true then Speaker dependent else Speaker INdependent
334 |     """
335 |     if input_modes == 'VTA':
336 |         COMMAND = "mod(**{'uA':uVideo, 'uB':uText, 'uC':uAudio"
337 |         if context_flag == 'y':
338 |             COMMAND += ",'cA':cVideo, 'cB':cText, 'cC':cAudio"
339 | 
340 |     elif input_modes == 'VT':
341 |         COMMAND = "mod(**{'uA':uVideo, 'uB':uText"
342 |         if context_flag == 'y':
343 |             COMMAND += ",'cA':cVideo, 'cB':cText"
344 | 
345 |     elif input_modes == 'VA':
346 |         COMMAND = "mod(**{'uA':uVideo, 'uB':uAudio"
347 |         if context_flag == 'y':
348 |             COMMAND += ",'cA':cVideo, 'cB':cAudio"
349 | 
350 |     elif input_modes == 'TA':
351 |         COMMAND = "mod(**{'uA':uText, 'uB':uAudio"
352 |         if context_flag == 'y':
353 |             COMMAND += ",'cA':cText, 'cB':cAudio"
354 | 
355 |     elif input_modes == 'T':
356 |         COMMAND = "mod(**{'uA':uText"
357 |         if context_flag == 'y':
358 |             COMMAND += ",'cA':cText"
359 | 
360 |     elif input_modes == 'V':
361 |         COMMAND = "mod(**{'uA':uVideo"
362 |         if context_flag == 'y':
363 |             COMMAND += ",'cA':cVideo"
364 | 
365 |     elif input_modes == 'A':
366 |         COMMAND = "mod(**{'uA':uAudio"
367 |         if context_flag == 'y':
368 |             COMMAND += ",'cA':cAudio"
369 |     if speaker_flag == 'y':
370 |         COMMAND += ",'speaker_embedding':speaker})"
371 |     else:
372 |         COMMAND += "})"
373 | 
374 |     return COMMAND
375 | 
376 | 
377 | def get_model_and_parameters(args):
378 |     """
379 |         args is an instance of argument parser
380 |         which will be used to 
381 |     """
382 |     # Here we are sortng VTA in descending order, in order to vae consistancy in the model
383 |     input_modes = ''.join(reversed(sorted(list(args.mode.upper()))))
384 | 
385 |     parameters = {}
386 |     MODEL_NAME = 'Speaker_'
387 | 
388 |     parameters['num_classes'] = 2
389 | 
390 |     if args.speaker.lower() == 'y':
391 |         MODEL_NAME += 'Dependent_'
392 |         parameters['n_speaker'] = 24
393 |     else:
394 |         MODEL_NAME += 'Independent_'
395 | 
396 |     if len(input_modes) == 3:
397 |         MODEL_NAME += 'Triple_'
398 |         parameters['input_embedding_A'] = 2048
399 |         parameters['input_embedding_B'] = 1024
400 |         parameters['input_embedding_C'] = 291
401 | 
402 |     elif len(input_modes) == 2:
403 |         MODEL_NAME += 'Dual_'
404 |         parameters['input_embedding_A'] = 2048 if input_modes[0] == 'V' else 1024
405 |         parameters['input_embedding_B'] = 291 if input_modes[1] == 'A' else 1024
406 |     else:
407 |         MODEL_NAME += 'Single_'
408 |         parameters['input_embedding_A'] = 2048 if input_modes == 'V' else 1024 if input_modes == 'T' else 291
409 | 
410 |     MODEL_NAME += 'Mode_with'
411 |     MODEL_NAME += 'out' if args.context.lower() == 'n' else ''
412 |     MODEL_NAME += '_Context'
413 | 
414 |     MODEL_NAME = 'emotion_regression_model.' + MODEL_NAME
415 | 
416 |     COMMAND = get_command(
417 |         input_modes, args.context.lower(), args.speaker.lower())
418 |     return MODEL_NAME, parameters, COMMAND
419 | 
420 | 
421 | # WE ARE NOT USING THIS
422 | with open(path+'splits.pickle', "rb") as fp:
423 |     split = pickle.load(fp)
424 | 
425 | 
426 | 
427 | #just intilizing 
428 | video_embedding_size = 2048
429 | audio_embedding_size = 291
430 | text_embedding_size = 1024
431 | 
432 | shared_embedding_size = 1024
433 | projection_embedding_size = 512
434 | 
435 | epooch = args.epooch
436 | lr = 0.001  # args.learning_rate
437 | patience = args.patience
438 | batch_size = 128  # args.batch_size
439 | dropout = 0.5
440 | 
441 | 
442 | 
443 | # get out model name , parameters, and command as per the arguments provided in command line
444 | MODEL_NAME, parameters, COMMAND = get_model_and_parameters(args)
445 | 
446 | parameters['shared_embedding'] = shared_embedding_size
447 | parameters['projection_embedding'] = projection_embedding_size
448 | parameters['dropout'] = dropout
449 | 
450 | 
451 | 
452 | """ This filename is used further for storing storing stats and log"""
453 | filename = args.mode
454 | filename += '_context_'+args.context.upper()
455 | filename += '_speaker_'+args.speaker.upper()
456 | 
457 | 
458 | """ File to store log"""
459 | f = open('MPP_Code/log/lrec_regression/TBart_VKey_' +
460 |          filename+'.txt', 'a+')
461 | """ Dataframe to store stats, will be saved as CSV """
462 | stats = pd.DataFrame(columns=['dropout', 'lr', 'batch_size', 'shared_embedding_size',
463 |                               'projection_embedding_size', 'epoch', 'VAL-MAE', 'VAL-MSE', 'VAL-R2', 'ARO-MAE', 'ARO-MSE', 'ARO-R2'])
464 | """ 'original' variable is mode to switch between printing area
465 |     if we do not want to print on log file then we will use 'original'
466 |     if we want to log then 'f'  ---> it will print on both terminal and log file
467 | """
468 | original = sys.stdout
469 | 
470 | sys.stdout = Tee(sys.stdout, f)
471 | 
472 | print(MODEL_NAME.split('.')[1])
473 | 
474 | sys.stdout = original
475 | 
476 | """ since we are loading speaker name from dict 
477 |     we are sorting them in ascending order to remove randomness and make code reproducible
478 | """
479 | speaker_list = sorted(list(mustard_input.SPEAKER.value_counts().keys()))
480 | 
481 | """
482 | These are various combination for parameters tuning (GRID SEARCH)
483 | """
484 | for dropout in [0.2,0.3,0.4]:
485 |     for lr in [0.001,0.0001]:
486 |         for batch_size in [64,128]:
487 |             for shared_embedding_size, projection_embedding_size in zip([2048, 1024], [1024, 256]):
488 |                 stat = [dropout, lr, batch_size,
489 |                         shared_embedding_size, projection_embedding_size]
490 |                 parameters['shared_embedding'] = shared_embedding_size
491 |                 parameters['projection_embedding'] = projection_embedding_size
492 |                 parameters['dropout'] = dropout
493 | 
494 |                 pred_all = []
495 |                 true_all = []
496 |                 indexes = []
497 |                 types = []
498 | 
499 |                 for fold in range(5):
500 | 
501 |                     """
502 |                         for 5 FOLD cross validation 
503 |                         we have made the stratified splits explicitly
504 |                         this is done in order to keep consistency in different experiments (to deal with randomness)
505 |                     """
506 |                     train = pd.read_csv(
507 |                         'MPP_Code/data/val_aro_splits/train_' + str(fold)+'.csv')
508 |                     valid = pd.read_csv(
509 |                         'MPP_Code/data/val_aro_splits/test_' + str(fold)+'.csv')
510 |                     seed()
511 |                     train_dataset = ContentDataset(train, data, speaker_list)
512 |                     seed()
513 |                     train_loader = DataLoader(
514 |                         train_dataset, batch_size, num_workers=0, pin_memory=False, worker_init_fn=seed_worker)
515 |                     seed()
516 |                     valid_dataset = ContentDataset(valid, data, speaker_list)
517 |                     seed()
518 |                     valid_loader = DataLoader(
519 |                         valid_dataset, batch_size, num_workers=0, pin_memory=False, worker_init_fn=seed_worker)
520 | 
521 |                     indexes.extend(valid['SCENE'].tolist())
522 |                     types.extend(valid['SAR_T'].tolist())
523 | 
524 |                     seed()
525 |                     mod = eval(MODEL_NAME)(**parameters)
526 |                     mod.to(device)
527 |                 #     print(mod)
528 |                     seed()
529 |                     criterion = nn.SmoothL1Loss(reduction='none')
530 |                     criterion.to(device)
531 |                     seed()
532 |                     optimizer = optim.Adam(
533 |                         params=mod.parameters(), betas=(0.5, 0.99), lr=lr)
534 | 
535 |                     (true, pred), epo = training(mod=mod, criterion=criterion, optimizer=optimizer, call=COMMAND,
536 |                                                  train_loader=train_loader, valid_loader=valid_loader, fold=fold, e=epooch, patience=patience)
537 |                     pred_all.extend(pred)
538 |                     true_all.extend(true)
539 |                 # training ends here
540 | 
541 | 
542 |                 # FOr log file emotion wisae results
543 |                 sys.stdout = Tee(sys.stdout, f)
544 | 
545 |                 print(f'n_epooch:{epo} | dropout:{dropout} | lr:{lr} | batch_size:{batch_size} | shared_embedding_size:{shared_embedding_size} | projection_embedding_size:{projection_embedding_size}')
546 |                 regression_report(true_all, pred_all)
547 |                 print('-'*100)
548 |                 print('-'*100)
549 |                 sys.stdout = original
550 |                 mse = mean_absolute_error(
551 |                     true_all, pred_all, multioutput='raw_values')
552 |                 mae = mean_squared_error(
553 |                     true_all, pred_all, multioutput='raw_values',squared=False)
554 |                 r2 = r2_score(true_all, pred_all, multioutput='raw_values')
555 | 
556 | 
557 |                 # FOr stats file emotion wise results
558 |                 stat.extend(
559 |                     [epo, mae[0], mse[0], r2[0], mae[1], mse[1], r2[1]])
560 |                 stats.loc[len(stats)] = stat
561 |                 stats.to_csv('MPP_Code/stats/lrec_regression/TBart_Vkey_' +
562 |                              filename+'.csv', index=False)
563 | 
564 | 
565 |                 """ Following code can be used to save prediction """
566 |                 results = []
567 | 
568 |                 # print(indexes)
569 |                 for row in zip(indexes, types, true_all, pred_all):
570 |                     results.append(
571 |                         [[0], row[1], row[2][0], row[3][0], row[2][1], row[3][1]])
572 | 
573 |                 results = pd.DataFrame(
574 |                     results, columns=['SCENE', 'TYPE', 'TRUE_V', 'PRED_V', 'TRUE_A', 'PRED_A'])
575 |                 results.to_csv('MPP_Code/predictions/lrec_regression/' + args.mode+'/' +
576 |                                filename+'_'+str(int(dropout*10))+'_'+str(batch_size)+'_'+str(shared_embedding_size)+'_'+str(len(str(lr*100)))+'_'+'.csv', index=False)
577 | 
578 | 
579 |                 """ Following code can be used to save Sarcasm type wise analysis (will be saved in log file)"""
580 |                 sys.stdout = Tee(sys.stdout, f)
581 | 
582 |                 for ty in ['PRO', 'LIK', 'ILL', 'EMB']:
583 |                     t_t = results[results['TYPE'] == ty][[
584 |                         'TRUE_V', 'TRUE_A']].to_numpy()
585 |                     p_t = results[results['TYPE'] ==
586 |                                   ty][['PRED_V', 'PRED_A']].to_numpy()
587 | 
588 |                     print("FOR :----> ", ty)
589 |                     regression_report(t_t, p_t)
590 |                     print('-'*100)
591 |                 print('#'*100)
592 |                 print('#'*100)
593 |                 sys.stdout = original
594 | 
595 | 
596 | f.close()
597 | 


--------------------------------------------------------------------------------
/MPP_Code/training/execute_sarcasm_mustard++.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import time
  4 | import sys
  5 | import random
  6 | import pickle
  7 | import argparse
  8 | from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report
  9 | 
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.optim as optim
 13 | 
 14 | import torchvision
 15 | from torchvision import datasets, models
 16 | from torch.utils.data import Dataset, DataLoader
 17 | 
 18 | from models import emotion_classification_model
 19 | 
 20 | """
 21 |         This script is used for hyper-parameter tuning and training the model.
 22 | """
 23 | class Tee(object):
 24 |     """
 25 |         This class is used for printing the logs in file as well as on Terminal
 26 |     """
 27 |     def __init__(self, *files):
 28 |         self.files = files
 29 | 
 30 |     def write(self, obj):
 31 |         for f in self.files:
 32 |             f.write(obj)
 33 |             f.flush()  # If you want the output to be visible immediately
 34 | 
 35 |     def flush(self):
 36 |         for f in self.files:
 37 |             f.flush()
 38 | 
 39 | 
 40 | def seed():
 41 |     """ This method is used for seeding the code and different points"""
 42 |     np.random.seed(42)
 43 |     random.seed(42)
 44 |     torch.manual_seed(42)
 45 |     torch.cuda.manual_seed(42)
 46 |     torch.backends.cudnn.enabled = False
 47 |     torch.backends.cudnn.deterministic = True
 48 | 
 49 | 
 50 | def seed_worker(worker_id):
 51 |     """ This method is used for seeding the worker in the dataloader"""
 52 |     worker_seed = 42
 53 |     np.random.seed(worker_seed)
 54 |     random.seed(worker_seed)
 55 | 
 56 | 
 57 | seed()
 58 | """ argument parser is used for running the script from terminal, makes it robust """
 59 | argParser = argparse.ArgumentParser()
 60 | argParser.add_argument("-s", "--speaker", required=True,
 61 |                        help="Enter y/Y for Speaker Dependent else n/N")
 62 | 
 63 | argParser.add_argument("-m", "--mode", required=True,
 64 |                        help="VTA for Video, Text,  Audio repectively")
 65 | argParser.add_argument("-c", "--context", required=True,
 66 |                        help="y/Y for Context Dependent else n/N")
 67 | argParser.add_argument("-e", "--epoch", default=500, help="Number of epoch")
 68 | argParser.add_argument("-l", "--learning_rate",
 69 |                        default=0.001, help="Learning rate")
 70 | argParser.add_argument("-p", "--patience", default=5, help="Patience")
 71 | argParser.add_argument("-b", "--batch_size", default=64, help="Batch Size")
 72 | argParser.add_argument("-cr", "--classification_report", default='n',
 73 |                        help="Prints Classification report of Validation Set ")
 74 | argParser.add_argument("-gpu", "--gpu", default=0,
 75 |                        help="Which GPU to use")
 76 | argParser.add_argument("-seed", "--seed", default=42,
 77 |                        help="SEED value")
 78 | argParser.add_argument("-d", "--dropout", default=0.3,
 79 |                        help="Dropout value")
 80 | argParser.add_argument("-pr", "--projection", default=256,
 81 |                        help="Projection embedding size")
 82 | argParser.add_argument("-sh", "--shared", default=1024,
 83 |                        help="Shared embedding size")
 84 | args = argParser.parse_args()
 85 | 
 86 | # Loading data
 87 | path = "MPP_Code/data/"
 88 | mustard_input = pd.read_csv('MPP_Code/data/final_datasets/mustard++_sarcasm_detection.csv', index_col=0)
 89 | temp = open(path+'extracted_features/an_merged/features_Tbart_Vkey_Audio_sarcasm.pickle', 'rb')
 90 | data = pickle.load(temp)
 91 | 
 92 | # Normalizing class
 93 | for key in list(data.keys()):
 94 |     for idx in ['cText', 'uText', 'cAudio', 'uAudio', 'cVideo', 'uVideo']:
 95 |         data[key][idx] /= np.max(abs(data[key][idx]))
 96 | 
 97 | # Dataset class
 98 | class ContentDataset(Dataset):
 99 | 
100 |     def __init__(self, mapping, dataset, speaker_list):
101 |         self.mapping = mapping
102 |         self.dataset = dataset
103 |         self.speakers_mapping = speaker_list
104 | 
105 |     def __len__(self):
106 |         return len(self.mapping)
107 | 
108 |     def __getitem__(self, idx):
109 |         if torch.is_tensor(idx):
110 |             idx = idx.tolist()
111 | 
112 |         index = self.mapping.loc[idx, 'SCENE']
113 |         data = self.dataset[index]
114 |         label = int(self.mapping.loc[idx, 'SAR'])
115 |         spkr = np.eye(len(self.speakers_mapping))[self.speakers_mapping.index(
116 |             self.mapping.loc[idx, 'SPEAKER'])]
117 | 
118 |         return data['uText'], data['cText'], data['uAudio'], data['cAudio'], data['uVideo'], data['cVideo'], spkr, label
119 | 
120 | 
121 | device = torch.device("cuda:"+str(args.gpu))
122 | 
123 | 
124 | # flag is for returning the predictions
125 | def evaluation(loader, mod, call, report=False, flag=False):
126 |     """Args:
127 |             loader:
128 |                 It is the validation dataloader
129 |             mod:
130 |                 It is the best model, which we have to evaluate
131 |             call:
132 |                 call is the COMMAND to be excuted to run the forward method of the model
133 |                 it changed as per the modality and other possible input
134 |             report:
135 |                 If True then the classification report for the validation set is printed
136 |             flag:
137 |                 if True the instead of evaluation metrics, method returns the class labels (predictions)
138 |     """
139 |     with torch.no_grad():
140 |         pred = []
141 |         true = []
142 |         total_loss = []
143 |         criterion = nn.CrossEntropyLoss()
144 |         criterion.to(device)
145 |         seed()
146 |         for batch in loader:
147 |             uText = batch[0].float().to(device)
148 |             cText = batch[1].float().to(device)
149 |             uAudio = batch[2].float().to(device)
150 |             cAudio = batch[3].float().to(device)
151 |             uVideo = batch[4].float().to(device)
152 |             cVideo = batch[5].float().to(device)
153 |             speaker = batch[6].float().to(device)
154 |             y_true = batch[7].long().to(device)
155 |             del batch
156 |             output = torch.softmax(eval(call), dim=1)
157 |             loss = criterion(output, y_true)
158 |             del uText, cText, uAudio, cAudio, uVideo, cVideo, speaker
159 |             # with torch.cuda.device(device):
160 |             #     torch.cuda.empty_cache()
161 |             total_loss.append(loss)
162 |             pred.extend(output.detach().cpu().tolist())
163 |             true.extend(y_true.tolist())
164 |         if flag:
165 |             return true, np.argmax(pred, axis=1)
166 |         if report:
167 |             print(classification_report(true, np.argmax(pred, axis=1), digits=3))
168 | 
169 |         
170 |         return f1_score(true, np.argmax(pred, axis=1), average='macro'), sum(total_loss)/len(total_loss)
171 | 
172 | 
173 | # def training(mod, criterion, optimizer, call, train_loader, valid_loader, fold, e=500, patience=5, report=False):
174 | def training(mod, criterion, optimizer, call, train_loader, valid_loader, fold, e=500, patience=5, report=False,save=True):
175 |     """Args:
176 |             mod :
177 |                 It is the mod we have to train
178 |             criterion :
179 |                 Loss function, here we have Cross entropy loss
180 |             optimizer :
181 |               object of torch.optim class
182 |             call:
183 |                 call is the COMMAND to be excuted to run the forward method of the model
184 |                 it changed as per the modality and other possible input
185 |             train_loader:
186 |                 It is a instance of train dataloader
187 |             valid_loader:
188 |                 It is a instance of validation dataloader, it is given as a input to evaluation class
189 |             fold:
190 |                 5 FOLD {0,1,2,3,4}
191 |             e:
192 |                 maximum epoch
193 |             patience:
194 |                 how many epoch to wait after the early stopping condition in satisfied
195 |             report:
196 |                 It True then the classification report for the validation set is printed, it is given as a input to evaluation class
197 |             save:
198 |                 If true then best model for each fold is saved
199 | 
200 |     """
201 |     print('-'*100)
202 |     train_losses = [0]
203 |     valid_losses = [0]
204 |     max_f1 = 0
205 |     patience_flag = 1
206 |     best_epoch = 0
207 |     print(fold, e, patience)
208 | 
209 |     while e > 0:
210 |         total_loss = []
211 |         seed()
212 |         for batch in train_loader:
213 |             uText = batch[0].float().to(device)
214 |             cText = batch[1].float().to(device)
215 |             uAudio = batch[2].float().to(device)
216 |             cAudio = batch[3].float().to(device)
217 |             uVideo = batch[4].float().to(device)
218 |             cVideo = batch[5].float().to(device)
219 |             speaker = batch[6].float().to(device)
220 |             y_true = batch[7].long().to(device)
221 |             del batch
222 |             output = eval(call)
223 |             loss = criterion(output, y_true)
224 |             del uText, cText, uAudio, cAudio, uVideo, cVideo, speaker
225 |             # with torch.cuda.device(device):
226 |             #     torch.cuda.empty_cache()
227 |             optimizer.zero_grad()
228 |             total_loss.append(loss.detach().item())
229 |             loss.backward()
230 |             optimizer.step()
231 |         with torch.no_grad():
232 |             valid_f1, valid_loss = evaluation(
233 |                 valid_loader, mod, call, report, False)
234 |             train_losses.append(sum(total_loss)/len(total_loss))
235 |             valid_losses.append(valid_loss)
236 | 
237 |             e = e-1
238 |             if max_f1 < valid_f1:
239 |                 max_f1 = valid_f1
240 |                 best_model = mod
241 |                 best_epoch = 500-e
242 |                 print(
243 |                     f'Epoch:{best_epoch} | Train Loss: {loss.detach().item():.3f} | Valid loss: { valid_loss.detach().item():7.3f} | Valid F1: { valid_f1:7.3f}')
244 | 
245 |             if abs(train_losses[-2]-train_losses[-1]) < 0.0001:
246 |                 if patience_flag == 1:
247 |                     e = patience
248 |                     patience_flag = 0
249 |             else:
250 |                 patience_flag = 1
251 | 
252 |     if save:
253 |         best_model.to(device)
254 |         torch.save(best_model.state_dict(), 'MPP_Code/saved_models/sarc/' +
255 |                             filename+'_'+str(fold)+'.pth')
256 |                 
257 |     return evaluation(valid_loader, best_model, call, report, True), best_epoch
258 | 
259 | 
260 | def get_command(input_modes, context_flag, speaker_flag):
261 |     """
262 |         This method is used to create the COMMAND to execute the forward methof of particular model,
263 |         Depending upon the input combination
264 |         Args:
265 |             input_modes:
266 |                 Input Modality {VTA, VT, VA, TA, V, T, A}
267 |             context_flag :
268 |                 If true then "with context" else "without context" 
269 |             speaker_flag:
270 |                 if true then Speaker dependent else Speaker INdependent
271 |     """
272 |     if input_modes == 'VTA':
273 |         COMMAND = "mod(**{'uA':uVideo, 'uB':uText, 'uC':uAudio"
274 |         if context_flag == 'y':
275 |             COMMAND += ",'cA':cVideo, 'cB':cText, 'cC':cAudio"
276 | 
277 |     elif input_modes == 'VT':
278 |         COMMAND = "mod(**{'uA':uVideo, 'uB':uText"
279 |         if context_flag == 'y':
280 |             COMMAND += ",'cA':cVideo, 'cB':cText"
281 | 
282 |     elif input_modes == 'VA':
283 |         COMMAND = "mod(**{'uA':uVideo, 'uB':uAudio"
284 |         if context_flag == 'y':
285 |             COMMAND += ",'cA':cVideo, 'cB':cAudio"
286 | 
287 |     elif input_modes == 'TA':
288 |         COMMAND = "mod(**{'uA':uText, 'uB':uAudio"
289 |         if context_flag == 'y':
290 |             COMMAND += ",'cA':cText, 'cB':cAudio"
291 | 
292 |     elif input_modes == 'T':
293 |         COMMAND = "mod(**{'uA':uText"
294 |         if context_flag == 'y':
295 |             COMMAND += ",'cA':cText"
296 | 
297 |     elif input_modes == 'V':
298 |         COMMAND = "mod(**{'uA':uVideo"
299 |         if context_flag == 'y':
300 |             COMMAND += ",'cA':cVideo"
301 | 
302 |     elif input_modes == 'A':
303 |         COMMAND = "mod(**{'uA':uAudio"
304 |         if context_flag == 'y':
305 |             COMMAND += ",'cA':cAudio"
306 |     if speaker_flag == 'y':
307 |         COMMAND += ",'speaker_embedding':speaker})"
308 |     else:
309 |         COMMAND += "})"
310 | 
311 |     return COMMAND
312 | 
313 | """ since we are loading speaker name from dict 
314 |     we are sorting them in ascending order to remove randomness and make code reproducible
315 | """
316 | speaker_list = sorted(list(mustard_input.SPEAKER.value_counts().keys()))
317 | 
318 | 
319 | def get_model_and_parameters(args):
320 |     """
321 |         args is an instance of argument parser
322 |         which will be used to 
323 |     """
324 |     # Here we are sortng VTA in descending order, in order to vae consistancy in the model
325 |     input_modes = ''.join(reversed(sorted(list(args.mode.upper()))))
326 | 
327 |     parameters = {}
328 |     MODEL_NAME = 'Speaker_'
329 | 
330 |     parameters['num_classes'] = 2
331 | 
332 |     if args.speaker.lower() == 'y':
333 |         MODEL_NAME += 'Dependent_'
334 |         parameters['n_speaker'] = len(speaker_list)
335 |     else:
336 |         MODEL_NAME += 'Independent_'
337 | 
338 |     if len(input_modes) == 3:
339 |         MODEL_NAME += 'Triple_'
340 |         parameters['input_embedding_A'] = 2048
341 |         parameters['input_embedding_B'] = 1024
342 |         parameters['input_embedding_C'] = 291
343 | 
344 |     elif len(input_modes) == 2:
345 |         MODEL_NAME += 'Dual_'
346 |         parameters['input_embedding_A'] = 2048 if input_modes[0] == 'V' else 1024
347 |         parameters['input_embedding_B'] = 291 if input_modes[1] == 'A' else 1024
348 |     else:
349 |         MODEL_NAME += 'Single_'
350 |         parameters['input_embedding_A'] = 2048 if input_modes == 'V' else 1024 if input_modes == 'T' else 291
351 | 
352 |     MODEL_NAME += 'Mode_with'
353 |     MODEL_NAME += 'out' if args.context.lower() == 'n' else ''
354 |     MODEL_NAME += '_Context'
355 | 
356 |     MODEL_NAME = 'emotion_classification_model.' + MODEL_NAME
357 | 
358 |     COMMAND = get_command(
359 |         input_modes, args.context.lower(), args.speaker.lower())
360 |     return MODEL_NAME, parameters, COMMAND
361 | 
362 | # WE ARE NOT USING THIS
363 | with open(path+'splits.pickle', "rb") as fp:
364 |     split = pickle.load(fp)
365 | 
366 | #just intilizing 
367 | video_embedding_size = 2048
368 | audio_embedding_size = 291
369 | text_embedding_size = 1024
370 | 
371 | shared_embedding_size = 1024
372 | projection_embedding_size = 512
373 | epoch = args.epoch
374 | lr = 0.001  # args.learning_rate
375 | patience = args.patience
376 | batch_size = 128  # args.batch_size
377 | dropout = 0.5
378 | 
379 | """
380 | These are various hyperparameters, if being set from the execution helper script uncomment
381 | """
382 | dropout  = float(args.dropout)
383 | lr = float(args.learning_rate)
384 | batch_size = int(args.batch_size)
385 | shared_embedding_size = int(args.shared)
386 | projection_embedding_size = int(args.projection)
387 | 
388 | # get out model name , parameters, and command as per the arguments provided in command line
389 | MODEL_NAME, parameters, COMMAND = get_model_and_parameters(args)
390 | 
391 | parameters['shared_embedding'] = shared_embedding_size
392 | parameters['projection_embedding'] = projection_embedding_size
393 | parameters['dropout'] = dropout
394 | 
395 | 
396 | """ This filename is used further for storing storing stats and log"""
397 | filename = args.mode
398 | filename += '_context_'+args.context.upper()
399 | filename += '_speaker_'+args.speaker.upper()
400 | 
401 | 
402 | """ File to store log"""
403 | f = open('MPP_Code/log/sarcasm/an_lrec_' +
404 |          filename+'.txt', 'a+')
405 | """ File to store charts of clf report"""
406 | c = open('MPP_Code/charts/sarcasm/an_lrec_'+
407 |          filename+'.txt', 'a+')
408 | # f = open('MPP_Code/rough/' +
409 | #          filename+'_log.txt', 'w')
410 | # csv = open('MPP_Code/stats'+filename+'.csv', 'w')
411 | 
412 | """ Dataframe to store stats, will be saved as CSV """
413 | stats = pd.DataFrame(columns=['dropout', 'lr', 'batch_size', 'shared_embedding_size',
414 |                               'projection_embedding_size', 'epoch', 'Precision', 'Recall', 'F1'])
415 | """ 'original' variable is mode to swithch between printing area
416 |     if we do not want to print on log file then we will use 'original'
417 |     if we want to log then 'f'  ---> it will print on both termminal and log file
418 | """
419 | original = sys.stdout
420 | 
421 | sys.stdout = Tee(sys.stdout, f)
422 | 
423 | print(MODEL_NAME.split('.')[1])
424 | 
425 | sys.stdout = original
426 | 
427 | 
428 | # for dropout in [0.2]:
429 | #     for lr in [0.001]:
430 | #         for batch_size in [64]:
431 | #             for shared_embedding_size, projection_embedding_size in zip([2048], [1024]):
432 | """
433 | These are various combination for parameters tuning (GRID SEARCH)
434 | """
435 | # for dropout in [0.2, 0.3, 0.4, 0.5]:
436 | #     for lr in [0.001, 0.0001, 0.00001]:
437 | #         for batch_size in [50, 128]:
438 | #             for shared_embedding_size, projection_embedding_size in zip([2048, 1024], [1024, 256]):
439 | # for dropout in [0.2,0.3,0.4]:
440 | #     for lr in [0.001,0.0001]:
441 | #         for batch_size in [64,128]:
442 | # for shared_embedding_size, projection_embedding_size in zip([2048,1024], [1024,256]):
443 | stat = [dropout, lr, batch_size, shared_embedding_size, projection_embedding_size]
444 | parameters['shared_embedding'] = shared_embedding_size
445 | parameters['projection_embedding'] = projection_embedding_size
446 | parameters['dropout'] = dropout
447 | 
448 | 
449 | # Following lists are used for canculating overall results
450 | pred_all = []
451 | true_all = []
452 | indexes = []
453 | types = []
454 | 
455 | for fold in range(5):
456 |     """
457 |         for 5 FOLS cross validation 
458 |         we have made the stritifued splits explicitly
459 |         this is done in order to keep consistency in different experiments (to deal with randomness)
460 |     """
461 |     train = pd.read_csv(
462 |         'MPP_Code/data/split_mustard_pp_sarcasm/train_' + str(fold)+'.csv')
463 |     valid = pd.read_csv(
464 |         'MPP_Code/data/split_mustard_pp_sarcasm/test_' + str(fold)+'.csv')
465 |     seed()
466 |     train_dataset = ContentDataset(train, data, speaker_list)
467 |     seed()
468 |     train_loader = DataLoader(
469 |         train_dataset, batch_size, num_workers=0, pin_memory=False, worker_init_fn=seed_worker)
470 |     seed()
471 |     valid_dataset = ContentDataset(valid, data, speaker_list)
472 |     seed()
473 |     valid_loader = DataLoader(
474 |         valid_dataset, batch_size, num_workers=0, pin_memory=False, worker_init_fn=seed_worker)
475 | 
476 |     indexes.extend(valid['SCENE'].tolist())
477 |     # types.extend(valid['SAR_T'].tolist())
478 | 
479 |     seed()
480 |     mod = eval(MODEL_NAME)(**parameters)
481 |     mod.to(device)
482 | #     print(mod)
483 |     seed()
484 |     criterion = nn.CrossEntropyLoss()
485 |     criterion.to(device)
486 |     seed()
487 |     optimizer = optim.Adam(
488 |         params=mod.parameters(), betas=(0.5, 0.99), lr=lr)
489 | 
490 |     (true, pred), epo = training(mod=mod, criterion=criterion, optimizer=optimizer, call=COMMAND,
491 |                                     train_loader=train_loader, valid_loader=valid_loader, fold=fold, e=epoch, patience=patience)
492 |     pred_all.extend(pred)
493 |     true_all.extend(true)
494 | 
495 |     # training ends here
496 | 
497 | 
498 | # FOr log file emotion wisae results
499 | sys.stdout = Tee(sys.stdout, f)
500 | 
501 | print(f'n_epoch:{epo} | dropout:{dropout} | lr:{lr} | batch_size:{batch_size} | shared_embedding_size:{shared_embedding_size} | projection_embedding_size:{projection_embedding_size}')
502 | report_dict = classification_report(true_all, pred_all, output_dict=True)
503 | 
504 | report = classification_report(true_all, pred_all, digits=3)
505 | print(report)
506 | print('-'*100)
507 | sys.stdout = Tee(sys.stdout, c)
508 | # print(sns.heatmap(pd.DataFrame(report).iloc[:-1, :].T, annot=True))
509 | print(f'dropout:{dropout} | lr:{lr} | batch_size:{batch_size} | shared_embedding_size:{shared_embedding_size} | projection_embedding_size:{projection_embedding_size}')
510 | 
511 | clf_df = pd.DataFrame(report_dict).transpose()
512 | print(clf_df)
513 | print('-'*100)
514 | 
515 | sys.stdout = original
516 | 
517 | 
518 | # FOr stats file emotion wise results
519 | stat.append(epo)
520 | stat.extend(
521 |     list(map(float, report.split('\n')[-2].split('     ')[1:-1])))
522 | stats.loc[len(stats)] = stat
523 | stats = stats.sort_values(by='F1')
524 | stats.to_csv('MPP_Code/stats/sarcasm/an_lrec_' +
525 |                 filename+'.csv', index=False)
526 | 
527 | 
528 | """ Following code can be used to save prediction """
529 | results = []
530 | # print(indexes)
531 | for row in zip(indexes, true_all, pred_all):
532 |     results.append(row)
533 | 
534 | results = pd.DataFrame(
535 |     results, columns=['KEY', 'TRUE', 'PRED'])
536 | results.to_csv('MPP_Code/predictions/sarcasm/' +args.mode+'/best_an_lrec_' +
537 |                 filename+'_'+str(int(dropout*10))+'_'+str(batch_size)+'_'+str(shared_embedding_size)+'_'+str(len(str(lr*100)))+'_'+'.csv', index=False)
538 | 
539 | sys.stdout = Tee(sys.stdout, f)
540 | 
541 | print('#'*100)
542 | sys.stdout = original
543 | 
544 | 
545 | f.close()
546 | 


--------------------------------------------------------------------------------
/MPP_Code/training/execute_sarcasm_mustard.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import time
  6 | import sys
  7 | import random
  8 | import pickle
  9 | import argparse
 10 | from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report
 11 | 
 12 | import torch
 13 | import torch.nn as nn
 14 | import torch.optim as optim
 15 | 
 16 | import torchvision
 17 | from torchvision import datasets, models
 18 | from torch.utils.data import Dataset, DataLoader
 19 | 
 20 | from models import emotion_classification_model
 21 | 
 22 | """
 23 |         This script is used for hyper-parameter tuning and training the model.
 24 | """
 25 | 
 26 | class Tee(object):
 27 |     """
 28 |         This class is used for printing the logs in the file as well as on the Terminal
 29 |     """
 30 |     def __init__(self, *files):
 31 |         self.files = files
 32 | 
 33 |     def write(self, obj):
 34 |         for f in self.files:
 35 |             f.write(obj)
 36 |             f.flush()  # If you want the output to be visible immediately
 37 | 
 38 |     def flush(self):
 39 |         for f in self.files:
 40 |             f.flush()
 41 | 
 42 | 
 43 | def seed():
 44 |     """ This method is used for seeding the code and different points"""
 45 |     np.random.seed(42)
 46 |     random.seed(42)
 47 |     torch.manual_seed(42)
 48 |     torch.cuda.manual_seed(42)
 49 |     torch.backends.cudnn.enabled = False
 50 |     torch.backends.cudnn.deterministic = True
 51 | 
 52 | 
 53 | def seed_worker(worker_id):
 54 |     """ This method is used for seeding the worker in the dataloader"""
 55 |     worker_seed = 42
 56 |     np.random.seed(worker_seed)
 57 |     random.seed(worker_seed)
 58 | 
 59 | 
 60 | seed()
 61 | """ argument parser is used for running the script from terminal, makes it robust """
 62 | argParser = argparse.ArgumentParser()
 63 | argParser.add_argument("-s", "--speaker", required=True,
 64 |                        help="Enter y/Y for Speaker Dependent else n/N")
 65 | 
 66 | argParser.add_argument("-m", "--mode", required=True,
 67 |                        help="VTA for Video, Text,  Audio repectively")
 68 | argParser.add_argument("-c", "--context", required=True,
 69 |                        help="y/Y for Context Dependent else n/N")
 70 | argParser.add_argument("-e", "--epooch", default=500, help="Number of epooch")
 71 | argParser.add_argument("-l", "--learning_rate",
 72 |                        default=0.001, help="Learning rate")
 73 | argParser.add_argument("-p", "--patience", default=5, help="Patience")
 74 | argParser.add_argument("-b", "--batch_size", default=64, help="Batch Size")
 75 | argParser.add_argument("-cr", "--classification_report", default='n',
 76 |                        help="Prints Classification report of Validation Set ")
 77 | argParser.add_argument("-gpu", "--gpu", default=0,
 78 |                        help="Which GPU to use")
 79 | argParser.add_argument("-seed", "--seed", default=42,
 80 |                        help="SEED value")
 81 | argParser.add_argument("-d", "--dropout", default=0.3,
 82 |                        help="Dropout value")
 83 | 
 84 | args = argParser.parse_args()
 85 | 
 86 | # Loading data
 87 | path = "MPP_Code/data/"
 88 | xx = pd.read_csv("/home/development/apoorvan/AN_MTP/SER/data/" +
 89 |                  'MUStARD-Final.csv')
 90 | dd = []
 91 | for group in xx.groupby('KEY'):
 92 |     dd.append(group[1].values[-1])
 93 | mustard_input = pd.DataFrame(dd, columns=xx.columns)
 94 | print(mustard_input.columns)
 95 | 
 96 | temp = open(path+'extracted_features/mustard_all_features.pickle', 'rb')
 97 | data = pickle.load(temp)
 98 | 
 99 | # Normalizing class
100 | for key in list(data.keys()):
101 |     for idx in ['cText', 'uText', 'cAudio', 'uAudio', 'cVideo', 'uVideo']:
102 |         data[key][idx] /= np.max(abs(data[key][idx]))
103 | 
104 | # Dataset class
105 | 
106 | 
107 | class ContentDataset(Dataset):
108 | 
109 |     def __init__(self, mapping, dataset, speaker_list):
110 |         self.mapping = mapping
111 |         self.dataset = dataset
112 |         self.speakers_mapping = speaker_list
113 | 
114 |     def __len__(self):
115 |         return len(self.mapping)
116 | 
117 |     def __getitem__(self, idx):
118 |         if torch.is_tensor(idx):
119 |             idx = idx.tolist()
120 | 
121 |         index = self.mapping.loc[idx, 'KEY']
122 |         data = self.dataset[index]
123 |         label = int(self.mapping.loc[idx, 'SARCASM'])
124 |         spkr = np.eye(len(self.speakers_mapping))[self.speakers_mapping.index(
125 |             self.mapping.loc[idx, 'SPEAKER'])]
126 | 
127 |         return data['uText'], data['cText'], data['uAudio'], data['cAudio'], data['uVideo'], data['cVideo'], spkr, label
128 | 
129 | 
130 | device = torch.device("cuda:"+str(args.gpu))
131 | 
132 | 
133 | # flag is for returning the predictions
134 | def evaluation(loader, mod, call, report=False, flag=False):
135 |     """Args:
136 |             loader:
137 |                 It is the validation dataloader
138 |             mod:
139 |                 It is the best model, which we have to evaluate
140 |             call:
141 |                 call is the COMMAND to be excuted to run the forward method of the model
142 |                 it changed as per the modality and other possible input
143 |             report:
144 |                 It True then the classification report for the validation set is printed
145 |             flag:
146 |                 if True the instead of evaluation metrics, method returns the calss labels
147 |     """
148 |     with torch.no_grad():
149 |         pred = []
150 |         true = []
151 | 
152 |         #     X_test.reset_index()
153 |         total_loss = []
154 |         criterion = nn.CrossEntropyLoss()
155 |         criterion.to(device)
156 |         seed()
157 |         for batch in loader:
158 |             uText = batch[0].float().to(device)
159 |             cText = batch[1].float().to(device)
160 |             uAudio = batch[2].float().to(device)
161 |             cAudio = batch[3].float().to(device)
162 |             uVideo = batch[4].float().to(device)
163 |             cVideo = batch[5].float().to(device)
164 |             speaker = batch[6].float().to(device)
165 |             y_true = batch[7].long().to(device)
166 |             del batch
167 |             output = torch.softmax(eval(call), dim=1)
168 |             loss = criterion(output, y_true)
169 |             del uText, cText, uAudio, cAudio, uVideo, cVideo, speaker
170 |             total_loss.append(loss)
171 |             pred.extend(output.detach().cpu().tolist())
172 |             true.extend(y_true.tolist())
173 |         if flag:
174 |             return true, np.argmax(pred, axis=1)
175 |         if report:
176 |             print(classification_report(true, np.argmax(pred, axis=1), digits=3))
177 |         # plot(true, np.argmax(pred, axis=1), '5FOLD_MULTITASK')
178 |         return f1_score(true, np.argmax(pred, axis=1), average='macro'), sum(total_loss)/len(total_loss)
179 | 
180 | 
181 | def training(mod, criterion, optimizer, call, train_loader, valid_loader, fold, e=500, patience=5, report=False,save=False):
182 |     """Args:
183 |             mod :
184 |                 It is the mod we have to train
185 |             criterion :
186 |                 Loss function, her we have Cross entropy loss
187 |             optimizer :
188 |               object of torch.optim class
189 |             call:
190 |                 call is the COMMAND to be excuted to run the forward method of the model
191 |                 it changed as per the modality and other possible input
192 |             train_loader:
193 |                 It is a instance of train dataloader
194 |             valid_loader:
195 |                 It is a instance of validation dataloader, it is given as a input to evaluation class
196 |             fold:
197 |                 5 FOLD {0,1,2,3,4}
198 |             e:
199 |                 maximum epoch
200 |             patience:
201 |                 how many epoch to wait after the early stopping condition in satisfied
202 |             report:
203 |                 If True then the classification report for the validation set is printed, it is given as a input to evaluation class
204 |             save:
205 |                 If true then best model for each fold is saved
206 | 
207 |     """
208 |     print('-'*100)
209 |     train_losses = [0]
210 |     valid_losses = [0]
211 |     max_f1 = 0
212 |     patience_flag = 1
213 |     best_epooch = 0
214 |     print(fold, e, patience)
215 | 
216 |     while e > 0:
217 |         total_loss = []
218 |         seed()
219 |         for batch in train_loader:
220 |             uText = batch[0].float().to(device)
221 |             cText = batch[1].float().to(device)
222 |             uAudio = batch[2].float().to(device)
223 |             cAudio = batch[3].float().to(device)
224 |             uVideo = batch[4].float().to(device)
225 |             cVideo = batch[5].float().to(device)
226 |             speaker = batch[6].float().to(device)
227 |             y_true = batch[7].long().to(device)
228 |             del batch
229 |             output = eval(call)
230 |             loss = criterion(output, y_true)
231 |             del uText, cText, uAudio, cAudio, uVideo, cVideo, speaker
232 |             optimizer.zero_grad()
233 |             total_loss.append(loss.detach().item())
234 |             loss.backward()
235 |             optimizer.step()
236 |         with torch.no_grad():
237 |             valid_f1, valid_loss = evaluation(
238 |                 valid_loader, mod, call, report, False)
239 |             train_losses.append(sum(total_loss)/len(total_loss))
240 |             valid_losses.append(valid_loss)
241 | 
242 |             e = e-1
243 |             if max_f1 < valid_f1:
244 |                 max_f1 = valid_f1
245 |                 best_model = mod
246 |                 best_epooch = 500-e
247 | #                     torch.save(mod.state_dict(
248 | #                     ), '/home/development/apoorvan/AN_MTP/SER/saved_models/rough_saved_models/V_T_A_average_5FOLD_FOLD'+str(fold)+'_'+str(e)+'.pth')
249 |                 print(
250 |                     f'Epooch:{best_epooch} | Train Loss: {loss.detach().item():.3f} | Valid loss: { valid_loss.detach().item():7.3f} | Valid F1: { valid_f1:7.3f}')
251 | 
252 |             if abs(train_losses[-2]-train_losses[-1]) < 0.0001:
253 |                 if patience_flag == 1:
254 |                     e = patience
255 |                     patience_flag = 0
256 |             else:
257 |                 patience_flag = 1
258 |     return evaluation(valid_loader, best_model, call, report, True), best_epooch
259 | 
260 | 
261 | def get_command(input_modes, context_flag, speaker_flag):
262 |     """
263 |         This method is used to create the COMMAND to execute the forward methof of particular model,
264 |         Depending upon the input combination
265 |         Args:
266 |             input_modes:
267 |                 Input Modality {VTA, VT, VA, TA, V, T, A}
268 |             context_flag :
269 |                 If true then "with context" else "without context" 
270 |             speaker_flag:
271 |                 if true then Speaker dependent else Speaker INdependent
272 |     """
273 |     if input_modes == 'VTA':
274 |         COMMAND = "mod(**{'uA':uVideo, 'uB':uText, 'uC':uAudio"
275 |         if context_flag == 'y':
276 |             COMMAND += ",'cA':cVideo, 'cB':cText, 'cC':cAudio"
277 | 
278 |     elif input_modes == 'VT':
279 |         COMMAND = "mod(**{'uA':uVideo, 'uB':uText"
280 |         if context_flag == 'y':
281 |             COMMAND += ",'cA':cVideo, 'cB':cText"
282 | 
283 |     elif input_modes == 'VA':
284 |         COMMAND = "mod(**{'uA':uVideo, 'uB':uAudio"
285 |         if context_flag == 'y':
286 |             COMMAND += ",'cA':cVideo, 'cB':cAudio"
287 | 
288 |     elif input_modes == 'TA':
289 |         COMMAND = "mod(**{'uA':uText, 'uB':uAudio"
290 |         if context_flag == 'y':
291 |             COMMAND += ",'cA':cText, 'cB':cAudio"
292 | 
293 |     elif input_modes == 'T':
294 |         COMMAND = "mod(**{'uA':uText"
295 |         if context_flag == 'y':
296 |             COMMAND += ",'cA':cText"
297 | 
298 |     elif input_modes == 'V':
299 |         COMMAND = "mod(**{'uA':uVideo"
300 |         if context_flag == 'y':
301 |             COMMAND += ",'cA':cVideo"
302 | 
303 |     elif input_modes == 'A':
304 |         COMMAND = "mod(**{'uA':uAudio"
305 |         if context_flag == 'y':
306 |             COMMAND += ",'cA':cAudio"
307 |     if speaker_flag == 'y':
308 |         COMMAND += ",'speaker_embedding':speaker})"
309 |     else:
310 |         COMMAND += "})"
311 | 
312 |     return COMMAND
313 | 
314 | 
315 | speaker_list = sorted(list(mustard_input.SPEAKER.value_counts().keys()))
316 | print(len(speaker_list))
317 | 
318 | 
319 | def get_model_and_parameters(args):
320 |     input_modes = ''.join(reversed(sorted(list(args.mode.upper()))))
321 | 
322 |     parameters = {}
323 |     MODEL_NAME = 'Speaker_'
324 | 
325 |     parameters['num_classes'] = 2
326 | 
327 |     if args.speaker.lower() == 'y':
328 |         MODEL_NAME += 'Dependent_'
329 |         parameters['n_speaker'] = len(speaker_list)
330 |     else:
331 |         MODEL_NAME += 'Independent_'
332 | 
333 |     if len(input_modes) == 3:
334 |         MODEL_NAME += 'Triple_'
335 |         parameters['input_embedding_A'] = 2048
336 |         parameters['input_embedding_B'] = 1024
337 |         parameters['input_embedding_C'] = 291
338 | 
339 |     elif len(input_modes) == 2:
340 |         MODEL_NAME += 'Dual_'
341 |         parameters['input_embedding_A'] = 2048 if input_modes[0] == 'V' else 1024
342 |         parameters['input_embedding_B'] = 291 if input_modes[1] == 'A' else 1024
343 |     else:
344 |         MODEL_NAME += 'Single_'
345 |         parameters['input_embedding_A'] = 2048 if input_modes == 'V' else 1024 if input_modes == 'T' else 291
346 | 
347 |     MODEL_NAME += 'Mode_with'
348 |     MODEL_NAME += 'out' if args.context.lower() == 'n' else ''
349 |     MODEL_NAME += '_Context'
350 | 
351 |     MODEL_NAME = 'emotion_classification_model.' + MODEL_NAME
352 | 
353 |     COMMAND = get_command(
354 |         input_modes, args.context.lower(), args.speaker.lower())
355 |     return MODEL_NAME, parameters, COMMAND
356 | 
357 | 
358 | #just initializing 
359 | video_embedding_size = 2048
360 | audio_embedding_size = 291
361 | text_embedding_size = 1024
362 | shared_embedding_size = 1024
363 | projection_embedding_size = 512
364 | 
365 | epooch = args.epooch
366 | lr = 0.001  # args.learning_rate
367 | patience = args.patience
368 | batch_size = 128  # args.batch_size
369 | dropout = 0.5
370 | 
371 | 
372 | # get out model name , parameters, and command as per the arguments provided in command line
373 | MODEL_NAME, parameters, COMMAND = get_model_and_parameters(args)
374 | 
375 | parameters['shared_embedding'] = shared_embedding_size
376 | parameters['projection_embedding'] = projection_embedding_size
377 | parameters['dropout'] = dropout
378 | 
379 | 
380 | """ This filename is used further for storing storing stats and log"""
381 | filename = args.mode
382 | filename += '_context_'+args.context.upper()
383 | filename += '_speaker_'+args.speaker.upper()
384 | 
385 | """ File to store log"""
386 | f = open('MPP_Code/log/sarcasm_mutard/' +
387 |          filename+'.txt', 'w')
388 | """ Dataframe to store stats, will be saved as CSV """
389 | stats = pd.DataFrame(columns=['dropout', 'lr', 'batch_size', 'shared_embedding_size',
390 |                               'projection_embedding_size', 'epoch', 'Precision', 'Recall', 'F1'])
391 | 
392 | 
393 | """ 'original' variable is mode to swithch between printing area
394 |     if we do not want to print on log file then we will use 'original'
395 |     if we want to log then 'f'  ---> it will print on both termminal and log file
396 | """
397 | original = sys.stdout
398 | 
399 | sys.stdout = Tee(sys.stdout, f)
400 | 
401 | print(MODEL_NAME.split('.')[1])
402 | 
403 | sys.stdout = original
404 | 
405 | 
406 | # for dropout in [0.2]:
407 | #     for lr in [0.001]:
408 | #         for batch_size in [64]:
409 | #             for shared_embedding_size, projection_embedding_size in zip([2048], [1024]):
410 | """
411 | These are various combination for parameters tuning (GRID SEARCH)
412 | """
413 | for dropout in [0.2, 0.3, 0.4, 0.5]:
414 |     for lr in [0.001, 0.0001, 0.00001]:
415 |         for batch_size in [128, 64]:
416 |             for shared_embedding_size, projection_embedding_size in zip([2048, 1024], [1024, 512]):
417 |                 stat = [dropout, lr, batch_size,
418 |                         shared_embedding_size, projection_embedding_size]
419 |                 parameters['shared_embedding'] = shared_embedding_size
420 |                 parameters['projection_embedding'] = projection_embedding_size
421 |                 parameters['dropout'] = dropout
422 | 
423 |                 pred_all = []
424 |                 true_all = []
425 |                 indexes = []
426 |                 types = []
427 | 
428 |                 for fold in range(5):
429 | 
430 | 
431 |                     """
432 |                         for 5 FOLD cross validation 
433 |                         we have made the stratified splits explicitly
434 |                         this is done in order to keep consistency in different experiments (to deal with randomness)
435 |                     """
436 |                     train = pd.read_csv(
437 |                         'MPP_Code/data/split_mustard/train_' + str(fold)+'.csv')
438 |                     valid = pd.read_csv(
439 |                         'MPP_Code/data/split_mustard/test_' + str(fold)+'.csv')
440 |                     seed()
441 |                     train_dataset = ContentDataset(train, data, speaker_list)
442 |                     seed()
443 |                     train_loader = DataLoader(
444 |                         train_dataset, batch_size, num_workers=0, pin_memory=False, worker_init_fn=seed_worker)
445 |                     seed()
446 |                     valid_dataset = ContentDataset(valid, data, speaker_list)
447 |                     seed()
448 |                     valid_loader = DataLoader(
449 |                         valid_dataset, batch_size, num_workers=0, pin_memory=False, worker_init_fn=seed_worker)
450 | 
451 |                     indexes.extend(valid['KEY'].tolist())
452 |                     # types.extend(valid['SAR_T'].tolist())
453 | 
454 |                     seed()
455 |                     mod = eval(MODEL_NAME)(**parameters)
456 |                     mod.to(device)
457 |                 #     print(mod)
458 |                     seed()
459 |                     criterion = nn.CrossEntropyLoss()
460 |                     criterion.to(device)
461 |                     seed()
462 |                     optimizer = optim.Adam(
463 |                         params=mod.parameters(), betas=(0.5, 0.99), lr=lr)
464 | 
465 |                     (true, pred), epo = training(mod=mod, criterion=criterion, optimizer=optimizer, call=COMMAND,
466 |                                                  train_loader=train_loader, valid_loader=valid_loader, fold=fold, e=epooch, patience=patience)
467 |                     pred_all.extend(pred)
468 |                     true_all.extend(true)
469 | 
470 |                 # training ends here
471 | 
472 | 
473 |                 # FOr log file emotion wisae results
474 |                 sys.stdout = Tee(sys.stdout, f)
475 | 
476 |                 print(f'n_epooch:{epo} | dropout:{dropout} | lr:{lr} | batch_size:{batch_size} | shared_embedding_size:{shared_embedding_size} | projection_embedding_size:{projection_embedding_size}')
477 |                 report = classification_report(true_all, pred_all, digits=3)
478 |                 print(report)
479 |                 print('-'*100)
480 |                 sys.stdout = original
481 | 
482 |                 # FOr stats file emotion wise results
483 |                 stat.append(epo)
484 |                 stat.extend(
485 |                     list(map(float, report.split('\n')[-2].split('     ')[1:-1])))
486 |                 stats.loc[len(stats)] = stat
487 |                 stats.to_csv('MPP_Code/stats/sarcasm_mutard/' +
488 |                              filename+'.csv', index=False)
489 | 
490 | 
491 |                 """ Following code can be used to save prediction """
492 |                 results = []
493 |                 # print(indexes)
494 |                 for row in zip(indexes, types, true_all, pred_all):
495 |                     results.append(row)
496 | 
497 |                 results = pd.DataFrame(
498 |                     results, columns=['KEY', 'TRUE', 'PRED'])
499 |                 results.to_csv('MPP_Code/predictions/sarcasm_mutard/' + args.mode+'/' +
500 |                                filename+'_'+str(int(dropout*10))+'_'+str(batch_size)+'_'+str(shared_embedding_size)+'_'+str(len(str(lr*100)))+'_'+'.csv', index=False)
501 | 
502 |                 sys.stdout = Tee(sys.stdout, f)
503 |                 print('#'*100)
504 |                 sys.stdout = original
505 | 
506 | 
507 | f.close()
508 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MUStARD++
 2 | 
 3 | This repository was created as part of our submission 'A Multimodal Corpus for Emotion Recognition in Sarcasm' to LREC-2022
 4 | 
 5 | Our multimodal dataset consists of dialogs from sit-coms each of which is presented as a combination of the main 'utterance' and the 'context' in which it was uttered. There are 1202 instances (utterance+context) out of which 601 are sarcastic and 601 are non-sarcastic. Each utterance is annotated with the following information
 6 | 
 7 | 
 8 | | Column            |                   Description                                   |
 9 | | -------------     | --------------------------------------------------------------- |
10 | | Sarcasm           | 0 or 1 indicating presence or absence of sarcasm                | 
11 | | Sarcasm_Type      | If sarcastic, indicates the type of sarcasm else None           |
12 | | Implicit_Emotion  | The perceived hidden emotion associated with an instance        |
13 | | Explicit_Emotion  | The surface emotion associated with an instance                 |
14 | | Valence           | Level of pleasantness (1-9)                                     | 
15 | | Arousal           | Level of perceived intensity  (1-9)                             |
16 | 
17 | 
18 | The textual transcipts of this data along with the associated annotations is made available in the form of a csv file uploaded in the repo. To access the videos associated with the utterances and their corresponding contexts, visit this [link](https://drive.google.com/drive/folders/1kUdT2yU7ERJ5KdauObTj5oQsBlSrvTlW?usp=sharing).
19 | 


--------------------------------------------------------------------------------