├── Images └── framework.pdf ├── README.md ├── loss.py ├── main.py ├── utils.py ├── dataset.py └── model.py /Images/framework.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HCPLab-SYSU/AUE-CRL/HEAD/Images/framework.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AU-Expression Knowledge Constrained Representation Learning for Facial Expression Recognition 2 | 3 | Implementation of paper: 4 | 5 | - [AU-Expression Knowledge Constrained Representation Learning for Facial Expression Recognition](https://ieeexplore.ieee.org/document/9561252) 6 | IEEE International Conference on Robotics and Automation (ICRA), 2021. 7 | Tao Pu, Tianshui Chen, Yuan Xie, Hefeng Wu, and Liang Lin. 8 | 9 | ![Pipeline](./Images/framework.pdf) 10 | 11 | ## Environment 12 | Ubuntu 16.04 LTS, Python 3.5, PyTorch 1.3 13 | 14 | ## Usage 15 | 16 | ``` 17 | # Step 1: Train the branch of facial expression recognition 18 | python main.py --Model ResNet-101 --Experiment EM 19 | # Step 2: Train the branch of facial AU recognition 20 | python main.py --Model ResNet-101 --Experiment AU --Resume_Model 21 | # Step 3: Train whole model 22 | python main.py --Model ResNet-101 --Experiment Fuse --Resume_Model 23 | ``` 24 | **Note:** At step 2 and 3, you should load the checkpoint from the previous step. 25 | 26 | ## Result 27 | 28 | ### Result on RAF-DB 29 | 30 | | Methods | Angry | Disgust | Fear | Happy | Neutral | Sad | Surprised | Ave. acc | 31 | | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | 32 | | **DCNN-DA** | 78.4 | 64.4 | 62.2 | 91.1 | 80.6 | 81.2 | 84.5 | 77.5 | 33 | | **WSLGRN** | 75.3 | 56.9 | 63.5 | 93.8 | 85.4 | 83.5 | 85.4 | 77.7 | 34 | | **CP** | 80.0 | 61.0 | 61.0 | 93.0 | **89.0** | **86.0** | 86.0 | 79.4 | 35 | | **CompactDLM** | 74.5 | 67.6 | 46.9 | 82.3 | 59.1 | 58.0 | 84.6 | 67.6 | 36 | | **FSN** | 72.8 | 46.9 | 56.8 | 90.5 | 76.9 | 81.6 | 81.8 | 72.5 | 37 | | **DLP-CNN** | 71.6 | 52.2 | 62.2 | 92.8 | 80.3 | 80.1 | 81.2 | 74.2 | 38 | | **MRE-CNN** | **84.0** | 57.5 | 60.8 | 88.8 | 80.2 | 79.9 | 86.0 | 76.7 | 39 | | **Ours** | 80.5 | **67.6** | **68.9** | **94.1** | 85.8 | 83.6 | **86.4** | **81.0** | 40 | 41 | 42 | ### Result on SFEW2.0 43 | 44 | | Methods | Angry | Disgust | Fear | Happy | Neutral | Sad | Surprised | Ave. acc | 45 | | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | 46 | | **CP** | 66.0 | 0.0 | 14.0 | **90.0** | 86.0 | **66.0** | 29.0 | 50.1 | 47 | | **DLP-CNN** | - | - | - | - | - | - | - | 51.1 | 48 | | **IA-CNN** | 70.7 | 0.0 | 8.9 | 70.4 | 60.3 | 58.8 | 28.9 | 42.6 | 49 | | **IL** | 61.0 | 0.0 | 6.4 | 89.0 | 66.2 | 48.0 | 33.3 | 43.4 | 50 | | **Ours** | **75.3** | **17.4** | **25.5** | 86.3 | **72.1** | 50.7 | **42.1** | **52.8** | 51 | 52 | 53 | ## Citation 54 | 55 | ``` 56 | @inproceedings{Pu2021AUE-CRL, 57 | author={Pu, Tao and Chen, Tianshui and Xie, Yuan and Wu, Hefeng and Lin, Liang}, 58 | title={Au-expression knowledge constrained representation learning for facial expression recognition}, 59 | booktitle={2021 IEEE international conference on robotics and automation (ICRA)}, 60 | year={2021}, 61 | pages={11154--11161}, 62 | publisher={IEEE}, 63 | doi={10.1109/ICRA48506.2021.9561252} 64 | } 65 | ``` 66 | 67 | ## Contributors 68 | For any questions, feel free to open an issue or contact us: 69 | 70 | * putao537@gmail.com 71 | * tianshuichen@gmail.com 72 | * phoenixsysu@gmail.com 73 | -------------------------------------------------------------------------------- /loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | # Web Link : https://github.com/kapsdeep/FER/blob/master/prior_probability.ipynb 6 | class Expression_Independent_AU_Loss(nn.Module): 7 | def __init__(self, size_average=True): 8 | super(Expression_Independent_AU_Loss, self).__init__() 9 | 10 | self.size_average = size_average 11 | 12 | # self.positive_au_pairs = [(1,2), (4,7), (4,9), (7,9), (6,12), (9,17), (15,17), (15,24), (17,24), (23,24)] 13 | # self.negative_au_pairs = [(2,6), (2,7), (12,15), (12,17)] 14 | self.positive_au_pairs = [(0,1), (2,5), (2,6), (5,6), (4,8), (6,11), (9,11), (9,14), (11,14), (13,14)] 15 | self.negative_au_pairs = [(1,4), (1,5), (8,9), (8,11)] 16 | 17 | def forward(self, pred, target): 18 | 19 | positive_loss = torch.zeros(1).cuda() 20 | negative_loss = torch.zeros(1).cuda() 21 | 22 | # Positive Loss 23 | for i, j in self.positive_au_pairs: 24 | positive_loss += torch.clamp(self.get_pos_prob(pred, i) * self.get_pos_prob(pred, j) - self.get_pos_pos_prob(pred, i, j), min=0.0) + \ 25 | torch.clamp(self.get_neg_prob(pred, i) * self.get_pos_prob(pred, j) - self.get_pos_pos_prob(pred, i, j), min=0.0) + \ 26 | torch.clamp(self.get_pos_prob(pred, i) * self.get_neg_prob(pred, j) - self.get_pos_pos_prob(pred, i, j), min=0.0) 27 | 28 | # Negative Loss 29 | for i, j in self.negative_au_pairs: 30 | negative_loss += torch.clamp(self.get_pos_prob(pred, i) * self.get_pos_prob(pred, j) - self.get_pos_pos_prob(pred, i, j), min=0.0) + \ 31 | torch.clamp(self.get_pos_pos_prob(pred, i, j) - self.get_neg_prob(pred, i) * self.get_pos_prob(pred, j), min=0.0) + \ 32 | torch.clamp(self.get_pos_pos_prob(pred, i, j) - self.get_pos_prob(pred, i) * self.get_neg_prob(pred, j), min=0.0) 33 | 34 | # Batch Loss 35 | batch_loss = positive_loss + negative_loss 36 | 37 | if self.size_average: 38 | return batch_loss.mean() 39 | 40 | return batch_loss 41 | 42 | # Formula : P^{i_1} 43 | def get_pos_prob(self, pred, index): 44 | result = torch.zeros(1).cuda() 45 | for i in range(pred.size(0)): 46 | if pred[i, index] >= 0.5: 47 | result += pred[i, index] 48 | return result / pred.size(0) 49 | 50 | # Formula : P^{i_0} 51 | def get_neg_prob(self, pred, index): 52 | result = torch.zeros(1).cuda() 53 | for i in range(pred.size(0)): 54 | if pred[i, index] < 0.5: 55 | result += pred[i, index] 56 | return result / pred.size(0) 57 | 58 | # Formula : P^{(i_1)(j_0)} 59 | def get_pos_neg_prob(self, pred, index_1, index_2): 60 | result = torch.zeros(1).cuda() 61 | for i in range(pred.size(0)): 62 | if pred[i, index_1] >= 0.5 and pred[i, index_2] < 0.5: 63 | result += pred[i, index_1] * pred[i, index_2] 64 | return result / pred.size(0) 65 | 66 | # Formula : P^{(i_0)(j_1)} 67 | def get_neg_pos_prob(self, pred, index_1, index_2): 68 | result = torch.zeros(1).cuda() 69 | for i in range(pred.size(0)): 70 | if pred[i, index_1] < 0.5 and pred[i, index_2] >= 0.5: 71 | result += pred[i, index_1] * pred[i, index_2] 72 | return result / pred.size(0) 73 | 74 | # Formula : P^{(i_1)(j_1)} 75 | def get_pos_pos_prob(self, pred, index_1, index_2): 76 | result = torch.zeros(1).cuda() 77 | for i in range(pred.size(0)): 78 | if pred[i, index_1] >= 0.5 and pred[i, index_2] >= 0.5: 79 | result += pred[i, index_1] * pred[i, index_2] 80 | return result / pred.size(0) 81 | 82 | class Generate_AU_Loss(nn.Module): 83 | def __init__(self, size_average=True): 84 | super(Generate_AU_Loss, self).__init__() 85 | 86 | self.size_average = size_average 87 | 88 | self.Mask_A = torch.Tensor([ 89 | [1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0], 90 | [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0], 91 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 92 | [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 93 | [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 94 | [0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0], 95 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]).float().cuda() 96 | 97 | self.Mask_B = torch.Tensor([ 98 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 99 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0], 100 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0], 101 | [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], 102 | [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], 103 | [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], 104 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]).float().cuda() 105 | 106 | self.Mask_C = torch.Tensor([ 107 | [0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0], 108 | [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0], 109 | [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0], 110 | [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0], 111 | [0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0], 112 | [1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0], 113 | [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]).float().cuda() 114 | 115 | def forward(self, target): 116 | batch_loss = self.Mask_A * torch.sqrt((target-(0.75 + 1.00)/2).pow(2)) + \ 117 | self.Mask_B * torch.sqrt((target-(0.50 + 0.75)/2).pow(2)) + \ 118 | self.Mask_C * torch.sqrt((target-(0.00 + 0.25)/2).pow(2)) 119 | 120 | if self.size_average: 121 | return batch_loss.mean() 122 | 123 | return batch_loss.sum() 124 | 125 | class MSELoss(nn.Module): 126 | def __init__(self, size_average=True): 127 | super(MSELoss, self).__init__() 128 | 129 | self.size_average = size_average 130 | 131 | def forward(self, pred, target): 132 | loss = (pred-target).pow(2) 133 | 134 | weight = target.clone() 135 | weight[weight >= 0.5] = 3 136 | weight[weight < 0.5] = 1 137 | 138 | loss = loss * weight 139 | 140 | if self.size_average: 141 | return loss.mean() 142 | 143 | return loss.sum() 144 | 145 | class BCELoss(nn.Module): 146 | def __init__(self, size_average=True): 147 | super(BCELoss, self).__init__() 148 | 149 | self.size_average = size_average 150 | 151 | def forward(self, pred, target): 152 | loss = - target * torch.log(pred) 153 | 154 | weight = target.clone() 155 | weight[weight >= 0.5] = 3 156 | weight[weight < 0.5] = 1 157 | 158 | loss = loss * weight 159 | 160 | if self.size_average: 161 | return loss.mean() 162 | 163 | return loss.sum() -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import time 2 | import warnings 3 | 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torch.utils.tensorboard import SummaryWriter 8 | 9 | from utils import * 10 | from loss import * 11 | 12 | warnings.filterwarnings("ignore") 13 | 14 | parser = argparse.ArgumentParser(description='Facial Expression Recognition Experiment') 15 | 16 | parser.add_argument('--Log_Name', type=str, help='Naming Format: date_experiment_model') 17 | parser.add_argument('--Experiment', default='EM', type=str, choices=['EM', 'AU', 'Fuse'], 18 | help='1->Expression Recognition Experiment, 2->AU Recognition Experiment, 3->Feature Fuse Experiment') 19 | 20 | parser.add_argument('--Dataset', default='RAF', type=str, choices=['RAF', 'SFEW', 'MMI', 'ExpW', 'BP4D'], help='Value Range: RAF, BP4D, SFEW, MMI, ExpW') 21 | parser.add_argument('--Distribute', default='Basic', type=str, choices=['Basic', 'Compound'], help='Value Range: Basic, Compound') 22 | parser.add_argument('--Aligned', default=False, type=str2bool, help='whether to Aligned Image') 23 | 24 | parser.add_argument('--Model', default='ResNet-101', type=str, choices=['ResNet-101', 'ResNet-50', 'ResNet-18'], 25 | help='1->ResNet-101(pre-trained on ImageNet), 2->ResNet-50(pre-trained on ImageNet), 3->ResNet-18(pre-trained on ImageNet)') 26 | parser.add_argument('--Resume_Model', default='None', type=str, help='if Resume_Model == none, then load pre-trained on ImageNet from PyTorch') 27 | 28 | parser.add_argument('--Dim', default=1024, type=int, help='Dim Of Fuse Feature') 29 | parser.add_argument('--numOfAU', default=17, type=int, help='Number of Action Units') 30 | parser.add_argument('--numOfLabel', default=7, type=int, help='Number of Expression Labels') 31 | 32 | parser.add_argument('--Epoch', default=40, type=int, help='Epoch') 33 | parser.add_argument('--LearnRate', default=0.01, type=float, help='Learning Rate') 34 | parser.add_argument('--Train_Batch_Size', default=64, type=int, help='Batch Size during training') 35 | parser.add_argument('--Test_Batch_Size', default=64, type=int, help='Batch Size during testing') 36 | 37 | parser.add_argument('--GPU_ID', default='0', type=str, help='CUDA_VISIBLE_DEVICES') 38 | parser.add_argument('--Num_Workers', default=12, type=int, help='Number of Workers') 39 | parser.add_argument('--DataParallel', default=False, type=str2bool, help='Data Parallel') 40 | 41 | def Train(args, model, criterion, optimizer, train_loader, writer, epoch): 42 | 43 | numOfClass = args.numOfAU if args.Experiment == 'AU' else args.numOfLabel 44 | acc_1, acc_2, prec, recall = [AverageMeter() for i in range(numOfClass)], [AverageMeter() for i in range(numOfClass)], [AverageMeter() for i in range(numOfClass)], [AverageMeter() for i in range(numOfClass)] 45 | loss, data_time, batch_time = AverageMeter(), AverageMeter(), AverageMeter() 46 | 47 | model.train() 48 | if args.Experiment == 'Fuse': 49 | model.backbone.eval() 50 | 51 | for i in range(numOfClass): 52 | acc_1[i].reset() 53 | acc_2[i].reset() 54 | prec[i].reset() 55 | recall[i].reset() 56 | 57 | loss.reset() 58 | data_time.reset() 59 | batch_time.reset() 60 | 61 | optimizer, lr = Adjust_Learning_Rate(optimizer, epoch, args.LearnRate) 62 | 63 | end = time.time() 64 | for step, (input, au_loc, target) in enumerate(train_loader, start=1): 65 | 66 | input, imgPath = input 67 | input, target = input.cuda(), target.cuda() 68 | data_time.update(time.time()-end) 69 | 70 | if args.Experiment in ['AU', 'Fuse']: 71 | au_loc = au_loc.cuda() 72 | au_target = model.get_au_target(target.cpu()) # generate au label 73 | 74 | # forward 75 | if args.Experiment == 'EM': 76 | pred = model(input, args) 77 | loss_ = criterion(pred, target) 78 | 79 | elif args.Experiment == 'AU': 80 | pred = model((input, au_loc), args) 81 | loss_ = criterion(pred, au_target) + 0.5 * Expression_Independent_AU_Loss()(pred, au_target) + Generate_AU_Loss()(model.PriorKnowledgeTable) 82 | 83 | elif args.Experiment == 'Fuse': 84 | pred1, pred2, au_prob = model((input, au_loc), args) 85 | loss_ = criterion(pred1, target) + criterion(pred2, target) + 0.05 * Expression_Independent_AU_Loss()(au_prob, au_target) 86 | 87 | # backward 88 | optimizer.zero_grad() 89 | loss_.backward() 90 | optimizer.step() 91 | 92 | # compute accuracy, recall and loss 93 | if args.Experiment == 'EM': 94 | Compute_Accuracy_Expression(args, pred, target, loss_, acc_1, acc_2, prec, recall, loss) 95 | 96 | elif args.Experiment == 'AU': 97 | Compute_Accuracy_AU(args, pred, au_target, loss_, acc_1, acc_2, prec, recall, loss) 98 | 99 | elif args.Experiment == 'Fuse': 100 | Compute_Accuracy_Expression(args, pred2, target, loss_, acc_1, acc_2, prec, recall, loss) 101 | 102 | batch_time.update(time.time()-end) 103 | end = time.time() 104 | 105 | Accuracy_Info, acc_1_avg, acc_2_avg, prec_avg, recall_avg, f1_avg = Show_Accuracy(acc_1, acc_2, prec, recall, numOfClass=numOfClass) 106 | 107 | # writer 108 | writer.add_scalar('Accuracy_1', acc_1_avg, epoch) 109 | writer.add_scalar('Accuracy_2', acc_2_avg, epoch) 110 | writer.add_scalar('Precision', prec_avg, epoch) 111 | writer.add_scalar('Recall', recall_avg, epoch) 112 | writer.add_scalar('F1', f1_avg, epoch) 113 | writer.add_scalar('Loss', loss.avg, epoch) 114 | 115 | LogInfo = ''' 116 | [Tain ({exp})]: 117 | Epoch {0} 118 | Data Time {data_time.sum:.4f} ({data_time.avg:.4f}) 119 | Batch Time {batch_time.sum:.4f} ({batch_time.avg:.4f}) 120 | Learning Rate {1}\n'''.format(epoch, lr, data_time=data_time, batch_time=batch_time, exp=args.Experiment) 121 | 122 | LogInfo += Accuracy_Info 123 | 124 | LogInfo += ''' Acc_avg(1) {0:.4f} Acc_avg(2) {1:.4f} Prec_avg {2:.4f} Recall_avg {3:.4f} F1_avg {4:.4f} 125 | Loss {loss.avg:.4f}'''.format(acc_1_avg, acc_2_avg, prec_avg, recall_avg, f1_avg, loss=loss) 126 | 127 | print(LogInfo) 128 | 129 | def Test(args, model, criterion, optimizer, test_loader, writer, epoch, Best_Accuracy): 130 | 131 | numOfClass = args.numOfAU if args.Experiment == 'AU' else args.numOfLabel 132 | 133 | acc_1, acc_2, prec, recall = [AverageMeter() for i in range(numOfClass)], [AverageMeter() for i in range(numOfClass)], [AverageMeter() for i in range(numOfClass)], [AverageMeter() for i in range(numOfClass)] 134 | loss, data_time, batch_time = AverageMeter(), AverageMeter(), AverageMeter() 135 | 136 | # Test Model 137 | model.eval() 138 | 139 | for i in range(numOfClass): 140 | acc_1[i].reset() 141 | acc_2[i].reset() 142 | prec[i].reset() 143 | recall[i].reset() 144 | 145 | loss.reset() 146 | data_time.reset() 147 | batch_time.reset() 148 | 149 | end = time.time() 150 | for step, (input, au_loc, target) in enumerate(test_loader, start=1): 151 | 152 | input, imgPath = input 153 | input, target = input.cuda(), target.cuda() 154 | data_time.update(time.time()-end) 155 | 156 | if args.Experiment in ['AU', 'Fuse']: 157 | au_loc = au_loc.cuda() 158 | au_target = model.get_au_target(target.cpu()) # generate au label 159 | 160 | with torch.no_grad(): 161 | 162 | # forward 163 | if args.Experiment == 'EM': 164 | pred = model(input, args) 165 | loss_ = criterion(pred, target) 166 | 167 | elif args.Experiment == 'AU': 168 | pred = model((input, au_loc), args) 169 | loss_ = criterion(pred, au_target) + 0.5 * Expression_Independent_AU_Loss()(pred, au_target) + Generate_AU_Loss()(model.PriorKnowledgeTable) 170 | 171 | elif args.Experiment == 'Fuse': 172 | pred1, pred2, au_prob = model((input, au_loc), args) 173 | loss_ = criterion(pred1, target) + criterion(pred2, target) + 0.05 * Expression_Independent_AU_Loss()(au_prob, au_target) 174 | 175 | # compute accuracy, recall and loss 176 | if args.Experiment == 'EM': 177 | Compute_Accuracy_Expression(args, pred, target, loss_, acc_1, acc_2, prec, recall, loss) 178 | 179 | elif args.Experiment == 'AU': 180 | Compute_Accuracy_AU(args, pred, au_target, loss_, acc_1, acc_2, prec, recall, loss) 181 | 182 | elif args.Experiment == 'Fuse': 183 | Compute_Accuracy_Expression(args, pred2, target, loss_, acc_1, acc_2, prec, recall, loss) 184 | 185 | batch_time.update(time.time()-end) 186 | end = time.time() 187 | 188 | Accuracy_Info, acc_1_avg, acc_2_avg, prec_avg, recall_avg, f1_avg = Show_Accuracy(acc_1, acc_2, prec, recall, numOfClass=numOfClass) 189 | 190 | # writer 191 | writer.add_scalar('Accuracy_1', acc_1_avg, epoch) 192 | writer.add_scalar('Accuracy_2', acc_2_avg, epoch) 193 | writer.add_scalar('Precision', prec_avg, epoch) 194 | writer.add_scalar('Recall', recall_avg, epoch) 195 | writer.add_scalar('F1', f1_avg, epoch) 196 | writer.add_scalar('Loss', loss.avg, epoch) 197 | 198 | LogInfo = ''' 199 | [Test ({exp})]: 200 | Epoch {0} 201 | Data Time {data_time.sum:.4f} ({data_time.avg:.4f}) 202 | Batch Time {batch_time.sum:.4f} ({batch_time.avg:.4f})\n'''.format(epoch, data_time=data_time, batch_time=batch_time, exp=args.Experiment) 203 | 204 | LogInfo += Accuracy_Info 205 | 206 | LogInfo += ''' Acc_avg(1) {0:.4f} Acc_avg(2) {1:.4f} Prec_avg {2:.4f} Recall_avg {3:.4f} F1_avg {4:.4f} 207 | Loss {loss.avg:.4f}'''.format(acc_1_avg, acc_2_avg, prec_avg, recall_avg, f1_avg, loss=loss) 208 | 209 | print(LogInfo) 210 | 211 | # Save Checkpoints 212 | if acc_2_avg > Best_Accuracy: 213 | 214 | Best_Accuracy, Best_Epoch = acc_2_avg, epoch 215 | print('[Save] Best Acc: %.4f, Best Epoch: %d' % (Best_Accuracy, Best_Epoch)) 216 | 217 | if isinstance(model, nn.DataParallel): 218 | torch.save(model.module.state_dict(), '{}.pkl'.format(args.Log_Name)) 219 | else: 220 | torch.save(model.state_dict(), '{}.pkl'.format(args.Log_Name)) 221 | 222 | return Best_Accuracy 223 | 224 | def main(): 225 | '''main''' 226 | 227 | # Parse Argument 228 | args = parser.parse_args() 229 | 230 | # Experiment Information 231 | print('Log Name: %s' % args.Log_Name) 232 | print('Backbone: %s' % args.Model) 233 | print('Experiment: %s' % args.Experiment) 234 | print('Resume_Model: %s' % args.Resume_Model) 235 | print('CUDA_VISIBLE_DEVICES: %s' % args.GPU_ID) 236 | 237 | print('================================================') 238 | 239 | print('Dataset: %s' % args.Dataset) 240 | print('Distribute: %s' % args.Distribute) 241 | print('Use Aligned Image' if args.Aligned else 'Don\'t use Aligned Image') 242 | 243 | print('================================================') 244 | 245 | if args.Distribute == 'Basic': 246 | args.numOfLabel = 7 247 | elif args.Distribute == 'Compound': 248 | args.numOfLabel = 11 249 | 250 | print('Dim: %d' % args.Dim) 251 | print('Number Of Action Units: %d' % args.numOfAU) 252 | print('Number Of Expression Labels: %d' % args.numOfLabel) 253 | 254 | print('================================================') 255 | 256 | print('Number of Workers: %d' % args.Num_Workers) 257 | print('Use Data Parallel' if args.DataParallel else 'Dont\'t use Data Parallel') 258 | print('Epoch: %d' % args.Epoch) 259 | print('Train Batch Size: %d' % args.Train_Batch_Size) 260 | print('Test Batch Size: %d' % args.Test_Batch_Size) 261 | 262 | print('================================================') 263 | 264 | # Bulid Model 265 | print('Load Model...') 266 | model = Bulid_Model(args) 267 | print('Done!') 268 | 269 | print('================================================') 270 | 271 | # Set Optimizer 272 | print('Building Optimizer...') 273 | param_optim = Set_Param_Optim(args, model) 274 | optimizer = Set_Criterion_Optimizer(args, param_optim) 275 | print('Done!') 276 | 277 | print('================================================') 278 | 279 | # Bulid Dataloader 280 | print("Buliding Train and Test Dataloader...") 281 | if args.Dataset == 'ExpW': 282 | train_loader, test_loader = BulidDataloader(args) 283 | else: 284 | train_loader = BulidDataloader(args, flag='train') 285 | test_loader = BulidDataloader(args, flag='test') 286 | print('Done!') 287 | 288 | print('================================================') 289 | 290 | Best_Accuracy = 0 291 | 292 | if args.Experiment in ['EM', 'Fuse']: 293 | criterion = nn.CrossEntropyLoss() 294 | elif args.Experiment == 'AU': 295 | criterion = MSELoss() 296 | 297 | # Running Experiment 298 | print("Run Experiment...") 299 | writer = SummaryWriter('{}'.format(args.Log_Name)) 300 | 301 | for epoch in range(1, args.Epoch + 1): 302 | 303 | Train(args, model, criterion, optimizer, train_loader, writer, epoch) 304 | Best_Accuracy = Test(args, model, criterion, optimizer, test_loader, writer, epoch, Best_Accuracy) 305 | 306 | torch.cuda.empty_cache() 307 | 308 | writer.close() 309 | 310 | if __name__=='__main__': 311 | main() 312 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | import pandas as pd 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.optim as optim 9 | 10 | import torch.utils.data as data 11 | import torchvision.transforms as transforms 12 | 13 | from dataset import get_au_loc, MyDataset_EM, MyDataset_AU, MyDataset_EM_Artificial, MyDataset_EM_Artificial_Compound, MyDataset_EM_Artificial_noBbox 14 | from model import ResNet101, ResNet101_Compound 15 | 16 | numOfAU = 17 17 | 18 | 19 | class AverageMeter(object): 20 | '''Computes and stores the sum, count and average''' 21 | 22 | def __init__(self): 23 | self.reset() 24 | 25 | def reset(self): 26 | self.val = 0 27 | self.avg = 0 28 | self.sum = 0 29 | self.count = 0 30 | 31 | def update(self, val, count=1): 32 | self.val = val 33 | self.sum += val 34 | self.count += count 35 | if self.count==0: 36 | self.avg = 0 37 | else: 38 | self.avg = float(self.sum) / self.count 39 | 40 | 41 | def str2bool(input): 42 | if isinstance(input, bool): 43 | return input 44 | if input.lower() in ('yes', 'true', 't', 'y', '1'): 45 | return True 46 | elif input.lower() in ('no', 'false', 'f', 'n', '0'): 47 | return False 48 | else: 49 | raise argparse.ArgumentTypeError('Boolean value expected.') 50 | 51 | 52 | def BulidDataloader(args, flag='train'): 53 | '''Bulid Datadloader''' 54 | 55 | # Set Transform 56 | print('Use 224 * 224 Image') 57 | trans = transforms.Compose([ 58 | transforms.Resize((224, 224)), 59 | transforms.ToTensor(), 60 | transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]), 61 | ]) 62 | target_trans = None 63 | 64 | # Set Data Path 65 | dataPath_prefix = 'your-data-path' 66 | ImagePath, BBoxPath, LandmarkPath, Train_List, Test_List = '', '', '', '', '' 67 | 68 | if args.Dataset == 'RAF': 69 | 70 | if args.Distribute == 'Basic': 71 | if args.Aligned: 72 | ImagePath = dataPath_prefix + '/RAF/basic/Image/aligned/' 73 | LandmarkPath = dataPath_prefix + '/RAF/basic/Annotation/Landmarks_68_FAN_aligned/' 74 | else: 75 | ImagePath = dataPath_prefix + '/RAF/basic/Image/original/' 76 | LandmarkPath = dataPath_prefix + '/RAF/basic/Annotation/Landmarks_68_FAN_bbox/' 77 | 78 | Train_List = dataPath_prefix + '/RAF/basic/EmoLabel/list_patition_label.txt' 79 | Test_List = dataPath_prefix + '/RAF/basic/EmoLabel/list_patition_label.txt' 80 | BBoxsPath = dataPath_prefix + '/RAF/basic/Annotation/boundingbox/' 81 | 82 | elif args.Distribute == 'Compound': 83 | if args.Aligned: 84 | ImagePath = dataPath_prefix + '/RAF/compound/Image/aligned/' 85 | LandmarkPath = dataPath_prefix + '/RAF/compound/Annotation/Landmarks_68_FAN_bbox/' 86 | else: 87 | ImagePath = dataPath_prefix + '/RAF/compound/Image/original/' 88 | LandmarkPath = dataPath_prefix + '/RAF/compound/Annotation/Landmarks_68_FAN_bbox/' 89 | 90 | Train_List = dataPath_prefix + '/RAF/compound/EmoLabel/list_patition_label.txt' 91 | Test_List = dataPath_prefix + '/RAF/compound/EmoLabel/list_patition_label.txt' 92 | BBoxsPath = dataPath_prefix + '/RAF/compound/Annotation/boundingbox/' 93 | 94 | elif args.Dataset=='BP4D': 95 | 96 | ImagePath = dataPath_prefix + '/BP4D/2D_img/' 97 | LandmarkPath = dataPath_prefix + '/BP4D/Annotations/Landmarks/' 98 | 99 | Train_List = dataPath_prefix + '/BP4D/list_experiment/id_fold1_2.txt' 100 | Test_List = dataPath_prefix + '/BP4D/list_experiment/id_fold3.txt' 101 | 102 | elif args.Dataset=='SFEW': 103 | 104 | ImagePath = dataPath_prefix + '/SFEW/' 105 | LandmarkPath = dataPath_prefix + '/SFEW/' 106 | 107 | Train_List = dataPath_prefix + '/SFEW/list_experiment/id_train_list.txt' 108 | Test_List = dataPath_prefix + '/SFEW/list_experiment/id_val_list.txt' 109 | 110 | elif args.Dataset=='MMI': 111 | 112 | ImagePath = dataPath_prefix + '/MMI/Select_Frames/Imgs_hyuan-cvpr18/' 113 | LandmarkPath = dataPath_prefix + '/MMI/Select_Frames/Landmarks_68_FAN/' 114 | 115 | Train_List = dataPath_prefix + '/MMI/Select_Frames/list/id_train_list_crossval0.txt' 116 | Test_List = dataPath_prefix + '/MMI/Select_Frames/list/id_val_list_crossval0.txt' 117 | 118 | elif args.Dataset=='ExpW': 119 | 120 | ImagePath = dataPath_prefix + '/ExpW/data/image/origin/' 121 | LandmarkPath = dataPath_prefix + '/ExpW/landmarks/ExpW_landmarks.txt' 122 | 123 | Train_List = dataPath_prefix + '/ExpW/data/label/label.lst' 124 | Test_List = dataPath_prefix + '/ExpW/data/label/label.lst' 125 | 126 | # Load Dataset 127 | data_imgs, data_labels, data_landmarks, data_bboxs = [], [], [], [] 128 | 129 | if args.Dataset=='RAF': 130 | 131 | # Basic Notes: { 1: Surprise, 2: Fear, 3: Disgust, 4: Happiness, 5: Sadness, 6: Anger, 7: Neutral} 132 | # Compound Notes: { 1: Happily Surprised, 2: Happily Disgusted, 3: Sadly Fearful, 4: Sadly Angry, 5: Sadly Surprised, 6: Sadly Disgusted, 7: Fearfully Angry, 8: Fearfully Surprised, 9: Angrily Surprised, 10: Angrily Disgusted, 11: Disgustedly Surprised} 133 | 134 | if flag == 'train': 135 | 136 | list_patition_label = pd.read_csv(Train_List, header=None, delim_whitespace=True) 137 | list_patition_label = np.array(list_patition_label) 138 | 139 | for i in range(list_patition_label.shape[0]): 140 | if list_patition_label[i, 0][:5] == "train": 141 | 142 | if not os.path.exists(LandmarkPath+list_patition_label[i, 0][:-4]+'.txt'): 143 | continue 144 | landmark = np.loadtxt(LandmarkPath+list_patition_label[i, 0][:-4]+'.txt') 145 | if landmark.ndim < 2: 146 | continue 147 | 148 | bbox = np.loadtxt(BBoxsPath+list_patition_label[i, 0][:-4]+'.txt') 149 | landmark[:, 0] += bbox[0] 150 | landmark[:, 1] += bbox[1] 151 | 152 | data_imgs.append(ImagePath+list_patition_label[i, 0]) 153 | data_labels.append(list_patition_label[i, 1]-1) 154 | data_landmarks.append(landmark) 155 | data_bboxs.append(bbox) 156 | 157 | elif flag == 'test': 158 | 159 | list_patition_label = pd.read_csv(Test_List, header=None, delim_whitespace=True) 160 | list_patition_label = np.array(list_patition_label) 161 | 162 | for i in range(list_patition_label.shape[0]): 163 | if list_patition_label[i, 0][:4] == "test": 164 | 165 | if not os.path.exists(LandmarkPath+list_patition_label[i, 0][:-4]+'.txt'): 166 | continue 167 | landmark = np.loadtxt(LandmarkPath+list_patition_label[i, 0][:-4]+'.txt') 168 | if landmark.ndim < 2: 169 | continue 170 | 171 | bbox = np.loadtxt(BBoxsPath+list_patition_label[i, 0][:-4]+'.txt') 172 | landmark[:, 0] += bbox[0] 173 | landmark[:, 1] += bbox[1] 174 | 175 | data_imgs.append(ImagePath + list_patition_label[i, 0]) 176 | data_labels.append(list_patition_label[i, 1]-1) 177 | data_landmarks.append(landmark) 178 | data_bboxs.append(bbox) 179 | 180 | # Dataset Distribute 181 | if flag == 'train': 182 | print('The train dataset distribute: %d, %d, %d, %d' % (len(data_imgs), len(data_labels), len(data_landmarks), len(data_bboxs)) ) 183 | elif flag == 'test': 184 | print('The test dataset distribute: %d, %d, %d, %d' % (len(data_imgs), len(data_labels), len(data_landmarks), len(data_bboxs)) ) 185 | 186 | # Dataset 187 | needAU = False if args.Experiment == 'EM' else True 188 | dataset = MyDataset_EM(data_imgs, data_labels, data_landmarks, data_bboxs, flag, needAU, args.Model, trans, target_trans) 189 | 190 | # DataLoader 191 | if flag=='train': 192 | data_loader = data.DataLoader(dataset=dataset, batch_size=args.Train_Batch_Size, shuffle=True, num_workers=args.Num_Workers, drop_last=True) 193 | elif flag=='test': 194 | data_loader = data.DataLoader(dataset=dataset, batch_size=args.Test_Batch_Size, shuffle=True, num_workers=args.Num_Workers, drop_last=False) 195 | 196 | return data_loader 197 | 198 | elif args.Dataset=='BP4D': 199 | 200 | List_Path = Train_List if flag == 'train' else Test_List 201 | 202 | numOfData = 0 203 | 204 | with open(List_Path,'r') as f: 205 | lines = f.readlines() 206 | 207 | for line in lines: 208 | id = line[:-1] 209 | if not (os.path.exists(ImagePath + id + '.jpg') and os.path.exists(dataPath_prefix + '/BP4D/Annotations/AUs/' + id + '.txt') and os.path.exists(LandmarkPath + id + '.txt')): 210 | continue 211 | 212 | landmark = np.loadtxt(LandmarkPath + id + '.txt') 213 | if landmark.ndim < 2: 214 | continue 215 | 216 | numOfData+=1 217 | if numOfData%10000==0: 218 | print('Load Data Num: %d' % numOfData) 219 | 220 | # load img path 221 | data_imgs.append(ImagePath+id+'.jpg') 222 | 223 | # load label 224 | label_txt = np.loadtxt(dataPath_prefix + '/BP4D/Annotations/AUs/'+ id + '.txt') 225 | label = np.zeros(12, dtype=np.float32) 226 | 227 | if label_txt.size == 0: 228 | data_labels.append(label) 229 | else: 230 | if label_txt.ndim == 1: # only one au 231 | label_txt = label_txt[np.newaxis, :] 232 | 233 | for au in label_txt: # au: [au_idx, au_value] 234 | au_idx = au[0] 235 | label[int(au_idx)] = 1 236 | 237 | data_labels.append(label) 238 | 239 | # load landmark 240 | data_landmarks.append(landmark) 241 | 242 | # Dataset Distribute 243 | if flag == 'train': 244 | print('The train dataset distribute: %d, %d, %d' % (len(data_imgs), len(data_labels), len(data_landmarks))) 245 | elif flag == 'test': 246 | print('The test dataset distribute: %d, %d, %d' % (len(data_imgs), len(data_labels), len(data_landmarks))) 247 | 248 | # Dataset with Bbox 249 | dataset = MyDataset_AU(data_imgs, data_labels, data_landmarks, flag, args.Model, trans, target_trans) 250 | 251 | # DataLoader 252 | if flag=='train': 253 | data_loader = data.DataLoader(dataset=dataset, batch_size=args.Train_Batch_Size, shuffle=True, num_workers=args.Num_Workers, drop_last=True) 254 | elif flag=='test': 255 | data_loader = data.DataLoader(dataset=dataset, batch_size=args.Test_Batch_Size, shuffle=True, num_workers=args.Num_Workers, drop_last=False) 256 | 257 | return data_loader 258 | 259 | elif args.Dataset=='SFEW': 260 | 261 | # Basic Notes: {1: Surprise, 2: Fear, 3: Disgust, 4: Happiness, 5: Sadness, 6: Anger, 7: Neutral} 262 | Label = {'Surprise':0, 'Fear':1, 'Disgust':2, 'Happy':3, 'Sad':4, 'Angry':5, 'Neutral':6} 263 | 264 | if flag == 'train': 265 | list_patition_label = pd.read_csv(Train_List, header=None, delim_whitespace=True) 266 | list_patition_label = np.array(list_patition_label) 267 | list_patition_label = list_patition_label.reshape(list_patition_label.shape[0],) 268 | 269 | for i in range(list_patition_label.shape[0]): 270 | if not os.path.exists(LandmarkPath + 'Train/Annotations/Landmarks_68_FAN/' + list_patition_label[i] + '.txt'): 271 | continue 272 | 273 | landmark = np.loadtxt(LandmarkPath + 'Train/Annotations/Landmarks_68_FAN/' + list_patition_label[i] + '.txt') 274 | if landmark.ndim < 2: 275 | continue 276 | 277 | if os.path.exists(ImagePath + 'Train/imgs/' + list_patition_label[i] + '.jpg'): 278 | data_imgs.append(ImagePath + 'Train/imgs/' + list_patition_label[i] + '.jpg') 279 | 280 | elif os.path.exists(ImagePath + 'Train/imgs/' + list_patition_label[i] + '.png'): 281 | data_imgs.append(ImagePath + 'Train/imgs/' + list_patition_label[i] + '.png') 282 | 283 | data_labels.append(Label[list_patition_label[i].split('/',1)[0]]) 284 | data_landmarks.append(landmark) 285 | 286 | elif flag == 'test': 287 | 288 | list_patition_label = pd.read_csv(Test_List, header=None, delim_whitespace=True) 289 | list_patition_label = np.array(list_patition_label) 290 | list_patition_label = list_patition_label.reshape(list_patition_label.shape[0],) 291 | 292 | for i in range(list_patition_label.shape[0]): 293 | if not os.path.exists(LandmarkPath + 'Val/Annotations/Landmarks_68_FAN/' + list_patition_label[i] + '.txt'): 294 | continue 295 | landmark = np.loadtxt(LandmarkPath + 'Val/Annotations/Landmarks_68_FAN/' + list_patition_label[i] + '.txt') 296 | if landmark.ndim<2: 297 | continue 298 | 299 | if os.path.exists(ImagePath + 'Val/imgs/' + list_patition_label[i] + '.jpg'): 300 | data_imgs.append(ImagePath + 'Val/imgs/' + list_patition_label[i] + '.jpg') 301 | elif os.path.exists(ImagePath + 'Val/imgs/' + list_patition_label[i] + '.png'): 302 | data_imgs.append(ImagePath + 'Val/imgs/' + list_patition_label[i] + '.png') 303 | 304 | data_labels.append(Label[list_patition_label[i].split('/',1)[0]]) 305 | data_landmarks.append(landmark) 306 | 307 | # Dataset Distribute 308 | if flag == 'train': 309 | print('The train dataset distribute: %d, %d, %d' % (len(data_imgs), len(data_labels), len(data_landmarks))) 310 | elif flag == 'test': 311 | print('The test dataset distribute: %d, %d, %d' % (len(data_imgs), len(data_labels), len(data_landmarks))) 312 | 313 | # Dataset 314 | needAU = False if args.Experiment == 'EM' else True 315 | dataset = MyDataset_EM_Artificial_noBbox(data_imgs, data_labels, data_landmarks, flag, needAU, args.Model, args.Experiment, trans, target_trans) 316 | 317 | # DataLoader 318 | if flag == 'train': 319 | data_loader = data.DataLoader(dataset=dataset, batch_size=args.Train_Batch_Size, shuffle=True, num_workers=args.Num_Workers, drop_last=True) 320 | elif flag == 'test': 321 | data_loader = data.DataLoader(dataset=dataset, batch_size=args.Test_Batch_Size, shuffle=True, num_workers=args.Num_Workers, drop_last=False) 322 | 323 | return data_loader 324 | 325 | elif args.Dataset=='MMI': 326 | 327 | # Basic Notes: {1: Surprise, 2: Fear, 3: Disgust, 4: Happiness, 5: Sadness, 6: Anger, 7: Neutral} 328 | Label = { 5:0, 2:1, 1:2, 3:3, 4:4, 0:5} 329 | 330 | if flag == 'train': 331 | 332 | list_patition_label = pd.read_csv(Train_List, header=None, delim_whitespace=True) 333 | list_patition_label = np.array(list_patition_label) 334 | list_patition_label = list_patition_label.reshape(list_patition_label.shape[0],) 335 | 336 | for i in range(list_patition_label.shape[0]): 337 | if not os.path.exists(LandmarkPath + list_patition_label[i] + '.txt'): 338 | continue 339 | landmark = np.loadtxt(LandmarkPath + list_patition_label[i] + '.txt') 340 | if landmark.ndim<2: 341 | continue 342 | 343 | data_imgs.append(ImagePath + list_patition_label[i] + '.jpg') 344 | label = pd.read_csv(dataPath_prefix + '/MMI/Select_Frames/Emotions_hyuan-cvpr18/' + list_patition_label[i] + '.txt', header=None, delim_whitespace=True) 345 | label = np.array(label)[0,0] 346 | data_labels.append(Label[label]) 347 | data_landmarks.append(landmark) 348 | 349 | elif flag == 'test': 350 | 351 | list_patition_label = pd.read_csv(Test_List, header=None, delim_whitespace=True) 352 | list_patition_label = np.array(list_patition_label) 353 | list_patition_label = list_patition_label.reshape(list_patition_label.shape[0],) 354 | 355 | for i in range(list_patition_label.shape[0]): 356 | if not os.path.exists(LandmarkPath + list_patition_label[i] + '.txt'): 357 | continue 358 | landmark = np.loadtxt(LandmarkPath + list_patition_label[i] + '.txt') 359 | if landmark.ndim<2: 360 | continue 361 | 362 | data_imgs.append(ImagePath + list_patition_label[i] + '.jpg') 363 | label = pd.read_csv(dataPath_prefix + '/MMI/Select_Frames/Emotions_hyuan-cvpr18/' + list_patition_label[i] + '.txt', header=None, delim_whitespace=True) 364 | label = np.array(label)[0,0] 365 | data_labels.append(Label[label]) 366 | data_landmarks.append(landmark) 367 | 368 | # Dataset Distribute 369 | if flag == 'train': 370 | print('The train dataset distribute: %d, %d, %d' % (len(data_imgs), len(data_labels), len(data_landmarks))) 371 | elif flag == 'test': 372 | print('The test dataset distribute: %d, %d, %d' % (len(data_imgs), len(data_labels), len(data_landmarks))) 373 | 374 | # Dataset 375 | needAU = False if args.Experiment == 'EM' else True 376 | dataset = MyDataset_EM_Artificial_noBbox(data_imgs, data_labels, data_landmarks, flag, needAU, args.Model, args.Experiment, trans, target_trans) 377 | 378 | # DataLoader 379 | if flag == 'train': 380 | data_loader = data.DataLoader(dataset=dataset, batch_size=args.Train_Batch_Size, shuffle=True, num_workers=args.Num_Workers, drop_last=True) 381 | elif flag ==' test': 382 | data_loader = data.DataLoader(dataset=dataset, batch_size=args.Test_Batch_Size, shuffle=True, num_workers=args.Num_Workers, drop_last=False) 383 | 384 | return data_loader 385 | 386 | elif args.Dataset=='ExpW': 387 | 388 | # Basic Notes: {1: Surprise, 2: Fear, 3: Disgust, 4: Happiness, 5: Sadness, 6: Anger, 7: Neutral} 389 | Label = { 5:0, 2:1, 1:2, 3:3, 4:4, 0:5, 6:6 } 390 | 391 | list_patition_label = pd.read_csv(dataPath_prefix + '/ExpW/landmarks/ExpW_landmarks.txt', header=None, delim_whitespace=True) 392 | list_patition_label = np.array(list_patition_label) 393 | 394 | for i in range(list_patition_label.shape[0]): 395 | 396 | landmark = list_patition_label[i,8:].reshape(68,2).astype(np.int) 397 | 398 | bbox = list_patition_label[i,2:6].astype(np.int) 399 | bbox[0], bbox[1], bbox[2], bbox[3] = bbox[1], bbox[0], bbox[2], bbox[3] 400 | 401 | landmark[:,0]+=bbox[0] 402 | landmark[:,1]+=bbox[1] 403 | 404 | data_imgs.append(args.ImagePath+list_patition_label[i,0]) 405 | label = list_patition_label[i,7] 406 | data_labels.append(Label[label]) 407 | data_landmarks.append(landmark) 408 | data_bboxs.append(bbox) 409 | 410 | # Dataset Distribute 411 | print('The dataset distribute: %d, %d, %d, %d' % ( len(data_imgs), len(data_labels), len(data_landmarks), len(data_bboxs) ) ) 412 | 413 | # Dataset 414 | needAU = False if args.Experiment == 'EM' else True 415 | dataset = MyDataset_EM_Artificial(data_imgs, data_labels, data_landmarks, data_bboxs, flag, needAU, args.Model, args.Experiment, trans, target_trans) 416 | 417 | torch.manual_seed(1) # Set CPU Seed 418 | if torch.cuda.is_available(): 419 | torch.cuda.manual_seed(1) # Set Current GPU Seed 420 | torch.cuda.manual_seed_all(1) # Set All GPU Seed 421 | 422 | trainSet_size = int(0.9 * len(data_imgs)) 423 | testSet_size = len(data_imgs) - trainSet_size 424 | train_set, test_set = data.random_split( dataset, [trainSet_size, testSet_size] ) 425 | 426 | print('The num of TrainSet and TestSet: %d , %d ' % (trainSet_size, testSet_size) ) 427 | 428 | # DataLoader 429 | train_loader = data.DataLoader(dataset=train_set, batch_size=args.Train_Batch_Size, shuffle=True, num_workers=args.Num_Workers, drop_last=True) 430 | test_loader = data.DataLoader(dataset=test_set, batch_size=args.Train_Batch_Size, shuffle=True, num_workers=args.Num_Workers, drop_last=False) 431 | 432 | return train_loader, test_loader 433 | 434 | 435 | def Bulid_Model(args): 436 | '''Bulid Model''' 437 | 438 | if args.Distribute == 'Basic': 439 | model = ResNet101(args.Dim) 440 | 441 | elif args.Distribute == 'Compound': 442 | model = ResNet101_Compound(args.Dim) 443 | 444 | 445 | if args.Resume_Model != 'None': 446 | print('Resume Model: {}'.format(args.Resume_Model)) 447 | checkpoint = torch.load(args.Resume_Model, map_location='cpu') 448 | model.load_state_dict(checkpoint, strict=True) 449 | 450 | # Save GPU Memory 451 | del checkpoint 452 | torch.cuda.empty_cache() 453 | else: 454 | print('No Resume Model') 455 | 456 | if args.DataParallel: 457 | model = nn.DataParallel(model) 458 | 459 | if torch.cuda.is_available(): 460 | model = model.cuda() 461 | 462 | return model 463 | 464 | 465 | def Set_Param_Optim(args, model): 466 | '''Set parameters optimizer''' 467 | 468 | # Expression Recognition Experiment 469 | if args.Experiment == 'EM': 470 | for param in model.parameters(): 471 | param.requires_grad = False 472 | 473 | for param in model.backbone.parameters(): 474 | param.requires_grad = True 475 | for param in model.LRN_em.parameters(): 476 | param.requires_grad = True 477 | for param in model.reduce_dim_em.parameters(): 478 | param.requires_grad = True 479 | for param in model.pred_em.parameters(): 480 | param.requires_grad = True 481 | 482 | param_optim = filter(lambda p:p.requires_grad, model.parameters()) 483 | 484 | # AU Recognition Experiment 485 | elif args.Experiment == 'AU': 486 | for param in model.parameters(): 487 | param.requires_grad = False 488 | 489 | for param in model.deconv_layer1.parameters(): 490 | param.requires_grad = True 491 | for param in model.deconv_layer2.parameters(): 492 | param.requires_grad = True 493 | for param in model.deconv_layer3.parameters(): 494 | param.requires_grad = True 495 | 496 | for param in model.reduce_dim_1_au.parameters(): 497 | param.requires_grad = True 498 | for param in model.reduce_dim_2_au.parameters(): 499 | param.requires_grad = True 500 | for param in model.reduce_dim_3_au.parameters(): 501 | param.requires_grad = True 502 | 503 | for param in model.LRN_au.parameters(): 504 | param.requires_grad = True 505 | for param in model.Crop_Net_1.parameters(): 506 | param.requires_grad = True 507 | for param in model.Crop_Net_2.parameters(): 508 | param.requires_grad = True 509 | for param in model.pred_au.parameters(): 510 | param.requires_grad = True 511 | 512 | model.PriorKnowledgeTable.requires_grad = True 513 | 514 | param_optim = filter(lambda p:p.requires_grad, model.parameters()) 515 | 516 | # Feature Fuse Experiment 517 | elif args.Experiment == 'Fuse': 518 | for param in model.parameters(): 519 | param.requires_grad = False 520 | 521 | for param in model.fc_em_fuse_3.parameters(): 522 | param.requires_grad = True 523 | for param in model.fc_au_fuse_3.parameters(): 524 | param.requires_grad = True 525 | 526 | for param in model.fc_attention_fuse_3.parameters(): 527 | param.requires_grad = True 528 | 529 | for param in model.pred_em_fuse_3.parameters(): 530 | param.requires_grad = True 531 | for param in model.pred_em.parameters(): 532 | param.requires_grad = True 533 | 534 | param_optim = filter(lambda p:p.requires_grad, model.parameters()) 535 | 536 | return param_optim 537 | 538 | 539 | def Set_Criterion_Optimizer(args, param_optim): 540 | '''Set Criterion and Optimizer''' 541 | 542 | optimizer = optim.SGD(param_optim, lr=args.LearnRate, momentum=0.9) 543 | 544 | return optimizer 545 | 546 | 547 | def Adjust_Learning_Rate(optimizer, epoch, LR): 548 | '''Adjust Learning Rate''' 549 | # lr = 0.001 550 | 551 | if epoch<=15: 552 | lr = LR 553 | elif epoch<=30: 554 | lr = 0.1 * LR 555 | else: 556 | lr = 0.01 * LR 557 | 558 | for param_group in optimizer.param_groups: 559 | param_group['lr'] = lr 560 | 561 | return optimizer, lr 562 | 563 | 564 | def Compute_Accuracy_Expression(args, pred, target, loss_, acc_1, acc_2, prec, recall, loss): 565 | '''Compute the accuracy of all samples, the accuracy of positive samples, the recall of positive samples and the loss''' 566 | 567 | pred = pred.cpu().data.numpy() 568 | pred = np.argmax(pred, axis=1) 569 | target = target.cpu().data.numpy() 570 | 571 | pred = pred.astype(np.int32).reshape(pred.shape[0],) 572 | target = target.astype(np.int32).reshape(target.shape[0],) 573 | 574 | if args.Distribute == 'Basic': 575 | numOfLabel = 7 576 | elif args.Distribute == 'Compound': 577 | numOfLabel = 11 578 | 579 | for index in range(numOfLabel): 580 | TP = np.sum((pred == index) * (target == index)) 581 | TN = np.sum((pred != index) * (target != index)) 582 | 583 | # Compute Accuracy of All --> TP+TN / All 584 | acc_1[index].update(np.sum(pred == target), pred.shape[0]) 585 | acc_2[index].update(TP, np.sum(target == index)) 586 | 587 | # Compute Precision of Positive --> TP/(TP+FP) 588 | prec[index].update(TP, np.sum(pred == index)) 589 | 590 | # Compute Recall of Positive --> TP/(TP+FN) 591 | recall[index].update(TP, np.sum(target == index)) 592 | 593 | # Compute Loss 594 | loss.update(float(loss_.cpu().data.numpy())) 595 | 596 | 597 | def Compute_Accuracy_AU(args, pred, target, loss_, acc_1, acc_2, prec, recall, loss): 598 | '''Compute the accuracy of all samples, the accuracy of positive samples, the recall of positive samples and the loss''' 599 | 600 | pred = pred.cpu().data.numpy() 601 | pred[pred < 0.5] = 0 602 | pred[pred > 0] = 1 603 | 604 | target = target.cpu().data.numpy() 605 | target[target < 0.5] = 0 606 | target[target > 0] = 1 607 | 608 | pred = pred.astype(np.int32).reshape(pred.shape[0], pred.shape[1]) 609 | target = target.astype(np.int32).reshape(target.shape[0], target.shape[1]) 610 | 611 | for index in range(numOfAU): 612 | TP = np.sum((pred[:, index] == 1) * (target[:, index] == 1)) 613 | TN = np.sum((pred[:, index] == 0) * (target[:, index] == 0)) 614 | 615 | # Compute Accuracy of All --> TP+TN / All 616 | acc_1[index].update(TP + TN, pred.shape[0]) 617 | acc_2[index].update(TP + TN, pred.shape[0]) 618 | 619 | # Compute Precision of Positive --> TP/(TP+FP) 620 | prec[index].update(TP, np.sum(pred[:, index] == 1)) 621 | 622 | # Compute Recall of Positive --> TP/(TP+FN) 623 | recall[index].update(TP, np.sum(target[:, index]==1)) 624 | 625 | # Compute Loss 626 | loss.update(float(loss_.cpu().data.numpy())) 627 | 628 | 629 | def Show_Accuracy(acc_1, acc_2, prec, recall, numOfClass=7): 630 | """Compute average of accuaracy/precision/recall/f1""" 631 | 632 | # compute F1 value 633 | f1 = [AverageMeter() for i in range(numOfClass)] 634 | for i in range(numOfClass): 635 | if prec[i].avg == 0 or recall[i].avg == 0: 636 | f1[i].avg = 0 637 | continue 638 | f1[i].avg = 2 * prec[i].avg * recall[i].avg / (prec[i].avg + recall[i].avg) 639 | 640 | acc_1_avg, acc_2_avg, prec_avg, recall_avg, f1_avg = 0, 0, 0, 0, 0 641 | for i in range(numOfClass): 642 | acc_1_avg += acc_1[i].avg 643 | acc_2_avg += acc_2[i].avg 644 | prec_avg += prec[i].avg 645 | recall_avg += recall[i].avg 646 | f1_avg += f1[i].avg 647 | acc_1_avg, acc_2_avg, prec_avg, recall_avg, f1_avg = acc_1_avg/numOfClass, acc_2_avg/numOfClass, prec_avg/numOfClass, recall_avg/numOfClass, f1_avg/numOfClass 648 | 649 | # Log Accuracy Infomation 650 | Accuracy_Info = '' 651 | 652 | Accuracy_Info+=' Accuracy(1)' 653 | for i in range(numOfClass): 654 | Accuracy_Info+=' {:.4f}'.format(acc_1[i].avg) 655 | Accuracy_Info+='\n' 656 | 657 | Accuracy_Info+=' Accuracy(2)' 658 | for i in range(numOfClass): 659 | Accuracy_Info+=' {:.4f}'.format(acc_2[i].avg) 660 | Accuracy_Info+='\n' 661 | 662 | Accuracy_Info+=' Precision' 663 | for i in range(numOfClass): 664 | Accuracy_Info+=' {:.4f}'.format(prec[i].avg) 665 | Accuracy_Info+='\n' 666 | 667 | Accuracy_Info+=' Recall' 668 | for i in range(numOfClass): 669 | Accuracy_Info+=' {:.4f}'.format(recall[i].avg) 670 | Accuracy_Info+='\n' 671 | 672 | Accuracy_Info+=' F1' 673 | for i in range(numOfClass): 674 | Accuracy_Info+=' {:.4f}'.format(f1[i].avg) 675 | Accuracy_Info+='\n' 676 | 677 | return Accuracy_Info, acc_1_avg, acc_2_avg, prec_avg, recall_avg, f1_avg -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import copy 3 | import random 4 | import numpy as np 5 | from PIL import Image, ImageDraw 6 | 7 | import torch 8 | import torch.utils.data as data 9 | 10 | def default_loader(path): 11 | return Image.open(path).convert('RGB') 12 | 13 | def get_au_loc(array_, ori_size, new_size): 14 | ''' 15 | input: 16 | np.ndarray shape=(68 , 2) 17 | return: 18 | np.ndarray shape=(12 , 4) 19 | ''' 20 | 21 | array = copy.deepcopy(array_) 22 | 23 | arr2d = array.transpose() 24 | arr2d[0,:]=arr2d[0,:]/ori_size*new_size 25 | arr2d[1,:]=arr2d[1,:]/ori_size*new_size 26 | 27 | region_bbox=[] 28 | if arr2d.shape[1] == 68: 29 | region_bbox+=[[arr2d[0,21],arr2d[1,21],arr2d[0,22],arr2d[1,22]]] # au 1 30 | region_bbox+=[[arr2d[0,18],arr2d[1,18],arr2d[0,25],arr2d[1,25]]] # au 2 31 | region_bbox+=[[(arr2d[0,21]+arr2d[0,22])/2,(arr2d[1,21]+arr2d[1,22])/2,(arr2d[0,19]+arr2d[0,24])/2,(arr2d[1,19]+arr2d[1,24])/2]] # au 4 32 | region_bbox+=[[(arr2d[0,37]+arr2d[0,38])/2,(arr2d[1,37]+arr2d[1,38])/2,(arr2d[0,43]+arr2d[0,44])/2,(arr2d[1,43]+arr2d[1,44])/2]] # au 5 * 33 | region_bbox+=[[arr2d[0,41],arr2d[1,41],arr2d[0,46],arr2d[1,46]]] # au 6 34 | region_bbox+=[[arr2d[0,38],arr2d[1,38],arr2d[0,43],arr2d[1,43]]] # au 7 35 | region_bbox+=[[arr2d[0,31],arr2d[1,31],arr2d[0,35],arr2d[1,35]]] # au 9 * 36 | region_bbox+=[[arr2d[0,50],arr2d[1,50],arr2d[0,52],arr2d[1,52]]] # au 10 37 | region_bbox+=[[arr2d[0,48],arr2d[1,48],arr2d[0,54],arr2d[1,54]]] # au 12 38 | # region_bbox+=[[arr2d[0,48],arr2d[1,48],arr2d[0,54],arr2d[1,54]]] # au 14 39 | region_bbox+=[[arr2d[0,48],arr2d[1,48],arr2d[0,54],arr2d[1,54]]] # au 15 40 | region_bbox+=[[arr2d[0,58],arr2d[1,58],arr2d[0,56],arr2d[1,56]]] # au 16 * 41 | region_bbox+=[[arr2d[0,51],arr2d[1,51],arr2d[0,57],arr2d[1,57]]] # au 17 42 | region_bbox+=[[arr2d[0,48],arr2d[1,48],arr2d[0,54],arr2d[1,54]]] # au 20 * 43 | region_bbox+=[[arr2d[0,60],arr2d[1,60],arr2d[0,62],arr2d[1,62]]] # au 23 44 | region_bbox+=[[arr2d[0,61],arr2d[1,61],arr2d[0,64],arr2d[1,64]]] # au 24 45 | region_bbox+=[[arr2d[0,62],arr2d[1,62],arr2d[0,66],arr2d[1,66]]] # au 25 * 46 | region_bbox+=[[arr2d[0,51],arr2d[1,51],arr2d[0,57],arr2d[1,57]]] # au 26 * 47 | elif arr2d.shape[1] == 66: 48 | region_bbox+=[[arr2d[0,21],arr2d[1,21],arr2d[0,22],arr2d[1,22]]] # au 1 49 | region_bbox+=[[arr2d[0,18],arr2d[1,18],arr2d[0,25],arr2d[1,25]]] # au 2 50 | region_bbox+=[[(arr2d[0,21]+arr2d[0,22])/2,(arr2d[1,21]+arr2d[1,22])/2,(arr2d[0,19]+arr2d[0,24])/2,(arr2d[1,19]+arr2d[1,24])/2]] # au 4 51 | # region_bbox+=[[(arr2d[0,37]+arr2d[0,38])/2,(arr2d[1,37]+arr2d[1,38])/2,(arr2d[0,43]+arr2d[0,44])/2,(arr2d[1,43]+arr2d[1,44])/2]] # au 5 * 52 | region_bbox+=[[arr2d[0,41],arr2d[1,41],arr2d[0,46],arr2d[1,46]]] # au 6 53 | region_bbox+=[[arr2d[0,38],arr2d[1,38],arr2d[0,43],arr2d[1,43]]] # au 7 54 | # region_bbox+=[[arr2d[0,31],arr2d[1,31],arr2d[0,35],arr2d[1,35]]] # au 9 * 55 | region_bbox+=[[arr2d[0,50],arr2d[1,50],arr2d[0,52],arr2d[1,52]]] # au 10 56 | region_bbox+=[[arr2d[0,48],arr2d[1,48],arr2d[0,54],arr2d[1,54]]] # au 12 57 | # region_bbox+=[[arr2d[0,48],arr2d[1,48],arr2d[0,54],arr2d[1,54]]] # au 14 58 | region_bbox+=[[arr2d[0,48],arr2d[1,48],arr2d[0,54],arr2d[1,54]]] # au 15 59 | # region_bbox+=[[arr2d[0,58],arr2d[1,58],arr2d[0,56],arr2d[1,56]]] # au 16 * 60 | region_bbox+=[[arr2d[0,51],arr2d[1,51],arr2d[0,57],arr2d[1,57]]] # au 17 61 | # region_bbox+=[[arr2d[0,48],arr2d[1,48],arr2d[0,54],arr2d[1,54]]] # au 20 * 62 | region_bbox+=[[arr2d[0,60],arr2d[1,60],arr2d[0,62],arr2d[1,62]]] # au 23 63 | region_bbox+=[[arr2d[0,61],arr2d[1,61],arr2d[0,64],arr2d[1,64]]] # au 24 64 | # region_bbox+=[[arr2d[0,62],arr2d[1,62],arr2d[0,66],arr2d[1,66]]] # au 25 * 65 | # region_bbox+=[[arr2d[0,51],arr2d[1,51],arr2d[0,57],arr2d[1,57]]] # au 26 * 66 | else: 67 | print("Invalid Landmark Annotations") 68 | 69 | region_array = np.round(np.array(region_bbox)) 70 | 71 | return region_array 72 | 73 | class MyDataset_EM(data.Dataset): 74 | def __init__(self, imgs, labels, landmarks, bboxs, flag, needAU, Model, transform=None, target_transform=None, loader=default_loader): 75 | self.imgs = imgs 76 | self.labels = labels 77 | self.landmarks = landmarks 78 | self.bboxs = bboxs 79 | self.transform = transform 80 | self.target_transform = target_transform 81 | self.loader = loader 82 | self.flag = flag 83 | self.needAU = needAU 84 | self.Model = Model 85 | 86 | def __getitem__(self, index): 87 | img, label, landmark, bbox = self.loader(self.imgs[index]), copy.deepcopy(self.labels[index]), copy.deepcopy(self.landmarks[index]), copy.deepcopy(self.bboxs[index]) 88 | ori_img_w, ori_img_h = img.size 89 | 90 | # BoundingBox 91 | left = bbox[0] 92 | upper = bbox[1] 93 | right = bbox[2] 94 | lower = bbox[3] 95 | 96 | # Visualization 97 | # draw = ImageDraw.Draw(img) 98 | # for idx in range(len(landmark[:,0])): 99 | # draw.point((landmark[idx, 0], landmark[idx, 1])) 100 | # img.save('./vis_/{}.jpg'.format(index)) 101 | 102 | enlarge_bbox = True 103 | 104 | if self.flag=='train': 105 | random_crop = True 106 | random_flip = True 107 | elif self.flag=='test': 108 | random_crop = False 109 | random_flip = False 110 | 111 | padding = max(0, int((right - left)*0.15)) # enlarge bbox 112 | half_padding = int(padding*0.5) 113 | 114 | if enlarge_bbox: 115 | left = max(left - half_padding, 0) 116 | right = min(right + half_padding, ori_img_w) 117 | upper = max(upper - half_padding, 0) 118 | lower = min(lower + half_padding, ori_img_h) 119 | 120 | if random_crop: 121 | offset_range = half_padding 122 | 123 | x_offset = random.randint(-offset_range, offset_range) 124 | y_offset = random.randint(-offset_range, offset_range) 125 | 126 | left = max(left + x_offset, 0) 127 | right = min(right + x_offset, ori_img_w) 128 | upper = max(upper + y_offset, 0) 129 | lower = min(lower + y_offset, ori_img_h) 130 | 131 | img = img.crop((left,upper,right,lower)) 132 | crop_img_w, crop_img_h = img.size 133 | 134 | landmark[:,0]-=left 135 | landmark[:,1]-=upper 136 | 137 | # Visualization 138 | # draw = ImageDraw.Draw(img) 139 | # for idx in range(len(landmark[:,0])): 140 | # draw.point((landmark[idx, 0], landmark[idx, 1])) 141 | # img.save('./vis_/{}.jpg'.format(index)) 142 | 143 | if random_flip and random.random() > 0.5: 144 | img = img.transpose(Image.FLIP_LEFT_RIGHT) 145 | landmark[:,0] = (right - left) - landmark[:,0] 146 | 147 | # Transform Image 148 | trans_img = self.transform(img) 149 | _, trans_img_w, trans_img_h = trans_img.size() 150 | 151 | if self.target_transform is not None: 152 | label = self.transform(label) 153 | 154 | # Don't need AU 155 | if not self.needAU: 156 | return (trans_img, self.imgs[index]), 0, label 157 | 158 | # get au location 159 | landmark[:, 0] = landmark[:, 0] * trans_img_w / crop_img_w 160 | landmark[:, 1] = landmark[:, 1] * trans_img_h / crop_img_h 161 | 162 | au_location = get_au_loc(copy.deepcopy(landmark),trans_img_w, 56) 163 | for i in range(au_location.shape[0]): 164 | for j in range(4): 165 | if au_location[i,j]<=11: 166 | au_location[i,j] = 12 167 | if au_location[i,j]>=45: 168 | au_location[i,j] = 44 169 | 170 | # Visualization 171 | # img_transform = img.resize((trans_img_w ,trans_img_h)) 172 | # draw = ImageDraw.Draw(img_transform) 173 | # for idx in range(len(landmark[:,0])): 174 | # draw.point((landmark[idx, 0], landmark[idx, 1])) 175 | # img_transform.save('./vis/{}.jpg'.format(index)) 176 | 177 | au_location = torch.LongTensor(au_location) 178 | 179 | return (trans_img, self.imgs[index]), au_location, label 180 | 181 | def __len__(self): 182 | return len(self.imgs) 183 | 184 | class MyDataset_AU(data.Dataset): 185 | def __init__(self, imgs, labels, landmarks, flag, Model, transform=None, target_transform=None, loader=default_loader): 186 | self.imgs = imgs 187 | self.labels = labels 188 | self.landmarks = landmarks 189 | self.transform = transform 190 | self.target_transform = target_transform 191 | self.loader = loader 192 | self.flag = flag 193 | self.Model = Model 194 | 195 | def __getitem__(self, index): 196 | img, label, landmark = self.loader(self.imgs[index]), copy.deepcopy(self.labels[index]), copy.deepcopy(self.landmarks[index]) 197 | ori_img_w, ori_img_h = img.size 198 | 199 | left = np.min(landmark[:,0]) 200 | right = np.max(landmark[:,0]) 201 | upper = np.min(landmark[:,1]) 202 | lower = np.max(landmark[:,1]) 203 | 204 | # Added by Xy 205 | if self.flag=='train': 206 | enlarge_bbox = True 207 | random_crop = True 208 | random_flip = True 209 | elif self.flag=='test': 210 | enlarge_bbox = True 211 | random_crop = False 212 | random_flip = False 213 | 214 | padding = max(0, int((right - left)*0.15)) # enlarge bbox 215 | half_padding = int(padding*0.5) 216 | 217 | if enlarge_bbox: 218 | left = max(left - half_padding, 0) 219 | right = min(right + half_padding, ori_img_w) 220 | upper = max(upper - half_padding, 0) 221 | lower = min(lower + half_padding, ori_img_h) 222 | 223 | if random_crop: 224 | offset_range = half_padding 225 | 226 | x_offset = random.randint(-offset_range, offset_range) 227 | y_offset = random.randint(-offset_range, offset_range) 228 | 229 | left = max(left + x_offset, 0) 230 | right = min(right + x_offset, ori_img_w) 231 | upper = max(upper + y_offset, 0) 232 | lower = min(lower + y_offset, ori_img_h) 233 | 234 | img = img.crop((left,upper,right,lower)) 235 | crop_img_w, crop_img_h = img.size 236 | 237 | landmark[:,0]-=left 238 | landmark[:,1]-=upper 239 | 240 | if random_flip and random.random() > 0.5: 241 | img = img.transpose(Image.FLIP_LEFT_RIGHT) 242 | landmark[:,0] = (right - left) - landmark[:,0] 243 | 244 | # Transform Image 245 | trans_img = self.transform(img) 246 | _, trans_img_w, trans_img_h = trans_img.size() 247 | 248 | # get au location 249 | landmark[:, 0] = landmark[:, 0] * trans_img_w / crop_img_w 250 | landmark[:, 1] = landmark[:, 1] * trans_img_h / crop_img_h 251 | 252 | if self.Model in ['ResNet-101', 'ResNet-50']: 253 | au_location = get_au_loc(copy.deepcopy(landmark),trans_img_w, 56) 254 | for i in range(au_location.shape[0]): 255 | for j in range(4): 256 | if au_location[i,j]<= 11: 257 | au_location[i,j] = 12 258 | if au_location[i,j]>=45: 259 | au_location[i,j] = 44 260 | elif self.Model == 'ResNet-18': 261 | au_location = get_au_loc(copy.deepcopy(landmark),trans_img_w, 112) 262 | for i in range(au_location.shape[0]): 263 | for j in range(4): 264 | if au_location[i,j]<= 23: 265 | au_location[i,j] = 24 266 | if au_location[i,j]>=89: 267 | au_location[i,j] = 88 268 | 269 | # Visualization 270 | # img_transform = img.resize((trans_img_w ,trans_img_h)) 271 | # draw = ImageDraw.Draw(img_transform) 272 | # for idx in range(len(landmark[:,0])): 273 | # draw.point((landmark[idx, 0], landmark[idx, 1])) 274 | # img_transform.save('./vis/{}.jpg'.format(index)) 275 | 276 | if self.target_transform is not None: 277 | label = self.transform(label) 278 | 279 | au_location = torch.LongTensor(au_location) 280 | 281 | return trans_img, au_location, label 282 | 283 | def __len__(self): 284 | return len(self.imgs) 285 | 286 | class MyDataset_EM_Artificial(data.Dataset): 287 | def __init__(self, imgs, labels, landmarks, bboxs, flag, needAU, Model, Experiment, transform=None, target_transform=None, loader=default_loader): 288 | self.imgs = imgs 289 | self.labels = labels 290 | self.landmarks = landmarks 291 | self.bboxs = bboxs 292 | self.transform = transform 293 | self.target_transform = target_transform 294 | self.loader = loader 295 | self.flag = flag 296 | self.needAU = needAU 297 | self.Model = Model 298 | self.Experiment = Experiment 299 | self.PriorKnowledgeTable_Train = np.array([ 300 | [1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0], 301 | [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.5, 0.0, 0.5, 0.0], 302 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.5, 0.0], 303 | [0.0, 0.0, 0.0, 0.0, 1.0, 0.5, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0], 304 | [1.0, 0.0, 0.5, 0.0, 0.5, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0], 305 | [0.0, 0.0, 1.0, 1.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 1.0, 1.0, 0.0, 0.0], 306 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],dtype=np.float) 307 | self.PriorKnowledgeTable_Test = np.array([ 308 | [1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0], 309 | [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0], 310 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0], 311 | [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], 312 | [1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], 313 | [0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0], 314 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],dtype=np.float) 315 | 316 | def __getitem__(self, index): 317 | img, label, landmark, bbox = self.loader(self.imgs[index]), copy.deepcopy(self.labels[index]), copy.deepcopy(self.landmarks[index]), copy.deepcopy(self.bboxs[index]) 318 | ori_img_w, ori_img_h = img.size 319 | 320 | # Need AU Label 321 | if self.Experiment=='AU': 322 | if self.flag=='train': 323 | label = self.PriorKnowledgeTable_Train[label].reshape(self.PriorKnowledgeTable_Train.shape[1],) 324 | for i in range(self.PriorKnowledgeTable_Train.shape[1]): 325 | if label[i] < 0.5: 326 | label[i] = np.random.uniform(0.0,0.25) # plan1: (0.0,0.4), plan2: (0.0,0.2), plan3: (0.0,0.25) 327 | elif label[i] < 1.0: 328 | label[i] = np.random.uniform(0.5,0.75) # plan1: (0.4,0.7), plan2: (0.6,0.8), plan3: (0.5,0.75) 329 | else: 330 | label[i] = np.random.uniform(0.75,1.0) # plan1: (0.7,1.0), plan2: (0.8,1.0), plan3: (0.75,1.0) 331 | # label = self.PriorKnowledgeTable_Test[label].reshape(self.PriorKnowledgeTable_Test.shape[1],) 332 | elif self.flag=='test': 333 | label = self.PriorKnowledgeTable_Test[label].reshape(self.PriorKnowledgeTable_Test.shape[1],) 334 | 335 | # Face Rotation 336 | # left = np.min(landmark[:,0]) 337 | # right = np.max(landmark[:,0]) 338 | # upper = np.min(landmark[:,1]) 339 | # lower = np.max(landmark[:,1]) 340 | 341 | # BoundingBox 342 | left = bbox[0] 343 | upper = bbox[1] 344 | right = bbox[2] 345 | lower = bbox[3] 346 | 347 | # Visualization 348 | # draw = ImageDraw.Draw(img) 349 | # for idx in range(len(landmark[:,0])): 350 | # draw.point((landmark[idx, 0], landmark[idx, 1])) 351 | # img.save('./vis_/{}.jpg'.format(index)) 352 | 353 | enlarge_bbox = True 354 | 355 | if self.flag=='train': 356 | random_crop = True 357 | random_flip = True 358 | elif self.flag=='test': 359 | random_crop = False 360 | random_flip = False 361 | 362 | if self.Model==7: 363 | left = np.min(landmark[:,0]) 364 | right = np.max(landmark[:,0]) 365 | upper = np.min(landmark[:,1]) 366 | lower = np.max(landmark[:,1]) 367 | 368 | enlarge_bbox = False 369 | random_crop = False 370 | random_flip = False 371 | 372 | padding = max(0, int((right - left)*0.15)) # enlarge bbox 373 | half_padding = int(padding*0.5) 374 | 375 | if enlarge_bbox: 376 | left = max(left - half_padding, 0) 377 | right = min(right + half_padding, ori_img_w) 378 | upper = max(upper - half_padding, 0) 379 | lower = min(lower + half_padding, ori_img_h) 380 | 381 | if random_crop: 382 | offset_range = half_padding 383 | 384 | x_offset = random.randint(-offset_range, offset_range) 385 | y_offset = random.randint(-offset_range, offset_range) 386 | 387 | left = max(left + x_offset, 0) 388 | right = min(right + x_offset, ori_img_w) 389 | upper = max(upper + y_offset, 0) 390 | lower = min(lower + y_offset, ori_img_h) 391 | 392 | img = img.crop((left,upper,right,lower)) 393 | crop_img_w, crop_img_h = img.size 394 | 395 | landmark[:,0]-=left 396 | landmark[:,1]-=upper 397 | 398 | if random_flip and random.random() > 0.5: 399 | img = img.transpose(Image.FLIP_LEFT_RIGHT) 400 | landmark[:,0] = (right - left) - landmark[:,0] 401 | 402 | # Transform Image 403 | trans_img = self.transform(img) 404 | _, trans_img_w, trans_img_h = trans_img.size() 405 | 406 | # Visualization 407 | # img_transform = img.resize((crop_img_w ,crop_img_h)) 408 | # draw = ImageDraw.Draw(img_transform) 409 | # for idx in range(len(landmark[:,0])): 410 | # draw.point((landmark[idx, 0], landmark[idx, 1])) 411 | # img_transform.save('./vis/{}.jpg'.format(index)) 412 | 413 | if self.target_transform is not None: 414 | label = self.transform(label) 415 | 416 | # Don't need AU 417 | if not self.needAU: 418 | return (trans_img, self.imgs[index]), 0, label 419 | 420 | # get au location 421 | landmark[:, 0] = landmark[:, 0] * trans_img_w / crop_img_w 422 | landmark[:, 1] = landmark[:, 1] * trans_img_h / crop_img_h 423 | 424 | if self.Model in ['ResNet-101', 'ResNet-50']: 425 | au_location = get_au_loc(copy.deepcopy(landmark),trans_img_w, 56) 426 | for i in range(au_location.shape[0]): 427 | for j in range(4): 428 | if au_location[i,j]<= 11: 429 | au_location[i,j] = 12 430 | if au_location[i,j]>=45: 431 | au_location[i,j] = 44 432 | elif self.Model == 'ResNet-18': 433 | au_location = get_au_loc(copy.deepcopy(landmark),trans_img_w, 112) 434 | for i in range(au_location.shape[0]): 435 | for j in range(4): 436 | if au_location[i,j]<= 23: 437 | au_location[i,j] = 24 438 | if au_location[i,j]>=89: 439 | au_location[i,j] = 88 440 | 441 | # Visualization 442 | # img_transform = img.resize((trans_img_w ,trans_img_h)) 443 | # draw = ImageDraw.Draw(img_transform) 444 | # for idx in range(len(landmark[:,0])): 445 | # draw.point((landmark[idx, 0], landmark[idx, 1])) 446 | # img_transform.save('./vis/{}.jpg'.format(index)) 447 | 448 | au_location = torch.LongTensor(au_location) 449 | 450 | # Need AU Label 451 | if self.Experiment=='Fuse': 452 | if self.flag=='train': 453 | AU_label = self.PriorKnowledgeTable_Train[label].reshape(self.PriorKnowledgeTable_Train.shape[1],) 454 | for i in range(self.PriorKnowledgeTable_Train.shape[1]): 455 | if AU_label[i] < 0.5: 456 | AU_label[i] = np.random.uniform(0.0,0.25) # plan1: (0.0,0.4), plan2: (0.0,0.2), plan3: (0.0,0.25) 457 | elif AU_label[i] < 1.0: 458 | AU_label[i] = np.random.uniform(0.5,0.75) # plan1: (0.4,0.7), plan2: (0.6,0.8), plan3: (0.5,0.75) 459 | else: 460 | AU_label[i] = np.random.uniform(0.75,1.0) # plan1: (0.7,1.0), plan2: (0.8,1.0), plan3: (0.75,1.0) 461 | # label = self.PriorKnowledgeTable_Test[label].reshape(self.PriorKnowledgeTable_Test.shape[1],) 462 | elif self.flag=='test': 463 | AU_label = self.PriorKnowledgeTable_Test[label].reshape(self.PriorKnowledgeTable_Test.shape[1],) 464 | 465 | return (trans_img, self.imgs[index]), au_location, (label, AU_label) 466 | 467 | return (trans_img, self.imgs[index]), au_location, label 468 | 469 | def __len__(self): 470 | return len(self.imgs) 471 | 472 | class MyDataset_EM_Artificial_Compound(data.Dataset): 473 | def __init__(self, imgs, labels, landmarks, bboxs, flag, needAU, Model, Experiment, transform=None, target_transform=None, loader=default_loader): 474 | self.imgs = imgs 475 | self.labels = labels 476 | self.landmarks = landmarks 477 | self.bboxs = bboxs 478 | self.transform = transform 479 | self.target_transform = target_transform 480 | self.loader = loader 481 | self.flag = flag 482 | self.needAU = needAU 483 | self.Model = Model 484 | self.Experiment = Experiment 485 | self.PriorKnowledgeTable_Train = np.array([ 486 | [1.0, 1.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.5], 487 | [0.0, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], 488 | [1.0, 0.5, 1.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0], 489 | [0.0, 0.0, 1.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 1.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0], 490 | [1.0, 0.5, 1.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0], 491 | [0.5, 0.0, 1.0, 0.0, 0.5, 0.0, 0.5, 1.0, 0.0, 0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.5, 0.0], 492 | [0.0, 0.0, 1.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0], 493 | [1.0, 1.0, 0.5, 1.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.5], 494 | [0.0, 0.0, 1.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0], 495 | [0.0, 0.0, 1.0, 0.0, 0.0, 0.5, 0.5, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.5, 0.0, 0.0], 496 | [1.0, 1.0, 0.5, 1.0, 0.0, 0.0, 0.5, 1.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 0.0],],dtype=np.float) 497 | self.PriorKnowledgeTable_Test = np.array([ 498 | [1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0], 499 | [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], 500 | [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0], 501 | [0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], 502 | [1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0], 503 | [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0], 504 | [0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0], 505 | [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0], 506 | [0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0], 507 | [0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0], 508 | [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0],],dtype=np.float) 509 | 510 | def __getitem__(self, index): 511 | img, label, landmark, bbox = self.loader(self.imgs[index]), copy.deepcopy(self.labels[index]), copy.deepcopy(self.landmarks[index]), copy.deepcopy(self.bboxs[index]) 512 | ori_img_w, ori_img_h = img.size 513 | 514 | # Need AU Label 515 | if self.Experiment=='AU': 516 | if self.flag=='train': 517 | label = self.PriorKnowledgeTable_Train[label].reshape(self.PriorKnowledgeTable_Train.shape[1],) 518 | for i in range(self.PriorKnowledgeTable_Train.shape[1]): 519 | if label[i] < 0.5: 520 | label[i] = np.random.uniform(0.0,0.25) # plan1: (0.0,0.4), plan2: (0.0,0.2), plan3: (0.0,0.25) 521 | elif label[i] < 1.0: 522 | label[i] = np.random.uniform(0.5,0.75) # plan1: (0.4,0.7), plan2: (0.6,0.8), plan3: (0.5,0.75) 523 | else: 524 | label[i] = np.random.uniform(0.75,1.0) # plan1: (0.7,1.0), plan2: (0.8,1.0), plan3: (0.75,1.0) 525 | # label = self.PriorKnowledgeTable_Test[label].reshape(self.PriorKnowledgeTable_Test.shape[1],) 526 | elif self.flag=='test': 527 | label = self.PriorKnowledgeTable_Test[label].reshape(self.PriorKnowledgeTable_Test.shape[1],) 528 | 529 | # Face Rotation 530 | # left = np.min(landmark[:,0]) 531 | # right = np.max(landmark[:,0]) 532 | # upper = np.min(landmark[:,1]) 533 | # lower = np.max(landmark[:,1]) 534 | 535 | # BoundingBox 536 | left = bbox[0] 537 | upper = bbox[1] 538 | right = bbox[2] 539 | lower = bbox[3] 540 | 541 | # Visualization 542 | # draw = ImageDraw.Draw(img) 543 | # for idx in range(len(landmark[:,0])): 544 | # draw.point((landmark[idx, 0], landmark[idx, 1])) 545 | # img.save('./vis_/{}.jpg'.format(index)) 546 | 547 | enlarge_bbox = True 548 | 549 | if self.flag=='train': 550 | random_crop = True 551 | random_flip = True 552 | elif self.flag=='test': 553 | random_crop = False 554 | random_flip = False 555 | 556 | padding = max(0, int((right - left)*0.15)) # enlarge bbox 557 | half_padding = int(padding*0.5) 558 | 559 | if enlarge_bbox: 560 | left = max(left - half_padding, 0) 561 | right = min(right + half_padding, ori_img_w) 562 | upper = max(upper - half_padding, 0) 563 | lower = min(lower + half_padding, ori_img_h) 564 | 565 | if random_crop: 566 | offset_range = half_padding 567 | 568 | x_offset = random.randint(-offset_range, offset_range) 569 | y_offset = random.randint(-offset_range, offset_range) 570 | 571 | left = max(left + x_offset, 0) 572 | right = min(right + x_offset, ori_img_w) 573 | upper = max(upper + y_offset, 0) 574 | lower = min(lower + y_offset, ori_img_h) 575 | 576 | img = img.crop((left,upper,right,lower)) 577 | crop_img_w, crop_img_h = img.size 578 | 579 | landmark[:,0]-=left 580 | landmark[:,1]-=upper 581 | 582 | if random_flip and random.random() > 0.5: 583 | img = img.transpose(Image.FLIP_LEFT_RIGHT) 584 | landmark[:,0] = (right - left) - landmark[:,0] 585 | 586 | # Transform Image 587 | trans_img = self.transform(img) 588 | _, trans_img_w, trans_img_h = trans_img.size() 589 | 590 | # Visualization 591 | # img_transform = img.resize((crop_img_w ,crop_img_h)) 592 | # draw = ImageDraw.Draw(img_transform) 593 | # for idx in range(len(landmark[:,0])): 594 | # draw.point((landmark[idx, 0], landmark[idx, 1])) 595 | # img_transform.save('./vis/{}.jpg'.format(index)) 596 | 597 | if self.target_transform is not None: 598 | label = self.transform(label) 599 | 600 | # Don't need AU 601 | if not self.needAU: 602 | return (trans_img, self.imgs[index]), 0, label 603 | 604 | # get au location 605 | landmark[:, 0] = landmark[:, 0] * trans_img_w / crop_img_w 606 | landmark[:, 1] = landmark[:, 1] * trans_img_h / crop_img_h 607 | 608 | if self.Model in ['ResNet-101', 'ResNet-50']: 609 | au_location = get_au_loc(copy.deepcopy(landmark),trans_img_w, 56) 610 | for i in range(au_location.shape[0]): 611 | for j in range(4): 612 | if au_location[i,j]<= 11: 613 | au_location[i,j] = 12 614 | if au_location[i,j]>=45: 615 | au_location[i,j] = 44 616 | elif self.Model == 'ResNet-18': 617 | au_location = get_au_loc(copy.deepcopy(landmark),trans_img_w, 112) 618 | for i in range(au_location.shape[0]): 619 | for j in range(4): 620 | if au_location[i,j]<= 23: 621 | au_location[i,j] = 24 622 | if au_location[i,j]>=89: 623 | au_location[i,j] = 88 624 | 625 | # Visualization 626 | # img_transform = img.resize((trans_img_w ,trans_img_h)) 627 | # draw = ImageDraw.Draw(img_transform) 628 | # for idx in range(len(landmark[:,0])): 629 | # draw.point((landmark[idx, 0], landmark[idx, 1])) 630 | # img_transform.save('./vis/{}.jpg'.format(index)) 631 | 632 | au_location = torch.LongTensor(au_location) 633 | 634 | if self.Experiment=='Fuse': 635 | if self.flag=='train': 636 | AU_label = self.PriorKnowledgeTable_Train[label].reshape(self.PriorKnowledgeTable_Train.shape[1],) 637 | for i in range(self.PriorKnowledgeTable_Train.shape[1]): 638 | if AU_label[i] < 0.5: 639 | AU_label[i] = np.random.uniform(0.0,0.25) # plan1: (0.0,0.4), plan2: (0.0,0.2), plan3: (0.0,0.25) 640 | elif AU_label[i] < 1.0: 641 | AU_label[i] = np.random.uniform(0.5,0.75) # plan1: (0.4,0.7), plan2: (0.6,0.8), plan3: (0.5,0.75) 642 | else: 643 | AU_label[i] = np.random.uniform(0.75,1.0) # plan1: (0.7,1.0), plan2: (0.8,1.0), plan3: (0.75,1.0) 644 | # label = self.PriorKnowledgeTable_Test[label].reshape(self.PriorKnowledgeTable_Test.shape[1],) 645 | elif self.flag=='test': 646 | AU_label = self.PriorKnowledgeTable_Test[label].reshape(self.PriorKnowledgeTable_Test.shape[1],) 647 | 648 | return (trans_img, self.imgs[index]), au_location, (label, AU_label) 649 | 650 | return (trans_img, self.imgs[index]), au_location, label 651 | 652 | def __len__(self): 653 | return len(self.imgs) 654 | 655 | class MyDataset_EM_Artificial_noBbox(data.Dataset): 656 | def __init__(self, imgs, labels, landmarks, flag, needAU, Model, Experiment, transform=None, target_transform=None, loader=default_loader): 657 | self.imgs = imgs 658 | self.labels = labels 659 | self.landmarks = landmarks 660 | self.transform = transform 661 | self.target_transform = target_transform 662 | self.loader = loader 663 | self.flag = flag 664 | self.needAU = needAU 665 | self.Model = Model 666 | self.Experiment = Experiment 667 | self.PriorKnowledgeTable_Train = np.array([ 668 | [1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0], 669 | [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.5, 0.0, 0.5, 0.0], 670 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.5, 0.0], 671 | [0.0, 0.0, 0.0, 0.0, 1.0, 0.5, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0], 672 | [1.0, 0.0, 0.5, 0.0, 0.5, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0], 673 | [0.0, 0.0, 1.0, 1.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 1.0, 1.0, 0.0, 0.0], 674 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],dtype=np.float) 675 | self.PriorKnowledgeTable_Test = np.array([ 676 | [1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0], 677 | [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0], 678 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0], 679 | [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], 680 | [1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], 681 | [0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0], 682 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],dtype=np.float) 683 | 684 | def __getitem__(self, index): 685 | img, label, landmark = self.loader(self.imgs[index]), copy.deepcopy(self.labels[index]), copy.deepcopy(self.landmarks[index]) 686 | ori_img_w, ori_img_h = img.size 687 | 688 | # Need AU Label 689 | if self.Experiment=='AU': 690 | if self.flag=='train': 691 | label = self.PriorKnowledgeTable_Train[label].reshape(self.PriorKnowledgeTable_Train.shape[1],) 692 | for i in range(self.PriorKnowledgeTable_Train.shape[1]): 693 | if label[i] < 0.5: 694 | label[i] = np.random.uniform(0.0,0.25) # plan1: (0.0,0.4), plan2: (0.0,0.2), plan3: (0.0,0.25) 695 | elif label[i] < 1.0: 696 | label[i] = np.random.uniform(0.5,0.75) # plan1: (0.4,0.7), plan2: (0.6,0.8), plan3: (0.5,0.75) 697 | else: 698 | label[i] = np.random.uniform(0.75,1.0) # plan1: (0.7,1.0), plan2: (0.8,1.0), plan3: (0.75,1.0) 699 | # label = self.PriorKnowledgeTable_Test[label].reshape(self.PriorKnowledgeTable_Test.shape[1],) 700 | elif self.flag=='test': 701 | label = self.PriorKnowledgeTable_Test[label].reshape(self.PriorKnowledgeTable_Test.shape[1],) 702 | 703 | # Face Rotation 704 | left = np.min(landmark[:,0]) 705 | right = np.max(landmark[:,0]) 706 | upper = np.min(landmark[:,1]) 707 | lower = np.max(landmark[:,1]) 708 | 709 | # Visualization 710 | # draw = ImageDraw.Draw(img) 711 | # for idx in range(len(landmark[:,0])): 712 | # draw.point((landmark[idx, 0], landmark[idx, 1])) 713 | # img.save('./vis_/{}.jpg'.format(index)) 714 | 715 | enlarge_bbox = True 716 | 717 | if self.flag=='train': 718 | random_crop = True 719 | random_flip = True 720 | elif self.flag=='test': 721 | random_crop = False 722 | random_flip = False 723 | 724 | padding = max(0, int((right - left)*0.2)) # enlarge bbox, Defaults: 0.15 725 | half_padding = int(padding*0.5) 726 | 727 | if enlarge_bbox: 728 | left = max(left - half_padding, 0) 729 | right = min(right + half_padding, ori_img_w) 730 | upper = max(upper - half_padding, 0) 731 | lower = min(lower + half_padding, ori_img_h) 732 | 733 | if random_crop: 734 | offset_range = half_padding 735 | 736 | x_offset = random.randint(-offset_range, offset_range) 737 | y_offset = random.randint(-offset_range, offset_range) 738 | 739 | left = max(left + x_offset, 0) 740 | right = min(right + x_offset, ori_img_w) 741 | upper = max(upper + y_offset, 0) 742 | lower = min(lower + y_offset, ori_img_h) 743 | 744 | img = img.crop((left,upper,right,lower)) 745 | crop_img_w, crop_img_h = img.size 746 | 747 | landmark[:,0]-=left 748 | landmark[:,1]-=upper 749 | 750 | if random_flip and random.random() > 0.5: 751 | img = img.transpose(Image.FLIP_LEFT_RIGHT) 752 | landmark[:,0] = (right - left) - landmark[:,0] 753 | 754 | # Transform Image 755 | trans_img = self.transform(img) 756 | _, trans_img_w, trans_img_h = trans_img.size() 757 | 758 | # Visualization 759 | # img_transform = img.resize((crop_img_w ,crop_img_h)) 760 | # draw = ImageDraw.Draw(img_transform) 761 | # for idx in range(len(landmark[:,0])): 762 | # draw.point((landmark[idx, 0], landmark[idx, 1])) 763 | # img_transform.save('./vis/{}.jpg'.format(index)) 764 | 765 | if self.target_transform is not None: 766 | label = self.transform(label) 767 | 768 | # Don't need AU 769 | if not self.needAU: 770 | return (trans_img, self.imgs[index]), 0, label 771 | 772 | # get au location 773 | landmark[:, 0] = landmark[:, 0] * trans_img_w / crop_img_w 774 | landmark[:, 1] = landmark[:, 1] * trans_img_h / crop_img_h 775 | 776 | if self.Model in ['ResNet-101', 'ResNet-50']: 777 | au_location = get_au_loc(copy.deepcopy(landmark),trans_img_w, 56) 778 | for i in range(au_location.shape[0]): 779 | for j in range(4): 780 | if au_location[i,j]<= 11: 781 | au_location[i,j] = 12 782 | if au_location[i,j]>=45: 783 | au_location[i,j] = 44 784 | elif self.Model == 'ResNet-18': 785 | au_location = get_au_loc(copy.deepcopy(landmark),trans_img_w, 112) 786 | for i in range(au_location.shape[0]): 787 | for j in range(4): 788 | if au_location[i,j]<= 23: 789 | au_location[i,j] = 24 790 | if au_location[i,j]>=89: 791 | au_location[i,j] = 88 792 | 793 | # Visualization 794 | # img_transform = img.resize((trans_img_w ,trans_img_h)) 795 | # draw = ImageDraw.Draw(img_transform) 796 | # for idx in range(len(landmark[:,0])): 797 | # draw.point((landmark[idx, 0], landmark[idx, 1])) 798 | # img_transform.save('./vis/{}.jpg'.format(index)) 799 | 800 | au_location = torch.LongTensor(au_location) 801 | 802 | if self.Experiment=='Fuse': 803 | if self.flag=='train': 804 | AU_label = self.PriorKnowledgeTable_Train[label].reshape(self.PriorKnowledgeTable_Train.shape[1],) 805 | for i in range(self.PriorKnowledgeTable_Train.shape[1]): 806 | if AU_label[i] < 0.5: 807 | AU_label[i] = np.random.uniform(0.0,0.25) # plan1: (0.0,0.4), plan2: (0.0,0.2), plan3: (0.0,0.25) 808 | elif AU_label[i] < 1.0: 809 | AU_label[i] = np.random.uniform(0.5,0.75) # plan1: (0.4,0.7), plan2: (0.6,0.8), plan3: (0.5,0.75) 810 | else: 811 | AU_label[i] = np.random.uniform(0.75,1.0) # plan1: (0.7,1.0), plan2: (0.8,1.0), plan3: (0.75,1.0) 812 | # label = self.PriorKnowledgeTable_Test[label].reshape(self.PriorKnowledgeTable_Test.shape[1],) 813 | elif self.flag=='test': 814 | AU_label = self.PriorKnowledgeTable_Test[label].reshape(self.PriorKnowledgeTable_Test.shape[1],) 815 | 816 | return (trans_img, self.imgs[index]), au_location, (label, AU_label) 817 | 818 | return (trans_img, self.imgs[index]), au_location, label 819 | 820 | def __len__(self): 821 | return len(self.imgs) 822 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torchvision.models as models 4 | 5 | numOfAU = 17 6 | 7 | def local_response_norm(input, size, alpha=1e-4, beta=0.75, k=1): 8 | r"""Applies local response normalization over an input signal composed of 9 | several input planes, where channels occupy the second dimension. 10 | Applies normalization across channels. 11 | 12 | See :class:`~torch.nn.LocalResponseNorm` for details. 13 | """ 14 | dim = input.dim() 15 | if dim < 3: 16 | raise ValueError('Expected 3D or higher dimensionality \ 17 | input (got {} dimensions)'.format(dim)) 18 | div = input.mul(input).unsqueeze(1) 19 | if dim == 3: 20 | div = pad(div, (0, 0, size // 2, (size - 1) // 2)) 21 | div = avg_pool2d(div, (size, 1), stride=1).squeeze(1) 22 | else: 23 | sizes = input.size() 24 | div = div.view(sizes[0], 1, sizes[1], sizes[2], -1) 25 | div = pad(div, (0, 0, 0, 0, size // 2, (size - 1) // 2)) 26 | div = avg_pool3d(div, (size, 1, 1), stride=1).squeeze(1) 27 | div = div.mul(size) 28 | div = div.view(sizes) 29 | div = div.mul(alpha).add(k).pow(beta) 30 | return input / div 31 | 32 | 33 | class LocalResponseNorm(nn.Module): 34 | r"""Applies local response normalization over an input signal composed 35 | of several input planes, where channels occupy the second dimension. 36 | Applies normalization across channels. 37 | 38 | .. math:: 39 | b_{c} = a_{c}\left(k + \frac{\alpha}{n} 40 | \sum_{c'=\max(0, c-n/2)}^{\min(N-1,c+n/2)}a_{c'}^2\right)^{-\beta} 41 | 42 | Args: 43 | size: amount of neighbouring channels used for normalization 44 | alpha: multiplicative factor. Default: 0.0001 45 | beta: exponent. Default: 0.75 46 | k: additive factor. Default: 1 47 | 48 | Shape: 49 | - Input: :math:`(N, C, ...)` 50 | - Output: :math:`(N, C, ...)` (same shape as input) 51 | 52 | Examples:: 53 | 54 | >>> lrn = nn.LocalResponseNorm(2) 55 | >>> signal_2d = torch.randn(32, 5, 24, 24) 56 | >>> signal_4d = torch.randn(16, 5, 7, 7, 7, 7) 57 | >>> output_2d = lrn(signal_2d) 58 | >>> output_4d = lrn(signal_4d) 59 | 60 | """ 61 | 62 | def __init__(self, size, alpha=1e-4, beta=0.75, k=1): 63 | super(LocalResponseNorm, self).__init__() 64 | self.size = size 65 | self.alpha = alpha 66 | self.beta = beta 67 | self.k = k 68 | 69 | def forward(self, input): 70 | return local_response_norm(input, self.size, self.alpha, self.beta, self.k) 71 | 72 | def extra_repr(self): 73 | return '{size}, alpha={alpha}, beta={beta}, k={k}'.format(**self.__dict__) 74 | 75 | 76 | class ResNet101(nn.Module): 77 | def __init__(self, Dim): 78 | super(ResNet101, self).__init__() 79 | 80 | self.Dim = Dim 81 | 82 | # Original Model 83 | self.backbone = models.resnet101(pretrained=False) 84 | self.backbone.fc = nn.Linear(in_features=2048, out_features=7, bias=True) 85 | 86 | # Universal Module 87 | self.maxpool1 = nn.MaxPool2d(kernel_size=8, stride=8) # W * H --> W/8 * H/8 88 | self.maxpool2 = nn.MaxPool2d(kernel_size=4, stride=4) # W * H --> W/4 * H/4 89 | self.maxpool3 = nn.MaxPool2d(kernel_size=2, stride=2) # W * H --> W/2 * H/2 90 | self.GAP = nn.AdaptiveAvgPool2d((1,1)) # B * C * W * H --> B * C * 1 * 1 91 | 92 | # Expression Recognition 93 | self.LRN_em = LocalResponseNorm(2) 94 | self.reduce_dim_em = nn.Sequential( 95 | nn.Conv2d(in_channels=3840,out_channels=1024,kernel_size=3,stride=1,padding=1,bias=False), 96 | nn.BatchNorm2d(1024), 97 | nn.ReLU(inplace=True), 98 | ) 99 | self.pred_em = nn.Linear(in_features=1024, out_features=7, bias=True) 100 | 101 | # AU Recognition 102 | self.LRN_au = LocalResponseNorm(2) 103 | self.pred_au = nn.Sequential(nn.Linear(256 * numOfAU * 2, numOfAU), nn.Sigmoid()) 104 | 105 | # Deconvolution Layer 106 | # ConvTranspose2d: output = (input - 1) * stride + output_padding - 2 * padding + kernel_size 107 | self.deconv_layer1 = nn.Sequential( 108 | nn.ConvTranspose2d( 109 | in_channels=2048, 110 | out_channels=1024, 111 | kernel_size=4, 112 | stride=2, 113 | padding=1, 114 | output_padding=0, 115 | bias=False), 116 | nn.BatchNorm2d(1024), 117 | nn.ReLU(inplace=True), 118 | ) 119 | self.deconv_layer2 = nn.Sequential( 120 | nn.ConvTranspose2d( 121 | in_channels=1024, 122 | out_channels=512, 123 | kernel_size=4, 124 | stride=2, 125 | padding=1, 126 | output_padding=0, 127 | bias=False), 128 | nn.BatchNorm2d(512), 129 | nn.ReLU(inplace=True), 130 | ) 131 | self.deconv_layer3 = nn.Sequential( 132 | nn.ConvTranspose2d( 133 | in_channels=512, 134 | out_channels=256, 135 | kernel_size=4, 136 | stride=2, 137 | padding=1, 138 | output_padding=0, 139 | bias=False), 140 | nn.BatchNorm2d(256), 141 | nn.ReLU(inplace=True), 142 | ) 143 | 144 | # Reduce Dimension 145 | self.reduce_dim_1_au = nn.Sequential( 146 | nn.Conv2d(in_channels=2048,out_channels=1024,kernel_size=3,stride=1,padding=1,bias=False), 147 | nn.BatchNorm2d(1024), 148 | nn.ReLU(inplace=True), 149 | ) 150 | self.reduce_dim_2_au = nn.Sequential( 151 | nn.Conv2d(in_channels=1024,out_channels=512,kernel_size=3,stride=1,padding=1,bias=False), 152 | nn.BatchNorm2d(512), 153 | nn.ReLU(inplace=True), 154 | ) 155 | self.reduce_dim_3_au = nn.Sequential( 156 | nn.Conv2d(in_channels=512,out_channels=64,kernel_size=3,stride=1,padding=1,bias=False), 157 | nn.BatchNorm2d(64), 158 | nn.ReLU(inplace=True), 159 | ) 160 | # Crop Net 161 | self.Crop_Net_1 = nn.ModuleList([ nn.Sequential( nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(3,3), stride=(1,1), padding=(1,1)), nn.ReLU()) for i in range(numOfAU * 2) ]) 162 | self.Crop_Net_2 = nn.ModuleList([ nn.Sequential( nn.Linear(in_features=64*24*24, out_features=256), nn.ReLU(), nn.Dropout(p=0.5) ) for i in range(numOfAU * 2) ]) 163 | 164 | # Bilinear + Attention 165 | self.fc_em_fuse_3 = nn.Sequential(nn.Dropout(p=0.5), nn.Linear(1024, self.Dim), nn.BatchNorm1d(self.Dim), nn.Tanh()) 166 | self.fc_au_fuse_3 = nn.ModuleList([ nn.Sequential(nn.Dropout(p=0.5), nn.Linear(512, self.Dim), nn.BatchNorm1d(self.Dim), nn.Tanh()) for i in range(numOfAU) ]) 167 | 168 | # self.fc_attention_fuse_3 = nn.Sequential(nn.Linear(self.Dim,1),) # Share Weight, no ReLU 169 | self.fc_attention_fuse_3 = nn.Sequential(nn.Linear(self.Dim, 1), nn.ReLU(inplace=True)) # Share Weight, ReLU 170 | 171 | self.pred_em_fuse_3 = nn.Linear(1024 + 512, 7) 172 | 173 | # Generate AU Label 174 | self.InitTable = torch.Tensor([ 175 | [1.0, 1.0, 0.1, 1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0], 176 | [1.0, 1.0, 1.0, 1.0, 0.1, 1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.5, 0.1, 0.5, 0.1], 177 | [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 0.1, 0.1, 0.1, 0.5, 0.1, 0.1, 0.1, 0.5, 0.1], 178 | [0.1, 0.1, 0.1, 0.1, 1.0, 0.5, 0.1, 0.1, 1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.1], 179 | [1.0, 0.1, 0.5, 0.1, 0.5, 1.0, 0.1, 0.1, 0.1, 1.0, 0.1, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1], 180 | [0.1, 0.1, 1.0, 1.0, 0.1, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.1, 1.0, 1.0, 0.1, 0.1], 181 | [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]).float() 182 | self.PriorKnowledgeTable = nn.Parameter(self.InitTable) 183 | 184 | # Generate AU Mask 185 | self.Mask_A = torch.Tensor([ 186 | [1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0], 187 | [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0], 188 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 189 | [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 190 | [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 191 | [0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0], 192 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]).float().cuda() 193 | self.Mask_B = torch.Tensor([ 194 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 195 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0], 196 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0], 197 | [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], 198 | [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], 199 | [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], 200 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]).float().cuda() 201 | self.Mask_C = torch.Tensor([ 202 | [0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0], 203 | [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0], 204 | [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0], 205 | [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0], 206 | [0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0], 207 | [1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0], 208 | [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]).float().cuda() 209 | 210 | def forward(self, input, args): # input: 3 * 224 * 224 211 | 212 | if args.Experiment == 'EM': 213 | featureMap0 = self.backbone.maxpool(self.backbone.relu(self.backbone.bn1(self.backbone.conv1(input)))) # 64 * 56 * 56 214 | featureMap1 = self.backbone.layer1(featureMap0) # 256 * 56 * 56 215 | featureMap2 = self.backbone.layer2(featureMap1) # 512 * 28 * 28 216 | featureMap3 = self.backbone.layer3(featureMap2) # 1024 * 14 * 14 217 | featureMap4 = self.backbone.layer4(featureMap3) # 2048 * 7 * 7 218 | 219 | featureMap1 = self.maxpool1(featureMap1) # 256 * 56 * 56 --> 256 * 7 * 7 220 | featureMap2 = self.maxpool2(featureMap2) # 512 * 28 * 28 --> 512 * 7 * 7 221 | featureMap3 = self.maxpool3(featureMap3) # 1024 * 14 * 14 --> 1024 * 7 * 7 222 | featureMap = torch.cat((torch.cat((featureMap1, featureMap2), dim=1), torch.cat((featureMap3, featureMap4), dim=1)), dim=1) # 3840 * 7 * 7 223 | 224 | # Save GPU Memory 225 | del featureMap0, featureMap1, featureMap2, featureMap3, featureMap4 226 | torch.cuda.empty_cache() 227 | 228 | featureMap = self.LRN_em(featureMap) # 3840 * 7 * 7 --> 3840 * 7 * 7 229 | featureMap = self.reduce_dim_em(featureMap) # 3840 * 7 * 7 --> 1024 * 7 * 7 230 | 231 | feature = self.GAP(featureMap) # 1024 * 7 * 7 --> 1024 * 1 * 1 232 | 233 | # Save GPU Memory 234 | del featureMap 235 | torch.cuda.empty_cache() 236 | 237 | feature = feature.view(feature.size(0),feature.size(1)) # 1024 * 1 * 1 --> 1024 238 | pred = self.pred_em(feature) # 1024 --> 7 239 | 240 | return pred 241 | 242 | elif args.Experiment == 'AU': 243 | 244 | input, au_loc = input # input = (input, au_loc) 245 | 246 | with torch.no_grad(): 247 | featureMap0 = self.backbone.maxpool(self.backbone.relu(self.backbone.bn1(self.backbone.conv1(input)))) # 64 * 56 * 56 248 | featureMap1 = self.backbone.layer1(featureMap0) # 256 * 56 * 56 249 | featureMap2 = self.backbone.layer2(featureMap1) # 512 * 28 * 28 250 | featureMap3 = self.backbone.layer3(featureMap2) # 1024 * 14 * 14 251 | featureMap4 = self.backbone.layer4(featureMap3) # 2048 * 7 * 7 252 | 253 | deconv_featureMap3 = self.deconv_layer1(featureMap4) # 2048 * 7 * 7 --> 1024 * 14 * 14 254 | deconv_featureMap3 = torch.cat((featureMap3, deconv_featureMap3), dim=1) # cat(1024 * 14 * 14, 1024 * 14 * 14) = 2048 * 14 * 14 255 | deconv_featureMap3 = self.reduce_dim_1_au(deconv_featureMap3) # 2048 * 14 * 14 --> 1024 * 14 * 14 256 | 257 | deconv_featureMap2 = self.deconv_layer2(deconv_featureMap3) # 1024 * 14 * 14 --> 512 * 28 * 28 258 | deconv_featureMap2 = torch.cat((featureMap2, deconv_featureMap2), dim=1) # cat(512 * 28 * 28, 512 * 28 * 28) = 1024 * 28 * 28 259 | deconv_featureMap2 = self.reduce_dim_2_au(deconv_featureMap2) # 1024 * 28 * 28 --> 512 * 28 * 28 260 | 261 | deconv_featureMap1 = self.deconv_layer3(deconv_featureMap2) # 512 * 28 * 28 --> 256 * 56 * 56 262 | deconv_featureMap1 = torch.cat((featureMap1, deconv_featureMap1), dim=1) # cat(256 * 56 * 56, 256 * 56 * 56) = 512 * 56 * 56 263 | deconv_featureMap1 = self.reduce_dim_3_au(deconv_featureMap1) # 512 * 56 * 56 --> 64 * 56 * 56 264 | 265 | deconv_featureMap = self.LRN_au(deconv_featureMap1) # 64 * 56 * 56 --> 64 * 56 * 56 266 | 267 | au_featureMap = self.crop_au_featureMap(deconv_featureMap, au_loc) # crop au feature: (2 * numOfAU) * 256 268 | au_featureMap = au_featureMap.view(au_featureMap.size(0), -1) # (2 * numOfAU) * 256 --> (2 * numOfAU) * 256 269 | pred = self.pred_au(au_featureMap) # (2 * numOfAU) * 256 --> (2 * numOfAU) 270 | 271 | return pred 272 | 273 | elif args.Experiment == 'Fuse': 274 | 275 | input, au_loc = input # input = (input, au_loc) 276 | 277 | with torch.no_grad(): 278 | # Feature Map 279 | featureMap0 = self.backbone.maxpool(self.backbone.relu(self.backbone.bn1(self.backbone.conv1(input)))) # 64 * 56 * 56 280 | featureMap1 = self.backbone.layer1(featureMap0) # 256 * 56 * 56 281 | featureMap2 = self.backbone.layer2(featureMap1) # 512 * 28 * 28 282 | featureMap3 = self.backbone.layer3(featureMap2) # 1024 * 14 * 14 283 | featureMap4 = self.backbone.layer4(featureMap3) # 2048 * 7 * 7 284 | 285 | # AU 286 | deconv_featureMap3 = self.deconv_layer1(featureMap4) # 2048 * 7 * 7 --> 1024 * 14 * 14 287 | deconv_featureMap3 = torch.cat((featureMap3,deconv_featureMap3),dim=1) # cat(1024 * 14 * 14, 1024 * 14 * 14) = 2048 * 14 * 14 288 | deconv_featureMap3 = self.reduce_dim_1_au(deconv_featureMap3) # 2048 * 14 * 14 --> 1024 * 14 * 14 289 | 290 | deconv_featureMap2 = self.deconv_layer2(deconv_featureMap3) # 1024 * 14 * 14 --> 512 * 28 * 28 291 | deconv_featureMap2 = torch.cat((featureMap2,deconv_featureMap2),dim=1) # cat(512 * 28 * 28, 512 * 28 * 28) = 1024 * 28 * 28 292 | deconv_featureMap2 = self.reduce_dim_2_au(deconv_featureMap2) # 1024 * 28 * 28 --> 512 * 28 * 28 293 | 294 | deconv_featureMap1 = self.deconv_layer3(deconv_featureMap2) # 512 * 28 * 28 --> 256 * 56 * 56 295 | deconv_featureMap1 = torch.cat((featureMap1,deconv_featureMap1),dim=1) # cat(256 * 56 * 56, 256 * 56 * 56) = 512 * 56 * 56 296 | deconv_featureMap1 = self.reduce_dim_3_au(deconv_featureMap1) # 512 * 56 * 56 --> 64 * 56 * 56 297 | 298 | deconv_featureMap = self.LRN_au(deconv_featureMap1) # 64 * 56 * 56 --> 64 * 56 * 56 299 | 300 | au_featureMap = self.crop_au_featureMap(deconv_featureMap,au_loc) # crop au feature: (2*numOfAU) * 256 301 | 302 | # EM 303 | featureMap1 = self.maxpool1(featureMap1) # 256 * 56 * 56 --> 256 * 7 * 7 304 | featureMap2 = self.maxpool2(featureMap2) # 512 * 28 * 28 --> 512 * 7 * 7 305 | featureMap3 = self.maxpool3(featureMap3) # 1024 * 14 * 14 --> 1024 * 7 * 7 306 | featureMap = torch.cat((torch.cat((featureMap1,featureMap2),dim=1),torch.cat((featureMap3,featureMap4),dim=1)),dim=1) # 3840 * 7 * 7 307 | 308 | featureMap = self.LRN_em(featureMap) # 3840 * 7 * 7 --> 3840 * 7 * 7 309 | featureMap = self.reduce_dim_em(featureMap) # 3840 * 7 * 7 --> 1024 * 7 * 7 310 | feature = self.GAP(featureMap) # 1024 * 7 * 7 --> 1024 * 1 * 1 311 | feature = feature.view(feature.size(0),feature.size(1)) # 1024 * 1 * 1 --> 1024 312 | 313 | # MultiScale 314 | pred1 = self.pred_em(feature) 315 | 316 | # Bilinear 317 | # AU_feature = torch.cat((au_featureMap[:,0:12,:],au_featureMap[:,12:,:]),dim=2) 318 | AU_feature = au_featureMap.view(au_featureMap.size(0), 2, numOfAU, au_featureMap.size(2)).transpose(1, 2).contiguous().view(au_featureMap.size(0), numOfAU, -1) # numOfAU * 512 319 | AU_Dim_feature = torch.zeros((AU_feature.size(0), numOfAU, self.Dim)).cuda() # numOfAU * Dim 320 | for i in range(numOfAU): 321 | AU_Dim_feature[:, i, :] = self.fc_au_fuse_3[i](AU_feature[:, i, :]) 322 | 323 | EM_Dim_feature = self.fc_em_fuse_3(feature) # Dim 324 | EM_Dim_feature = EM_Dim_feature.view(EM_Dim_feature.size(0), 1, self.Dim).repeat(1, numOfAU, 1) # numOfAU * Dim 325 | 326 | Attention = EM_Dim_feature * AU_Dim_feature # numOfAU * Dim 327 | 328 | Attention_Result = torch.zeros((Attention.size(0), numOfAU, 1)).cuda() # numOfAU * 1 329 | for i in range(numOfAU): 330 | Attention_Result[:, i, :] = self.fc_attention_fuse_3(Attention[:, i, :]) # Share Weight FC 331 | 332 | # Attention_Result = nn.Sigmoid()(Attention_Result) # Sigmoid 333 | Attention_Result = nn.Softmax(dim=1)(Attention_Result) # Softmax 334 | 335 | au_prob = Attention_Result.view(Attention_Result.size(0), numOfAU) 336 | 337 | Attention_Result = Attention_Result.repeat(1, 1, 512) # numOfAU * 512 338 | 339 | Result = Attention_Result * AU_feature # numOfAU * 512 340 | 341 | Result = Result.sum(dim=1) # 512 342 | 343 | Result = torch.cat((feature, Result),dim=1) 344 | 345 | # Bilinear Pooling 346 | pred2 = self.pred_em_fuse_3(Result) 347 | 348 | return pred1, pred2, au_prob 349 | 350 | def crop_au_featureMap(self, deconv_featureMap, au_location): 351 | au = [] 352 | for i in range(numOfAU): 353 | au.append(au_location[:,i,:]) 354 | 355 | batch_size = deconv_featureMap.size(0) 356 | map_ch = deconv_featureMap.size(1) 357 | map_len = deconv_featureMap.size(2) 358 | 359 | grid_ch = map_ch 360 | grid_len = int(map_len * 24 / 56) 361 | 362 | feature_list = [] 363 | for i in range(numOfAU): 364 | grid1_list = [] 365 | grid2_list = [] 366 | for j in range(batch_size): 367 | h_min_1 = au[i][j,1]-int(grid_len/2) 368 | h_max_1 = au[i][j,1]+int(grid_len/2) 369 | w_min_1 = au[i][j,0]-int(grid_len/2) 370 | w_max_1 = au[i][j,0]+int(grid_len/2) 371 | 372 | h_min_2 = au[i][j,3]-int(grid_len/2) 373 | h_max_2 = au[i][j,3]+int(grid_len/2) 374 | w_min_2 = au[i][j,2]-int(grid_len/2) 375 | w_max_2 = au[i][j,2]+int(grid_len/2) 376 | # grid_1 = deconv_featureMap[j, :, h_min_1:h_max_1, w_min_1:w_max_1] 377 | # grid_2 = deconv_featureMap[j, :, h_min_2:h_max_2, w_min_2:w_max_2] 378 | 379 | map_h_min_1 = max(0, h_min_1) 380 | map_h_max_1 = min(map_len, h_max_1) 381 | map_w_min_1 = max(0, w_min_1) 382 | map_w_max_1 = min(map_len, w_max_1) 383 | 384 | map_h_min_2 = max(0, h_min_2) 385 | map_h_max_2 = min(map_len, h_max_2) 386 | map_w_min_2 = max(0, w_min_2) 387 | map_w_max_2 = min(map_len, w_max_2) 388 | 389 | grid_h_min_1 = max(0, 0-h_min_1) 390 | grid_h_max_1 = grid_len + min(0, map_len-h_max_1) 391 | grid_w_min_1 = max(0, 0-w_min_1) 392 | grid_w_max_1 = grid_len + min(0, map_len-w_max_1) 393 | 394 | grid_h_min_2 = max(0, 0-h_min_2) 395 | grid_h_max_2 = grid_len + min(0, map_len-h_max_2) 396 | grid_w_min_2 = max(0, 0-w_min_2) 397 | grid_w_max_2 = grid_len + min(0, map_len-w_max_2) 398 | 399 | grid_1 = torch.zeros(grid_ch, grid_len, grid_len) 400 | grid_2 = torch.zeros(grid_ch, grid_len, grid_len) 401 | grid_1 = grid_1.cuda() 402 | grid_2 = grid_2.cuda() 403 | 404 | grid_1[:, grid_h_min_1:grid_h_max_1, grid_w_min_1:grid_w_max_1] = deconv_featureMap[j, :, map_h_min_1:map_h_max_1, map_w_min_1:map_w_max_1] 405 | grid_2[:, grid_h_min_2:grid_h_max_2, grid_w_min_2:grid_w_max_2] = deconv_featureMap[j, :, map_h_min_2:map_h_max_2, map_w_min_2:map_w_max_2] 406 | 407 | grid1_list.append(grid_1) 408 | grid2_list.append(grid_2) 409 | 410 | input1 = torch.stack(grid1_list, dim=0) 411 | input2 = torch.stack(grid2_list, dim=0) 412 | feature_list.append(input1) 413 | feature_list.append(input2) 414 | 415 | # feature list: (numOfAU * 2) * batch * 1024 * 3 * 3 416 | output_list = [] 417 | # Feed into crop net individually 418 | for i in range(numOfAU * 2): 419 | output = self.Crop_Net_1[i](feature_list[i]) 420 | # output = self.GAP(output) 421 | output = output.view(batch_size, -1) 422 | output = self.Crop_Net_2[i](output) 423 | output_list.append(output) 424 | 425 | au_feature = torch.stack(output_list, dim=1) # batch * (2 * numOfAU) * 256 426 | # au_feature = torch.cat(output_list, 1) # batch * (12 * 2 * 150) 427 | # au_feature = au_feature.view(batch_size, -1, self.feature_dim_au) 428 | 429 | return au_feature 430 | 431 | def get_au_target(self, EM_Target): 432 | 433 | # get one-hot label 434 | EM_Target_One_Hot = torch.zeros(EM_Target.size(0), 7).float() 435 | EM_Target_One_Hot.scatter_(1, EM_Target.view(-1, 1).long(), 1.) 436 | EM_Target_One_Hot = EM_Target_One_Hot.cuda() 437 | 438 | # clamp 439 | Clamp_Matrix_A = torch.clamp(self.PriorKnowledgeTable, min=0.75, max=1.00) 440 | Clamp_Matrix_B = torch.clamp(self.PriorKnowledgeTable, min=0.50, max=0.75) 441 | Clamp_Matrix_C = torch.clamp(self.PriorKnowledgeTable, min=0.00, max=0.25) 442 | 443 | Clamp_Matrix_A = Clamp_Matrix_A * self.Mask_A 444 | Clamp_Matrix_B = Clamp_Matrix_B * self.Mask_B 445 | Clamp_Matrix_C = Clamp_Matrix_C * self.Mask_C 446 | 447 | Clamp_Matrix = Clamp_Matrix_A + Clamp_Matrix_B + Clamp_Matrix_C 448 | 449 | # get au target 450 | AU_Target = EM_Target_One_Hot.mm(Clamp_Matrix) 451 | # AU_Target = EM_Target_One_Hot.mm(self.PriorKnowledgeTable) 452 | 453 | return AU_Target 454 | 455 | 456 | class ResNet101_Compound(nn.Module): 457 | def __init__(self,Dim): 458 | super(ResNet101_Compound, self).__init__() 459 | 460 | self.Dim = Dim 461 | 462 | # Original Model 463 | self.backbone = models.resnet101(pretrained=True) 464 | self.backbone.fc = nn.Linear(in_features=2048, out_features=11, bias=True) 465 | 466 | # Universal Module 467 | self.maxpool1 = nn.MaxPool2d(kernel_size=8, stride=8) # W * H --> W/8 * H/8 468 | self.maxpool2 = nn.MaxPool2d(kernel_size=4, stride=4) # W * H --> W/4 * H/4 469 | self.maxpool3 = nn.MaxPool2d(kernel_size=2, stride=2) # W * H --> W/2 * H/2 470 | self.GAP = nn.AdaptiveAvgPool2d((1,1)) # B * C * W * H --> B * C * 1 * 1 471 | 472 | # Expression Recognition 473 | self.LRN_em = LocalResponseNorm(2) 474 | self.reduce_dim_em = nn.Sequential( 475 | nn.Conv2d(in_channels=3840, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False), 476 | nn.BatchNorm2d(1024), 477 | nn.ReLU(inplace=True), 478 | ) 479 | self.pred_em = nn.Linear(in_features=1024, out_features=11, bias=True) 480 | 481 | # AU Recognition 482 | self.LRN_au = LocalResponseNorm(2) 483 | self.pred_au = nn.Sequential(nn.Linear(256 * numOfAU * 2, numOfAU), nn.Sigmoid()) 484 | self.pred_au_noCropNet2 = nn.Sequential(nn.Linear(64 * numOfAU * 2, numOfAU), nn.Sigmoid()) 485 | # ConvTranspose2d: output = (input - 1) * stride + output_padding - 2 * padding + kernel_size 486 | self.deconv_layer1 = nn.Sequential( 487 | nn.ConvTranspose2d( 488 | in_channels=2048, 489 | out_channels=1024, 490 | kernel_size=4, 491 | stride=2, 492 | padding=1, 493 | output_padding=0, 494 | bias=False), 495 | nn.BatchNorm2d(1024), 496 | nn.ReLU(inplace=True), 497 | ) 498 | self.deconv_layer2 = nn.Sequential( 499 | nn.ConvTranspose2d( 500 | in_channels=1024, 501 | out_channels=512, 502 | kernel_size=4, 503 | stride=2, 504 | padding=1, 505 | output_padding=0, 506 | bias=False), 507 | nn.BatchNorm2d(512), 508 | nn.ReLU(inplace=True), 509 | ) 510 | self.deconv_layer3 = nn.Sequential( 511 | nn.ConvTranspose2d( 512 | in_channels=512, 513 | out_channels=256, 514 | kernel_size=4, 515 | stride=2, 516 | padding=1, 517 | output_padding=0, 518 | bias=False), 519 | nn.BatchNorm2d(256), 520 | nn.ReLU(inplace=True), 521 | ) 522 | # Reduce Dimension 523 | self.reduce_dim_1_au = nn.Sequential( 524 | nn.Conv2d(in_channels=2048, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False), 525 | nn.BatchNorm2d(1024), 526 | nn.ReLU(inplace=True), 527 | ) 528 | self.reduce_dim_2_au = nn.Sequential( 529 | nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=3, stride=1, padding=1, bias=False), 530 | nn.BatchNorm2d(512), 531 | nn.ReLU(inplace=True), 532 | ) 533 | self.reduce_dim_3_au = nn.Sequential( 534 | nn.Conv2d(in_channels=512, out_channels=64, kernel_size=3, stride=1, padding=1 ,bias=False), 535 | nn.BatchNorm2d(64), 536 | nn.ReLU(inplace=True), 537 | ) 538 | # Crop Net 539 | self.Crop_Net_1 = nn.ModuleList([ nn.Sequential( nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(3,3), stride=(1,1), padding=(1,1)), nn.ReLU() ) for i in range(numOfAU * 2) ]) 540 | self.Crop_Net_2 = nn.ModuleList([ nn.Sequential( nn.Linear(in_features=64*24*24, out_features=256), nn.ReLU(), nn.Dropout(p=0.5) ) for i in range(numOfAU * 2) ]) 541 | 542 | # Feature Fuse 543 | 544 | # Plan 3 : Attention 545 | # Dropout = 0.5 546 | self.fc_em_fuse_3 = nn.Sequential(nn.Dropout(p=0.5), nn.Linear(1024, self.Dim), nn.BatchNorm1d(self.Dim), nn.Tanh()) 547 | self.fc_au_fuse_3 = nn.ModuleList([ nn.Sequential(nn.Dropout(p=0.5), nn.Linear(512,self.Dim), nn.BatchNorm1d(self.Dim), nn.Tanh()) for i in range(numOfAU) ]) # CropNet2 548 | # self.fc_au_fuse_3 = nn.ModuleList([ nn.Sequential(nn.Dropout(p=0.5),nn.Linear(128,self.Dim),nn.BatchNorm1d(self.Dim),nn.Tanh(),) for i in range(numOfAU) ]) # no CropNet2 549 | 550 | # self.fc_attention_fuse_3 = nn.Sequential(nn.Linear(self.Dim,1),) # Share Weight, no ReLU 551 | # self.fc_attention_fuse_3 = nn.ModuleList([ nn.Sequential(nn.Linear(self.Dim,1),) for i in range(numOfAU) ]) # no Share Weight, no ReLU 552 | self.fc_attention_fuse_3 = nn.Sequential(nn.Linear(self.Dim, 1), nn.ReLU(inplace=True)) # Share Weight, ReLU 553 | # self.fc_attention_fuse_3 = nn.ModuleList([ nn.Sequential(nn.Linear(self.Dim,1),nn.ReLU(inplace=True),) for i in range(numOfAU) ]) # no Share Weight, ReLU 554 | 555 | self.pred_em_fuse_3 = nn.Linear(1024 + 512, 11) # CropNet2 556 | # self.pred_em_fuse_3 = nn.Linear(1024+128, 11) # no CropNet2 557 | 558 | self.pred_em_fuse_3_attention = nn.Linear(numOfAU * self.Dim, 11) 559 | 560 | self.pred_au_fuse_3 = nn.Sequential(nn.Linear(1024 + 512, numOfAU), nn.Sigmoid()) 561 | self.pred_au_fuse_3_attention = nn.Sequential(nn.Linear(numOfAU * self.Dim, numOfAU), nn.Sigmoid()) 562 | 563 | self.pred_em_fuse_3_ShareWeight = nn.Linear(numOfAU * 512, 11) # CropNet2 564 | # self.pred_em_fuse_3_ShareWeight = nn.Linear(numOfAU * 128, 11) # no CropNet2 565 | 566 | def forward(self, input, args): 567 | 568 | if args.Experiment == 'EM': 569 | 570 | featureMap0 = self.backbone.maxpool(self.backbone.relu(self.backbone.bn1(self.backbone.conv1(input)))) 571 | featureMap1 = self.backbone.layer1(featureMap0) # 256 * 56 * 56 572 | featureMap2 = self.backbone.layer2(featureMap1) # 512 * 28 * 28 573 | featureMap3 = self.backbone.layer3(featureMap2) # 1024 * 14 * 14 574 | featureMap4 = self.backbone.layer4(featureMap3) # 2048 * 7 * 7 575 | 576 | featureMap1 = self.maxpool1(featureMap1) # 256 * 56 * 56 --> 256 * 7 * 7 577 | featureMap2 = self.maxpool2(featureMap2) # 512 * 28 * 28 --> 512 * 7 * 7 578 | featureMap3 = self.maxpool3(featureMap3) # 1024 * 14 * 14 --> 1024 * 7 * 7 579 | featureMap = torch.cat((torch.cat((featureMap1,featureMap2),dim=1),torch.cat((featureMap3,featureMap4),dim=1)),dim=1) # 3840 * 7 * 7 580 | 581 | featureMap = self.LRN_em(featureMap) # 3840 * 7 * 7 --> 3840 * 7 * 7 582 | featureMap = self.reduce_dim_em(featureMap) # 3840 * 7 * 7 --> 1024 * 7 * 7 583 | feature = self.GAP(featureMap) # 1024 * 7 * 7 --> 1024 * 1 * 1 584 | feature = feature.view(feature.size(0),feature.size(1)) # 1024 * 1 * 1 --> 1024 585 | pred = self.pred_em(feature) # 1024 --> 11 586 | 587 | return pred 588 | 589 | elif args.Experiment == 'AU': 590 | 591 | input, au_loc = input # input = (input, au_loc) 592 | 593 | with torch.no_grad(): 594 | featureMap0 = self.backbone.maxpool(self.backbone.relu(self.backbone.bn1(self.backbone.conv1(input)))) 595 | featureMap1 = self.backbone.layer1(featureMap0) # 256 * 56 * 56 596 | featureMap2 = self.backbone.layer2(featureMap1) # 512 * 28 * 28 597 | featureMap3 = self.backbone.layer3(featureMap2) # 1024 * 14 * 14 598 | featureMap4 = self.backbone.layer4(featureMap3) # 2048 * 7 * 7 599 | 600 | 601 | deconv_featureMap3 = self.deconv_layer1(featureMap4) # 2048 * 7 * 7 --> 1024 * 14 * 14 602 | deconv_featureMap3 = torch.cat((featureMap3,deconv_featureMap3),dim=1) # cat(1024 * 14 * 14, 1024 * 14 * 14) = 2048 * 14 * 14 603 | deconv_featureMap3 = self.reduce_dim_1_au(deconv_featureMap3) # 2048 * 14 * 14 --> 1024 * 14 * 14 604 | 605 | deconv_featureMap2 = self.deconv_layer2(deconv_featureMap3) # 1024 * 14 * 14 --> 512 * 28 * 28 606 | deconv_featureMap2 = torch.cat((featureMap2,deconv_featureMap2),dim=1) # cat(512 * 28 * 28, 512 * 28 * 28) = 1024 * 28 * 28 607 | deconv_featureMap2 = self.reduce_dim_2_au(deconv_featureMap2) # 1024 * 28 * 28 --> 512 * 28 * 28 608 | 609 | deconv_featureMap1 = self.deconv_layer3(deconv_featureMap2) # 512 * 28 * 28 --> 256 * 56 * 56 610 | deconv_featureMap1 = torch.cat((featureMap1,deconv_featureMap1),dim=1) # cat(256 * 56 * 56, 256 * 56 * 56) = 512 * 56 * 56 611 | deconv_featureMap1 = self.reduce_dim_3_au(deconv_featureMap1) # 512 * 56 * 56 --> 64 * 56 * 56 612 | 613 | deconv_featureMap = self.LRN_au(deconv_featureMap1) # 64 * 56 * 56 --> 64 * 56 * 56 614 | 615 | au_featureMap = self.crop_au_featureMap(deconv_featureMap,au_loc) # crop au feature: (2*numOfAU) * 256 616 | au_featureMap = au_featureMap.view(au_featureMap.size(0), -1) # (2*numOfAU) * 256 --> (2*numOfAU) * 256 617 | pred = self.pred_au(au_featureMap) # (2*numOfAU) * 256 --> numOfAU 618 | # pred = self.pred_au_noCropNet2(au_featureMap) # no CropNet2 619 | return pred 620 | 621 | elif args.Experiment == 'Fuse': 622 | 623 | input, au_loc = input # input = (input, au_loc) 624 | 625 | with torch.no_grad(): 626 | featureMap0 = self.backbone.maxpool(self.backbone.relu(self.backbone.bn1(self.backbone.conv1(input)))) 627 | featureMap1 = self.backbone.layer1(featureMap0) # 256 * 56 * 56 628 | featureMap2 = self.backbone.layer2(featureMap1) # 512 * 28 * 28 629 | featureMap3 = self.backbone.layer3(featureMap2) # 1024 * 14 * 14 630 | featureMap4 = self.backbone.layer4(featureMap3) # 2048 * 7 * 7 631 | 632 | # AU 633 | deconv_featureMap3 = self.deconv_layer1(featureMap4) # 2048 * 7 * 7 --> 1024 * 14 * 14 634 | deconv_featureMap3 = torch.cat((featureMap3,deconv_featureMap3),dim=1) # cat(1024 * 14 * 14, 1024 * 14 * 14) = 2048 * 14 * 14 635 | deconv_featureMap3 = self.reduce_dim_1_au(deconv_featureMap3) # 2048 * 14 * 14 --> 1024 * 14 * 14 636 | 637 | deconv_featureMap2 = self.deconv_layer2(deconv_featureMap3) # 1024 * 14 * 14 --> 512 * 28 * 28 638 | deconv_featureMap2 = torch.cat((featureMap2,deconv_featureMap2),dim=1) # cat(512 * 28 * 28, 512 * 28 * 28) = 1024 * 28 * 28 639 | deconv_featureMap2 = self.reduce_dim_2_au(deconv_featureMap2) # 1024 * 28 * 28 --> 512 * 28 * 28 640 | 641 | deconv_featureMap1 = self.deconv_layer3(deconv_featureMap2) # 512 * 28 * 28 --> 256 * 56 * 56 642 | deconv_featureMap1 = torch.cat((featureMap1,deconv_featureMap1),dim=1) # cat(256 * 56 * 56, 256 * 56 * 56) = 512 * 56 * 56 643 | deconv_featureMap1 = self.reduce_dim_3_au(deconv_featureMap1) # 512 * 56 * 56 --> 64 * 56 * 56 644 | 645 | deconv_featureMap = self.LRN_au(deconv_featureMap1) # 64 * 56 * 56 --> 64 * 56 * 56 646 | 647 | au_featureMap = self.crop_au_featureMap(deconv_featureMap,au_loc) # crop au feature: (2*numOfAU) * 256 648 | 649 | # EM 650 | featureMap1 = self.maxpool1(featureMap1) # 256 * 56 * 56 --> 256 * 7 * 7 651 | featureMap2 = self.maxpool2(featureMap2) # 512 * 28 * 28 --> 512 * 7 * 7 652 | featureMap3 = self.maxpool3(featureMap3) # 1024 * 14 * 14 --> 1024 * 7 * 7 653 | featureMap = torch.cat((torch.cat((featureMap1,featureMap2),dim=1),torch.cat((featureMap3,featureMap4),dim=1)),dim=1) # 3840 * 7 * 7 654 | 655 | featureMap = self.LRN_em(featureMap) # 3840 * 7 * 7 --> 3840 * 7 * 7 656 | featureMap = self.reduce_dim_em(featureMap) # 3840 * 7 * 7 --> 1024 * 7 * 7 657 | feature = self.GAP(featureMap) # 1024 * 7 * 7 --> 1024 * 1 * 1 658 | feature = feature.view(feature.size(0),feature.size(1)) # 1024 * 1 * 1 --> 1024 659 | 660 | # Feature Fuse 661 | pred1 = self.pred_em(feature) 662 | 663 | # Plan 3 664 | # AU_feature = torch.cat((au_featureMap[:,0:12,:],au_featureMap[:,12:,:]),dim=2) 665 | AU_feature = au_featureMap.view(au_featureMap.size(0),2,numOfAU,au_featureMap.size(2)).transpose(1,2).contiguous().view(au_featureMap.size(0),numOfAU,-1) # numOfAU * 512 666 | AU_Dim_feature = torch.zeros((AU_feature.size(0),numOfAU,self.Dim)).cuda() # numOfAU * Dim 667 | for i in range(numOfAU): 668 | AU_Dim_feature[:,i,:] = self.fc_au_fuse_3[i](AU_feature[:,i,:]) 669 | 670 | EM_Dim_feature = self.fc_em_fuse_3(feature) # Dim 671 | 672 | EM_Dim_feature = EM_Dim_feature.view(EM_Dim_feature.size(0),1,self.Dim).repeat(1,numOfAU,1) # numOfAU * Dim 673 | 674 | Attention = EM_Dim_feature * AU_Dim_feature # numOfAU * Dim 675 | 676 | # Attention: numOfAU * Dim 677 | # pred2 = self.pred_em_fuse_3_attention(Attention.view(Attention.size(0),-1)) 678 | 679 | # return pred1, pred2 680 | 681 | Attention_Result = torch.zeros((Attention.size(0),numOfAU,1)).cuda() # numOfAU * 1 682 | for i in range(numOfAU): 683 | # Attention_Result[:,i,:] = self.fc_attention_fuse_3[i](Attention[:,i,:]) # No Share Weight FC 684 | Attention_Result[:,i,:] = self.fc_attention_fuse_3(Attention[:,i,:]) # Share Weight FC 685 | 686 | # Attention_Result = Attention.mean(2,keepdim=True) # GAP 687 | 688 | Attention_Result = nn.Sigmoid()(Attention_Result) # Sigmoid 689 | # Attention_Result = nn.Softmax(dim=1)(Attention_Result) # Softmax 690 | 691 | au_prob = Attention_Result.view(Attention_Result.size(0),numOfAU) 692 | 693 | Attention_Result = Attention_Result.repeat(1,1,512) # numOfAU * 512 694 | # Attention_Result = Attention_Result.repeat(1,1,128) # numOfAU * 128 695 | 696 | Result = Attention_Result * AU_feature # numOfAU * 512 697 | 698 | # 5.22 Concate 699 | # pred2 = self.pred_em_fuse_3( torch.cat( (feature,Result.mean(dim=1)), dim=1 ) ) 700 | # return pred1, pred2, au_prob 701 | 702 | # Share Weight and Feature Concat 703 | # Result = Result.view(Result.size(0),-1) 704 | # pred2 = self.pred_em_fuse_3_ShareWeight(Result) 705 | 706 | # return pred1, pred2 707 | 708 | Result = Result.sum(dim=1) # 512 709 | 710 | Result = torch.cat((feature,Result),dim=1) 711 | 712 | # Bilinear Pooling 713 | pred2 = self.pred_em_fuse_3(Result) 714 | 715 | return pred1, pred2, au_prob 716 | 717 | def crop_au_featureMap(self,deconv_featureMap,au_location): 718 | au = [] 719 | for i in range(numOfAU): 720 | au.append(au_location[:,i,:]) 721 | 722 | batch_size = deconv_featureMap.size(0) 723 | map_ch = deconv_featureMap.size(1) 724 | map_len = deconv_featureMap.size(2) 725 | 726 | grid_ch = map_ch 727 | grid_len = int(map_len * 24 / 56) 728 | 729 | feature_list = [] 730 | for i in range(numOfAU): 731 | grid1_list = [] 732 | grid2_list = [] 733 | for j in range(batch_size): 734 | h_min_1 = au[i][j,1]-int(grid_len/2) 735 | h_max_1 = au[i][j,1]+int(grid_len/2) 736 | w_min_1 = au[i][j,0]-int(grid_len/2) 737 | w_max_1 = au[i][j,0]+int(grid_len/2) 738 | 739 | h_min_2 = au[i][j,3]-int(grid_len/2) 740 | h_max_2 = au[i][j,3]+int(grid_len/2) 741 | w_min_2 = au[i][j,2]-int(grid_len/2) 742 | w_max_2 = au[i][j,2]+int(grid_len/2) 743 | # grid_1 = deconv_featureMap[j, :, h_min_1:h_max_1, w_min_1:w_max_1] 744 | # grid_2 = deconv_featureMap[j, :, h_min_2:h_max_2, w_min_2:w_max_2] 745 | 746 | map_h_min_1 = max(0, h_min_1) 747 | map_h_max_1 = min(map_len, h_max_1) 748 | map_w_min_1 = max(0, w_min_1) 749 | map_w_max_1 = min(map_len, w_max_1) 750 | 751 | map_h_min_2 = max(0, h_min_2) 752 | map_h_max_2 = min(map_len, h_max_2) 753 | map_w_min_2 = max(0, w_min_2) 754 | map_w_max_2 = min(map_len, w_max_2) 755 | 756 | grid_h_min_1 = max(0, 0-h_min_1) 757 | grid_h_max_1 = grid_len + min(0, map_len-h_max_1) 758 | grid_w_min_1 = max(0, 0-w_min_1) 759 | grid_w_max_1 = grid_len + min(0, map_len-w_max_1) 760 | 761 | grid_h_min_2 = max(0, 0-h_min_2) 762 | grid_h_max_2 = grid_len + min(0, map_len-h_max_2) 763 | grid_w_min_2 = max(0, 0-w_min_2) 764 | grid_w_max_2 = grid_len + min(0, map_len-w_max_2) 765 | 766 | grid_1 = torch.zeros(grid_ch, grid_len, grid_len) 767 | grid_2 = torch.zeros(grid_ch, grid_len, grid_len) 768 | grid_1 = grid_1.cuda() 769 | grid_2 = grid_2.cuda() 770 | 771 | grid_1[:, grid_h_min_1:grid_h_max_1, grid_w_min_1:grid_w_max_1] = deconv_featureMap[j, :, map_h_min_1:map_h_max_1, map_w_min_1:map_w_max_1] 772 | grid_2[:, grid_h_min_2:grid_h_max_2, grid_w_min_2:grid_w_max_2] = deconv_featureMap[j, :, map_h_min_2:map_h_max_2, map_w_min_2:map_w_max_2] 773 | 774 | grid1_list.append(grid_1) 775 | grid2_list.append(grid_2) 776 | 777 | input1 = torch.stack(grid1_list, dim=0) 778 | input2 = torch.stack(grid2_list, dim=0) 779 | feature_list.append(input1) 780 | feature_list.append(input2) 781 | 782 | # feature list: (12 * 2) * batch * 1024 * 3 * 3 783 | output_list = [] 784 | # Feed into crop net individually 785 | for i in range(numOfAU * 2): 786 | output = self.Crop_Net_1[i](feature_list[i]) 787 | # output = self.GAP(output) 788 | output = output.view(batch_size, -1) 789 | output = self.Crop_Net_2[i](output) 790 | output_list.append(output) 791 | 792 | au_feature = torch.stack(output_list, dim=1) # batch * 24 * 256 793 | # au_feature = torch.cat(output_list, 1) # batch * (12 * 2 * 150) 794 | # au_feature = au_feature.view(batch_size, -1, self.feature_dim_au) 795 | 796 | return au_feature --------------------------------------------------------------------------------