├── CocoDetection.py ├── README.md ├── RMA_recognition.py ├── RMA_server.py ├── coco-label.txt ├── main.py ├── models ├── RMA_module.py ├── RMA_module_with_priori.py ├── loss.py └── loss_with_priori.py ├── priori_main.py ├── priori_main_tencrop.py ├── utils.py ├── visualize.py ├── visualizeImg.py ├── visualizeImg_test0712.py └── visualize_test0712.py /CocoDetection.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data as data 2 | from PIL import Image 3 | import os 4 | import os.path 5 | import torch.tensor as tensor 6 | import torchvision.transforms as transforms 7 | #import torch 8 | import numpy as np 9 | import copy 10 | 11 | class CocoDetection(data.Dataset): 12 | """`MS Coco Detection `_ Dataset. 13 | 14 | Args: 15 | root (string): Root directory where images are downloaded to. 16 | annFile (string): Path to json annotation file. 17 | transform (callable, optional): A function/transform that takes in an PIL image 18 | and returns a transformed version. E.g, ``transforms.ToTensor`` 19 | target_transform (callable, optional): A function/transform that takes in the 20 | target and transforms it. 21 | """ 22 | 23 | def __init__(self, root, annFile, transform=None, target_transform=None): 24 | from pycocotools.coco import COCO 25 | self.root = root 26 | self.coco = COCO(annFile) 27 | self.ids = list(self.coco.imgs.keys()) 28 | self.transform = transform 29 | self.target_transform = target_transform 30 | 31 | def __getitem__(self, index): 32 | """ 33 | Args: 34 | index (int): Index 35 | 36 | Returns: 37 | tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``. 38 | """ 39 | coco = self.coco 40 | img_id = self.ids[index] 41 | ann_ids = coco.getAnnIds(imgIds=img_id) 42 | target = coco.loadAnns(ann_ids) 43 | 44 | path = coco.loadImgs(img_id)[0]['file_name'] 45 | #image_id = coco.loadImgs(img_id)[0]['id'] 46 | 47 | original_img = Image.open(os.path.join(self.root, path)).convert('RGB') 48 | #print(transforms.ToTensor()(original_img)) 49 | if self.transform is not None: 50 | original_img = self.transform(original_img) 51 | #print(original_img) 52 | img = copy.deepcopy(original_img) 53 | img = transforms.Compose([ 54 | #transforms.ToTensor(), 55 | transforms.Normalize(mean=[0.485, 0.456, 0.406], 56 | std=[0.229, 0.224, 0.225])])(img) 57 | #print(original_img) 58 | 59 | if self.target_transform is not None: 60 | target = self.target_transform(target) 61 | 62 | return img, target, original_img 63 | 64 | 65 | def __len__(self): 66 | return len(self.ids) 67 | 68 | def __repr__(self): 69 | fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' 70 | fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) 71 | fmt_str += ' Root Location: {}\n'.format(self.root) 72 | tmp = ' Transforms (if any): ' 73 | fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) 74 | tmp = ' Target Transforms (if any): ' 75 | fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) 76 | return fmt_str 77 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AttentionImageClass 2 | Multi-label Image Recognition by Recurrently Discovering Attentional Regions (Pytorch implementation) 3 | 4 | ## Training 5 | 6 | ```bash 7 | python main.py 8 | ``` 9 | 10 | ## Running the server for recognition 11 | 12 | ```bash 13 | python RMA_server.py 14 | ``` 15 | -------------------------------------------------------------------------------- /RMA_recognition.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.optim as optim 3 | import torch.utils.data 4 | import torch.backends.cudnn as cudnn 5 | import torchvision 6 | from torchvision import transforms as transforms 7 | import numpy as np 8 | import visdom 9 | import torch.nn.functional as F 10 | import PIL.Image as Image 11 | 12 | 13 | import argparse 14 | import os 15 | 16 | from models.RMA_module_with_priori import RMA_module 17 | from models.loss_with_priori import loss_function 18 | from utils import get_target_transform as target_trans 19 | from utils import id2label 20 | 21 | 22 | # GPU setting 23 | os.environ.setdefault("CUDA_VISIBLE_DEVICES", "6") 24 | 25 | 26 | # ================================================================== 27 | # Constants 28 | # ================================================================== 29 | EPOCH = 45 # number of times for each run-through 30 | BATCH_SIZE = 8 # number of images for each epoch 31 | N = 512 # size of input images (512 or 640) 32 | TOPK = 3 # top k highest-ranked labels 33 | GPU_IN_USE = torch.cuda.is_available() # whether using GPU 34 | PATH_MODEL_PARAMS = './params/params_with_priori.pkl' 35 | 36 | 37 | # ================================================================== 38 | # Parser Initialization 39 | # ================================================================== 40 | parser = argparse.ArgumentParser(description='Pytorch Implementation of ICCV2017_AttentionImageClass') 41 | parser.add_argument('--testBatchSize', default=BATCH_SIZE, type=int, help='testing batch size') 42 | parser.add_argument('--pathModelParams', default=PATH_MODEL_PARAMS, type=str, help='path of model parameters') 43 | parser.add_argument('--loadModel', default=True, type=bool, help='load model parameters') 44 | args = parser.parse_args() 45 | 46 | 47 | # ================================================================== 48 | # Transforms for the Input Images 49 | # ================================================================== 50 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 51 | std=[0.229, 0.224, 0.225]) 52 | transforms = transforms.Compose([ 53 | transforms.Resize((N, N)), 54 | transforms.ToTensor(), 55 | normalize 56 | ]) 57 | 58 | 59 | class RMA_model(object): 60 | # 构造函数里加载模型,比如 tensorflow 的 graph, sess 等 61 | def __init__(self): 62 | # prepare model 63 | print('\n***** Prepare Model *****') 64 | vgg16 = torchvision.models.vgg16(pretrained=True) 65 | self.extract_features = vgg16.features 66 | self.RMA = RMA_module(lstm_input_size=14, lstm_hidden_size=4096, zk_size=4096) 67 | if args.loadModel: 68 | self.RMA.load_state_dict(torch.load(args.pathModelParams)) 69 | if GPU_IN_USE: 70 | print('CUDA_VISIBLE_DEVICES:', os.environ['CUDA_VISIBLE_DEVICES']) 71 | print('cuda: move all model parameters and buffers to the GPU') 72 | self.extract_features.cuda() 73 | self.RMA.cuda() 74 | cudnn.benchmark = True 75 | print('Model Preparation : Finished') 76 | 77 | # Test 78 | def evaluate(self, data): 79 | print('evaluate:') 80 | self.RMA.eval() # set the module in evaluation mode 81 | print('before transforms') 82 | data = transforms(data).unsqueeze(0) 83 | if GPU_IN_USE: 84 | data = data.cuda() # set up GPU Tensor 85 | 86 | print('before extracting features') 87 | f_I = self.extract_features(data) 88 | output, _ = self.RMA(f_I) 89 | print('after RMA') 90 | prediction = torch.topk(F.softmax(output, dim=1), 10, dim=1) 91 | filter = prediction[0].eq(0.1) + prediction[0].gt(0.1) 92 | category_id = torch.mul(prediction[1]+1, filter.type(torch.cuda.LongTensor)) 93 | print(prediction[0]) 94 | #print(category_id) 95 | return id2label(category_id)[0].tolist() 96 | 97 | 98 | # 需要对外提供一个 API,可以直接拿到你们的结果 99 | def image_recognition(self, image_path): 100 | # 业务逻辑 101 | print('image path: ', image_path) 102 | image = Image.open(image_path) 103 | if not image.mode == 'RGB': 104 | image = image.convert('RGB') 105 | with torch.no_grad(): 106 | label = self.evaluate(image) 107 | print(label) 108 | return dict( 109 | data = label 110 | ) 111 | 112 | def __del__(self): 113 | print("delete!") 114 | 115 | # 生成模型实例 116 | # 这里生成模型实例供 server 导入并调用 117 | print("生成 RMA Model 实例.................") 118 | RMA_model_instance = RMA_model() 119 | print("RMA Model 实例生成完成...............") 120 | -------------------------------------------------------------------------------- /RMA_server.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 用了 Python 的 Tornado 框架,当然也可以使用其它框架 3 | # 英文文档 (v5.0) http://www.tornadoweb.org/en/stable/ 4 | # 中文文档 (v4.3) http://tornado-zh.readthedocs.io/zh/latest/ 5 | import json 6 | import os 7 | 8 | import tornado.httpserver 9 | import tornado.ioloop 10 | import tornado.options 11 | import tornado.web 12 | from tornado.options import define, options 13 | from tornado.escape import json_decode, json_encode 14 | from tornado.concurrent import Future 15 | 16 | # 这里导入模型实例 17 | from RMA_recognition import RMA_model_instance 18 | 19 | # 定义端口默认值 20 | define("port", default=8080, help="run on the given port", type=int) 21 | 22 | # 业务层,负责调用模型 API 23 | class RecognitionService(object): 24 | def upload_image(self, file_metas): 25 | file_path = None 26 | if (file_metas): 27 | for meta in file_metas: 28 | # 上传图片保存路径为 {当前目录}/realimg 29 | upload_path = os.path.join(os.path.dirname(__file__), "realimg") 30 | 31 | filename = meta['filename'] 32 | file_path = os.path.join(upload_path, filename) 33 | 34 | with open(file_path, 'wb') as f: 35 | f.write(meta['body']) 36 | 37 | return file_path 38 | 39 | def recognition_model_run(self, image_path): 40 | res = dict( 41 | rtn = 0, 42 | msg = "", 43 | data = {} 44 | ) 45 | # 调用模型 API 46 | try: 47 | data = RMA_model_instance.image_recognition(image_path) 48 | res = dict( 49 | rtn = 200, 50 | msg = "成功", 51 | data = data 52 | ) 53 | except Exception as e: 54 | res["rtn"] = 500 55 | res["msg"] = str(e) 56 | 57 | return res 58 | 59 | 60 | # 同步 Handler 示例 61 | class SyncHandler(tornado.web.RequestHandler): 62 | def initialize(self): 63 | self.recognition_service = RecognitionService() 64 | 65 | def post(self): 66 | file_metas = self.request.files.get("img") 67 | 68 | file_path = self.recognition_service.upload_image(file_metas) 69 | 70 | res = self.recognition_service.recognition_model_run(file_path) 71 | 72 | self.set_status(res.get("rtn")) 73 | self.set_header("Content-Type", "application/json") 74 | self.write(json_encode(res)) 75 | 76 | self.finish() 77 | 78 | 79 | 80 | 81 | if __name__ == "__main__": 82 | tornado.options.parse_command_line() 83 | # 下面用正则表达式匹配 url 84 | app = tornado.web.Application(handlers=[ 85 | (r"/api/recognition", SyncHandler) 86 | ]) 87 | http_server = tornado.httpserver.HTTPServer(app) 88 | http_server.listen(options.port) 89 | tornado.ioloop.IOLoop.instance().start() 90 | 91 | # http://localhost:8080/api/async_api 92 | # http://localhost:8080/api/sync_api 93 | -------------------------------------------------------------------------------- /coco-label.txt: -------------------------------------------------------------------------------- 1 | 1,1,person 2 | 2,2,bicycle 3 | 3,3,car 4 | 4,4,motorcycle 5 | 5,5,airplane 6 | 6,6,bus 7 | 7,7,train 8 | 8,8,truck 9 | 9,9,boat 10 | 10,10,traffic light 11 | 11,11,fire hydrant 12 | 13,12,stop sign 13 | 14,13,parking meter 14 | 15,14,bench 15 | 16,15,bird 16 | 17,16,cat 17 | 18,17,dog 18 | 19,18,horse 19 | 20,19,sheep 20 | 21,20,cow 21 | 22,21,elephant 22 | 23,22,bear 23 | 24,23,zebra 24 | 25,24,giraffe 25 | 27,25,backpack 26 | 28,26,umbrella 27 | 31,27,handbag 28 | 32,28,tie 29 | 33,29,suitcase 30 | 34,30,frisbee 31 | 35,31,skis 32 | 36,32,snowboard 33 | 37,33,sports ball 34 | 38,34,kite 35 | 39,35,baseball bat 36 | 40,36,baseball glove 37 | 41,37,skateboard 38 | 42,38,surfboard 39 | 43,39,tennis racket 40 | 44,40,bottle 41 | 46,41,wine glass 42 | 47,42,cup 43 | 48,43,fork 44 | 49,44,knife 45 | 50,45,spoon 46 | 51,46,bowl 47 | 52,47,banana 48 | 53,48,apple 49 | 54,49,sandwich 50 | 55,50,orange 51 | 56,51,broccoli 52 | 57,52,carrot 53 | 58,53,hot dog 54 | 59,54,pizza 55 | 60,55,donut 56 | 61,56,cake 57 | 62,57,chair 58 | 63,58,couch 59 | 64,59,potted plant 60 | 65,60,bed 61 | 67,61,dining table 62 | 70,62,toilet 63 | 72,63,tv 64 | 73,64,laptop 65 | 74,65,mouse 66 | 75,66,remote 67 | 76,67,keyboard 68 | 77,68,cell phone 69 | 78,69,microwave 70 | 79,70,oven 71 | 80,71,toaster 72 | 81,72,sink 73 | 82,73,refrigerator 74 | 84,74,book 75 | 85,75,clock 76 | 86,76,vase 77 | 87,77,scissors 78 | 88,78,teddy bear 79 | 89,79,hair drier 80 | 90,80,toothbrush -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.optim as optim 3 | import torch.utils.data 4 | import torch.backends.cudnn as cudnn 5 | import torchvision 6 | from torchvision import transforms as transforms 7 | import numpy as np 8 | import visdom 9 | import torch.nn.functional as F 10 | 11 | import argparse 12 | import os 13 | 14 | from models.RMA_module import RMA_module 15 | from models.loss import loss_function 16 | from utils import get_target_transform as target_trans 17 | 18 | # data visualization 19 | vis = visdom.Visdom(env='baseline(no priori)') 20 | # GPU setting 21 | os.environ.setdefault("CUDA_VISIBLE_DEVICES", "2") 22 | 23 | # ================================================================== 24 | # Constants 25 | # ================================================================== 26 | EPOCH = 45 # number of times for each run-through 27 | BATCH_SIZE = 16 # number of images for each epoch 28 | LEARNING_RATE = 1e-5 # default learning rate 29 | WEIGHT_DECAY = 0 # default weight decay 30 | N = 512 # size of input images (512 or 640) 31 | MOMENTUM = (0.9, 0.999) # momentum in Adam optimization 32 | TOPK = 3 # top k highest-ranked labels 33 | GPU_IN_USE = torch.cuda.is_available() # whether using GPU 34 | DIR_TRAIN_IMAGES = '../dataset/train2017/' 35 | DIR_TEST_IMAGES = '../dataset/val2017/' 36 | PATH_TRAIN_ANNFILE = '../dataset/annotations/instances_train2017.json' 37 | PATH_TEST_ANNFILE = '../dataset/annotations/instances_val2017.json' 38 | PATH_MODEL_PARAMS = './params/params_no_priori.pkl' 39 | NUM_CATEGORIES = 80 40 | LOSS_OUTPUT_INTERVAL = 100 41 | 42 | # ================================================================== 43 | # Global Variables 44 | # ================================================================== 45 | # one iteration means one mini-batch finishs a forward-backward process 46 | current_training_iteration = torch.tensor([1]) 47 | current_test_iteration = torch.tensor([1]) 48 | loss_graph_window = 'loss graph' 49 | test_f1_graph_window = 'test OF1 and CF1 graph' 50 | evaluation_window = 'six evaluation metrics' 51 | #category_id_window = 'category ids of prediction and ground-truth' 52 | of1 = 0. 53 | cf1 = 0. 54 | 55 | # ================================================================== 56 | # Parser Initialization 57 | # ================================================================== 58 | parser = argparse.ArgumentParser(description='Pytorch Implementation of ICCV2017_AttentionImageClass') 59 | parser.add_argument('--lr', default=LEARNING_RATE, type=float, help='learning rate') 60 | parser.add_argument('--epoch', default=EPOCH, type=int, help='number of epochs') 61 | parser.add_argument('--trainBatchSize', default=BATCH_SIZE, type=int, help='training batch size') 62 | parser.add_argument('--testBatchSize', default=BATCH_SIZE, type=int, help='testing batch size') 63 | parser.add_argument('--weightDecay', default=WEIGHT_DECAY, type=float, help='weight decay') 64 | parser.add_argument('--pathModelParams', default=PATH_MODEL_PARAMS, type=str, help='path of model parameters') 65 | parser.add_argument('--saveModel', default=True, type=bool, help='save model parameters') 66 | parser.add_argument('--loadModel', default=False, type=bool, help='load model parameters') 67 | args = parser.parse_args() 68 | 69 | 70 | # ================================================================== 71 | # Prepare Dataset(training & test) 72 | # ================================================================== 73 | print('***** Prepare Data ******') 74 | 75 | # transforms of training dataset 76 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 77 | std=[0.229, 0.224, 0.225]) 78 | train_transforms = transforms.Compose([ 79 | transforms.RandomHorizontalFlip(p=0.5), # default value is 0.5 80 | transforms.Resize((N, N)), 81 | transforms.ToTensor(), 82 | normalize 83 | ]) 84 | 85 | # transforms of test dataset 86 | test_transforms = transforms.Compose([ 87 | transforms.Resize((N, N)), 88 | transforms.ToTensor(), 89 | normalize 90 | ]) 91 | 92 | train_dataset = torchvision.datasets.CocoDetection(root=DIR_TRAIN_IMAGES, annFile=PATH_TRAIN_ANNFILE, 93 | transform=train_transforms, target_transform=target_trans) 94 | test_dataset = torchvision.datasets.CocoDetection(root=DIR_TEST_IMAGES, annFile=PATH_TEST_ANNFILE, 95 | transform=test_transforms, target_transform=target_trans) 96 | train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.trainBatchSize, shuffle=True, num_workers=2) 97 | test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=args.testBatchSize, shuffle=False, num_workers=2) 98 | print('Data Preparation : Finished') 99 | 100 | 101 | # ================================================================== 102 | # Prepare Model 103 | # ================================================================== 104 | print('\n***** Prepare Model *****') 105 | 106 | vgg16 = torchvision.models.vgg16(pretrained=True) 107 | 108 | for param in vgg16.features.parameters(): 109 | param.requires_grad=False 110 | 111 | extract_features = vgg16.features 112 | 113 | RMA = RMA_module(lstm_input_size=14, lstm_hidden_size=4096, zk_size=4096) 114 | if args.loadModel: 115 | RMA.load_state_dict(torch.load(args.pathModelParams)) 116 | 117 | if GPU_IN_USE: 118 | print('CUDA_VISIBLE_DEVICES:', os.environ['CUDA_VISIBLE_DEVICES']) 119 | print('cuda: move all model parameters and buffers to the GPU') 120 | extract_features.cuda() 121 | RMA.cuda() 122 | cudnn.benchmark = True 123 | 124 | # Adam optimization 125 | optimizer = optim.Adam(RMA.parameters(), lr=args.lr, weight_decay=args.weightDecay, betas=MOMENTUM) 126 | # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[75, 150], gamma=0.5) # lr decay 127 | print('Model Preparation : Finished') 128 | 129 | 130 | # Train 131 | # ================================================================================ 132 | # data: [torch.cuda.FloatTensor of size [batch_size, 3, N, N] N=512/640] 133 | # target: [torch.cuda.FloatTensor of size [batch_size, num_categories]] 134 | # output: [torch.cuda.FloatTensor of size [batch_size, num_categories]] 135 | # prediction: [ 136 | # [torch.cuda.FloatTensor of size [batch_size, TOPK] (TOPK)], 137 | # [torch.cuda.LongTensor of size [batch_size, TOPK] (index of TOPK)] 138 | # ] 139 | # ================================================================================ 140 | def train(): 141 | print('train:') 142 | RMA.train() # set the module in training mode 143 | train_loss = 0. # sum of train loss up to current batch 144 | 145 | global current_training_iteration 146 | 147 | sum_prediction_label = torch.zeros(1, 80) + 1e-6 148 | sum_correct_prediction_label = torch.zeros(1, 80) 149 | sum_ground_truth_label = torch.zeros(1, 80) 150 | 151 | for batch_num, (data, target) in enumerate(train_loader): 152 | if target.sum() == 0: 153 | continue 154 | target = target.index_select(0, torch.nonzero(target.sum(dim=1)).view(-1)) 155 | data = data.index_select(0, torch.nonzero(target.sum(dim=1)).view(-1)) 156 | 157 | if GPU_IN_USE: 158 | data, target = data.cuda(), target.cuda() 159 | 160 | # -----forward----- 161 | optimizer.zero_grad() 162 | f_I = extract_features(data) 163 | output, M = RMA(f_I) 164 | # ---end forward--- 165 | 166 | # ---calculate loss and backward--- 167 | loss = loss_function(output, target, M, add_constraint=True) 168 | loss.backward() 169 | optimizer.step() 170 | # ----------end backward----------- 171 | 172 | train_loss += loss 173 | prediction = torch.topk(F.softmax(output, dim=1), 10, dim=1) 174 | filter = prediction[0].eq(0.1) + prediction[0].gt(0.1) 175 | prediction_index = torch.mul(prediction[1]+1, filter.type(torch.cuda.LongTensor)) 176 | extend_eye_mat = torch.cat((torch.zeros(1, 80), torch.eye(80)), 0) 177 | prediction_label = extend_eye_mat[prediction_index.view(-1)].view(-1, 10, 80).sum(dim=1) 178 | correct_prediction_label = (target.cpu().byte() & prediction_label.byte()).type(torch.FloatTensor) 179 | 180 | #count the sum of label vector 181 | sum_prediction_label += prediction_label.sum(dim=0) 182 | sum_correct_prediction_label += correct_prediction_label.sum(dim=0) 183 | sum_ground_truth_label += target.cpu().sum(dim=0) 184 | 185 | #for i in range(0, target.size(0)): 186 | # print('-----------------') 187 | # print('ground-truth: ', target[i].nonzero().view(-1)) 188 | # print('prediction: ', prediction[1][i]) 189 | # print('-----------------') 190 | 191 | if batch_num % LOSS_OUTPUT_INTERVAL == 0: 192 | # visualization: draw the train loss graph 193 | vis.line( 194 | X=current_training_iteration, 195 | Y=torch.tensor([train_loss.data]) / (batch_num+1), 196 | win=loss_graph_window, 197 | name='train loss', 198 | update=None if current_training_iteration == 1 else 'append', 199 | opts=dict(xlabel='iteration', ylabel='loss', showlegend=True) 200 | ) 201 | print('loss %.3f (batch %d)' % (train_loss/(batch_num+1), batch_num+1)) 202 | current_training_iteration += LOSS_OUTPUT_INTERVAL 203 | 204 | # evaluation metrics 205 | o_p = torch.div(sum_correct_prediction_label.sum(), sum_prediction_label.sum()) 206 | o_r = torch.div(sum_correct_prediction_label.sum(), sum_ground_truth_label.sum()) 207 | of1 = torch.div(2 * o_p * o_r, o_p + o_r) 208 | c_p = (torch.div(sum_correct_prediction_label, sum_prediction_label)).sum() / NUM_CATEGORIES 209 | c_r = (torch.div(sum_correct_prediction_label, sum_ground_truth_label)).sum() / NUM_CATEGORIES 210 | cf1 = torch.div(2 * c_p * c_r, c_p + c_r) 211 | 212 | return c_p, c_r, cf1, o_p, o_r, of1 213 | 214 | 215 | # Test 216 | # ================================================================================ 217 | # data: [torch.cuda.FloatTensor of size [batch_size, 3, N, N] N=512/640] 218 | # target: [torch.cuda.FloatTensor of size [batch_size, num_categories]] 219 | # output: [torch.cuda.FloatTensor of size [batch_size, num_categories]] 220 | # prediction: [ 221 | # [torch.cuda.FloatTensor of size [batch_size, TOPK] (TOPK)], 222 | # [torch.cuda.LongTensor of size [batch_size, TOPK] (index of TOPK)] 223 | # ] 224 | # ================================================================================ 225 | def test(): 226 | print('test:') 227 | RMA.eval() # set the module in evaluation mode 228 | test_loss = 0. # sum of train loss up to current batch 229 | 230 | global current_test_iteration 231 | 232 | sum_prediction_label = torch.zeros(1, 80) + 1e-6 233 | sum_correct_prediction_label = torch.zeros(1, 80) 234 | sum_ground_truth_label = torch.zeros(1, 80) 235 | 236 | for batch_num, (data, target) in enumerate(test_loader): 237 | if target.sum() == 0: 238 | continue 239 | target = target.index_select(0, torch.nonzero(target.sum(dim=1)).view(-1)) 240 | data = data.index_select(0, torch.nonzero(target.sum(dim=1)).view(-1)) 241 | 242 | if GPU_IN_USE: 243 | data, target = data.cuda(), target.cuda() # set up GPU Tensor 244 | 245 | f_I = extract_features(data) 246 | output, M = RMA(f_I) 247 | loss = loss_function(output, target, M, add_constraint=True) 248 | 249 | test_loss += loss 250 | prediction = torch.topk(F.softmax(output, dim=1), 10, dim=1) 251 | filter = prediction[0].eq(0.1) + prediction[0].gt(0.1) 252 | prediction_index = torch.mul(prediction[1]+1, filter.type(torch.cuda.LongTensor)) 253 | extend_eye_mat = torch.cat((torch.zeros(1, 80), torch.eye(80)), 0) 254 | prediction_label = extend_eye_mat[prediction_index.view(-1)].view(-1, 10, 80).sum(dim=1) 255 | correct_prediction_label = (target.cpu().byte() & prediction_label.byte()).type(torch.FloatTensor) 256 | 257 | #count the sum of label vector 258 | sum_prediction_label += prediction_label.sum(dim=0) 259 | sum_correct_prediction_label += correct_prediction_label.sum(dim=0) 260 | sum_ground_truth_label += target.cpu().sum(dim=0) 261 | 262 | #for i in range(0, target.size(0)): 263 | # print('-----------------') 264 | # print('ground-truth: ', target[i].nonzero().view(-1)) 265 | # print('prediction: ', prediction_index[i] - 1) 266 | # print('-----------------') 267 | # 268 | 269 | if batch_num % LOSS_OUTPUT_INTERVAL == 0: 270 | # visualization: draw the test loss graph 271 | vis.line( 272 | X=current_test_iteration, 273 | Y=torch.tensor([test_loss.data]) / (batch_num+1), 274 | win=loss_graph_window, 275 | name='test loss', 276 | update='insert' if current_test_iteration == 1 else 'append', 277 | opts=dict(showlegend=True), 278 | ) 279 | print('loss %.3f (batch %d)' % (test_loss / (batch_num+1), batch_num+1)) 280 | current_test_iteration += LOSS_OUTPUT_INTERVAL 281 | 282 | # evaluation metrics 283 | o_p = torch.div(sum_correct_prediction_label.sum(), sum_prediction_label.sum()) 284 | o_r = torch.div(sum_correct_prediction_label.sum(), sum_ground_truth_label.sum()) 285 | of1 = torch.div(2 * o_p * o_r, o_p + o_r) 286 | c_p = (torch.div(sum_correct_prediction_label, sum_prediction_label)).sum() / NUM_CATEGORIES 287 | c_r = (torch.div(sum_correct_prediction_label, sum_ground_truth_label)).sum() / NUM_CATEGORIES 288 | cf1 = torch.div(2 * c_p * c_r, c_p + c_r) 289 | 290 | return c_p, c_r, cf1, o_p, o_r, of1 291 | 292 | 293 | # ================================================================== 294 | # Save Model 295 | # ================================================================== 296 | def save(): 297 | torch.save(RMA.state_dict(), args.pathModelParams) 298 | print('Checkpoint saved to {}'.format(args.pathModelParams)) 299 | 300 | 301 | # ================================================================== 302 | # Main Loop 303 | # ================================================================== 304 | for current_epoch in range(1, args.epoch + 1): 305 | print('\n===> epoch: %d/%d' % (current_epoch, args.epoch)) 306 | train_cp, train_cr, train_cf1, train_op, train_or, train_of1 = train() 307 | with torch.no_grad(): 308 | test_cp, test_cr, test_cf1, test_op, test_or, test_of1 = test() 309 | 310 | evaluation_metrics = ''' 311 |
312 | ===> epoch: %d/%d
313 | ------------------------------------------------------------- 314 | | CP | CR | CF1 | OP | OR | OF1 | 315 | ------------------------------------------------------------- 316 | | %.3f | %.3f | %.3f | %.3f | %.3f | %.3f | 317 | ------------------------------------------------------------- 318 |
319 | ''' % (current_epoch, args.epoch, test_cp, test_cr, test_cf1, test_op, test_or, test_of1) 320 | 321 | # visualization 322 | vis.line( 323 | X=torch.tensor([current_epoch]), 324 | Y=torch.tensor([test_cf1]), 325 | name='test_CF1', 326 | win=test_f1_graph_window, 327 | update=None if current_epoch == 1 else 'append', 328 | opts=dict(xlabel='epoch', ylabel='F1', showlegend=True, title='Evaluation of Test (CF1 / OF1)') 329 | ) 330 | vis.line( 331 | X=torch.tensor([current_epoch]), 332 | Y=torch.tensor([test_of1]), 333 | name='test_OF1', 334 | win=test_f1_graph_window, 335 | update='insert' if current_epoch == 1 else 'append', 336 | opts=dict(showlegend=True) 337 | ) 338 | vis.text( 339 | evaluation_metrics, 340 | win=evaluation_window, 341 | append=False if current_epoch == 1 else True 342 | ) 343 | 344 | if test_of1 > of1 and test_cf1 > cf1: 345 | if args.saveModel: 346 | save() 347 | of1 = test_of1 348 | cf1 = test_cf1 349 | 350 | if current_epoch == args.epoch: 351 | print('===> BEST PERFORMANCE (OF1/CF1): %.3f / %.3f' % (of1, cf1)) 352 | 353 | -------------------------------------------------------------------------------- /models/RMA_module.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch.tensor as tensor 4 | import torch 5 | 6 | COCO_CATEGORIES = 80 7 | 8 | ''' 9 | Recurrent Memorized-Attention Module 10 | ============================================================================================== 11 | @Parameters: 12 | lstm_input_size : number of expected features in the input x of LSTM 13 | lstm_hidden_size : number of features in the hidden state of LSTM 14 | zk_size : size of z_k (about z_k, see 'Update rule of M' in the paper) 15 | num_itreations : number of iterations in RMA module (default: 5) 16 | num_classes : number of classes/categories (default: 80, using COCO dataset) 17 | use_gpu : whether using gpu (default: True) 18 | @Input: 19 | f_I : feature map (torch.cuda.FloatTensor[batch_size, num_channels, height, width]) 20 | @Output: 21 | fused_scores : final fused score vectors (torch.cuda.FloatTensor[batch_size, num_classes]) 22 | M : transformation matrices in ST 23 | ============================================================================================== 24 | ''' 25 | class RMA_module(nn.Module): 26 | def __init__(self, lstm_input_size, lstm_hidden_size, zk_size, 27 | num_iterations=5, num_classes=COCO_CATEGORIES, 28 | use_gpu=True): 29 | 30 | super(RMA_module, self).__init__() 31 | 32 | self.K = num_iterations 33 | self.C = num_classes 34 | self.use_gpu = use_gpu 35 | self.input_size = lstm_input_size 36 | self.hidden_size = lstm_hidden_size 37 | 38 | self.pooling = nn.MaxPool2d(kernel_size=2) 39 | self.fc = nn.Linear(lstm_input_size * lstm_input_size / 4 * 512, 4096) 40 | self.lstm = nn.LSTMCell(4096, lstm_hidden_size) 41 | 42 | self.get_zk = nn.Sequential( 43 | # channels of output feature map in vgg16 = 512 44 | nn.Linear(lstm_hidden_size, zk_size), 45 | nn.ReLU(inplace=True) 46 | ) 47 | self.get_score = nn.Linear(zk_size, num_classes) 48 | self.update_m = nn.Linear(zk_size, 6) 49 | self.update_m.weight.data = torch.zeros(6, zk_size) 50 | self.update_m.bias.data = tensor([1., 0., 0., 0., 1., 0.]) 51 | 52 | # ST: spatial transformer network forward function 53 | # ================================================ 54 | def ST(self, x, theta): 55 | # determine the output size of STN 56 | num_channels = x.size()[1] 57 | batch_size = x.size()[0] 58 | output_size = torch.Size((batch_size, num_channels, self.input_size, self.input_size)) 59 | 60 | grid = F.affine_grid(theta, output_size) 61 | if self.use_gpu: 62 | grid = grid.cuda() 63 | # use bilinear interpolation(default) to sample the input pixels 64 | x = F.grid_sample(x, grid) 65 | return x 66 | 67 | # init_hidden: initialize the (h0, c0) in LSTM 68 | # ============================================ 69 | def init_hidden(self, N): 70 | if self.use_gpu: 71 | h0 = torch.zeros(N, self.hidden_size).cuda() 72 | c0 = torch.zeros(N, self.hidden_size).cuda() 73 | else: 74 | h0 = torch.zeros(N, self.hidden_size) 75 | c0 = torch.zeros(N, self.hidden_size) 76 | return (h0, c0) 77 | 78 | # RMA moudule forward function 79 | # ============================ 80 | def forward(self, f_I, return_whole_scores=False): 81 | # initialization 82 | batch_size = f_I.size()[0] 83 | hidden = self.init_hidden(batch_size) 84 | if self.use_gpu: 85 | scores = torch.randn(self.K, batch_size, self.C).cuda() 86 | else: 87 | scores = torch.randn(self.K, batch_size, self.C) 88 | M = torch.randn(self.K+1, batch_size, 2, 3) 89 | M[0] = tensor([[1., 0., 0.], [0., 1., 0.]]) 90 | 91 | # for each iteration 92 | for k in range(0, self.K+1): 93 | # locate an attentional region 94 | f_k = self.ST(f_I, M[k]) 95 | 96 | # descend dimension for lower GPU memory requirement 97 | f_k = self.pooling(f_k) 98 | f_k = self.fc(f_k.view(batch_size, -1)) 99 | 100 | # predict the scores regarding this region 101 | hidden = self.lstm(f_k, hidden) 102 | 103 | # get z_k for further caculating M and scores 104 | z_k = self.get_zk(hidden[0]) 105 | 106 | if k != 0: 107 | # obtain the score vector of current iteration 108 | scores[k-1] = self.get_score(z_k) 109 | 110 | if k != self.K: 111 | # update transformation matrix for next iteration 112 | M[k+1] = self.update_m(z_k).view(batch_size, 2, 3) 113 | M[k+1, :, 0, 1] = tensor(0.) 114 | M[k+1, :, 1, 0] = tensor(0.) 115 | 116 | # max pooling to obtain the final fused scores 117 | fused_scores = scores.max(0) 118 | 119 | if return_whole_scores: 120 | return fused_scores[0], M[1:, :, :, :], scores 121 | else: 122 | return fused_scores[0], M 123 | 124 | -------------------------------------------------------------------------------- /models/RMA_module_with_priori.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch.tensor as tensor 4 | import torch 5 | 6 | COCO_CATEGORIES = 80 7 | 8 | ''' 9 | Recurrent Memorized-Attention Module 10 | ============================================================================================== 11 | @Parameters: 12 | lstm_input_size : number of expected features in the input x of LSTM 13 | lstm_hidden_size : number of features in the hidden state of LSTM 14 | zk_size : size of z_k (about z_k, see 'Update rule of M' in the paper) 15 | num_itreations : number of iterations in RMA module (default: 5) 16 | num_classes : number of classes/categories (default: 80, using COCO dataset) 17 | use_gpu : whether using gpu (default: True) 18 | @Input: 19 | f_I : feature map (torch.cuda.FloatTensor[batch_size, num_channels, height, width]) 20 | @Output: 21 | fused_scores : final fused score vectors (torch.cuda.FloatTensor[batch_size, num_classes]) 22 | M : transformation matrices in ST 23 | ============================================================================================== 24 | ''' 25 | class RMA_module(nn.Module): 26 | def __init__(self, lstm_input_size, lstm_hidden_size, zk_size, 27 | num_iterations=5, num_classes=COCO_CATEGORIES, 28 | use_gpu=True): 29 | 30 | super(RMA_module, self).__init__() 31 | 32 | self.K = num_iterations 33 | self.C = num_classes 34 | self.use_gpu = use_gpu 35 | self.input_size = lstm_input_size 36 | self.hidden_size = lstm_hidden_size 37 | 38 | self.cx = tensor([0.5, 0.5, -0.5, -0.5]).view(4, -1) 39 | self.cy = tensor([0.5, -0.5, 0.5, -0.5]).view(4, -1) 40 | 41 | self.pooling = nn.MaxPool2d(kernel_size=2) 42 | self.fc = nn.Linear(lstm_input_size * lstm_input_size / 4 * 512, 4096) 43 | self.lstm = nn.LSTMCell(4096, lstm_hidden_size) 44 | 45 | self.get_zk = nn.Sequential( 46 | # channels of output feature map in vgg16 = 512 47 | nn.Linear(lstm_hidden_size, zk_size), 48 | nn.ReLU(inplace=True) 49 | ) 50 | self.get_score = nn.Linear(zk_size, num_classes) 51 | self.update_m = nn.Linear(zk_size, 6) 52 | self.update_m.weight.data = torch.zeros(6, zk_size) 53 | self.update_m.bias.data = tensor([1., 0., 0., 0., 1., 0.]) 54 | 55 | # ST: spatial transformer network forward function 56 | # ================================================ 57 | def ST(self, x, theta, k): 58 | # determine the output size of STN 59 | num_channels = x.size()[1] 60 | batch_size = x.size()[0] 61 | output_size = torch.Size((batch_size, num_channels, self.input_size, self.input_size)) 62 | 63 | if k > 1: 64 | theta[:, 0, 2] = theta[:, 0, 2] + self.cx[k-2] 65 | theta[:, 1, 2] = theta[:, 1, 2] + self.cy[k-2] 66 | 67 | grid = F.affine_grid(theta, output_size) 68 | if self.use_gpu: 69 | grid = grid.cuda() 70 | # use bilinear interpolation(default) to sample the input pixels 71 | x = F.grid_sample(x, grid) 72 | return x, theta 73 | 74 | # init_hidden: initialize the (h0, c0) in LSTM 75 | # ============================================ 76 | def init_hidden(self, N): 77 | if self.use_gpu: 78 | h0 = torch.zeros(N, self.hidden_size).cuda() 79 | c0 = torch.zeros(N, self.hidden_size).cuda() 80 | else: 81 | h0 = torch.zeros(N, self.hidden_size) 82 | c0 = torch.zeros(N, self.hidden_size) 83 | return (h0, c0) 84 | 85 | # RMA moudule forward function 86 | # ============================ 87 | def forward(self, f_I, return_whole_scores=False): 88 | # initialization 89 | batch_size = f_I.size()[0] 90 | hidden = self.init_hidden(batch_size) 91 | if self.use_gpu: 92 | scores = torch.randn(self.K, batch_size, self.C).cuda() 93 | else: 94 | scores = torch.randn(self.K, batch_size, self.C) 95 | M = torch.randn(self.K+1, batch_size, 2, 3) 96 | M[0] = tensor([[1., 0., 0.], [0., 1., 0.]]) 97 | M_for_visual = torch.randn(self.K, batch_size, 2, 3) 98 | 99 | # for each iteration 100 | for k in range(0, self.K+1): 101 | # locate an attentional region 102 | f_k, M_k_for_visual = self.ST(f_I, M[k].clone(), k) 103 | 104 | # descend dimension for lower GPU memory requirement 105 | f_k = self.pooling(f_k) 106 | f_k = self.fc(f_k.view(batch_size, -1)) 107 | 108 | # predict the scores regarding this region 109 | hidden = self.lstm(f_k, hidden) 110 | 111 | # get z_k for further caculating M and scores 112 | z_k = self.get_zk(hidden[0]) 113 | 114 | if k != 0: 115 | # obtain the score vector of current iteration 116 | scores[k-1] = self.get_score(z_k) 117 | M_for_visual[k-1] = M_k_for_visual 118 | 119 | if k != self.K: 120 | # update transformation matrix for next iteration 121 | M[k+1] = self.update_m(z_k).view(batch_size, 2, 3) 122 | M[k+1, :, 0, 1] = tensor(0.) 123 | M[k+1, :, 1, 0] = tensor(0.) 124 | 125 | # max pooling to obtain the final fused scores 126 | fused_scores = scores.max(0) 127 | 128 | #print('M for visual ', M_for_visual) 129 | 130 | if return_whole_scores: 131 | return fused_scores[0], M_for_visual, scores 132 | else: 133 | return fused_scores[0], M 134 | 135 | -------------------------------------------------------------------------------- /models/loss.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.tensor as tensor 3 | import torch.nn.functional as F 4 | import torch 5 | from math import * 6 | 7 | # hyperparameters 8 | alpha = 0.5 9 | beta = 0.1 10 | lambda1 = 0.01 11 | lambda2 = 0.1 12 | gama = 0.1 13 | 14 | 15 | def getAnchorPoints(num_points): 16 | radius = 0.5 * sqrt(2) 17 | # difference between two anchor points 18 | diff = 2 * pi / num_points 19 | cx = [radius * cos(i * diff) for i in range(0, num_points)] 20 | cy = [radius * sin(i * diff) for i in range(0, num_points)] 21 | 22 | return tensor(cx).view(num_points, -1), tensor(cy).view(num_points, -1) 23 | 24 | 25 | ''' 26 | Loss Function for AttentionImageClass 27 | ======================================================================================= 28 | @Args: 29 | input : score vectors (torch.cuda.FloatTensor[batch_size, num_categories]) 30 | target : target (torch.cuda.FloatTensor[batch_size, num_categories]) 31 | M : transformation matrix (torch.FloatTensor[num_iterations, batch_size, 2, 3]) 32 | 33 | @Returns: 34 | total_loss 35 | ======================================================================================= 36 | ''' 37 | def loss_function(input, target, M, add_constraint=False): 38 | ''' 39 | [variable] 'pp' : predicted probability vector 40 | [variable] 'gtp' : ground-truth probability vector 41 | [variable] 'loss_cls' : loss for classification 42 | [variable] 'loss_loc' : loss for localizatoin 43 | ''' 44 | 45 | # extra arguments from theta(that is transformation matrix) 46 | # ========================================================= 47 | sx = M[1:, :, 0, 0] 48 | sy = M[1:, :, 1, 1] 49 | tx = M[1:, :, 0, 2] 50 | ty = M[1:, :, 1, 2] 51 | 52 | # anchor point 53 | # ============ 54 | cx = tensor([0., 0.5, 0.5, -0.5, -0.5]).view(5, -1) 55 | cy = tensor([0., 0.5, -0.5, 0.5, -0.5]).view(5, -1) 56 | #cx, cy = getAnchorPoints(M.size(0) - 2) 57 | 58 | # calculate the predicted & ground-truth iprobability vector 59 | # ========================================================== 60 | pp = F.softmax(input, dim=1) 61 | gtp = target.div(target.norm(p=1, dim=1).view(input.size()[0], -1)) 62 | 63 | # calculate loss for classification 64 | # ================================= 65 | loss_cls = F.mse_loss(pp, gtp, size_average=False) 66 | 67 | if not add_constraint: 68 | return loss_cls 69 | 70 | # calculate loss for localization 71 | # =============================== 72 | # anchor constraint 73 | loss_A = torch.sum(0.5 * ((tx - cx)**2 + (ty - cy)**2)) 74 | 75 | # scale constraint 76 | loss_sx = torch.sum(torch.max(abs(sx) - alpha, tensor(0.)) ** 2) 77 | loss_sy = torch.sum(torch.max(abs(sy) - alpha, tensor(0.)) ** 2) 78 | loss_S = loss_sx + loss_sy 79 | 80 | # positive constraint 81 | loss_P = torch.sum(torch.max(beta - sx, tensor(0.)) + torch.max(beta - sy, tensor(0.))) 82 | 83 | loss_loc = (loss_S + lambda1 * loss_A + lambda2 * loss_P).cuda() 84 | 85 | # calculate total loss 86 | # ==================== 87 | total_loss = loss_cls + gama * loss_loc 88 | 89 | print("M ", M) 90 | #print('sx ', sx) 91 | #print('sy ', sy) 92 | #print('tx ', tx) 93 | #print('ty ', ty) 94 | 95 | #print('cx ', cx) 96 | #print('cy ', cy) 97 | 98 | print('loss_A ', loss_A) 99 | #print('loss_S ', loss_S) 100 | #print('loss_P ', loss_P) 101 | print('loss_loc ', loss_loc) 102 | print("loss_cls ", loss_cls) 103 | print('total_loss ', total_loss) 104 | 105 | return total_loss 106 | 107 | -------------------------------------------------------------------------------- /models/loss_with_priori.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.tensor as tensor 3 | import torch.nn.functional as F 4 | import torch 5 | from math import * 6 | 7 | 8 | # hyperparameters 9 | alpha = 0.5 10 | beta = 0.1 11 | lambda1 = 0.01 12 | lambda2 = 0.1 13 | gama = 0.1 14 | 15 | 16 | def getAnchorPoints(num_points): 17 | radius = 0.5 * sqrt(2) 18 | # difference between two anchor points 19 | diff = 2 * pi / num_points 20 | cx = [radius * cos(i * diff) for i in range(0, num_points)] 21 | cy = [radius * sin(i * diff) for i in range(0, num_points)] 22 | 23 | return tensor(cx).view(num_points, -1), tensor(cy).view(num_points, -1) 24 | 25 | 26 | ''' 27 | Loss Function for AttentionImageClass 28 | ======================================================================================= 29 | @Args: 30 | input : score vectors (torch.cuda.FloatTensor[batch_size, num_categories]) 31 | target : target (torch.cuda.FloatTensor[batch_size, num_categories]) 32 | M : transformation matrix (torch.FloatTensor[num_iterations, batch_size, 2, 3]) 33 | 34 | @Returns: 35 | total_loss 36 | ======================================================================================= 37 | ''' 38 | def loss_function(input, target, M, add_constraint=False): 39 | ''' 40 | [variable] 'pp' : predicted probability vector 41 | [variable] 'gtp' : ground-truth probability vector 42 | [variable] 'loss_cls' : loss for classification 43 | [variable] 'loss_loc' : loss for localizatoin 44 | ''' 45 | 46 | # extra arguments from theta(that is transformation matrix) 47 | # ========================================================= 48 | sx = M[1:, :, 0, 0] 49 | sy = M[1:, :, 1, 1] 50 | tx = M[2:, :, 0, 2] 51 | ty = M[2:, :, 1, 2] 52 | 53 | # anchor point 54 | # ============ 55 | cx = tensor([0.5, 0.5, -0.5, -0.5]).view(4, -1) 56 | cy = tensor([0.5, -0.5, 0.5, -0.5]).view(4, -1) 57 | #cx, cy = getAnchorPoints(M.size(0) - 2) 58 | 59 | # calculate the predicted & ground-truth iprobability vector 60 | # ========================================================== 61 | pp = F.softmax(input, dim=1) 62 | gtp = target.div(target.norm(p=1, dim=1).view(input.size()[0], -1)) 63 | 64 | # calculate loss for classification 65 | # ================================= 66 | loss_cls = F.mse_loss(pp, gtp, size_average=False) 67 | 68 | if not add_constraint: 69 | return loss_cls 70 | 71 | # calculate loss for localization 72 | # =============================== 73 | # anchor constraint 74 | loss_A = torch.sum(0.5 * (tx**2 + ty**2)) 75 | 76 | # scale constraint 77 | loss_sx = torch.sum(torch.max(abs(sx) - alpha, tensor(0.)) ** 2) 78 | loss_sy = torch.sum(torch.max(abs(sy) - alpha, tensor(0.)) ** 2) 79 | loss_S = loss_sx + loss_sy 80 | 81 | # positive constraint 82 | loss_P = torch.sum(torch.max(beta - sx, tensor(0.)) + torch.max(beta - sy, tensor(0.))) 83 | 84 | loss_loc = (loss_S + lambda1 * loss_A + lambda2 * loss_P).cuda() 85 | 86 | # calculate total loss 87 | # ==================== 88 | total_loss = loss_cls + gama * loss_loc 89 | 90 | print("M ", M) 91 | #print('sx ', sx) 92 | #print('sy ', sy) 93 | #print('tx ', tx) 94 | #print('ty ', ty) 95 | 96 | #print('cx ', cx) 97 | #print('cy ', cy) 98 | 99 | print('loss_A ', loss_A) 100 | #print('loss_S ', loss_S) 101 | #print('loss_P ', loss_P) 102 | print('loss_loc ', loss_loc) 103 | print("loss_cls ", loss_cls) 104 | print('total_loss ', total_loss) 105 | 106 | return total_loss 107 | 108 | -------------------------------------------------------------------------------- /priori_main.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.optim as optim 3 | import torch.utils.data 4 | import torch.backends.cudnn as cudnn 5 | import torchvision 6 | from torchvision import transforms as transforms 7 | import numpy as np 8 | import visdom 9 | import torch.nn.functional as F 10 | 11 | import argparse 12 | import os 13 | 14 | from models.RMA_module_with_priori import RMA_module 15 | from models.loss_with_priori import loss_function 16 | from utils import get_target_transform as target_trans 17 | 18 | # data visualization 19 | vis = visdom.Visdom(env='baseline(with priori)') 20 | # GPU setting 21 | os.environ.setdefault("CUDA_VISIBLE_DEVICES", "3") 22 | 23 | # ================================================================== 24 | # Constants 25 | # ================================================================== 26 | EPOCH = 45 # number of times for each run-through 27 | BATCH_SIZE = 16 # number of images for each epoch 28 | LEARNING_RATE = 1e-5 # default learning rate 29 | WEIGHT_DECAY = 0 # default weight decay 30 | N = 512 # size of input images (512 or 640) 31 | MOMENTUM = (0.9, 0.999) # momentum in Adam optimization 32 | TOPK = 3 # top k highest-ranked labels 33 | GPU_IN_USE = torch.cuda.is_available() # whether using GPU 34 | DIR_TRAIN_IMAGES = '../dataset/train2017/' 35 | DIR_TEST_IMAGES = '../dataset/val2017/' 36 | PATH_TRAIN_ANNFILE = '../dataset/annotations/instances_train2017.json' 37 | PATH_TEST_ANNFILE = '../dataset/annotations/instances_val2017.json' 38 | PATH_MODEL_PARAMS = './params/params_with_priori.pkl' 39 | NUM_CATEGORIES = 80 40 | LOSS_OUTPUT_INTERVAL = 100 41 | 42 | # ================================================================== 43 | # Global Variables 44 | # ================================================================== 45 | # one iteration means one mini-batch finishs a forward-backward process 46 | current_training_iteration = torch.tensor([1]) 47 | current_test_iteration = torch.tensor([1]) 48 | loss_graph_window = 'loss graph' 49 | test_f1_graph_window = 'test OF1 and CF1 graph' 50 | evaluation_window = 'six evaluation metrics' 51 | #category_id_window = 'category ids of prediction and ground-truth' 52 | of1 = 0. 53 | cf1 = 0. 54 | 55 | # ================================================================== 56 | # Parser Initialization 57 | # ================================================================== 58 | parser = argparse.ArgumentParser(description='Pytorch Implementation of ICCV2017_AttentionImageClass') 59 | parser.add_argument('--lr', default=LEARNING_RATE, type=float, help='learning rate') 60 | parser.add_argument('--epoch', default=EPOCH, type=int, help='number of epochs') 61 | parser.add_argument('--trainBatchSize', default=BATCH_SIZE, type=int, help='training batch size') 62 | parser.add_argument('--testBatchSize', default=BATCH_SIZE, type=int, help='testing batch size') 63 | parser.add_argument('--weightDecay', default=WEIGHT_DECAY, type=float, help='weight decay') 64 | parser.add_argument('--pathModelParams', default=PATH_MODEL_PARAMS, type=str, help='path of model parameters') 65 | parser.add_argument('--saveModel', default=True, type=bool, help='save model parameters') 66 | parser.add_argument('--loadModel', default=False, type=bool, help='load model parameters') 67 | args = parser.parse_args() 68 | 69 | 70 | # ================================================================== 71 | # Prepare Dataset(training & test) 72 | # ================================================================== 73 | print('***** Prepare Data ******') 74 | 75 | # transforms of training dataset 76 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 77 | std=[0.229, 0.224, 0.225]) 78 | train_transforms = transforms.Compose([ 79 | transforms.RandomHorizontalFlip(p=0.5), # default value is 0.5 80 | transforms.Resize((N, N)), 81 | transforms.ToTensor(), 82 | normalize 83 | ]) 84 | 85 | # transforms of test dataset 86 | test_transforms = transforms.Compose([ 87 | transforms.Resize((N, N)), 88 | transforms.ToTensor(), 89 | normalize 90 | ]) 91 | 92 | train_dataset = torchvision.datasets.CocoDetection(root=DIR_TRAIN_IMAGES, annFile=PATH_TRAIN_ANNFILE, 93 | transform=train_transforms, target_transform=target_trans) 94 | test_dataset = torchvision.datasets.CocoDetection(root=DIR_TEST_IMAGES, annFile=PATH_TEST_ANNFILE, 95 | transform=test_transforms, target_transform=target_trans) 96 | train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.trainBatchSize, shuffle=True, num_workers=2) 97 | test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=args.testBatchSize, shuffle=False, num_workers=2) 98 | print('Data Preparation : Finished') 99 | 100 | 101 | # ================================================================== 102 | # Prepare Model 103 | # ================================================================== 104 | print('\n***** Prepare Model *****') 105 | 106 | vgg16 = torchvision.models.vgg16(pretrained=True) 107 | 108 | for param in vgg16.features.parameters(): 109 | param.requires_grad=False 110 | 111 | extract_features = vgg16.features 112 | 113 | RMA = RMA_module(lstm_input_size=14, lstm_hidden_size=4096, zk_size=4096) 114 | if args.loadModel: 115 | RMA.load_state_dict(torch.load(args.pathModelParams)) 116 | 117 | if GPU_IN_USE: 118 | print('CUDA_VISIBLE_DEVICES:', os.environ['CUDA_VISIBLE_DEVICES']) 119 | print('cuda: move all model parameters and buffers to the GPU') 120 | extract_features.cuda() 121 | RMA.cuda() 122 | cudnn.benchmark = True 123 | 124 | # Adam optimization 125 | optimizer = optim.Adam(RMA.parameters(), lr=args.lr, weight_decay=args.weightDecay, betas=MOMENTUM) 126 | # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[75, 150], gamma=0.5) # lr decay 127 | print('Model Preparation : Finished') 128 | 129 | 130 | # Train 131 | # ================================================================================ 132 | # data: [torch.cuda.FloatTensor of size [batch_size, 3, N, N] N=512/640] 133 | # target: [torch.cuda.FloatTensor of size [batch_size, num_categories]] 134 | # output: [torch.cuda.FloatTensor of size [batch_size, num_categories]] 135 | # prediction: [ 136 | # [torch.cuda.FloatTensor of size [batch_size, TOPK] (TOPK)], 137 | # [torch.cuda.LongTensor of size [batch_size, TOPK] (index of TOPK)] 138 | # ] 139 | # ================================================================================ 140 | def train(): 141 | print('train:') 142 | RMA.train() # set the module in training mode 143 | train_loss = 0. # sum of train loss up to current batch 144 | 145 | global current_training_iteration 146 | 147 | sum_prediction_label = torch.zeros(1, 80) + 1e-6 148 | sum_correct_prediction_label = torch.zeros(1, 80) 149 | sum_ground_truth_label = torch.zeros(1, 80) 150 | 151 | for batch_num, (data, target) in enumerate(train_loader): 152 | if target.sum() == 0: 153 | continue 154 | target = target.index_select(0, torch.nonzero(target.sum(dim=1)).view(-1)) 155 | data = data.index_select(0, torch.nonzero(target.sum(dim=1)).view(-1)) 156 | 157 | if GPU_IN_USE: 158 | data, target = data.cuda(), target.cuda() 159 | 160 | # -----forward----- 161 | optimizer.zero_grad() 162 | f_I = extract_features(data) 163 | output, M = RMA(f_I) 164 | # ---end forward--- 165 | 166 | # ---calculate loss and backward--- 167 | loss = loss_function(output, target, M, add_constraint=True) 168 | loss.backward() 169 | optimizer.step() 170 | # ----------end backward----------- 171 | 172 | train_loss += loss 173 | prediction = torch.topk(F.softmax(output, dim=1), 10, dim=1) 174 | filter = prediction[0].eq(0.1) + prediction[0].gt(0.1) 175 | prediction_index = torch.mul(prediction[1]+1, filter.type(torch.cuda.LongTensor)) 176 | extend_eye_mat = torch.cat((torch.zeros(1, 80), torch.eye(80)), 0) 177 | prediction_label = extend_eye_mat[prediction_index.view(-1)].view(-1, 10, 80).sum(dim=1) 178 | correct_prediction_label = (target.cpu().byte() & prediction_label.byte()).type(torch.FloatTensor) 179 | 180 | #count the sum of label vector 181 | sum_prediction_label += prediction_label.sum(dim=0) 182 | sum_correct_prediction_label += correct_prediction_label.sum(dim=0) 183 | sum_ground_truth_label += target.cpu().sum(dim=0) 184 | 185 | #for i in range(0, target.size(0)): 186 | # print('-----------------') 187 | # print('ground-truth: ', target[i].nonzero().view(-1)) 188 | # print('prediction: ', prediction[1][i]) 189 | # print('-----------------') 190 | 191 | if batch_num % LOSS_OUTPUT_INTERVAL == 0: 192 | # visualization: draw the train loss graph 193 | vis.line( 194 | X=current_training_iteration, 195 | Y=torch.tensor([train_loss.data]) / (batch_num+1), 196 | win=loss_graph_window, 197 | name='train loss', 198 | update=None if current_training_iteration == 1 else 'append', 199 | opts=dict(xlabel='iteration', ylabel='loss', showlegend=True) 200 | ) 201 | print('loss %.3f (batch %d)' % (train_loss/(batch_num+1), batch_num+1)) 202 | current_training_iteration += LOSS_OUTPUT_INTERVAL 203 | 204 | # evaluation metrics 205 | o_p = torch.div(sum_correct_prediction_label.sum(), sum_prediction_label.sum()) 206 | o_r = torch.div(sum_correct_prediction_label.sum(), sum_ground_truth_label.sum()) 207 | of1 = torch.div(2 * o_p * o_r, o_p + o_r) 208 | c_p = (torch.div(sum_correct_prediction_label, sum_prediction_label)).sum() / NUM_CATEGORIES 209 | c_r = (torch.div(sum_correct_prediction_label, sum_ground_truth_label)).sum() / NUM_CATEGORIES 210 | cf1 = torch.div(2 * c_p * c_r, c_p + c_r) 211 | 212 | return c_p, c_r, cf1, o_p, o_r, of1 213 | 214 | 215 | # Test 216 | # ================================================================================ 217 | # data: [torch.cuda.FloatTensor of size [batch_size, 3, N, N] N=512/640] 218 | # target: [torch.cuda.FloatTensor of size [batch_size, num_categories]] 219 | # output: [torch.cuda.FloatTensor of size [batch_size, num_categories]] 220 | # prediction: [ 221 | # [torch.cuda.FloatTensor of size [batch_size, TOPK] (TOPK)], 222 | # [torch.cuda.LongTensor of size [batch_size, TOPK] (index of TOPK)] 223 | # ] 224 | # ================================================================================ 225 | def test(): 226 | print('test:') 227 | RMA.eval() # set the module in evaluation mode 228 | test_loss = 0. # sum of train loss up to current batch 229 | 230 | global current_test_iteration 231 | 232 | sum_prediction_label = torch.zeros(1, 80) + 1e-6 233 | sum_correct_prediction_label = torch.zeros(1, 80) 234 | sum_ground_truth_label = torch.zeros(1, 80) 235 | 236 | for batch_num, (data, target) in enumerate(test_loader): 237 | if target.sum() == 0: 238 | continue 239 | target = target.index_select(0, torch.nonzero(target.sum(dim=1)).view(-1)) 240 | data = data.index_select(0, torch.nonzero(target.sum(dim=1)).view(-1)) 241 | 242 | if GPU_IN_USE: 243 | data, target = data.cuda(), target.cuda() # set up GPU Tensor 244 | 245 | f_I = extract_features(data) 246 | output, M = RMA(f_I) 247 | loss = loss_function(output, target, M, add_constraint=True) 248 | 249 | test_loss += loss 250 | prediction = torch.topk(F.softmax(output, dim=1), 10, dim=1) 251 | filter = prediction[0].eq(0.1) + prediction[0].gt(0.1) 252 | prediction_index = torch.mul(prediction[1]+1, filter.type(torch.cuda.LongTensor)) 253 | extend_eye_mat = torch.cat((torch.zeros(1, 80), torch.eye(80)), 0) 254 | prediction_label = extend_eye_mat[prediction_index.view(-1)].view(-1, 10, 80).sum(dim=1) 255 | correct_prediction_label = (target.cpu().byte() & prediction_label.byte()).type(torch.FloatTensor) 256 | 257 | #count the sum of label vector 258 | sum_prediction_label += prediction_label.sum(dim=0) 259 | sum_correct_prediction_label += correct_prediction_label.sum(dim=0) 260 | sum_ground_truth_label += target.cpu().sum(dim=0) 261 | 262 | #for i in range(0, target.size(0)): 263 | # print('-----------------') 264 | # print('ground-truth: ', target[i].nonzero().view(-1)) 265 | # print('prediction: ', prediction_index[i] - 1) 266 | # print('-----------------') 267 | # 268 | 269 | if batch_num % LOSS_OUTPUT_INTERVAL == 0: 270 | # visualization: draw the test loss graph 271 | vis.line( 272 | X=current_test_iteration, 273 | Y=torch.tensor([test_loss.data]) / (batch_num+1), 274 | win=loss_graph_window, 275 | name='test loss', 276 | update='insert' if current_test_iteration == 1 else 'append', 277 | opts=dict(showlegend=True), 278 | ) 279 | print('loss %.3f (batch %d)' % (test_loss / (batch_num+1), batch_num+1)) 280 | current_test_iteration += LOSS_OUTPUT_INTERVAL 281 | 282 | # evaluation metrics 283 | o_p = torch.div(sum_correct_prediction_label.sum(), sum_prediction_label.sum()) 284 | o_r = torch.div(sum_correct_prediction_label.sum(), sum_ground_truth_label.sum()) 285 | of1 = torch.div(2 * o_p * o_r, o_p + o_r) 286 | c_p = (torch.div(sum_correct_prediction_label, sum_prediction_label)).sum() / NUM_CATEGORIES 287 | c_r = (torch.div(sum_correct_prediction_label, sum_ground_truth_label)).sum() / NUM_CATEGORIES 288 | cf1 = torch.div(2 * c_p * c_r, c_p + c_r) 289 | 290 | return c_p, c_r, cf1, o_p, o_r, of1 291 | 292 | 293 | # ================================================================== 294 | # Save Model 295 | # ================================================================== 296 | def save(): 297 | torch.save(RMA.state_dict(), args.pathModelParams) 298 | print('Checkpoint saved to {}'.format(args.pathModelParams)) 299 | 300 | 301 | # ================================================================== 302 | # Main Loop 303 | # ================================================================== 304 | for current_epoch in range(1, args.epoch + 1): 305 | print('\n===> epoch: %d/%d' % (current_epoch, args.epoch)) 306 | train_cp, train_cr, train_cf1, train_op, train_or, train_of1 = train() 307 | with torch.no_grad(): 308 | test_cp, test_cr, test_cf1, test_op, test_or, test_of1 = test() 309 | 310 | evaluation_metrics = ''' 311 |
312 | ===> epoch: %d/%d
313 | ------------------------------------------------------------- 314 | | CP | CR | CF1 | OP | OR | OF1 | 315 | ------------------------------------------------------------- 316 | | %.3f | %.3f | %.3f | %.3f | %.3f | %.3f | 317 | ------------------------------------------------------------- 318 |
319 | ''' % (current_epoch, args.epoch, test_cp, test_cr, test_cf1, test_op, test_or, test_of1) 320 | 321 | # visualization 322 | vis.line( 323 | X=torch.tensor([current_epoch]), 324 | Y=torch.tensor([test_cf1]), 325 | name='test_CF1', 326 | win=test_f1_graph_window, 327 | update=None if current_epoch == 1 else 'append', 328 | opts=dict(xlabel='epoch', ylabel='F1', showlegend=True, title='Evaluation of Test (CF1 / OF1)') 329 | ) 330 | vis.line( 331 | X=torch.tensor([current_epoch]), 332 | Y=torch.tensor([test_of1]), 333 | name='test_OF1', 334 | win=test_f1_graph_window, 335 | update='insert' if current_epoch == 1 else 'append', 336 | opts=dict(showlegend=True) 337 | ) 338 | vis.text( 339 | evaluation_metrics, 340 | win=evaluation_window, 341 | append=False if current_epoch == 1 else True 342 | ) 343 | 344 | if test_of1 > of1 and test_cf1 > cf1: 345 | if args.saveModel: 346 | save() 347 | of1 = test_of1 348 | cf1 = test_cf1 349 | 350 | if current_epoch == args.epoch: 351 | print('===> BEST PERFORMANCE (OF1/CF1): %.3f / %.3f' % (of1, cf1)) 352 | 353 | -------------------------------------------------------------------------------- /priori_main_tencrop.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.optim as optim 3 | import torch.utils.data 4 | import torch.backends.cudnn as cudnn 5 | import torchvision 6 | from torchvision import transforms as transforms 7 | import numpy as np 8 | import visdom 9 | import torch.nn.functional as F 10 | 11 | import argparse 12 | import os 13 | 14 | from models.RMA_module_with_priori import RMA_module 15 | from models.loss_with_priori import loss_function 16 | from utils import get_target_transform as target_trans 17 | from utils import* 18 | 19 | # data visualization 20 | vis = visdom.Visdom(env='priori_tencrop') 21 | # GPU setting 22 | os.environ.setdefault("CUDA_VISIBLE_DEVICES", "5") 23 | 24 | # ================================================================== 25 | # Constants 26 | # ================================================================== 27 | EPOCH = 45 # number of times for each run-through 28 | BATCH_SIZE = 16 # number of images for each epoch 29 | LEARNING_RATE = 1e-5 # default learning rate 30 | WEIGHT_DECAY = 0 # default weight decay 31 | N = 576 # size of input images (512 or 640) 32 | MOMENTUM = (0.9, 0.999) # momentum in Adam optimization 33 | TOPK = 3 # top k highest-ranked labels 34 | GPU_IN_USE = torch.cuda.is_available() # whether using GPU 35 | DIR_TRAIN_IMAGES = '../dataset/train2017/' 36 | DIR_TEST_IMAGES = '../dataset/val2017/' 37 | PATH_TRAIN_ANNFILE = '../dataset/annotations/instances_train2017.json' 38 | PATH_TEST_ANNFILE = '../dataset/annotations/instances_val2017.json' 39 | PATH_MODEL_PARAMS = './params/params_with_priori.pkl' 40 | NUM_CATEGORIES = 80 41 | LOSS_OUTPUT_INTERVAL = 100 42 | CROPSIZE_512 = 16 43 | 44 | # ================================================================== 45 | # Global Variables 46 | # ================================================================== 47 | # one iteration means one mini-batch finishs a forward-backward process 48 | current_training_iteration = torch.tensor([1]) 49 | current_test_iteration = torch.tensor([1]) 50 | loss_graph_window = 'loss graph' 51 | test_f1_graph_window = 'test OF1 and CF1 graph' 52 | evaluation_window = 'six evaluation metrics' 53 | #category_id_window = 'category ids of prediction and ground-truth' 54 | of1 = 0. 55 | cf1 = 0. 56 | 57 | # ================================================================== 58 | # Parser Initialization 59 | # ================================================================== 60 | parser = argparse.ArgumentParser(description='Pytorch Implementation of ICCV2017_AttentionImageClass') 61 | parser.add_argument('--lr', default=LEARNING_RATE, type=float, help='learning rate') 62 | parser.add_argument('--epoch', default=EPOCH, type=int, help='number of epochs') 63 | parser.add_argument('--trainBatchSize', default=BATCH_SIZE, type=int, help='training batch size') 64 | parser.add_argument('--testBatchSize', default=BATCH_SIZE, type=int, help='testing batch size') 65 | parser.add_argument('--weightDecay', default=WEIGHT_DECAY, type=float, help='weight decay') 66 | parser.add_argument('--pathModelParams', default=PATH_MODEL_PARAMS, type=str, help='path of model parameters') 67 | parser.add_argument('--saveModel', default=True, type=bool, help='save model parameters') 68 | parser.add_argument('--loadModel', default=False, type=bool, help='load model parameters') 69 | args = parser.parse_args() 70 | 71 | 72 | # ================================================================== 73 | # Prepare Dataset(training & test) 74 | # ================================================================== 75 | print('***** Prepare Data ******') 76 | 77 | # transforms of training dataset 78 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 79 | std=[0.229, 0.224, 0.225]) 80 | train_transforms = transforms.Compose([ 81 | transforms.RandomHorizontalFlip(p=0.5), # default value is 0.5 82 | transforms.Resize((N, N)), 83 | transforms.ToTensor(), 84 | normalize 85 | ]) 86 | 87 | # transforms of test dataset 88 | test_transforms = transforms.Compose([ 89 | transforms.Resize((N, N)), 90 | transforms.ToTensor(), 91 | normalize 92 | ]) 93 | 94 | train_dataset = torchvision.datasets.CocoDetection(root=DIR_TRAIN_IMAGES, annFile=PATH_TRAIN_ANNFILE, 95 | transform=train_transforms, target_transform=target_trans) 96 | test_dataset = torchvision.datasets.CocoDetection(root=DIR_TEST_IMAGES, annFile=PATH_TEST_ANNFILE, 97 | transform=test_transforms, target_transform=target_trans) 98 | train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.trainBatchSize, shuffle=True, num_workers=2) 99 | test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=args.testBatchSize, shuffle=False, num_workers=2) 100 | print('Data Preparation : Finished') 101 | 102 | 103 | # ================================================================== 104 | # Prepare Model 105 | # ================================================================== 106 | print('\n***** Prepare Model *****') 107 | 108 | vgg16 = torchvision.models.vgg16(pretrained=True) 109 | 110 | for param in vgg16.features.parameters(): 111 | param.requires_grad=False 112 | 113 | extract_features = vgg16.features 114 | 115 | RMA = RMA_module(lstm_input_size=14, lstm_hidden_size=4096, zk_size=4096) 116 | if args.loadModel: 117 | RMA.load_state_dict(torch.load(args.pathModelParams)) 118 | 119 | if GPU_IN_USE: 120 | print('CUDA_VISIBLE_DEVICES:', os.environ['CUDA_VISIBLE_DEVICES']) 121 | print('cuda: move all model parameters and buffers to the GPU') 122 | extract_features.cuda() 123 | RMA.cuda() 124 | cudnn.benchmark = True 125 | 126 | # Adam optimization 127 | optimizer = optim.Adam(RMA.parameters(), lr=args.lr, weight_decay=args.weightDecay, betas=MOMENTUM) 128 | # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[75, 150], gamma=0.5) # lr decay 129 | print('Model Preparation : Finished') 130 | 131 | 132 | # Train 133 | # ================================================================================ 134 | # data: [torch.cuda.FloatTensor of size [batch_size, 3, N, N] N=512/640] 135 | # target: [torch.cuda.FloatTensor of size [batch_size, num_categories]] 136 | # output: [torch.cuda.FloatTensor of size [batch_size, num_categories]] 137 | # prediction: [ 138 | # [torch.cuda.FloatTensor of size [batch_size, TOPK] (TOPK)], 139 | # [torch.cuda.LongTensor of size [batch_size, TOPK] (index of TOPK)] 140 | # ] 141 | # ================================================================================ 142 | def train(): 143 | print('train:') 144 | RMA.train() # set the module in training mode 145 | train_loss = 0. # sum of train loss up to current batch 146 | 147 | global current_training_iteration 148 | 149 | sum_prediction_label = torch.zeros(1, 80) + 1e-6 150 | sum_correct_prediction_label = torch.zeros(1, 80) 151 | sum_ground_truth_label = torch.zeros(1, 80) 152 | 153 | for batch_num, (data, target) in enumerate(train_loader): 154 | if target.sum() == 0: 155 | continue 156 | target = target.index_select(0, torch.nonzero(target.sum(dim=1)).view(-1)) 157 | data = data.index_select(0, torch.nonzero(target.sum(dim=1)).view(-1)) 158 | 159 | if GPU_IN_USE: 160 | data, target = data.cuda(), target.cuda() 161 | 162 | # -----forward----- 163 | optimizer.zero_grad() 164 | f_I = extract_features(data) 165 | output, M = RMA(f_I) 166 | # ---end forward--- 167 | 168 | # ---calculate loss and backward--- 169 | loss = loss_function(output, target, M, add_constraint=True) 170 | loss.backward() 171 | optimizer.step() 172 | # ----------end backward----------- 173 | 174 | train_loss += loss 175 | prediction = torch.topk(F.softmax(output, dim=1), 10, dim=1) 176 | filter = prediction[0].eq(0.1) + prediction[0].gt(0.1) 177 | prediction_index = torch.mul(prediction[1]+1, filter.type(torch.cuda.LongTensor)) 178 | extend_eye_mat = torch.cat((torch.zeros(1, 80), torch.eye(80)), 0) 179 | prediction_label = extend_eye_mat[prediction_index.view(-1)].view(-1, 10, 80).sum(dim=1) 180 | correct_prediction_label = (target.cpu().byte() & prediction_label.byte()).type(torch.FloatTensor) 181 | 182 | #count the sum of label vector 183 | sum_prediction_label += prediction_label.sum(dim=0) 184 | sum_correct_prediction_label += correct_prediction_label.sum(dim=0) 185 | sum_ground_truth_label += target.cpu().sum(dim=0) 186 | 187 | #for i in range(0, target.size(0)): 188 | # print('-----------------') 189 | # print('ground-truth: ', target[i].nonzero().view(-1)) 190 | # print('prediction: ', prediction[1][i]) 191 | # print('-----------------') 192 | 193 | if batch_num % LOSS_OUTPUT_INTERVAL == 0: 194 | # visualization: draw the train loss graph 195 | vis.line( 196 | X=current_training_iteration, 197 | Y=torch.tensor([train_loss.data]) / (batch_num+1), 198 | win=loss_graph_window, 199 | name='train loss', 200 | update=None if current_training_iteration == 1 else 'append', 201 | opts=dict(xlabel='iteration', ylabel='loss', showlegend=True) 202 | ) 203 | print('loss %.3f (batch %d)' % (train_loss/(batch_num+1), batch_num+1)) 204 | current_training_iteration += LOSS_OUTPUT_INTERVAL 205 | 206 | # evaluation metrics 207 | o_p = torch.div(sum_correct_prediction_label.sum(), sum_prediction_label.sum()) 208 | o_r = torch.div(sum_correct_prediction_label.sum(), sum_ground_truth_label.sum()) 209 | of1 = torch.div(2 * o_p * o_r, o_p + o_r) 210 | c_p = (torch.div(sum_correct_prediction_label, sum_prediction_label)).sum() / NUM_CATEGORIES 211 | c_r = (torch.div(sum_correct_prediction_label, sum_ground_truth_label)).sum() / NUM_CATEGORIES 212 | cf1 = torch.div(2 * c_p * c_r, c_p + c_r) 213 | 214 | return c_p, c_r, cf1, o_p, o_r, of1 215 | 216 | 217 | # Test 218 | # ================================================================================ 219 | # data: [torch.cuda.FloatTensor of size [batch_size, 3, N, N] N=512/640] 220 | # target: [torch.cuda.FloatTensor of size [batch_size, num_categories]] 221 | # output: [torch.cuda.FloatTensor of size [batch_size, num_categories]] 222 | # prediction: [ 223 | # [torch.cuda.FloatTensor of size [batch_size, TOPK] (TOPK)], 224 | # [torch.cuda.LongTensor of size [batch_size, TOPK] (index of TOPK)] 225 | # ] 226 | # ================================================================================ 227 | def test(): 228 | print('test:') 229 | RMA.eval() # set the module in evaluation mode 230 | test_loss = 0. # sum of train loss up to current batch 231 | 232 | global current_test_iteration 233 | 234 | sum_prediction_label = torch.zeros(1, 80) + 1e-6 235 | sum_correct_prediction_label = torch.zeros(1, 80) 236 | sum_ground_truth_label = torch.zeros(1, 80) 237 | 238 | for batch_num, (data, target) in enumerate(test_loader): 239 | if target.sum() == 0: 240 | continue 241 | target = target.index_select(0, torch.nonzero(target.sum(dim=1)).view(-1)) 242 | data = data.index_select(0, torch.nonzero(target.sum(dim=1)).view(-1)) 243 | 244 | if GPU_IN_USE: 245 | data, target = data.cuda(), target.cuda() # set up GPU Tensor 246 | 247 | f_I = extract_features(data) 248 | # ten-crop 249 | # f_I: batchsize*channel*inputSize*inputSize 250 | # tencrop_results: 10*batchsize*channel*cropSize*cropSize 251 | tencrop_results = tencrop(f_I, f_I.size(0), f_I.size(1), f_I.size(2), CROPSIZE_512) 252 | RMA_outputs = torch.zeros(target.size()) 253 | RMA_losses = 0 254 | tencrop_results = tencrop_results.cuda() 255 | RMA_outputs = RMA_outputs.cuda() 256 | 257 | for i in range(10): 258 | crop_RMA_output, crop_RMA_M = RMA(tencrop_results[i]) 259 | RMA_outputs += crop_RMA_output 260 | RMA_losses += loss_function(crop_RMA_output, target, crop_RMA_M, add_constraint=True) 261 | 262 | output = RMA_outputs * 0.1 263 | loss = RMA_losses * 0.1 264 | 265 | # output, M = RMA(f_I) 266 | # loss = loss_function(output, target, M, add_constraint=True) 267 | 268 | test_loss += loss 269 | prediction = torch.topk(F.softmax(output, dim=1), 10, dim=1) 270 | filter = prediction[0].eq(0.1) + prediction[0].gt(0.1) 271 | prediction_index = torch.mul(prediction[1]+1, filter.type(torch.cuda.LongTensor)) 272 | extend_eye_mat = torch.cat((torch.zeros(1, 80), torch.eye(80)), 0) 273 | prediction_label = extend_eye_mat[prediction_index.view(-1)].view(-1, 10, 80).sum(dim=1) 274 | correct_prediction_label = (target.cpu().byte() & prediction_label.byte()).type(torch.FloatTensor) 275 | 276 | #count the sum of label vector 277 | sum_prediction_label += prediction_label.sum(dim=0) 278 | sum_correct_prediction_label += correct_prediction_label.sum(dim=0) 279 | sum_ground_truth_label += target.cpu().sum(dim=0) 280 | 281 | #for i in range(0, target.size(0)): 282 | # print('-----------------') 283 | # print('ground-truth: ', target[i].nonzero().view(-1)) 284 | # print('prediction: ', prediction_index[i] - 1) 285 | # print('-----------------') 286 | # 287 | 288 | if batch_num % LOSS_OUTPUT_INTERVAL == 0: 289 | # visualization: draw the test loss graph 290 | vis.line( 291 | X=current_test_iteration, 292 | Y=torch.tensor([test_loss.data]) / (batch_num+1), 293 | win=loss_graph_window, 294 | name='test loss', 295 | update=None if current_test_iteration == 1 else 'append', 296 | # update='insert' if current_test_iteration == 1 else 'append', 297 | opts=dict(showlegend=True), 298 | ) 299 | print('loss %.3f (batch %d)' % (test_loss / (batch_num+1), batch_num+1)) 300 | current_test_iteration += LOSS_OUTPUT_INTERVAL 301 | 302 | # evaluation metrics 303 | o_p = torch.div(sum_correct_prediction_label.sum(), sum_prediction_label.sum()) 304 | o_r = torch.div(sum_correct_prediction_label.sum(), sum_ground_truth_label.sum()) 305 | of1 = torch.div(2 * o_p * o_r, o_p + o_r) 306 | c_p = (torch.div(sum_correct_prediction_label, sum_prediction_label)).sum() / NUM_CATEGORIES 307 | c_r = (torch.div(sum_correct_prediction_label, sum_ground_truth_label)).sum() / NUM_CATEGORIES 308 | cf1 = torch.div(2 * c_p * c_r, c_p + c_r) 309 | 310 | return c_p, c_r, cf1, o_p, o_r, of1 311 | 312 | 313 | # ================================================================== 314 | # Save Model 315 | # ================================================================== 316 | def save(): 317 | torch.save(RMA.state_dict(), args.pathModelParams) 318 | print('Checkpoint saved to {}'.format(args.pathModelParams)) 319 | 320 | 321 | # ================================================================== 322 | # Main Loop 323 | # ================================================================== 324 | for current_epoch in range(1, args.epoch + 1): 325 | print('\n===> epoch: %d/%d' % (current_epoch, args.epoch)) 326 | # train_cp, train_cr, train_cf1, train_op, train_or, train_of1 = train() 327 | with torch.no_grad(): 328 | test_cp, test_cr, test_cf1, test_op, test_or, test_of1 = test() 329 | 330 | evaluation_metrics = ''' 331 |
332 | ===> epoch: %d/%d
333 | ------------------------------------------------------------- 334 | | CP | CR | CF1 | OP | OR | OF1 | 335 | ------------------------------------------------------------- 336 | | %.3f | %.3f | %.3f | %.3f | %.3f | %.3f | 337 | ------------------------------------------------------------- 338 |
339 | ''' % (current_epoch, args.epoch, test_cp, test_cr, test_cf1, test_op, test_or, test_of1) 340 | 341 | # visualization 342 | vis.line( 343 | X=torch.tensor([current_epoch]), 344 | Y=torch.tensor([test_cf1]), 345 | name='test_CF1', 346 | win=test_f1_graph_window, 347 | update=None if current_epoch == 1 else 'append', 348 | opts=dict(xlabel='epoch', ylabel='F1', showlegend=True, title='Evaluation of Test (CF1 / OF1)') 349 | ) 350 | vis.line( 351 | X=torch.tensor([current_epoch]), 352 | Y=torch.tensor([test_of1]), 353 | name='test_OF1', 354 | win=test_f1_graph_window, 355 | update='insert' if current_epoch == 1 else 'append', 356 | opts=dict(showlegend=True) 357 | ) 358 | vis.text( 359 | evaluation_metrics, 360 | win=evaluation_window, 361 | append=False if current_epoch == 1 else True 362 | ) 363 | 364 | if test_of1 > of1 and test_cf1 > cf1: 365 | if args.saveModel: 366 | save() 367 | of1 = test_of1 368 | cf1 = test_cf1 369 | 370 | if current_epoch == args.epoch: 371 | print('===> BEST PERFORMANCE (OF1/CF1): %.3f / %.3f' % (of1, cf1)) 372 | 373 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from numpy import * 4 | import torch.utils.data as data 5 | from PIL import Image 6 | import os 7 | import os.path 8 | import torch.nn.functional as F 9 | 10 | CLASS = np.array(['None', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']) 11 | 12 | 13 | def get_target_transform(target): 14 | labelsmap = {} 15 | target_transform = torch.zeros(80) 16 | labels = open('coco-label.txt', 'r') 17 | for line in labels: 18 | ids = line.split(',') 19 | labelsmap[int(ids[0])] = int(ids[1]) 20 | for obj in target: 21 | if 'category_id' in obj: 22 | catId = obj['category_id'] 23 | target_transform[labelsmap[catId] - 1] = 1 24 | #print(target[0]['image_id']) 25 | #print(target['image']['id']) 26 | return target_transform 27 | 28 | # inputs: batchsize*channel*inputSize*inputSize(tensor) 29 | # batchSize = 8 30 | # channel = 31 | # inputSize = 32 | # cropSize = 33 | # return: 10*batchsize*channel*cropSize*cropSize(tensor) 34 | def tencrop(inputs, batchSize, channel, inputSize, cropSize): 35 | crops = torch.zeros(10, batchSize, channel, cropSize, cropSize) 36 | crops = crops.numpy() 37 | edgestart = inputSize-cropSize; 38 | midstart = int(floor(inputSize/2)-floor(cropSize/2)) 39 | midend = midstart+cropSize; 40 | 41 | crops[0] = inputs[:,:,0:cropSize,0:cropSize] 42 | crops[1] = inputs[:,:,0:cropSize,edgestart:inputSize] 43 | crops[2] = inputs[:,:,edgestart:inputSize,0:cropSize] 44 | crops[3] = inputs[:,:,edgestart:inputSize,edgestart:inputSize] 45 | crops[4] = inputs[:,:,midstart:midend,midstart:midend] 46 | crops[5] = crops[0][:,:,:,::-1] 47 | crops[6] = crops[1][:,:,:,::-1] 48 | crops[7] = crops[2][:,:,:,::-1] 49 | crops[8] = crops[3][:,:,:,::-1] 50 | crops[9] = crops[4][:,:,:,::-1] 51 | crops = torch.from_numpy(crops) 52 | return crops 53 | 54 | 55 | # input: 'scores' tensor[K, batch_size, num_categories] 56 | # return : 'confidence' tensor[K, batch_size] 57 | # 'className' numpy.ndarray[K, batch_size] 58 | def getPredictedInfo(scores): 59 | confidence, category_id = torch.max(F.softmax(scores, dim=2), dim=2) 60 | #print(category_id) 61 | className = CLASS[category_id+1] 62 | return confidence, className 63 | 64 | 65 | def id2label(category_id): 66 | return [CLASS[i.index_select(0, i.nonzero().view(-1))] for i in category_id] 67 | 68 | -------------------------------------------------------------------------------- /visualize.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.optim as optim 3 | import torch.utils.data 4 | import torch.backends.cudnn as cudnn 5 | import torchvision 6 | from torchvision import transforms as transforms 7 | import numpy as np 8 | import visdom 9 | import torch.nn.functional as F 10 | from CocoDetection import CocoDetection 11 | from visualizeImg import * 12 | #from PIL import Image 13 | #import torch 14 | 15 | import argparse 16 | import os 17 | 18 | from models.RMA_module_with_priori import RMA_module 19 | from models.loss_with_priori import loss_function 20 | from utils import get_target_transform as target_trans 21 | 22 | 23 | # GPU setting 24 | os.environ.setdefault("CUDA_VISIBLE_DEVICES", "2") 25 | 26 | 27 | # ================================================================== 28 | # Constants 29 | # ================================================================== 30 | EPOCH = 45 # number of times for each run-through 31 | BATCH_SIZE = 16 # number of images for each epoch 32 | N = 512 # size of input images (512 or 640) 33 | TOPK = 3 # top k highest-ranked labels 34 | GPU_IN_USE = torch.cuda.is_available() # whether using GPU 35 | DIR_TEST_IMAGES = '../dataset/val2017/' 36 | PATH_TEST_ANNFILE = '../dataset/annotations/instances_val2017.json' 37 | PATH_MODEL_PARAMS = './params/params_with_priori.pkl' 38 | NUM_CATEGORIES = 80 39 | OUTPUT_INTERVAL = 100 40 | 41 | 42 | # ================================================================== 43 | # Parser Initialization 44 | # ================================================================== 45 | parser = argparse.ArgumentParser(description='Pytorch Implementation of ICCV2017_AttentionImageClass') 46 | parser.add_argument('--testBatchSize', default=BATCH_SIZE, type=int, help='testing batch size') 47 | parser.add_argument('--pathModelParams', default=PATH_MODEL_PARAMS, type=str, help='path of model parameters') 48 | parser.add_argument('--loadModel', default=True, type=bool, help='load model parameters') 49 | args = parser.parse_args() 50 | 51 | 52 | # ================================================================== 53 | # Prepare Dataset(training & test) 54 | # ================================================================== 55 | print('***** Prepare Data ******') 56 | 57 | # transforms of test dataset 58 | #normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 59 | # std=[0.229, 0.224, 0.225]) 60 | test_transforms = transforms.Compose([ 61 | transforms.Resize((N, N)), 62 | transforms.ToTensor(), 63 | ]) 64 | 65 | test_dataset = CocoDetection(root=DIR_TEST_IMAGES, annFile=PATH_TEST_ANNFILE, 66 | transform=test_transforms, target_transform=target_trans) 67 | test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=args.testBatchSize, shuffle=False, num_workers=2) 68 | print('Data Preparation : Finished') 69 | 70 | # ================================================================== 71 | # Prepare Model 72 | # ================================================================== 73 | print('\n***** Prepare Model *****') 74 | 75 | vgg16 = torchvision.models.vgg16(pretrained=True) 76 | extract_features = vgg16.features 77 | RMA = RMA_module(lstm_input_size=14, lstm_hidden_size=4096, zk_size=4096) 78 | if args.loadModel: 79 | RMA.load_state_dict(torch.load(args.pathModelParams)) 80 | 81 | if GPU_IN_USE: 82 | print('CUDA_VISIBLE_DEVICES:', os.environ['CUDA_VISIBLE_DEVICES']) 83 | print('cuda: move all model parameters and buffers to the GPU') 84 | extract_features.cuda() 85 | RMA.cuda() 86 | cudnn.benchmark = True 87 | 88 | print('Model Preparation : Finished') 89 | 90 | 91 | # Test 92 | # ================================================================================ 93 | # data: [torch.cuda.FloatTensor of size [batch_size, 3, N, N] N=512/640] 94 | # target: [torch.cuda.FloatTensor of size [batch_size, num_categories]] 95 | # output: [torch.cuda.FloatTensor of size [batch_size, num_categories]] 96 | # prediction: [ 97 | # [torch.cuda.FloatTensor of size [batch_size, TOPK] (TOPK)], 98 | # [torch.cuda.LongTensor of size [batch_size, TOPK] (index of TOPK)] 99 | # ] 100 | # ================================================================================ 101 | def test(): 102 | print('test:') 103 | #RMA.eval() # set the module in evaluation mode 104 | 105 | sum_prediction_label = torch.zeros(1, 80) + 1e-6 106 | sum_correct_prediction_label = torch.zeros(1, 80) 107 | sum_ground_truth_label = torch.zeros(1, 80) 108 | 109 | for batch_num, (data, target, original_imgs) in enumerate(test_loader): 110 | if target.sum() == 0: 111 | continue 112 | target = target.index_select(0, torch.nonzero(target.sum(dim=1)).view(-1)) 113 | data = data.index_select(0, torch.nonzero(target.sum(dim=1)).view(-1)) 114 | original_imgs = original_imgs.index_select(0, torch.nonzero(target.sum(dim=1)).view(-1)) 115 | 116 | #print('original_imgs ', original_imgs.size()) 117 | 118 | if GPU_IN_USE: 119 | data, target = data.cuda(), target.cuda() # set up GPU Tensor 120 | 121 | f_I = extract_features(data) 122 | output, M, scores = RMA(f_I, return_whole_scores=True) 123 | 124 | #total_thetas.append(M) 125 | #total_scores.append(scores) 126 | 127 | #visualize_attentional_regions(original_imgs, M[1:, :, :, :], scores) 128 | visualize_attentional_regions(original_imgs, M, scores) 129 | 130 | prediction = torch.topk(F.softmax(output, dim=1), 10, dim=1) 131 | filter = prediction[0].eq(0.1) + prediction[0].gt(0.1) 132 | prediction_index = torch.mul(prediction[1]+1, filter.type(torch.cuda.LongTensor)) 133 | extend_eye_mat = torch.cat((torch.zeros(1, 80), torch.eye(80)), 0) 134 | prediction_label = extend_eye_mat[prediction_index.view(-1)].view(-1, 10, 80).sum(dim=1) 135 | correct_prediction_label = (target.cpu().byte() & prediction_label.byte()).type(torch.FloatTensor) 136 | 137 | #count the sum of label vector 138 | sum_prediction_label += prediction_label.sum(dim=0) 139 | sum_correct_prediction_label += correct_prediction_label.sum(dim=0) 140 | sum_ground_truth_label += target.cpu().sum(dim=0) 141 | 142 | #for i in range(0, target.size(0)): 143 | # print('-----------------') 144 | # print('ground-truth: ', target[i].nonzero().view(-1)) 145 | # print('prediction: ', prediction_index[i] - 1) 146 | # print('-----------------') 147 | 148 | if batch_num % OUTPUT_INTERVAL == 0: 149 | print(batch_num) 150 | #print('loss %.3f (batch %d)' % (test_loss / (batch_num+1), batch_num+1)) 151 | 152 | #evaluation metrics 153 | o_p = torch.div(sum_correct_prediction_label.sum(), sum_prediction_label.sum()) 154 | o_r = torch.div(sum_correct_prediction_label.sum(), sum_ground_truth_label.sum()) 155 | of1 = torch.div(2 * o_p * o_r, o_p + o_r) 156 | c_p = (torch.div(sum_correct_prediction_label, sum_prediction_label)).sum() / NUM_CATEGORIES 157 | c_r = (torch.div(sum_correct_prediction_label, sum_ground_truth_label)).sum() / NUM_CATEGORIES 158 | cf1 = torch.div(2 * c_p * c_r, c_p + c_r) 159 | 160 | print('-------------------------------------------------------------') 161 | print('| CP | CR | CF1 | OP | OR | OF1 |') 162 | print('-------------------------------------------------------------') 163 | print('| %.3f | %.3f | %.3f | %.3f | %.3f | %.3f |' % (c_p, c_r, cf1, o_p, o_r, of1)) 164 | print('-------------------------------------------------------------') 165 | 166 | 167 | 168 | # ================================================================== 169 | # Save Parameters of Test 170 | # ================================================================== 171 | #def save(): 172 | # torch.save(RMA.state_dict(), args.pathModelParams) 173 | # print('Checkpoint saved to {}'.format(args.pathModelParams)) 174 | 175 | 176 | # ================================================================== 177 | # Main 178 | # ================================================================== 179 | with torch.no_grad(): 180 | test() 181 | 182 | -------------------------------------------------------------------------------- /visualizeImg.py: -------------------------------------------------------------------------------- 1 | from PIL import Image, ImageDraw, ImageFont, ImageFilter 2 | import matplotlib.pyplot as plt 3 | import matplotlib 4 | import numpy 5 | import random 6 | import torch 7 | import os 8 | from utils import * 9 | from torchvision import transforms as transforms 10 | 11 | RECTANGLE_SIZE = 4 12 | imgId = 0 13 | 14 | # random color 15 | def randomColor(): 16 | return (random.randint(64, 255), random.randint(64, 255), random.randint(64, 255)) 17 | 18 | # input: numpy.ndarray 19 | # topleft_Corner: (x,y) 20 | # width: rectangle width 21 | # height: rectangle height 22 | # size: line size 23 | def drawRectangle(input, x, y, width, height, size, k, className, confidence): 24 | # numpy->PIL 25 | inputImg = Image.fromarray(numpy.uint8(input)) 26 | draw = ImageDraw.Draw(inputImg) 27 | color = randomColor() 28 | font = ImageFont.truetype('Arial.ttf', 24) 29 | for i in range(1, size + 1): 30 | draw.rectangle((x + (size - i), y + (size - i), x + width + i, y + height + i), outline=color) 31 | draw.text((x+size+6,y), k, font = font, fill = color) 32 | draw.text((x+size+30,y), className, font = font, fill = color) 33 | draw.text((x+len(className)*14+36,y), confidence, font = font, fill = color) 34 | return inputImg 35 | 36 | 37 | # M: k*batchsize*2*3 38 | # sourceCoordinate: k*batchsize*2*2 39 | # return: topleft, bottomright corners 40 | def getSourceCoordinate(M): 41 | target = torch.tensor([-1.,1.,1.,-1.,1.,1.]) 42 | target = target.view(3,2) 43 | sourceCoordinate = torch.matmul(M,target) 44 | if imgId == 0: 45 | for batch_index in range(M.size(1)): 46 | print("img", batch_index,"-----------") 47 | for k_index in range(M.size(0)): 48 | print("k: ", k_index) 49 | print(M[k_index, batch_index, :, :]) 50 | x0 = sourceCoordinate[:,:,0,0]*256.+256. 51 | x1 = sourceCoordinate[:,:,0,1]*256.+256. 52 | y0 = -(sourceCoordinate[:,:,1,0]*256.-256.) 53 | y1 = -(sourceCoordinate[:,:,1,1]*256.-256.) 54 | sourceCoordinate[:,:,0,0] = torch.min(x0, x1) 55 | sourceCoordinate[:,:,0,1] = torch.max(x0, x1) 56 | sourceCoordinate[:,:,1,0] = torch.min(y0, y1) 57 | sourceCoordinate[:,:,1,1] = torch.max(y0, y1) 58 | return sourceCoordinate 59 | 60 | # sourceCoordinate: k*batchsize*2*2 61 | # rectangleInfo: k*batchsize*4 62 | # return:k*batchsize*[x,y,width,height] 63 | def getPredictedRectangle(M): 64 | coordinate = getSourceCoordinate(M) 65 | rectangleInfo = torch.zeros(coordinate.size(0), coordinate.size(1), 4) 66 | rectangleInfo[:,:,0] = coordinate[:,:,0,0] 67 | rectangleInfo[:,:,1] = coordinate[:,:,1,0] 68 | rectangleInfo[:,:,2] = coordinate[:,:,0,1]-coordinate[:,:,0,0] 69 | rectangleInfo[:,:,3] = coordinate[:,:,1,1]-coordinate[:,:,1,0] 70 | return rectangleInfo 71 | 72 | 73 | def drawPictures(original_img, rectangle, className, confidence): 74 | for batch_index in range(shape(className)[1]): 75 | current_picture = transforms.ToPILImage()(original_img[batch_index]).convert('RGB') 76 | for iterator_index in range(shape(className)[0]): 77 | k = str(iterator_index) 78 | x = rectangle[iterator_index, batch_index, 0] 79 | y = rectangle[iterator_index, batch_index, 1] 80 | width = rectangle[iterator_index, batch_index, 2] 81 | height = rectangle[iterator_index, batch_index, 3] 82 | classname = className[iterator_index, batch_index] 83 | conf_value = str(round(confidence[iterator_index, batch_index].item(),3)) 84 | current_picture = drawRectangle(current_picture, x, y, width, height, RECTANGLE_SIZE, k, classname, conf_value) 85 | global imgId 86 | current_picture.save("./result_visual/visualize_imgs_with_priori/"+str(imgId)+".jpg") 87 | imgId += 1 88 | return 89 | 90 | 91 | def visualize_attentional_regions(original_img, M, scores): 92 | # rectangle 93 | rectangle = getPredictedRectangle(M) 94 | # information 95 | confidence, className = getPredictedInfo(scores) 96 | drawPictures(original_img, rectangle, className, confidence) 97 | return 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /visualizeImg_test0712.py: -------------------------------------------------------------------------------- 1 | from PIL import Image, ImageDraw, ImageFont, ImageFilter 2 | import matplotlib.pyplot as plt 3 | import matplotlib 4 | import numpy 5 | import random 6 | import torch 7 | import os 8 | from utils import * 9 | from torchvision import transforms as transforms 10 | 11 | RECTANGLE_SIZE = 4 12 | imgId = 0 13 | imgTextId = 0 14 | 15 | # random color 16 | def randomColor(): 17 | return (random.randint(64, 255), random.randint(64, 255), random.randint(64, 255)) 18 | 19 | # input: numpy.ndarray 20 | # topleft_Corner: (x,y) 21 | # width: rectangle width 22 | # height: rectangle height 23 | # size: line size 24 | def drawRectangle(input, x, y, width, height, size, k, className, confidence): 25 | # numpy->PIL 26 | inputImg = Image.fromarray(numpy.uint8(input)) 27 | draw = ImageDraw.Draw(inputImg) 28 | color = randomColor() 29 | font = ImageFont.truetype('Arial.ttf', 24) 30 | for i in range(1, size + 1): 31 | draw.rectangle((x + (size - i), y + (size - i), x + width + i, y + height + i), outline=color) 32 | draw.text((x+size+6,y), k, font = font, fill = color) 33 | draw.text((x+size+30,y), className, font = font, fill = color) 34 | draw.text((x+len(className)*14+36,y), confidence, font = font, fill = color) 35 | return inputImg 36 | 37 | 38 | # M: k*batchsize*2*3 39 | # sourceCoordinate: k*batchsize*2*2 40 | # return: topleft, bottomright corners 41 | def getSourceCoordinate(M): 42 | target = torch.tensor([-1.,1.,1.,-1.,1.,1.]) 43 | target = target.view(3,2) 44 | sourceCoordinate = torch.matmul(M,target) 45 | if imgId == 0: 46 | for batch_index in range(M.size(1)): 47 | print("img", batch_index,"-----------") 48 | for k_index in range(M.size(0)): 49 | print("k: ", k_index) 50 | print(M[k_index, batch_index, :, :]) 51 | x0 = sourceCoordinate[:,:,0,0]*256.+256. 52 | x1 = sourceCoordinate[:,:,0,1]*256.+256. 53 | y0 = -(sourceCoordinate[:,:,1,0]*256.-256.) 54 | y1 = -(sourceCoordinate[:,:,1,1]*256.-256.) 55 | sourceCoordinate[:,:,0,0] = torch.min(x0, x1) 56 | sourceCoordinate[:,:,0,1] = torch.max(x0, x1) 57 | sourceCoordinate[:,:,1,0] = torch.min(y0, y1) 58 | sourceCoordinate[:,:,1,1] = torch.max(y0, y1) 59 | return sourceCoordinate 60 | 61 | # sourceCoordinate: k*batchsize*2*2 62 | # rectangleInfo: k*batchsize*4 63 | # return:k*batchsize*[x,y,width,height] 64 | def getPredictedRectangle(M): 65 | coordinate = getSourceCoordinate(M) 66 | rectangleInfo = torch.zeros(coordinate.size(0), coordinate.size(1), 4) 67 | rectangleInfo[:,:,0] = coordinate[:,:,0,0] 68 | rectangleInfo[:,:,1] = coordinate[:,:,1,0] 69 | rectangleInfo[:,:,2] = coordinate[:,:,0,1]-coordinate[:,:,0,0] 70 | rectangleInfo[:,:,3] = coordinate[:,:,1,1]-coordinate[:,:,1,0] 71 | return rectangleInfo 72 | 73 | 74 | def drawPictures(original_img, rectangle, className, confidence): 75 | for batch_index in range(shape(className)[1]): 76 | current_picture = transforms.ToPILImage()(original_img[batch_index]).convert('RGB') 77 | for iterator_index in range(shape(className)[0]): 78 | k = str(iterator_index) 79 | x = rectangle[iterator_index, batch_index, 0] 80 | y = rectangle[iterator_index, batch_index, 1] 81 | width = rectangle[iterator_index, batch_index, 2] 82 | height = rectangle[iterator_index, batch_index, 3] 83 | classname = className[iterator_index, batch_index] 84 | conf_value = str(round(confidence[iterator_index, batch_index].item(),3)) 85 | current_picture = drawRectangle(current_picture, x, y, width, height, RECTANGLE_SIZE, k, classname, conf_value) 86 | global imgId 87 | current_picture.save("./result_visual/visualize_imgs_test0712/"+str(imgId)+".jpg") 88 | imgId += 1 89 | return 90 | 91 | 92 | def visualize_attentional_regions(original_img, M, scores): 93 | # rectangle 94 | rectangle = getPredictedRectangle(M) 95 | # information 96 | confidence, className = getPredictedInfo(scores) 97 | drawPictures(original_img, rectangle, className, confidence) 98 | return 99 | 100 | 101 | # write_prediction_target 102 | # ================================================================================ 103 | # input: numpy (one picture) 104 | # prediction: list (prediction classname) 105 | # target: list (target classname) 106 | # return PIL 107 | # ================================================================================ 108 | def write_prediction_target(input, prediction, target): 109 | # numpy->PIL 110 | inputImg = Image.fromarray(numpy.uint8(input)) 111 | draw = ImageDraw.Draw(inputImg) 112 | color = randomColor() 113 | font = ImageFont.truetype('Arial.ttf', 24) 114 | # initialize x coordinates 115 | prediction_x = 10 116 | target_x = 10 117 | # prediction 118 | for p_index in range(len(prediction)): 119 | draw.text((prediction_x,10), prediction[p_index], font = font, fill = color) 120 | prediction_x = p_index*10 + prediction_x + len(prediction[p_index])*14 121 | # target 122 | for t_index in range(len(target)): 123 | draw.text((target_x,40), target[t_index], font = font, fill = color) 124 | target_x = t_index*10 + target_x + len(target[t_index])*14 125 | 126 | return inputImg 127 | 128 | 129 | 130 | def visualize_prediction(original_imgs, prediction_list, target_list): 131 | for batch_index in range(original_imgs.shape[0]): 132 | current_picture = transforms.ToPILImage()(original_img[batch_index]).convert('RGB') 133 | current_picture = write_prediction_target(current_picture,prediction_list[batch_index],target_list[batch_index]) 134 | global imgTextId 135 | current_picture.save("./result_visual/visualize_imgs_test0712/"+str(imgTextId)+".jpg") 136 | imgTextId += 1 137 | return -------------------------------------------------------------------------------- /visualize_test0712.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.optim as optim 3 | import torch.utils.data 4 | import torch.backends.cudnn as cudnn 5 | import torchvision 6 | from torchvision import transforms as transforms 7 | import numpy as np 8 | import visdom 9 | import torch.nn.functional as F 10 | from CocoDetection import CocoDetection 11 | # from visualizeImg import * 12 | from visualizeImg_test0712 import * 13 | #from PIL import Image 14 | #import torch 15 | 16 | import argparse 17 | import os 18 | 19 | from models.RMA_module_with_priori import RMA_module 20 | from models.loss_with_priori import loss_function 21 | from utils import get_target_transform as target_trans 22 | 23 | 24 | # GPU setting 25 | os.environ.setdefault("CUDA_VISIBLE_DEVICES", "3") 26 | 27 | 28 | # ================================================================== 29 | # Constants 30 | # ================================================================== 31 | EPOCH = 45 # number of times for each run-through 32 | BATCH_SIZE = 16 # number of images for each epoch 33 | N = 512 # size of input images (512 or 640) 34 | TOPK = 3 # top k highest-ranked labels 35 | GPU_IN_USE = torch.cuda.is_available() # whether using GPU 36 | DIR_TEST_IMAGES = '../dataset/val2017/' 37 | PATH_TEST_ANNFILE = '../dataset/annotations/instances_val2017.json' 38 | PATH_MODEL_PARAMS = './params/params_with_priori.pkl' 39 | NUM_CATEGORIES = 80 40 | OUTPUT_INTERVAL = 100 41 | 42 | 43 | # ================================================================== 44 | # Parser Initialization 45 | # ================================================================== 46 | parser = argparse.ArgumentParser(description='Pytorch Implementation of ICCV2017_AttentionImageClass') 47 | parser.add_argument('--testBatchSize', default=BATCH_SIZE, type=int, help='testing batch size') 48 | parser.add_argument('--pathModelParams', default=PATH_MODEL_PARAMS, type=str, help='path of model parameters') 49 | parser.add_argument('--loadModel', default=True, type=bool, help='load model parameters') 50 | args = parser.parse_args() 51 | 52 | 53 | # ================================================================== 54 | # Prepare Dataset(training & test) 55 | # ================================================================== 56 | print('***** Prepare Data ******') 57 | 58 | # transforms of test dataset 59 | #normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 60 | # std=[0.229, 0.224, 0.225]) 61 | test_transforms = transforms.Compose([ 62 | transforms.Resize((N, N)), 63 | transforms.ToTensor(), 64 | ]) 65 | 66 | test_dataset = CocoDetection(root=DIR_TEST_IMAGES, annFile=PATH_TEST_ANNFILE, 67 | transform=test_transforms, target_transform=target_trans) 68 | test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=args.testBatchSize, shuffle=False, num_workers=2) 69 | print('Data Preparation : Finished') 70 | 71 | # ================================================================== 72 | # Prepare Model 73 | # ================================================================== 74 | print('\n***** Prepare Model *****') 75 | 76 | vgg16 = torchvision.models.vgg16(pretrained=True) 77 | extract_features = vgg16.features 78 | RMA = RMA_module(lstm_input_size=14, lstm_hidden_size=4096, zk_size=4096) 79 | if args.loadModel: 80 | RMA.load_state_dict(torch.load(args.pathModelParams)) 81 | 82 | if GPU_IN_USE: 83 | print('CUDA_VISIBLE_DEVICES:', os.environ['CUDA_VISIBLE_DEVICES']) 84 | print('cuda: move all model parameters and buffers to the GPU') 85 | extract_features.cuda() 86 | RMA.cuda() 87 | cudnn.benchmark = True 88 | 89 | print('Model Preparation : Finished') 90 | 91 | 92 | # Test 93 | # ================================================================================ 94 | # data: [torch.cuda.FloatTensor of size [batch_size, 3, N, N] N=512/640] 95 | # target: [torch.cuda.FloatTensor of size [batch_size, num_categories]] 96 | # output: [torch.cuda.FloatTensor of size [batch_size, num_categories]] 97 | # prediction: [ 98 | # [torch.cuda.FloatTensor of size [batch_size, TOPK] (TOPK)], 99 | # [torch.cuda.LongTensor of size [batch_size, TOPK] (index of TOPK)] 100 | # ] 101 | # ================================================================================ 102 | def test(): 103 | print('test:') 104 | #RMA.eval() # set the module in evaluation mode 105 | 106 | sum_prediction_label = torch.zeros(1, 80) + 1e-6 107 | sum_correct_prediction_label = torch.zeros(1, 80) 108 | sum_ground_truth_label = torch.zeros(1, 80) 109 | 110 | for batch_num, (data, target, original_imgs) in enumerate(test_loader): 111 | if target.sum() == 0: 112 | continue 113 | target = target.index_select(0, torch.nonzero(target.sum(dim=1)).view(-1)) 114 | data = data.index_select(0, torch.nonzero(target.sum(dim=1)).view(-1)) 115 | original_imgs = original_imgs.index_select(0, torch.nonzero(target.sum(dim=1)).view(-1)) 116 | 117 | #print('original_imgs ', original_imgs.size()) 118 | 119 | if GPU_IN_USE: 120 | data, target = data.cuda(), target.cuda() # set up GPU Tensor 121 | 122 | f_I = extract_features(data) 123 | output, M, scores = RMA(f_I, return_whole_scores=True) 124 | 125 | #total_thetas.append(M) 126 | #total_scores.append(scores) 127 | 128 | #visualize_attentional_regions(original_imgs, M[1:, :, :, :], scores) 129 | #visualize_attentional_regions(original_imgs, M, scores) 130 | 131 | prediction = torch.topk(F.softmax(output, dim=1), 10, dim=1) 132 | filter = prediction[0].eq(0.1) + prediction[0].gt(0.1) 133 | prediction_index = torch.mul(prediction[1]+1, filter.type(torch.cuda.LongTensor)) 134 | extend_eye_mat = torch.cat((torch.zeros(1, 80), torch.eye(80)), 0) 135 | prediction_label = extend_eye_mat[prediction_index.view(-1)].view(-1, 10, 80).sum(dim=1) 136 | correct_prediction_label = (target.cpu().byte() & prediction_label.byte()).type(torch.FloatTensor) 137 | 138 | #count the sum of label vector 139 | sum_prediction_label += prediction_label.sum(dim=0) 140 | sum_correct_prediction_label += correct_prediction_label.sum(dim=0) 141 | sum_ground_truth_label += target.cpu().sum(dim=0) 142 | 143 | # visualize 未测试 144 | prediction_list = id2label(prediction_index)[0].tolist() 145 | target_list = id2label(target.nonzero()+1)[0].tolist() 146 | print("prediction_list: ", prediction_list) 147 | print("target_list: ", target_list) 148 | # visualize_prediction(original_imgs, prediction_list, target_list) 149 | 150 | #for i in range(0, target.size(0)): 151 | # print('-----------------') 152 | # print('ground-truth: ', target[i].nonzero().view(-1)) 153 | # print('prediction: ', prediction_index[i] - 1) 154 | # print('-----------------') 155 | 156 | if batch_num % OUTPUT_INTERVAL == 0: 157 | print(batch_num) 158 | #print('loss %.3f (batch %d)' % (test_loss / (batch_num+1), batch_num+1)) 159 | 160 | #evaluation metrics 161 | o_p = torch.div(sum_correct_prediction_label.sum(), sum_prediction_label.sum()) 162 | o_r = torch.div(sum_correct_prediction_label.sum(), sum_ground_truth_label.sum()) 163 | of1 = torch.div(2 * o_p * o_r, o_p + o_r) 164 | c_p = (torch.div(sum_correct_prediction_label, sum_prediction_label)).sum() / NUM_CATEGORIES 165 | c_r = (torch.div(sum_correct_prediction_label, sum_ground_truth_label)).sum() / NUM_CATEGORIES 166 | cf1 = torch.div(2 * c_p * c_r, c_p + c_r) 167 | 168 | print('-------------------------------------------------------------') 169 | print('| CP | CR | CF1 | OP | OR | OF1 |') 170 | print('-------------------------------------------------------------') 171 | print('| %.3f | %.3f | %.3f | %.3f | %.3f | %.3f |' % (c_p, c_r, cf1, o_p, o_r, of1)) 172 | print('-------------------------------------------------------------') 173 | 174 | 175 | 176 | # ================================================================== 177 | # Save Parameters of Test 178 | # ================================================================== 179 | #def save(): 180 | # torch.save(RMA.state_dict(), args.pathModelParams) 181 | # print('Checkpoint saved to {}'.format(args.pathModelParams)) 182 | 183 | 184 | # ================================================================== 185 | # Main 186 | # ================================================================== 187 | with torch.no_grad(): 188 | test() 189 | 190 | --------------------------------------------------------------------------------