├── .gitignore ├── compare.png ├── test ├── test.jpg └── detection.jpg ├── __pycache__ └── backbone.cpython-36.pyc ├── utils ├── __pycache__ │ ├── timer.cpython-36.pyc │ ├── utils.cpython-36.pyc │ ├── utils.cpython-37.pyc │ ├── visual.cpython-36.pyc │ ├── apply_prior.cpython-36.pyc │ ├── visual_hico.cpython-36.pyc │ ├── vsrl_eval.cpython-36.pyc │ └── vsrl_eval.cpython-37.pyc ├── sync_batchnorm │ ├── __pycache__ │ │ ├── comm.cpython-36.pyc │ │ ├── comm.cpython-37.pyc │ │ ├── __init__.cpython-36.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── batchnorm.cpython-36.pyc │ │ ├── batchnorm.cpython-37.pyc │ │ ├── replicate.cpython-36.pyc │ │ └── replicate.cpython-37.pyc │ ├── __init__.py │ ├── unittest.py │ ├── batchnorm_reimpl.py │ ├── replicate.py │ ├── comm.py │ └── batchnorm.py ├── timer.py ├── apply_prior.py ├── visual_hico.py └── visual.py ├── efficientdet ├── __pycache__ │ ├── loss.cpython-36.pyc │ ├── model.cpython-36.pyc │ ├── model.cpython-37.pyc │ ├── utils.cpython-36.pyc │ ├── utils.cpython-37.pyc │ ├── dataset.cpython-36.pyc │ ├── hoi_model.cpython-36.pyc │ ├── help_function.cpython-36.pyc │ ├── vcoco_dataset.cpython-36.pyc │ ├── vcoco_dataset.cpython-37.pyc │ └── hico_det_dataset.cpython-36.pyc ├── config.py ├── hoi_model.py ├── help_function.py ├── utils.py ├── dataset.py ├── hico_det_dataset.py └── model.py ├── efficientnet ├── __pycache__ │ ├── model.cpython-36.pyc │ ├── model.cpython-37.pyc │ ├── utils.cpython-36.pyc │ ├── utils.cpython-37.pyc │ ├── __init__.cpython-36.pyc │ ├── __init__.cpython-37.pyc │ ├── utils_extra.cpython-36.pyc │ └── utils_extra.cpython-37.pyc ├── __init__.py ├── utils_extra.py ├── model.py └── utils.py ├── projects ├── hico-det.yml └── vcoco.yml ├── backbone.py ├── README.md ├── coco_eval.py ├── Generate_HICO_detection.py ├── demo.py ├── test_vcoco.py └── test_hico-det.py /.gitignore: -------------------------------------------------------------------------------- 1 | datasets/ 2 | logs/ 3 | weights/ 4 | 5 | -------------------------------------------------------------------------------- /compare.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/compare.png -------------------------------------------------------------------------------- /test/test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/test/test.jpg -------------------------------------------------------------------------------- /test/detection.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/test/detection.jpg -------------------------------------------------------------------------------- /__pycache__/backbone.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/__pycache__/backbone.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/timer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/__pycache__/timer.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/visual.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/__pycache__/visual.cpython-36.pyc -------------------------------------------------------------------------------- /efficientdet/__pycache__/loss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/loss.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/apply_prior.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/__pycache__/apply_prior.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/visual_hico.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/__pycache__/visual_hico.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/vsrl_eval.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/__pycache__/vsrl_eval.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/vsrl_eval.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/__pycache__/vsrl_eval.cpython-37.pyc -------------------------------------------------------------------------------- /efficientdet/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /efficientdet/__pycache__/model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/model.cpython-37.pyc -------------------------------------------------------------------------------- /efficientdet/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /efficientdet/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /efficientnet/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientnet/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /efficientnet/__pycache__/model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientnet/__pycache__/model.cpython-37.pyc -------------------------------------------------------------------------------- /efficientnet/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientnet/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /efficientnet/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientnet/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /efficientdet/__pycache__/dataset.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/dataset.cpython-36.pyc -------------------------------------------------------------------------------- /efficientdet/__pycache__/hoi_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/hoi_model.cpython-36.pyc -------------------------------------------------------------------------------- /efficientnet/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientnet/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /efficientnet/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientnet/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /efficientnet/__pycache__/utils_extra.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientnet/__pycache__/utils_extra.cpython-36.pyc -------------------------------------------------------------------------------- /efficientnet/__pycache__/utils_extra.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientnet/__pycache__/utils_extra.cpython-37.pyc -------------------------------------------------------------------------------- /efficientdet/__pycache__/help_function.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/help_function.cpython-36.pyc -------------------------------------------------------------------------------- /efficientdet/__pycache__/vcoco_dataset.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/vcoco_dataset.cpython-36.pyc -------------------------------------------------------------------------------- /efficientdet/__pycache__/vcoco_dataset.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/vcoco_dataset.cpython-37.pyc -------------------------------------------------------------------------------- /utils/sync_batchnorm/__pycache__/comm.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/sync_batchnorm/__pycache__/comm.cpython-36.pyc -------------------------------------------------------------------------------- /utils/sync_batchnorm/__pycache__/comm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/sync_batchnorm/__pycache__/comm.cpython-37.pyc -------------------------------------------------------------------------------- /efficientdet/__pycache__/hico_det_dataset.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/hico_det_dataset.cpython-36.pyc -------------------------------------------------------------------------------- /utils/sync_batchnorm/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/sync_batchnorm/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /utils/sync_batchnorm/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/sync_batchnorm/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /utils/sync_batchnorm/__pycache__/batchnorm.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/sync_batchnorm/__pycache__/batchnorm.cpython-36.pyc -------------------------------------------------------------------------------- /utils/sync_batchnorm/__pycache__/batchnorm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/sync_batchnorm/__pycache__/batchnorm.cpython-37.pyc -------------------------------------------------------------------------------- /utils/sync_batchnorm/__pycache__/replicate.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/sync_batchnorm/__pycache__/replicate.cpython-36.pyc -------------------------------------------------------------------------------- /utils/sync_batchnorm/__pycache__/replicate.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/sync_batchnorm/__pycache__/replicate.cpython-37.pyc -------------------------------------------------------------------------------- /efficientnet/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.6.1" 2 | from .model import EfficientNet 3 | from .utils import ( 4 | GlobalParams, 5 | BlockArgs, 6 | BlockDecoder, 7 | efficientnet, 8 | get_model_params, 9 | ) 10 | 11 | -------------------------------------------------------------------------------- /utils/sync_batchnorm/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File : __init__.py 3 | # Author : Jiayuan Mao 4 | # Email : maojiayuan@gmail.com 5 | # Date : 27/01/2018 6 | # 7 | # This file is part of Synchronized-BatchNorm-PyTorch. 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch 9 | # Distributed under MIT License. 10 | 11 | from .batchnorm import SynchronizedBatchNorm1d, SynchronizedBatchNorm2d, SynchronizedBatchNorm3d 12 | from .batchnorm import patch_sync_batchnorm, convert_model 13 | from .replicate import DataParallelWithCallback, patch_replication_callback 14 | -------------------------------------------------------------------------------- /projects/hico-det.yml: -------------------------------------------------------------------------------- 1 | project_name: hico-det_final # also the folder name of the dataset that under data_path folder 2 | train_set: trainval 3 | val_set: test 4 | num_gpus: 8 5 | 6 | # mean and std in RGB order, actually this part should remain unchanged as long as your dataset is similar to coco. 7 | mean: [0.485, 0.456, 0.406] 8 | std: [0.229, 0.224, 0.225] 9 | 10 | # this is coco anchors, change it if necessary 11 | anchors_scales: '[2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]' 12 | anchors_ratios: '[(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)]' 13 | 14 | # must match your dataset's category_id. 15 | # category_id is one_indexed, 16 | # for example, index of 'car' here is 2, while category_id of is 3 17 | 18 | -------------------------------------------------------------------------------- /utils/sync_batchnorm/unittest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File : unittest.py 3 | # Author : Jiayuan Mao 4 | # Email : maojiayuan@gmail.com 5 | # Date : 27/01/2018 6 | # 7 | # This file is part of Synchronized-BatchNorm-PyTorch. 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch 9 | # Distributed under MIT License. 10 | 11 | import unittest 12 | import torch 13 | 14 | 15 | class TorchTestCase(unittest.TestCase): 16 | def assertTensorClose(self, x, y): 17 | adiff = float((x - y).abs().max()) 18 | if (y == 0).all(): 19 | rdiff = 'NaN' 20 | else: 21 | rdiff = float((adiff / y).abs().max()) 22 | 23 | message = ( 24 | 'Tensor close check failed\n' 25 | 'adiff={}\n' 26 | 'rdiff={}\n' 27 | ).format(adiff, rdiff) 28 | self.assertTrue(torch.allclose(x, y), message) 29 | 30 | -------------------------------------------------------------------------------- /utils/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | class Timer(object): 11 | """A simple timer.""" 12 | def __init__(self): 13 | self.total_time = 0. 14 | self.calls = 0 15 | self.start_time = 0. 16 | self.diff = 0. 17 | self.average_time = 0. 18 | 19 | def tic(self): 20 | # using time.time instead of time.clock because time time.clock 21 | # does not normalize for multithreading 22 | self.start_time = time.time() 23 | 24 | def toc(self, average=True): 25 | self.diff = time.time() - self.start_time 26 | self.total_time += self.diff 27 | self.calls += 1 28 | self.average_time = self.total_time / self.calls 29 | if average: 30 | return self.average_time 31 | else: 32 | return self.diff 33 | -------------------------------------------------------------------------------- /projects/vcoco.yml: -------------------------------------------------------------------------------- 1 | project_name: vcoco_new # also the folder name of the dataset that under data_path folder 2 | train_set: trainval 3 | val_set: test 4 | num_gpus: 4 5 | 6 | # mean and std in RGB order, actually this part should remain unchanged as long as your dataset is similar to coco. 7 | mean: [0.485, 0.456, 0.406] 8 | std: [0.229, 0.224, 0.225] 9 | 10 | # this is coco anchors, change it if necessary 11 | anchors_scales: '[2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]' 12 | anchors_ratios: '[(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)]' 13 | 14 | # must match your dataset's category_id. 15 | # category_id is one_indexed, 16 | # for example, index of 'car' here is 2, while category_id of is 3 17 | obj_list: "['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 18 | 'fire hydrant', '', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 19 | 'cow', 'elephant', 'bear', 'zebra', 'giraffe', '', 'backpack', 'umbrella', '', '', 'handbag', 'tie', 20 | 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 21 | 'skateboard', 'surfboard', 'tennis racket', 'bottle', '', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 22 | 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 23 | 'cake', 'chair', 'couch', 'potted plant', 'bed', '', 'dining table', '', '', 'toilet', '', 'tv', 24 | 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 25 | 'refrigerator', '', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier','toothbrush']" 26 | 27 | union_action_list: "[('hold', 'obj'), ('sit', 'instr'), ('ride', 'instr'), ('look', 'obj'), 28 | ('hit', 'instr'), ('hit', 'obj'), ('eat', 'obj'), ('eat', 'instr'), 29 | ('jump', 'instr'), ('lay', 'instr'), ('talk_on_phone', 'instr'), 30 | ('carry', 'obj'), ('throw', 'obj'), ('catch', 'obj'), ('cut', 'instr'), 31 | ('cut', 'obj'), ('work_on_computer', 'instr'), ('ski', 'instr'), 32 | ('surf', 'instr'), ('skateboard', 'instr'), ('drink', 'instr'), 33 | ('kick', 'obj'), ('point', 'instr'), ('read', 'obj'), ('snowboard', 'instr')]" 34 | -------------------------------------------------------------------------------- /utils/sync_batchnorm/batchnorm_reimpl.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # File : batchnorm_reimpl.py 4 | # Author : acgtyrant 5 | # Date : 11/01/2018 6 | # 7 | # This file is part of Synchronized-BatchNorm-PyTorch. 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch 9 | # Distributed under MIT License. 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.init as init 14 | 15 | __all__ = ['BatchNorm2dReimpl'] 16 | 17 | 18 | class BatchNorm2dReimpl(nn.Module): 19 | """ 20 | A re-implementation of batch normalization, used for testing the numerical 21 | stability. 22 | 23 | Author: acgtyrant 24 | See also: 25 | https://github.com/vacancy/Synchronized-BatchNorm-PyTorch/issues/14 26 | """ 27 | def __init__(self, num_features, eps=1e-5, momentum=0.1): 28 | super().__init__() 29 | 30 | self.num_features = num_features 31 | self.eps = eps 32 | self.momentum = momentum 33 | self.weight = nn.Parameter(torch.empty(num_features)) 34 | self.bias = nn.Parameter(torch.empty(num_features)) 35 | self.register_buffer('running_mean', torch.zeros(num_features)) 36 | self.register_buffer('running_var', torch.ones(num_features)) 37 | self.reset_parameters() 38 | 39 | def reset_running_stats(self): 40 | self.running_mean.zero_() 41 | self.running_var.fill_(1) 42 | 43 | def reset_parameters(self): 44 | self.reset_running_stats() 45 | init.uniform_(self.weight) 46 | init.zeros_(self.bias) 47 | 48 | def forward(self, input_): 49 | batchsize, channels, height, width = input_.size() 50 | numel = batchsize * height * width 51 | input_ = input_.permute(1, 0, 2, 3).contiguous().view(channels, numel) 52 | sum_ = input_.sum(1) 53 | sum_of_square = input_.pow(2).sum(1) 54 | mean = sum_ / numel 55 | sumvar = sum_of_square - sum_ * mean 56 | 57 | self.running_mean = ( 58 | (1 - self.momentum) * self.running_mean 59 | + self.momentum * mean.detach() 60 | ) 61 | unbias_var = sumvar / (numel - 1) 62 | self.running_var = ( 63 | (1 - self.momentum) * self.running_var 64 | + self.momentum * unbias_var.detach() 65 | ) 66 | 67 | bias_var = sumvar / numel 68 | inv_std = 1 / (bias_var + self.eps).pow(0.5) 69 | output = ( 70 | (input_ - mean.unsqueeze(1)) * inv_std.unsqueeze(1) * 71 | self.weight.unsqueeze(1) + self.bias.unsqueeze(1)) 72 | 73 | return output.view(channels, batchsize, height, width).permute(1, 0, 2, 3).contiguous() 74 | 75 | -------------------------------------------------------------------------------- /efficientdet/config.py: -------------------------------------------------------------------------------- 1 | COCO_CLASSES = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", 2 | "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", 3 | "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", 4 | "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", 5 | "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", 6 | "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", 7 | "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", 8 | "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", 9 | "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", 10 | "teddy bear", "hair drier", "toothbrush"] 11 | 12 | colors = [(39, 129, 113), (164, 80, 133), (83, 122, 114), (99, 81, 172), (95, 56, 104), (37, 84, 86), (14, 89, 122), 13 | (80, 7, 65), (10, 102, 25), (90, 185, 109), (106, 110, 132), (169, 158, 85), (188, 185, 26), (103, 1, 17), 14 | (82, 144, 81), (92, 7, 184), (49, 81, 155), (179, 177, 69), (93, 187, 158), (13, 39, 73), (12, 50, 60), 15 | (16, 179, 33), (112, 69, 165), (15, 139, 63), (33, 191, 159), (182, 173, 32), (34, 113, 133), (90, 135, 34), 16 | (53, 34, 86), (141, 35, 190), (6, 171, 8), (118, 76, 112), (89, 60, 55), (15, 54, 88), (112, 75, 181), 17 | (42, 147, 38), (138, 52, 63), (128, 65, 149), (106, 103, 24), (168, 33, 45), (28, 136, 135), (86, 91, 108), 18 | (52, 11, 76), (142, 6, 189), (57, 81, 168), (55, 19, 148), (182, 101, 89), (44, 65, 179), (1, 33, 26), 19 | (122, 164, 26), (70, 63, 134), (137, 106, 82), (120, 118, 52), (129, 74, 42), (182, 147, 112), (22, 157, 50), 20 | (56, 50, 20), (2, 22, 177), (156, 100, 106), (21, 35, 42), (13, 8, 121), (142, 92, 28), (45, 118, 33), 21 | (105, 118, 30), (7, 185, 124), (46, 34, 146), (105, 184, 169), (22, 18, 5), (147, 71, 73), (181, 64, 91), 22 | (31, 39, 184), (164, 179, 33), (96, 50, 18), (95, 15, 106), (113, 68, 54), (136, 116, 112), (119, 139, 130), 23 | (31, 139, 34), (66, 6, 127), (62, 39, 2), (49, 99, 180), (49, 119, 155), (153, 50, 183), (125, 38, 3), 24 | (129, 87, 143), (49, 87, 40), (128, 62, 120), (73, 85, 148), (28, 144, 118), (29, 9, 24), (175, 45, 108), 25 | (81, 175, 64), (178, 19, 157), (74, 188, 190), (18, 114, 2), (62, 128, 96), (21, 3, 150), (0, 6, 95), 26 | (2, 20, 184), (122, 37, 185)] 27 | -------------------------------------------------------------------------------- /efficientdet/hoi_model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | from torchvision.ops.boxes import nms as nms_torch 4 | 5 | from efficientnet import EfficientNet as EffNet 6 | from efficientnet.utils import MemoryEfficientSwish, Swish 7 | from efficientnet.utils_extra import Conv2dStaticSamePadding, MaxPool2dStaticSamePadding 8 | 9 | from efficientdet.model import Regressor, Classifier, SeparableConvBlock 10 | 11 | 12 | class Union_Branch(nn.Module): 13 | def __init__(self, in_channels, num_anchors, num_layers, num_union_classes, num_obj_classes): 14 | super(Union_Branch, self).__init__() 15 | self.num_layers = num_layers 16 | self.num_anchors = num_anchors 17 | self.in_channels = in_channels 18 | 19 | self.num_union_classes = num_union_classes 20 | self.num_obj_classes = num_obj_classes 21 | 22 | self.action_classifier = Classifier(in_channels=self.in_channels, num_anchors=self.num_anchors, 23 | num_classes=self.num_union_classes, num_layers=self.num_layers) 24 | 25 | self.union_sub_regressor = Regressor(in_channels=self.in_channels, num_anchors=self.num_anchors, num_layers=self.num_layers) 26 | self.union_obj_regressor = Regressor(in_channels=self.in_channels, num_anchors=self.num_anchors, num_layers=self.num_layers) 27 | 28 | 29 | def forward(self, inputs): 30 | union_act_cls = self.action_classifier(inputs) 31 | union_sub_reg = self.union_sub_regressor(inputs) 32 | union_obj_reg = self.union_obj_regressor(inputs) 33 | 34 | return union_act_cls, union_sub_reg, union_obj_reg 35 | 36 | 37 | class Instance_Branch(nn.Module): 38 | def __init__(self, in_channels, num_anchors, num_layers, num_inst_classes, num_obj_classes): 39 | super(Instance_Branch, self).__init__() 40 | self.num_layers = num_layers 41 | self.num_anchors = num_anchors 42 | self.in_channels = in_channels 43 | 44 | self.num_inst_classes = num_inst_classes 45 | self.num_obj_classes = num_obj_classes 46 | 47 | self.action_classifier = Classifier(in_channels=self.in_channels, num_anchors=self.num_anchors, 48 | num_classes=self.num_inst_classes, num_layers=self.num_layers) 49 | 50 | self.object_classifier = Classifier(in_channels=self.in_channels, num_anchors=self.num_anchors, 51 | num_classes=self.num_obj_classes, num_layers=self.num_layers) 52 | 53 | self.object_regressor = Regressor(in_channels=self.in_channels, num_anchors=self.num_anchors, num_layers=self.num_layers) 54 | 55 | def forward(self, inputs): 56 | inst_act_cls = self.action_classifier(inputs) 57 | inst_obj_cls = self.object_classifier(inputs) 58 | inst_bbox_reg = self.object_regressor(inputs) 59 | 60 | return inst_act_cls, inst_obj_cls, inst_bbox_reg 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /efficientnet/utils_extra.py: -------------------------------------------------------------------------------- 1 | # Author: Zylo117 2 | 3 | import math 4 | 5 | from torch import nn 6 | import torch.nn.functional as F 7 | 8 | 9 | class Conv2dStaticSamePadding(nn.Module): 10 | """ 11 | created by Zylo117 12 | The real keras/tensorflow conv2d with same padding 13 | """ 14 | 15 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, bias=True, groups=1, dilation=1, **kwargs): 16 | super().__init__() 17 | self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, 18 | bias=bias, groups=groups) 19 | self.stride = self.conv.stride 20 | self.kernel_size = self.conv.kernel_size 21 | self.dilation = self.conv.dilation 22 | 23 | if isinstance(self.stride, int): 24 | self.stride = [self.stride] * 2 25 | elif len(self.stride) == 1: 26 | self.stride = [self.stride[0]] * 2 27 | 28 | if isinstance(self.kernel_size, int): 29 | self.kernel_size = [self.kernel_size] * 2 30 | elif len(self.kernel_size) == 1: 31 | self.kernel_size = [self.kernel_size[0]] * 2 32 | 33 | def forward(self, x): 34 | h, w = x.shape[-2:] 35 | 36 | h_step = math.ceil(w / self.stride[1]) 37 | v_step = math.ceil(h / self.stride[0]) 38 | 39 | h_cover_len = self.stride[1] * (h_step - 1) + 1 + (self.kernel_size[1] - 1) 40 | v_cover_len = self.stride[0] * (v_step - 1) + 1 + (self.kernel_size[0] - 1) 41 | 42 | extra_h = h_cover_len - w 43 | extra_v = v_cover_len - h 44 | 45 | left = extra_h // 2 46 | right = extra_h - left 47 | top = extra_v // 2 48 | bottom = extra_v - top 49 | 50 | x = F.pad(x, [left, right, top, bottom]) 51 | 52 | x = self.conv(x) 53 | return x 54 | 55 | 56 | class MaxPool2dStaticSamePadding(nn.Module): 57 | """ 58 | created by Zylo117 59 | The real keras/tensorflow MaxPool2d with same padding 60 | """ 61 | 62 | def __init__(self, *args, **kwargs): 63 | super().__init__() 64 | self.pool = nn.MaxPool2d(*args, **kwargs) 65 | self.stride = self.pool.stride 66 | self.kernel_size = self.pool.kernel_size 67 | 68 | if isinstance(self.stride, int): 69 | self.stride = [self.stride] * 2 70 | elif len(self.stride) == 1: 71 | self.stride = [self.stride[0]] * 2 72 | 73 | if isinstance(self.kernel_size, int): 74 | self.kernel_size = [self.kernel_size] * 2 75 | elif len(self.kernel_size) == 1: 76 | self.kernel_size = [self.kernel_size[0]] * 2 77 | 78 | def forward(self, x): 79 | h, w = x.shape[-2:] 80 | 81 | h_step = math.ceil(w / self.stride[1]) 82 | v_step = math.ceil(h / self.stride[0]) 83 | h_cover_len = self.stride[1] * (h_step - 1) + 1 + (self.kernel_size[1] - 1) 84 | v_cover_len = self.stride[0] * (v_step - 1) + 1 + (self.kernel_size[0] - 1) 85 | 86 | extra_h = h_cover_len - w 87 | extra_v = v_cover_len - h 88 | 89 | left = extra_h // 2 90 | right = extra_h - left 91 | top = extra_v // 2 92 | bottom = extra_v - top 93 | 94 | x = F.pad(x, [left, right, top, bottom]) 95 | 96 | x = self.pool(x) 97 | return x 98 | -------------------------------------------------------------------------------- /utils/sync_batchnorm/replicate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File : replicate.py 3 | # Author : Jiayuan Mao 4 | # Email : maojiayuan@gmail.com 5 | # Date : 27/01/2018 6 | # 7 | # This file is part of Synchronized-BatchNorm-PyTorch. 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch 9 | # Distributed under MIT License. 10 | 11 | import functools 12 | 13 | from torch.nn.parallel.data_parallel import DataParallel 14 | 15 | __all__ = [ 16 | 'CallbackContext', 17 | 'execute_replication_callbacks', 18 | 'DataParallelWithCallback', 19 | 'patch_replication_callback' 20 | ] 21 | 22 | 23 | class CallbackContext(object): 24 | pass 25 | 26 | 27 | def execute_replication_callbacks(modules): 28 | """ 29 | Execute an replication callback `__data_parallel_replicate__` on each module created by original replication. 30 | 31 | The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)` 32 | 33 | Note that, as all modules are isomorphism, we assign each sub-module with a context 34 | (shared among multiple copies of this module on different devices). 35 | Through this context, different copies can share some information. 36 | 37 | We guarantee that the callback on the master copy (the first copy) will be called ahead of calling the callback 38 | of any slave copies. 39 | """ 40 | master_copy = modules[0] 41 | nr_modules = len(list(master_copy.modules())) 42 | ctxs = [CallbackContext() for _ in range(nr_modules)] 43 | 44 | for i, module in enumerate(modules): 45 | for j, m in enumerate(module.modules()): 46 | if hasattr(m, '__data_parallel_replicate__'): 47 | m.__data_parallel_replicate__(ctxs[j], i) 48 | 49 | 50 | class DataParallelWithCallback(DataParallel): 51 | """ 52 | Data Parallel with a replication callback. 53 | 54 | An replication callback `__data_parallel_replicate__` of each module will be invoked after being created by 55 | original `replicate` function. 56 | The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)` 57 | 58 | Examples: 59 | > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False) 60 | > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1]) 61 | # sync_bn.__data_parallel_replicate__ will be invoked. 62 | """ 63 | 64 | def replicate(self, module, device_ids): 65 | modules = super(DataParallelWithCallback, self).replicate(module, device_ids) 66 | execute_replication_callbacks(modules) 67 | return modules 68 | 69 | 70 | def patch_replication_callback(data_parallel): 71 | """ 72 | Monkey-patch an existing `DataParallel` object. Add the replication callback. 73 | Useful when you have customized `DataParallel` implementation. 74 | 75 | Examples: 76 | > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False) 77 | > sync_bn = DataParallel(sync_bn, device_ids=[0, 1]) 78 | > patch_replication_callback(sync_bn) 79 | # this is equivalent to 80 | > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False) 81 | > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1]) 82 | """ 83 | 84 | assert isinstance(data_parallel, DataParallel) 85 | 86 | old_replicate = data_parallel.replicate 87 | 88 | @functools.wraps(old_replicate) 89 | def new_replicate(module, device_ids): 90 | modules = old_replicate(module, device_ids) 91 | execute_replication_callbacks(modules) 92 | return modules 93 | 94 | data_parallel.replicate = new_replicate 95 | -------------------------------------------------------------------------------- /utils/apply_prior.py: -------------------------------------------------------------------------------- 1 | def apply_prior(scores, obj_cls): 2 | assert len(scores) == 25 3 | if obj_cls != 35: # not a snowboard, then the action is impossible to be snowboard 4 | scores[24] = 0 5 | 6 | if obj_cls != 83: # not a book, then the action is impossible to be read 7 | scores[23] = 0 8 | 9 | if obj_cls != 36: # not a sports ball, then the action is impossible to be kick 10 | scores[21] = 0 11 | 12 | if (obj_cls != 45) and (obj_cls != 43) and (obj_cls != 46) and (obj_cls != 50): # not 'wine glass', 'bottle', 'cup', 'bowl', then the action is impossible to be drink 13 | scores[20] = 0 14 | 15 | if obj_cls != 40: # not a skateboard, then the action is impossible to be skateboard 16 | scores[19] = 0 17 | 18 | if obj_cls != 41: # not a surfboard, then the action is impossible to be surfboard 19 | scores[18] = 0 20 | 21 | if obj_cls != 34: # not a ski, then the action is impossible to be ski 22 | scores[17] = 0 23 | 24 | if obj_cls != 72: # not a laptop, then the action is impossible to be work on computer 25 | scores[16] = 0 26 | 27 | if (obj_cls != 86) and (obj_cls != 47) and (obj_cls != 48): # not 'scissors', 'fork', 'knife', then the action is impossible to be cur instr 28 | scores[14] = 0 29 | 30 | if (obj_cls != 36) and (obj_cls != 33): # not 'sports ball', 'frisbee', then the action is impossible to be throw and catch 31 | scores[12] = 0 32 | scores[13] = 0 33 | 34 | if obj_cls != 76: # not a cellphone, then the action is impossible to be talk_on_phone 35 | scores[10] = 0 36 | 37 | if (obj_cls != 14) and (obj_cls != 66) and (obj_cls != 69) and (obj_cls != 64) and (obj_cls != 62) and (obj_cls != 61): # not 'bench', 'dining table', 'toilet', 'bed', 'couch', 'chair', then the action is impossible to be lay 38 | scores[9] = 0 39 | 40 | if (obj_cls != 35) and (obj_cls != 34) and (obj_cls != 40) and (obj_cls != 41): # not 'snowboard', 'skis', 'skateboard', 'surfboard', then the action is impossible to be jump 41 | scores[8] = 0 42 | 43 | if (obj_cls != 51) and (obj_cls != 52) and (obj_cls != 53) and (obj_cls != 54) and (obj_cls != 55) and (obj_cls != 56) and (obj_cls != 57) and (obj_cls != 58) and (obj_cls != 59) and (obj_cls != 60): # not ''banana', 'apple', 'sandwich', 'orange', 'carrot', 'broccoli', 'hot dog', 'pizza', 'cake', 'donut', then the action is impossible to be eat_obj 44 | scores[6] = 0 45 | 46 | if (obj_cls != 47) and (obj_cls != 48) and (obj_cls != 49): # not 'fork', 'knife', 'spoon', then the action is impossible to be eat_instr 47 | scores[7] = 0 48 | 49 | if (obj_cls != 42) and (obj_cls != 38): # not 'tennis racket', 'baseball bat', then the action is impossible to be hit_instr 50 | scores[4] = 0 51 | 52 | if (obj_cls != 36): # not 'sports ball, then the action is impossible to be hit_obj 53 | scores[5] = 0 54 | 55 | if (obj_cls != 1) and (obj_cls != 3) and (obj_cls != 5) and (obj_cls != 7) and (obj_cls != 8) and (obj_cls != 6) and (obj_cls != 4) and (obj_cls != 2) and (obj_cls != 18) and (obj_cls != 21): # not 'bicycle', 'motorcycle', 'bus', 'truck', 'boat', 'train', 'airplane', 'car', 'horse', 'elephant', then the action is impossible to be ride 56 | scores[2] = 0 57 | 58 | if (obj_cls != 1) and (obj_cls != 3) and (obj_cls != 18) and (obj_cls != 21) and (obj_cls != 14) and (obj_cls != 61) and (obj_cls != 62) and (obj_cls != 64) and (obj_cls != 69) and (obj_cls != 66) and (obj_cls != 32) and (obj_cls != 30) and (obj_cls != 26): # not 'bicycle', 'motorcycle', 'horse', 'elephant', 'bench', 'chair', 'couch', 'bed', 'toilet', 'dining table', 'suitcase', 'handbag', 'backpack', then the action is impossible to be sit 59 | scores[1] = 0 60 | 61 | if (obj_cls == 0): # "person", then the action is impossible to be cut_obj 62 | scores[15] = 0 63 | 64 | return scores 65 | -------------------------------------------------------------------------------- /backbone.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch import nn 5 | 6 | from efficientdet.model import BiFPN, Regressor, Classifier, EfficientNet 7 | from efficientdet.hoi_model import Union_Branch, Instance_Branch 8 | from efficientdet.utils import Anchors 9 | 10 | 11 | class EfficientDetBackbone(nn.Module): 12 | def __init__(self, num_classes=80, num_union_classes=25, num_inst_classes=51, compound_coef=0, load_weights=False, **kwargs): 13 | super(EfficientDetBackbone, self).__init__() 14 | self.compound_coef = compound_coef 15 | 16 | self.backbone_compound_coef = [0, 1, 2, 3, 4, 5, 6, 6] 17 | self.fpn_num_filters = [64, 88, 112, 160, 224, 288, 384, 384] 18 | self.fpn_cell_repeats = [3, 4, 5, 6, 7, 7, 8, 8] 19 | self.input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] 20 | self.box_class_repeats = [3, 3, 3, 4, 4, 4, 5, 5] 21 | self.anchor_scale = [4., 4., 4., 4., 4., 4., 4., 5.] 22 | self.aspect_ratios = kwargs.get('ratios', [(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)]) 23 | self.num_scales = len(kwargs.get('scales', [2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)])) 24 | conv_channel_coef = { 25 | # the channels of P3/P4/P5. 26 | 0: [40, 112, 320], 27 | 1: [40, 112, 320], 28 | 2: [48, 120, 352], 29 | 3: [48, 136, 384], 30 | 4: [56, 160, 448], 31 | 5: [64, 176, 512], 32 | 6: [72, 200, 576], 33 | 7: [72, 200, 576], 34 | } 35 | 36 | num_anchors = len(self.aspect_ratios) * self.num_scales 37 | 38 | self.bifpn = nn.Sequential( 39 | *[BiFPN(self.fpn_num_filters[self.compound_coef], 40 | conv_channel_coef[compound_coef], 41 | True if _ == 0 else False, 42 | attention=True if compound_coef < 6 else False) 43 | for _ in range(self.fpn_cell_repeats[compound_coef])]) 44 | 45 | self.num_classes = num_classes 46 | self.num_union_classes = num_union_classes 47 | self.num_inst_classes = num_inst_classes 48 | 49 | self.union_branch = Union_Branch(in_channels = self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors, 50 | num_layers=self.box_class_repeats[self.compound_coef], 51 | num_union_classes=num_union_classes, num_obj_classes=num_classes) 52 | self.instance_branch = Instance_Branch(in_channels = self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors, 53 | num_layers=self.box_class_repeats[self.compound_coef], 54 | num_inst_classes=num_inst_classes, num_obj_classes=num_classes) 55 | 56 | self.anchors = Anchors(anchor_scale=self.anchor_scale[compound_coef], **kwargs) 57 | 58 | self.backbone_net = EfficientNet(self.backbone_compound_coef[compound_coef], load_weights) 59 | 60 | def freeze_bn(self): 61 | for m in self.modules(): 62 | if isinstance(m, nn.BatchNorm2d): 63 | m.eval() 64 | 65 | def forward(self, inputs): 66 | max_size = inputs.shape[-1] 67 | 68 | _, p3, p4, p5 = self.backbone_net(inputs) 69 | 70 | features = (p3, p4, p5) 71 | features = self.bifpn(features) 72 | 73 | union_act_cls, union_sub_reg, union_obj_reg = self.union_branch(features) 74 | inst_act_cls, inst_obj_cls, inst_bbox_reg = self.instance_branch(features) 75 | 76 | anchors = self.anchors(inputs, inputs.dtype) 77 | 78 | return features, union_act_cls, union_sub_reg, union_obj_reg, inst_act_cls, inst_obj_cls, inst_bbox_reg, anchors 79 | 80 | 81 | def init_backbone(self, path): 82 | state_dict = torch.load(path) 83 | try: 84 | ret = self.load_state_dict(state_dict, strict=False) 85 | print(ret) 86 | except RuntimeError as e: 87 | print('Ignoring ' + str(e) + '"') 88 | 89 | 90 | -------------------------------------------------------------------------------- /utils/visual_hico.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from PIL import Image 6 | import matplotlib.pyplot as plt 7 | 8 | import pickle 9 | import json 10 | import numpy as np 11 | import cv2 12 | import os 13 | import sys 14 | import argparse 15 | 16 | import matplotlib as mpl 17 | mpl.use('Agg') 18 | 19 | 20 | obj_list = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 21 | 'fire hydrant', '', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 22 | 'cow', 'elephant', 'bear', 'zebra', 'giraffe', '', 'backpack', 'umbrella', '', '', 'handbag', 'tie', 23 | 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 24 | 'skateboard', 'surfboard', 'tennis racket', 'bottle', '', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 25 | 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 26 | 'cake', 'chair', 'couch', 'potted plant', 'bed', '', 'dining table', '', '', 'toilet', '', 'tv', 27 | 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 28 | 'refrigerator', '', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 29 | 'toothbrush'] 30 | 31 | with open("/DATA1/Benchmark/hico_20160224_det/hico_processed/hoi_list.json", "r") as file: 32 | hois = json.load(file) 33 | num_hois = len(hois) 34 | union_action_list = {} 35 | for i, item in enumerate(hois): 36 | union_action_list[i] = item["verb"] + "_" + item["object"] 37 | 38 | 39 | def visual_hico(preds_inst, detection, image_id): 40 | output_dir = "vis/%d" % image_id 41 | if not os.path.exists(output_dir): 42 | os.mkdir(output_dir) 43 | 44 | dpi = 80 45 | 46 | im_file = "./datasets/hico_20160224_det/images/test2015/HICO_test2015_" + (str(image_id)).zfill(8) + ".jpg" 47 | 48 | im_data = plt.imread(im_file) 49 | height, width, nbands = im_data.shape 50 | figsize = width / float(dpi), height / float(dpi) 51 | 52 | fig = plt.figure(figsize=figsize) 53 | ax = fig.add_axes([0, 0, 1, 1]) 54 | ax.axis('off') 55 | ax.imshow(im_data, interpolation='nearest') 56 | 57 | for inst_id in range(len(preds_inst["rois"])): 58 | box = preds_inst["rois"][inst_id] 59 | ax.add_patch( 60 | plt.Rectangle((box[0], box[1]), 61 | box[2] - box[0], 62 | box[3] - box[1], fill=False, 63 | edgecolor="orange", linewidth=3) 64 | ) 65 | text = obj_list[preds_inst["obj_class_ids"][inst_id]] + " ," + "%.3f"%preds_inst["obj_scores"][inst_id] 66 | ax.text(box[0] + 10, box[1] + 25, 67 | text, fontsize=20, color='blue') 68 | fig.savefig(os.path.join(output_dir, "instances.jpg")) 69 | plt.close() 70 | 71 | for ele_id, ele in enumerate(detection): 72 | role_scores = ele[3] 73 | role_scores_idx_sort = np.argsort(role_scores)[::-1] 74 | 75 | if role_scores.max() < 1: 76 | continue 77 | 78 | fig = plt.figure(figsize=figsize) 79 | ax = fig.add_axes([0, 0, 1, 1]) 80 | ax.axis('off') 81 | ax.imshow(im_data, interpolation='nearest') 82 | 83 | H_box = ele[0] 84 | O_box = ele[1] 85 | 86 | ax.add_patch( 87 | plt.Rectangle((H_box[0], H_box[1]), 88 | H_box[2] - H_box[0], 89 | H_box[3] - H_box[1], fill=False, 90 | edgecolor="red", linewidth=3) 91 | ) 92 | 93 | ax.add_patch( 94 | plt.Rectangle((O_box[0], O_box[1]), 95 | O_box[2] - O_box[0], 96 | O_box[3] - O_box[1], fill=False, 97 | edgecolor="blue", linewidth=3) 98 | ) 99 | 100 | for action_count in range(5): 101 | text = union_action_list[role_scores_idx_sort[action_count]] + ", %.2f" % role_scores[role_scores_idx_sort[action_count]] 102 | ax.text(H_box[0] + 10, H_box[1] + 25 + action_count * 35, 103 | text, fontsize=16, color='green') 104 | 105 | fig.savefig(os.path.join(output_dir, "%d.jpg"%ele_id)) 106 | 107 | plt.close() 108 | 109 | 110 | 111 | if __name__=="__main__": 112 | arg = argparse.ArgumentParser() 113 | arg.add_argument('--det_file', type=str, default=None) 114 | args = ap.parse_args() 115 | 116 | detection = pickle.load(open(args.det_file, "rb")) 117 | visualize(detection) 118 | -------------------------------------------------------------------------------- /efficientdet/help_function.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | label_to_class = {0: ('hold', 'obj'), 1: ('sit', 'instr'), 2: ('ride', 'instr'), 3: ('look', 'obj'), 4 | 4: ('hit', 'instr'), 5: ('hit', 'obj'), 6: ('eat', 'obj'), 7: ('eat', 'instr'), 5 | 8: ('jump', 'instr'), 9: ('lay', 'instr'), 10: ('talk_on_phone', 'instr'), 6 | 11: ('carry', 'obj'), 12: ('throw', 'obj'), 13: ('catch', 'obj'), 14: ('cut', 'instr'), 7 | 15: ('cut', 'obj'), 16: ('work_on_computer', 'instr'), 17: ('ski', 'instr'), 8 | 18: ('surf', 'instr'), 19: ('skateboard', 'instr'), 20: ('drink', 'instr'), 9 | 21: ('kick', 'obj'), 22: ('point', 'instr'), 23: ('read', 'obj'), 24: ('snowboard', 'instr')} 10 | 11 | sub_label_to_class = {0: 'hold', 1: 'stand', 2: 'sit', 3: 'ride', 4: 'walk', 5: 'look', 6: 'hit', 12 | 7: 'eat', 8: 'jump', 9: 'lay', 10: 'talk_on_phone', 11: 'carry', 12: 'throw', 13 | 13: 'catch', 14: 'cut', 15: 'run', 16: 'work_on_computer', 17: 'ski', 18: 'surf', 14 | 19: 'skateboard', 20: 'smile', 21: 'drink', 22: 'kick', 23: 'point', 24: 'read', 15 | 25: 'snowboard'} 16 | 17 | obj_label_to_class = {26: ('hold', 'obj'), 27: ('sit', 'instr'), 28: ('ride', 'instr'), 29: ('look', 'obj'), 18 | 30: ('hit', 'instr'), 31: ('hit', 'obj'), 32: ('eat', 'obj'), 33: ('eat', 'instr'), 19 | 34: ('jump', 'instr'), 35: ('lay', 'instr'), 36: ('talk_on_phone', 'instr'), 20 | 37: ('carry', 'obj'), 38: ('throw', 'obj'), 39: ('catch', 'obj'), 40: ('cut', 'instr'), 21 | 41: ('cut', 'obj'), 42: ('work_on_computer', 'instr'), 43: ('ski', 'instr'), 22 | 44: ('surf', 'instr'), 45: ('skateboard', 'instr'), 46: ('drink', 'instr'), 23 | 47: ('kick', 'obj'), 48: ('point', 'instr'), 49: ('read', 'obj'), 50: ('snowboard', 'instr')} 24 | 25 | sub_union_map = np.zeros(len(label_to_class), dtype=np.uint8) 26 | for uid in label_to_class: 27 | for sid in sub_label_to_class: 28 | if sub_label_to_class[sid] == label_to_class[uid][0]: 29 | sub_union_map[uid] = sid 30 | break 31 | 32 | 33 | def to_onehot(label, label_num): 34 | if isinstance(label, int) or isinstance(id, np.int32) or isinstance(id, np.int64): 35 | tmp = np.zeros(label_num) 36 | tmp[label] = 1 37 | elif isinstance(label, list) or isinstance(id, np.ndarray): 38 | tmp = np.zeros(label_num) 39 | label = np.array(label) 40 | assert len(label.shape) == 1 41 | if label.shape[0] > 0: 42 | tmp[label] = 1 43 | else: 44 | raise (Exception, "Only int or list is allowed to transform to one hot") 45 | return tmp 46 | 47 | 48 | def single_iou(a, b, need_area = False): 49 | # a(x1, y1, x2, y2) 50 | # b(x1, y1, x2, y2) 51 | 52 | area = (b[2] - b[0]) * (b[3] - b[1]) 53 | iw = min(a[2], b[2]) - max(a[0], b[0]) 54 | ih = min(a[3], b[3]) - max(a[1], b[1]) 55 | iw = max(iw, 0) 56 | ih = max(ih, 0) 57 | ua = (a[2] - a[0]) * (a[3] - a[1]) + area - iw * ih 58 | ua = max(ua, 1e-8) 59 | 60 | intersection = iw * ih 61 | IoU = intersection / ua 62 | if need_area: 63 | return IoU, intersection, ua 64 | else: 65 | return IoU 66 | 67 | 68 | def single_ioa(a, b, need_area = False): 69 | # a(x1, y1, x2, y2) 70 | # b(x1, y1, x2, y2) 71 | 72 | area = (b[2] - b[0]) * (b[3] - b[1]) 73 | iw = min(a[2], b[2]) - max(a[0], b[0]) 74 | ih = min(a[3], b[3]) - max(a[1], b[1]) 75 | 76 | iw = max(iw, 0) 77 | ih = max(ih, 0) 78 | 79 | area = max(area, 1e-8) 80 | intersection = iw * ih 81 | IoA = intersection / area 82 | 83 | if need_area: 84 | return IoA, intersection, area 85 | else: 86 | return IoA 87 | 88 | 89 | def single_inter(a, b): 90 | inter = [max(a[0], b[0]), max(a[1], b[1]), min(a[2], b[2]), min(a[3], b[3])] 91 | if inter[0] > inter[2] or inter[1] > inter[3]: 92 | inter = [0.0, 0.0, 0.0, 0.0] 93 | return np.array(inter) 94 | 95 | 96 | def single_union(a, b): 97 | inter = [min(a[0], b[0]), min(a[1], b[1]), max(a[2], b[2]), max(a[3], b[3])] 98 | if inter[0] > inter[2] or inter[1] > inter[3]: 99 | inter = [0.0, 0.0, 0.0, 0.0] 100 | return np.array(inter) 101 | 102 | 103 | def transform_action(inst_score, mode): 104 | assert mode in {"subject", "object"} 105 | 106 | num_union_act = len(label_to_class) 107 | num_sub_act = len(sub_label_to_class) 108 | num_obj_act = len(obj_label_to_class) 109 | 110 | res = np.zeros(num_union_act) 111 | ids = np.arange(num_union_act) 112 | 113 | if mode == "object": 114 | res = inst_score[num_sub_act:] 115 | return res 116 | else: 117 | res[ids] = inst_score[sub_union_map[ids]] 118 | return res -------------------------------------------------------------------------------- /utils/sync_batchnorm/comm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File : comm.py 3 | # Author : Jiayuan Mao 4 | # Email : maojiayuan@gmail.com 5 | # Date : 27/01/2018 6 | # 7 | # This file is part of Synchronized-BatchNorm-PyTorch. 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch 9 | # Distributed under MIT License. 10 | 11 | import queue 12 | import collections 13 | import threading 14 | 15 | __all__ = ['FutureResult', 'SlavePipe', 'SyncMaster'] 16 | 17 | 18 | class FutureResult(object): 19 | """A thread-safe future implementation. Used only as one-to-one pipe.""" 20 | 21 | def __init__(self): 22 | self._result = None 23 | self._lock = threading.Lock() 24 | self._cond = threading.Condition(self._lock) 25 | 26 | def put(self, result): 27 | with self._lock: 28 | assert self._result is None, 'Previous result has\'t been fetched.' 29 | self._result = result 30 | self._cond.notify() 31 | 32 | def get(self): 33 | with self._lock: 34 | if self._result is None: 35 | self._cond.wait() 36 | 37 | res = self._result 38 | self._result = None 39 | return res 40 | 41 | 42 | _MasterRegistry = collections.namedtuple('MasterRegistry', ['result']) 43 | _SlavePipeBase = collections.namedtuple('_SlavePipeBase', ['identifier', 'queue', 'result']) 44 | 45 | 46 | class SlavePipe(_SlavePipeBase): 47 | """Pipe for master-slave communication.""" 48 | 49 | def run_slave(self, msg): 50 | self.queue.put((self.identifier, msg)) 51 | ret = self.result.get() 52 | self.queue.put(True) 53 | return ret 54 | 55 | 56 | class SyncMaster(object): 57 | """An abstract `SyncMaster` object. 58 | 59 | - During the replication, as the data parallel will trigger an callback of each module, all slave devices should 60 | call `register(id)` and obtain an `SlavePipe` to communicate with the master. 61 | - During the forward pass, master device invokes `run_master`, all messages from slave devices will be collected, 62 | and passed to a registered callback. 63 | - After receiving the messages, the master device should gather the information and determine to message passed 64 | back to each slave devices. 65 | """ 66 | 67 | def __init__(self, master_callback): 68 | """ 69 | 70 | Args: 71 | master_callback: a callback to be invoked after having collected messages from slave devices. 72 | """ 73 | self._master_callback = master_callback 74 | self._queue = queue.Queue() 75 | self._registry = collections.OrderedDict() 76 | self._activated = False 77 | 78 | def __getstate__(self): 79 | return {'master_callback': self._master_callback} 80 | 81 | def __setstate__(self, state): 82 | self.__init__(state['master_callback']) 83 | 84 | def register_slave(self, identifier): 85 | """ 86 | Register an slave device. 87 | 88 | Args: 89 | identifier: an identifier, usually is the device id. 90 | 91 | Returns: a `SlavePipe` object which can be used to communicate with the master device. 92 | 93 | """ 94 | if self._activated: 95 | assert self._queue.empty(), 'Queue is not clean before next initialization.' 96 | self._activated = False 97 | self._registry.clear() 98 | future = FutureResult() 99 | self._registry[identifier] = _MasterRegistry(future) 100 | return SlavePipe(identifier, self._queue, future) 101 | 102 | def run_master(self, master_msg): 103 | """ 104 | Main entry for the master device in each forward pass. 105 | The messages were first collected from each devices (including the master device), and then 106 | an callback will be invoked to compute the message to be sent back to each devices 107 | (including the master device). 108 | 109 | Args: 110 | master_msg: the message that the master want to send to itself. This will be placed as the first 111 | message when calling `master_callback`. For detailed usage, see `_SynchronizedBatchNorm` for an example. 112 | 113 | Returns: the message to be sent back to the master device. 114 | 115 | """ 116 | self._activated = True 117 | 118 | intermediates = [(0, master_msg)] 119 | for i in range(self.nr_slaves): 120 | intermediates.append(self._queue.get()) 121 | 122 | results = self._master_callback(intermediates) 123 | assert results[0][0] == 0, 'The first result should belongs to the master.' 124 | 125 | for i, res in results: 126 | if i == 0: 127 | continue 128 | self._registry[i].result.put(res) 129 | 130 | for i in range(self.nr_slaves): 131 | assert self._queue.get() is True 132 | 133 | return results[0][1] 134 | 135 | @property 136 | def nr_slaves(self): 137 | return len(self._registry) 138 | -------------------------------------------------------------------------------- /efficientdet/utils.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import torch 3 | import torch.nn as nn 4 | import numpy as np 5 | 6 | 7 | class BBoxTransform(nn.Module): 8 | def forward(self, anchors, regression): 9 | """ 10 | decode_box_outputs adapted from https://github.com/google/automl/blob/master/efficientdet/anchors.py 11 | 12 | Args: 13 | anchors: [batchsize, boxes, (y1, x1, y2, x2)] 14 | regression: [batchsize, boxes, (dy, dx, dh, dw)] 15 | 16 | Returns: 17 | 18 | """ 19 | y_centers_a = (anchors[..., 0] + anchors[..., 2]) / 2 20 | x_centers_a = (anchors[..., 1] + anchors[..., 3]) / 2 21 | ha = anchors[..., 2] - anchors[..., 0] 22 | wa = anchors[..., 3] - anchors[..., 1] 23 | 24 | w = regression[..., 3].exp() * wa 25 | h = regression[..., 2].exp() * ha 26 | 27 | y_centers = regression[..., 0] * ha + y_centers_a 28 | x_centers = regression[..., 1] * wa + x_centers_a 29 | 30 | ymin = y_centers - h / 2. 31 | xmin = x_centers - w / 2. 32 | ymax = y_centers + h / 2. 33 | xmax = x_centers + w / 2. 34 | 35 | return torch.stack([xmin, ymin, xmax, ymax], dim=2) 36 | 37 | 38 | class ClipBoxes(nn.Module): 39 | 40 | def __init__(self): 41 | super(ClipBoxes, self).__init__() 42 | 43 | def forward(self, boxes, img): 44 | batch_size, num_channels, height, width = img.shape 45 | 46 | boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0) 47 | boxes[:, :, 1] = torch.clamp(boxes[:, :, 1], min=0) 48 | 49 | boxes[:, :, 2] = torch.clamp(boxes[:, :, 2], max=width - 1) 50 | boxes[:, :, 3] = torch.clamp(boxes[:, :, 3], max=height - 1) 51 | 52 | return boxes 53 | 54 | 55 | class Anchors(nn.Module): 56 | """ 57 | adapted and modified from https://github.com/google/automl/blob/master/efficientdet/anchors.py by Zylo117 58 | """ 59 | 60 | def __init__(self, anchor_scale=4., pyramid_levels=None, **kwargs): 61 | super().__init__() 62 | self.anchor_scale = anchor_scale 63 | 64 | if pyramid_levels is None: 65 | self.pyramid_levels = [3, 4, 5, 6, 7] 66 | 67 | self.strides = kwargs.get('strides', [2 ** x for x in self.pyramid_levels]) 68 | self.scales = np.array(kwargs.get('scales', [2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)])) 69 | self.ratios = kwargs.get('ratios', [(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)]) 70 | 71 | self.last_anchors = {} 72 | self.last_shape = None 73 | 74 | def forward(self, image, dtype=torch.float32): 75 | """Generates multiscale anchor boxes. 76 | 77 | Args: 78 | image_size: integer number of input image size. The input image has the 79 | same dimension for width and height. The image_size should be divided by 80 | the largest feature stride 2^max_level. 81 | anchor_scale: float number representing the scale of size of the base 82 | anchor to the feature stride 2^level. 83 | anchor_configs: a dictionary with keys as the levels of anchors and 84 | values as a list of anchor configuration. 85 | 86 | Returns: 87 | anchor_boxes: a numpy array with shape [N, 4], which stacks anchors on all 88 | feature levels. 89 | Raises: 90 | ValueError: input size must be the multiple of largest feature stride. 91 | """ 92 | image_shape = image.shape[2:] 93 | 94 | if image_shape == self.last_shape and image.device in self.last_anchors: 95 | return self.last_anchors[image.device] 96 | 97 | if self.last_shape is None or self.last_shape != image_shape: 98 | self.last_shape = image_shape # (h, w) 99 | 100 | if dtype == torch.float16: 101 | dtype = np.float16 102 | else: 103 | dtype = np.float32 104 | 105 | boxes_all = [] 106 | for stride in self.strides: 107 | boxes_level = [] 108 | for scale, ratio in itertools.product(self.scales, self.ratios): # scales中每个元素依次与ratios中每个元素组合 109 | if image_shape[1] % stride != 0: 110 | raise ValueError('input size must be divided by the stride.') 111 | base_anchor_size = self.anchor_scale * stride * scale 112 | anchor_size_x_2 = base_anchor_size * ratio[0] / 2.0 113 | anchor_size_y_2 = base_anchor_size * ratio[1] / 2.0 114 | 115 | x = np.arange(stride / 2, image_shape[1], stride) 116 | y = np.arange(stride / 2, image_shape[0], stride) 117 | xv, yv = np.meshgrid(x, y) # 原图anchor box的(x, y)坐标中点 118 | xv = xv.reshape(-1) 119 | yv = yv.reshape(-1) 120 | 121 | # y1,x1,y2,x2 122 | boxes = np.vstack((yv - anchor_size_y_2, xv - anchor_size_x_2, 123 | yv + anchor_size_y_2, xv + anchor_size_x_2)) # (4, n_boxes) 124 | boxes = np.swapaxes(boxes, 0, 1) # (n_boxes, 4) 125 | boxes_level.append(np.expand_dims(boxes, axis=1)) 126 | # concat anchors on the same level to the reshape NxAx4 127 | boxes_level = np.concatenate(boxes_level, axis=1) 128 | boxes_all.append(boxes_level.reshape([-1, 4])) 129 | 130 | anchor_boxes = np.vstack(boxes_all) # (batch_size, n_boxes*A, 4) 131 | 132 | anchor_boxes = torch.from_numpy(anchor_boxes.astype(dtype)).to(image.device) 133 | anchor_boxes = anchor_boxes.unsqueeze(0) 134 | 135 | # save it for later use to reduce overhead 136 | self.last_anchors[image.device] = anchor_boxes 137 | return anchor_boxes 138 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DIRV: Dense Interaction Region Voting for End-to-End Human-Object Interaction Detection 2 | 3 |
4 | 5 |
6 | 7 | Official code implementation for the paper "DIRV: Dense Interaction Region Voting for End-to-End Human-Object Interaction Detection" (AAAI 2021) [paper](https://arxiv.org/abs/2010.01005). 8 | 9 | The code is developed based on the architecture of [zylo117/Yet-Another-EfficientDet-Pytorch](https://github.com/zylo117/Yet-Another-EfficientDet-Pytorch). We also follow some data pre-processing and model evaluation methods in [BigRedT/no_frills_hoi_det](https://github.com/BigRedT/no_frills_hoi_det) and [vt-vl-lab/iCAN](https://github.com/vt-vl-lab/iCAN). We sincerely thank the authors for the excellent work. 10 | 11 | 12 | 13 | ## Checklist 14 | 15 | + [x] Training and Test for V-COCO dataset 16 | + [x] Training and Test for HICO-DET dataset 17 | + [x] Demonstration on images 18 | + [ ] Demonstration on videos 19 | + [ ] More efficient voting strategy for inference using GPU 20 | 21 | ## Prerequisites 22 | 23 | The code was tested with python 3.6, pytorch 1.5.1, torchvision 0.6.1, CUDA 10.2, and Ubuntu 18.04. 24 | 25 | ## Installation 26 | 27 | 1. Clone this repository: 28 | 29 | ``` 30 | git clone https://github.com/MVIG-SJTU/DIRV.git 31 | ``` 32 | 33 | 2. Install pytorch and torchvision: 34 | 35 | ``` 36 | pip install torch==1.5.1 torchvision==0.6.1 37 | ``` 38 | 39 | 3. Install other necessary packages: 40 | 41 | ``` 42 | pip install pycocotools numpy opencv-python tqdm tensorboard tensorboardX pyyaml webcolors 43 | ``` 44 | 45 | ## Data Preparation 46 | 47 | ### V-COCO Dataset: 48 | 49 | Download [V-COCO](https://github.com/s-gupta/v-coco) dataset following the official instructions. 50 | 51 | You can find the files new_prior_mask.pkl [here](https://drive.google.com/drive/folders/14xXUb5l_SugfWiRXX3o8jgKXMNac1c7_?usp=sharing). Each element inside it refers to the prior probability that a verb (e.g. eat) is associated with an object category (e.g. apple). You should also download the combined training and valdataion sets annotations instances_trainval2014.json [here](https://drive.google.com/drive/folders/14xXUb5l_SugfWiRXX3o8jgKXMNac1c7_?usp=sharing), and put it in datasets/vcoco/coco/annotations. 52 | 53 | ### HICO-DET Dataset: 54 | 55 | Download [HICO-DET](http://www-personal.umich.edu/~ywchao/hico/) dataset from the official website. 56 | 57 | We transform the annotations of HICO-DET dataset to JSON format following [BigRedT/no_frills_hoi_det](https://github.com/BigRedT/no_frills_hoi_det). You can directly download the processed annotations from [here](https://drive.google.com/drive/folders/14xXUb5l_SugfWiRXX3o8jgKXMNac1c7_?usp=sharing). 58 | 59 | We count the training sample number of each category in [hico_processed/hico-det_verb_count.json](https://drive.google.com/drive/folders/14xXUb5l_SugfWiRXX3o8jgKXMNac1c7_?usp=sharing). It serves as a weight when calculating loss. 60 | 61 | ### Dataset Structure: 62 | 63 | Make sure to put the files in the following structure: 64 | 65 | ``` 66 | |-- datasets 67 | | |-- vcoco 68 | | | |-- data 69 | | | | |-- splits 70 | | | | |-- vcoco 71 | | | | 72 | | | |-- coco 73 | | | | |-- images 74 | | | | |-- annotations 75 | | | |-- new_prior_mask.pkl 76 | | |-- hico_20160224_det 77 | | | |-- images 78 | | | |-- hico_processed 79 | ``` 80 | 81 | ## Demonstration 82 | ### Demonstration on Images 83 | 84 | ``` 85 | CUDA_VISIBLE_DEVICES=0 python demo.py --image_path /path/to/a/single/image 86 | ``` 87 | 88 | ### Demonstration on Videos 89 | 90 | Coming soon. 91 | 92 | ## Pre-trained Weights 93 | 94 | You can download the pre-trained weights for V-COCO dataset (vcoco_best.pth) and HICO-DET dataset (hico-det_best.pth) [here](https://drive.google.com/drive/folders/14xXUb5l_SugfWiRXX3o8jgKXMNac1c7_?usp=sharing). 95 | 96 | ## Training 97 | 98 | Download the pre-trained weight of our backbone (efficientdet-d3_vcoco.pth and efficientdet-d3_hico-det.pth) [here](https://drive.google.com/drive/folders/14xXUb5l_SugfWiRXX3o8jgKXMNac1c7_?usp=sharing), and save it in `weights/` directory. 99 | 100 | ### Training on V-COCO Dataset 101 | 102 | ``` 103 | CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py -p vcoco --batch_size 32 --load_weights weights/efficientdet-d3_vcoco.pth 104 | ``` 105 | 106 | ### Training on HICO-DET Dataset 107 | 108 | ``` 109 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python train.py -p hico-det --batch_size 48 --load_weights weights/efficientdet-d3_hico-det.pth 110 | ``` 111 | 112 | You may also adjust the saving directory and GPU number in `projects/vcoco.yaml` and `projects/hico-det.yaml` or create your own projects in `projects/`. 113 | 114 | ## Test 115 | 116 | ### Test on V-COCO Dataset 117 | 118 | ``` 119 | CUDA_VISIBLE_DEVICES=0 python test_vcoco.py -w $path to the checkpoint$ 120 | ``` 121 | 122 | ### Test on HICO-DET Dataset 123 | 124 | ``` 125 | CUDA_VISIBLE_DEVICES=0 python test_hico-det.py -w $path to the checkpoint$ 126 | ``` 127 | 128 | Then please follow the same procedures in [vt-vl-lab/iCAN](https://github.com/vt-vl-lab/iCAN) to evaluate the result on HICO-DET dataset. 129 | 130 | ## Citation 131 | 132 | If you found our paper or code useful for your research, please cite the following paper: 133 | ``` 134 | @inproceedings{fang2020dirv, 135 | title={DIRV: Dense Interaction Region Voting for End-to-End Human-Object Interaction Detection}, 136 | author={Fang, Hao-Shu and Xie, Yichen and Shao, Dian and Lu, Cewu}, 137 | year={2021}, 138 | booktitle = {The AAAI Conference on Artificial Intelligence (AAAI)} 139 | } 140 | ``` 141 | 142 | -------------------------------------------------------------------------------- /utils/visual.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from PIL import Image 6 | import matplotlib.pyplot as plt 7 | 8 | import pickle 9 | import json 10 | import numpy as np 11 | import cv2 12 | import os 13 | import sys 14 | import argparse 15 | 16 | import matplotlib as mpl 17 | mpl.use('Agg') 18 | 19 | 20 | def visual(detection, image_id=None): 21 | if image_id is None: 22 | image_id = detection[0]["image_id"] 23 | 24 | cc = plt.get_cmap('hsv', lut=6) 25 | dpi = 80 26 | 27 | im_file = './datasets/vcoco/coco/images/val2014/COCO_val2014_' + (str(image_id)).zfill(12) + '.jpg' 28 | im_data = plt.imread(im_file) 29 | height, width, nbands = im_data.shape 30 | figsize = width / float(dpi), height / float(dpi) 31 | fig = plt.figure(figsize=figsize) 32 | ax = fig.add_axes([0, 0, 1, 1]) 33 | ax.axis('off') 34 | ax.imshow(im_data, interpolation='nearest') 35 | 36 | HO_dic = {} 37 | HO_set = set() 38 | count = 0 39 | 40 | for ele in detection: 41 | if (ele['image_id'] == image_id): 42 | action_count = -1 43 | 44 | for action_key, action_value in ele.items(): 45 | if (action_key.split('_')[-1] != 'agent') and action_key != 'image_id' and action_key != 'person_box': 46 | if (not np.isnan(action_value[0])) and (action_value[4] > 0.01): 47 | O_box = action_value[:4] 48 | H_box = ele['person_box'] 49 | 50 | action_count += 1 51 | 52 | if tuple(O_box) not in HO_set: 53 | HO_dic[tuple(O_box)] = count 54 | HO_set.add(tuple(O_box)) 55 | count += 1 56 | if tuple(H_box) not in HO_set: 57 | HO_dic[tuple(H_box)] = count 58 | HO_set.add(tuple(H_box)) 59 | count += 1 60 | 61 | ax.add_patch( 62 | plt.Rectangle((H_box[0], H_box[1]), 63 | H_box[2] - H_box[0], 64 | H_box[3] - H_box[1], fill=False, 65 | edgecolor=cc(HO_dic[tuple(H_box)])[:3], linewidth=3) 66 | ) 67 | text = action_key.split('_')[0] + ', ' + "%.2f" % action_value[4] 68 | 69 | ax.text(H_box[0] + 10, H_box[1] + 25 + action_count * 35, 70 | text, 71 | bbox=dict(facecolor=cc(HO_dic[tuple(O_box)])[:3], alpha=0.5), 72 | fontsize=16, color='white') 73 | 74 | ax.add_patch( 75 | plt.Rectangle((O_box[0], O_box[1]), 76 | O_box[2] - O_box[0], 77 | O_box[3] - O_box[1], fill=False, 78 | edgecolor=cc(HO_dic[tuple(O_box)])[:3], linewidth=3) 79 | ) 80 | ax.set(xlim=[0, width], ylim=[height, 0], aspect=1) 81 | fig.savefig("vis/%d.jpg"%image_id) 82 | 83 | 84 | def visual_demo(detection, im_path, save_path): 85 | cc = plt.get_cmap('hsv', lut=6) 86 | dpi = 80 87 | im_data = plt.imread(im_path) 88 | height, width, nbands = im_data.shape 89 | figsize = width / float(dpi), height / float(dpi) 90 | fig = plt.figure(figsize=figsize) 91 | ax = fig.add_axes([0, 0, 1, 1]) 92 | ax.axis('off') 93 | ax.imshow(im_data, interpolation='nearest') 94 | 95 | HO_dic = {} 96 | HO_set = set() 97 | count = 0 98 | 99 | for ele in detection: 100 | action_count = -1 101 | for action_key, action_value in ele.items(): 102 | if (action_key.split('_')[-1] != 'agent') and action_key != 'image_id' and action_key != 'person_box': 103 | if (not np.isnan(action_value[0])) and (action_value[4] > 0.01): 104 | O_box = action_value[:4] 105 | H_box = ele['person_box'] 106 | action_count += 1 107 | if tuple(O_box) not in HO_set: 108 | HO_dic[tuple(O_box)] = count 109 | HO_set.add(tuple(O_box)) 110 | count += 1 111 | if tuple(H_box) not in HO_set: 112 | HO_dic[tuple(H_box)] = count 113 | HO_set.add(tuple(H_box)) 114 | count += 1 115 | ax.add_patch( 116 | plt.Rectangle((H_box[0], H_box[1]), 117 | H_box[2] - H_box[0], 118 | H_box[3] - H_box[1], fill=False, 119 | edgecolor=cc(HO_dic[tuple(H_box)])[:3], linewidth=3) 120 | ) 121 | text = action_key.split('_')[0] + ', ' + "%.2f" % action_value[4] 122 | ax.text(H_box[0] + 10, H_box[1] + 25 + action_count * 35, 123 | text, 124 | bbox=dict(facecolor=cc(HO_dic[tuple(O_box)])[:3], alpha=0.5), 125 | fontsize=16, color='white') 126 | ax.add_patch( 127 | plt.Rectangle((O_box[0], O_box[1]), 128 | O_box[2] - O_box[0], 129 | O_box[3] - O_box[1], fill=False, 130 | edgecolor=cc(HO_dic[tuple(O_box)])[:3], linewidth=3) 131 | ) 132 | ax.set(xlim=[0, width], ylim=[height, 0], aspect=1) 133 | fig.savefig(save_path) 134 | 135 | 136 | if __name__=="__main__": 137 | arg = argparse.ArgumentParser() 138 | arg.add_argument('--det_file', type=str, default=None) 139 | args = ap.parse_args() 140 | 141 | detection = pickle.load(open(args.det_file, "rb")) 142 | visualize(detection) 143 | -------------------------------------------------------------------------------- /coco_eval.py: -------------------------------------------------------------------------------- 1 | # Author: Zylo117 2 | 3 | """ 4 | COCO-Style Evaluations 5 | 6 | put images here datasets/your_project_name/annotations/val_set_name/*.jpg 7 | put annotations here datasets/your_project_name/annotations/instances_{val_set_name}.json 8 | put weights here /path/to/your/weights/*.pth 9 | change compound_coef 10 | 11 | """ 12 | 13 | import json 14 | import os 15 | 16 | import argparse 17 | import torch 18 | import yaml 19 | from tqdm import tqdm 20 | from pycocotools.coco import COCO 21 | from pycocotools.cocoeval import COCOeval 22 | 23 | from backbone import EfficientDetBackbone 24 | from efficientdet.utils import BBoxTransform, ClipBoxes 25 | from utils.utils import preprocess, invert_affine, postprocess 26 | 27 | from efficientdet.vcoco_dataset import VCOCO_Dataset, Resizer, Normalizer, Augmenter, collater 28 | 29 | 30 | ap = argparse.ArgumentParser() 31 | ap.add_argument('-p', '--project', type=str, default='coco', help='project file that contains parameters') 32 | ap.add_argument('-c', '--compound_coef', type=int, default=0, help='coefficients of efficientdet') 33 | ap.add_argument('-w', '--weights', type=str, default=None, help='/path/to/weights') 34 | ap.add_argument('--nms_threshold', type=float, default=0.5, help='nms threshold, don\'t change it if not for testing purposes') 35 | ap.add_argument('--cuda', type=bool, default=True) 36 | ap.add_argument('--device', type=int, default=0) 37 | ap.add_argument('--float16', type=bool, default=False) 38 | ap.add_argument('--override', type=bool, default=True, help='override previous bbox results file if exists') 39 | args = ap.parse_args() 40 | 41 | compound_coef = args.compound_coef 42 | nms_threshold = args.nms_threshold 43 | use_cuda = args.cuda 44 | gpu = args.device 45 | use_float16 = args.float16 46 | override_prev_results = args.override 47 | project_name = args.project 48 | weights_path = f'weights/efficientdet-d{compound_coef}.pth' if args.weights is None else args.weights 49 | 50 | print(f'running coco-style evaluation on project {project_name}, weights {weights_path}...') 51 | 52 | params = yaml.safe_load(open(f'projects/{project_name}.yml')) 53 | obj_list = params['obj_list'] 54 | 55 | input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] 56 | 57 | 58 | def evaluate_coco(img_path, set_name, image_ids, coco, model, threshold=0.05): 59 | results = [] 60 | processed_image_ids = [] 61 | 62 | regressBoxes = BBoxTransform() 63 | clipBoxes = ClipBoxes() 64 | 65 | for image_id in tqdm(image_ids): 66 | image_info = coco.loadImgs(image_id)[0] 67 | image_path = img_path + image_info['file_name'] 68 | 69 | ori_imgs, framed_imgs, framed_metas = preprocess(image_path, max_size=input_sizes[compound_coef]) 70 | x = torch.from_numpy(framed_imgs[0]) 71 | 72 | if use_cuda: 73 | x = x.cuda(gpu) 74 | if use_float16: 75 | x = x.half() 76 | else: 77 | x = x.float() 78 | else: 79 | x = x.float() 80 | 81 | x = x.unsqueeze(0).permute(0, 3, 1, 2) 82 | features, regression, classification, anchors = model(x) 83 | 84 | preds = postprocess(x, 85 | anchors, regression, classification, 86 | regressBoxes, clipBoxes, 87 | threshold, nms_threshold) 88 | 89 | processed_image_ids.append(image_id) 90 | 91 | if not preds: 92 | continue 93 | 94 | preds = invert_affine(framed_metas, preds)[0] 95 | 96 | scores = preds['scores'] 97 | class_ids = preds['class_ids'] 98 | rois = preds['rois'] 99 | 100 | if rois.shape[0] > 0: 101 | # x1,y1,x2,y2 -> x1,y1,w,h 102 | rois[:, 2] -= rois[:, 0] 103 | rois[:, 3] -= rois[:, 1] 104 | 105 | bbox_score = scores 106 | 107 | for roi_id in range(rois.shape[0]): 108 | score = float(bbox_score[roi_id]) 109 | label = int(class_ids[roi_id]) 110 | box = rois[roi_id, :] 111 | 112 | if score < threshold: 113 | break 114 | image_result = { 115 | 'image_id': image_id, 116 | 'category_id': label + 1, 117 | 'score': float(score), 118 | 'bbox': box.tolist(), 119 | } 120 | 121 | results.append(image_result) 122 | 123 | if not len(results): 124 | raise Exception('the model does not provide any valid output, check model architecture and the data input') 125 | 126 | # write output 127 | filepath = f'{set_name}_bbox_results.json' 128 | if os.path.exists(filepath): 129 | os.remove(filepath) 130 | json.dump(results, open(filepath, 'w'), indent=4) 131 | 132 | return processed_image_ids 133 | 134 | 135 | def _eval(coco_gt, image_ids, pred_json_path): 136 | # load results in COCO evaluation tool 137 | coco_pred = coco_gt.loadRes(pred_json_path) 138 | 139 | # run COCO evaluation 140 | print('BBox') 141 | coco_eval = COCOeval(coco_gt, coco_pred, 'bbox') 142 | coco_eval.params.imgIds = image_ids 143 | coco_eval.evaluate() 144 | coco_eval.accumulate() 145 | coco_eval.summarize() 146 | 147 | 148 | if __name__ == '__main__': 149 | SET_NAME = params['val_set'] 150 | VAL_GT = f'datasets/{params["project_name"]}/annotations/instances_{SET_NAME}.json' 151 | VAL_IMGS = f'datasets/{params["project_name"]}/{SET_NAME}/' 152 | MAX_IMAGES = 10000 153 | coco_gt = COCO(VAL_GT) 154 | image_ids = coco_gt.getImgIds()[:MAX_IMAGES] 155 | 156 | if override_prev_results or not os.path.exists(f'{SET_NAME}_bbox_results.json'): 157 | model = EfficientDetBackbone(compound_coef=compound_coef, num_classes=len(obj_list), 158 | ratios=eval(params['anchors_ratios']), scales=eval(params['anchors_scales'])) 159 | model.load_state_dict(torch.load(weights_path, map_location=torch.device('cpu'))) 160 | model.requires_grad_(False) 161 | model.eval() 162 | 163 | if use_cuda: 164 | model.cuda(gpu) 165 | 166 | if use_float16: 167 | model.half() 168 | 169 | image_ids = evaluate_coco(VAL_IMGS, SET_NAME, image_ids, coco_gt, model) 170 | 171 | _eval(coco_gt, image_ids, f'{SET_NAME}_bbox_results.json') 172 | -------------------------------------------------------------------------------- /efficientdet/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | 5 | from torch.utils.data import Dataset, DataLoader 6 | from pycocotools.coco import COCO 7 | import cv2 8 | 9 | 10 | class CocoDataset(Dataset): 11 | def __init__(self, root_dir, set='train2017', transform=None): 12 | 13 | self.root_dir = root_dir 14 | self.set_name = set 15 | self.transform = transform 16 | 17 | self.coco = COCO(os.path.join(self.root_dir, 'annotations', 'instances_' + self.set_name + '.json')) 18 | self.image_ids = self.coco.getImgIds() 19 | 20 | self.load_classes() 21 | 22 | def load_classes(self): 23 | 24 | # load class names (name -> label) 25 | categories = self.coco.loadCats(self.coco.getCatIds()) 26 | categories.sort(key=lambda x: x['id']) 27 | 28 | self.classes = {} 29 | self.coco_labels = {} 30 | self.coco_labels_inverse = {} 31 | for c in categories: 32 | self.coco_labels[len(self.classes)] = c['id'] 33 | self.coco_labels_inverse[c['id']] = len(self.classes) 34 | self.classes[c['name']] = len(self.classes) 35 | 36 | # also load the reverse (label -> name) 37 | self.labels = {} 38 | for key, value in self.classes.items(): 39 | self.labels[value] = key 40 | 41 | def __len__(self): 42 | return len(self.image_ids) 43 | 44 | def __getitem__(self, idx): 45 | 46 | img = self.load_image(idx) 47 | annot = self.load_annotations(idx) 48 | sample = {'img': img, 'annot': annot} 49 | if self.transform: 50 | sample = self.transform(sample) 51 | return sample 52 | 53 | def load_image(self, image_index): 54 | image_info = self.coco.loadImgs(self.image_ids[image_index])[0] 55 | path = os.path.join(self.root_dir, self.set_name, image_info['file_name']) 56 | img = cv2.imread(path) 57 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 58 | 59 | return img.astype(np.float32) / 255. 60 | 61 | def load_annotations(self, image_index): 62 | # get ground truth annotations 63 | annotations_ids = self.coco.getAnnIds(imgIds=self.image_ids[image_index], iscrowd=False) 64 | annotations = np.zeros((0, 5)) 65 | 66 | # some images appear to miss annotations 67 | if len(annotations_ids) == 0: 68 | return annotations 69 | 70 | # parse annotations 71 | coco_annotations = self.coco.loadAnns(annotations_ids) 72 | for idx, a in enumerate(coco_annotations): 73 | 74 | # some annotations have basically no width / height, skip them 75 | if a['bbox'][2] < 1 or a['bbox'][3] < 1: 76 | continue 77 | 78 | annotation = np.zeros((1, 5)) 79 | annotation[0, :4] = a['bbox'] 80 | annotation[0, 4] = self.coco_label_to_label(a['category_id']) 81 | annotations = np.append(annotations, annotation, axis=0) 82 | 83 | # transform from [x, y, w, h] to [x1, y1, x2, y2] 84 | annotations[:, 2] = annotations[:, 0] + annotations[:, 2] 85 | annotations[:, 3] = annotations[:, 1] + annotations[:, 3] 86 | 87 | return annotations 88 | 89 | def coco_label_to_label(self, coco_label): 90 | return self.coco_labels_inverse[coco_label] 91 | 92 | def label_to_coco_label(self, label): 93 | return self.coco_labels[label] 94 | 95 | 96 | def collater(data): 97 | imgs = [s['img'] for s in data] 98 | annots = [s['annot'] for s in data] 99 | scales = [s['scale'] for s in data] 100 | 101 | imgs = torch.from_numpy(np.stack(imgs, axis=0)) 102 | 103 | max_num_annots = max(annot.shape[0] for annot in annots) 104 | 105 | if max_num_annots > 0: 106 | 107 | annot_padded = torch.ones((len(annots), max_num_annots, 5)) * -1 108 | 109 | if max_num_annots > 0: 110 | for idx, annot in enumerate(annots): 111 | if annot.shape[0] > 0: 112 | annot_padded[idx, :annot.shape[0], :] = annot 113 | else: 114 | annot_padded = torch.ones((len(annots), 1, 5)) * -1 115 | 116 | imgs = imgs.permute(0, 3, 1, 2) 117 | 118 | return {'img': imgs, 'annot': annot_padded, 'scale': scales} 119 | 120 | 121 | class Resizer(object): 122 | """Convert ndarrays in sample to Tensors.""" 123 | 124 | def __init__(self, img_size=512): 125 | self.img_size = img_size 126 | 127 | def __call__(self, sample): 128 | image, annots = sample['img'], sample['annot'] 129 | height, width, _ = image.shape 130 | if height > width: 131 | scale = self.img_size / height 132 | resized_height = self.img_size 133 | resized_width = int(width * scale) 134 | else: 135 | scale = self.img_size / width 136 | resized_height = int(height * scale) 137 | resized_width = self.img_size 138 | 139 | image = cv2.resize(image, (resized_width, resized_height), interpolation=cv2.INTER_LINEAR) 140 | 141 | new_image = np.zeros((self.img_size, self.img_size, 3)) 142 | new_image[0:resized_height, 0:resized_width] = image 143 | 144 | annots[:, :4] *= scale 145 | 146 | return {'img': torch.from_numpy(new_image).to(torch.float32), 'annot': torch.from_numpy(annots), 'scale': scale} 147 | 148 | 149 | class Augmenter(object): 150 | """Convert ndarrays in sample to Tensors.""" 151 | 152 | def __call__(self, sample, flip_x=0.5): 153 | if np.random.rand() < flip_x: 154 | image, annots = sample['img'], sample['annot'] 155 | image = image[:, ::-1, :] 156 | 157 | rows, cols, channels = image.shape 158 | 159 | x1 = annots[:, 0].copy() 160 | x2 = annots[:, 2].copy() 161 | 162 | x_tmp = x1.copy() 163 | 164 | annots[:, 0] = cols - x2 165 | annots[:, 2] = cols - x_tmp 166 | 167 | sample = {'img': image, 'annot': annots} 168 | 169 | return sample 170 | 171 | 172 | class Normalizer(object): 173 | 174 | def __init__(self, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]): 175 | self.mean = np.array([[mean]]) 176 | self.std = np.array([[std]]) 177 | 178 | def __call__(self, sample): 179 | image, annots = sample['img'], sample['annot'] 180 | 181 | return {'img': ((image.astype(np.float32) - self.mean) / self.std), 'annot': annots} 182 | -------------------------------------------------------------------------------- /Generate_HICO_detection.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Tensorflow iCAN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Chen Gao 5 | # -------------------------------------------------------- 6 | 7 | """ 8 | Change the HICO-DET detection results to the right format. 9 | """ 10 | 11 | import pickle 12 | import numpy as np 13 | import scipy.io as sio 14 | import os 15 | 16 | def save_HICO(HICO, HICO_dir, classid, begin, finish): 17 | print("class id:", classid) 18 | 19 | all_boxes = [] 20 | for i in range(finish - begin + 1): 21 | total = [] 22 | score = [] 23 | for key, value in HICO.items(): 24 | for element in value: 25 | if element[2] == classid: 26 | temp = [] 27 | temp.append(element[0].tolist()) # Human box 28 | temp.append(element[1].tolist()) # Object box 29 | temp.append(int(key)) # image id 30 | temp.append(int(i)) # action id (0-599) 31 | temp.append(element[3][begin - 1 + i] * element[4] * element[5]) 32 | 33 | total.append(temp) 34 | score.append(element[3][begin - 1 + i] * element[4] * element[5]) 35 | 36 | 37 | idx = np.argsort(score, axis=0)[::-1] 38 | for i_idx in range(min(len(idx),19999)): 39 | all_boxes.append(total[idx[i_idx]]) 40 | savefile = HICO_dir + 'detections_' + str(classid).zfill(2) + '.mat' 41 | sio.savemat(savefile, {'all_boxes':all_boxes}) 42 | 43 | def Generate_HICO_detection(output_file, HICO_dir): 44 | 45 | if not os.path.exists(HICO_dir): 46 | os.makedirs(HICO_dir) 47 | 48 | # Remove previous results 49 | filelist = [ f for f in os.listdir(HICO_dir)] 50 | for f in filelist: 51 | os.remove(os.path.join(HICO_dir, f)) 52 | 53 | HICO = pickle.load( open( output_file, "rb" ) ) 54 | 55 | save_HICO(HICO, HICO_dir, 1 ,161, 170) # 1 person 56 | save_HICO(HICO, HICO_dir, 2 ,11, 24 ) # 2 bicycle 57 | save_HICO(HICO, HICO_dir, 3 ,66, 76 ) # 3 car 58 | save_HICO(HICO, HICO_dir, 4 ,147, 160) # 4 motorcycle 59 | save_HICO(HICO, HICO_dir, 5 ,1, 10 ) # 5 airplane 60 | save_HICO(HICO, HICO_dir, 6 ,55, 65 ) # 6 bus 61 | save_HICO(HICO, HICO_dir, 7 ,187, 194) # 7 train 62 | save_HICO(HICO, HICO_dir, 8 ,568, 576) # 8 truck 63 | save_HICO(HICO, HICO_dir, 9 ,32, 46 ) # 9 boat 64 | save_HICO(HICO, HICO_dir, 10,563, 567) # 10 traffic light 65 | save_HICO(HICO, HICO_dir, 11,326,330) # 11 fire_hydrant 66 | save_HICO(HICO, HICO_dir, 12,503,506) # 12 stop_sign 67 | save_HICO(HICO, HICO_dir, 13,415,418) # 13 parking_meter 68 | save_HICO(HICO, HICO_dir, 14,244,247) # 14 bench 69 | save_HICO(HICO, HICO_dir, 15,25, 31) # 15 bird 70 | save_HICO(HICO, HICO_dir, 16,77, 86) # 16 cat 71 | save_HICO(HICO, HICO_dir, 17,112,129) # 17 dog 72 | save_HICO(HICO, HICO_dir, 18,130,146) # 18 horse 73 | save_HICO(HICO, HICO_dir, 19,175,186) # 19 sheep 74 | save_HICO(HICO, HICO_dir, 20,97,107) # 20 cow 75 | save_HICO(HICO, HICO_dir, 21,314,325) # 21 elephant 76 | save_HICO(HICO, HICO_dir, 22,236,239) # 22 bear 77 | save_HICO(HICO, HICO_dir, 23,596,600) # 23 zebra 78 | save_HICO(HICO, HICO_dir, 24,343,348) # 24 giraffe 79 | save_HICO(HICO, HICO_dir, 25,209,214) # 25 backpack 80 | save_HICO(HICO, HICO_dir, 26,577,584) # 26 umbrella 81 | save_HICO(HICO, HICO_dir, 27,353,356) # 27 handbag 82 | save_HICO(HICO, HICO_dir, 28,539,546) # 28 tie 83 | save_HICO(HICO, HICO_dir, 29,507,516) # 29 suitcase 84 | save_HICO(HICO, HICO_dir, 30,337,342) # 30 Frisbee 85 | save_HICO(HICO, HICO_dir, 31,464,474) # 31 skis 86 | save_HICO(HICO, HICO_dir, 32,475,483) # 32 snowboard 87 | save_HICO(HICO, HICO_dir, 33,489,502) # 33 sports_ball 88 | save_HICO(HICO, HICO_dir, 34,369,376) # 34 kite 89 | save_HICO(HICO, HICO_dir, 35,225,232) # 35 baseball_bat 90 | save_HICO(HICO, HICO_dir, 36,233,235) # 36 baseball_glove 91 | save_HICO(HICO, HICO_dir, 37,454,463) # 37 skateboard 92 | save_HICO(HICO, HICO_dir, 38,517,528) # 38 surfboard 93 | save_HICO(HICO, HICO_dir, 39,534,538) # 39 tennis_racket 94 | save_HICO(HICO, HICO_dir, 40,47,54) # 40 bottle 95 | save_HICO(HICO, HICO_dir, 41,589,595) # 41 wine_glass 96 | save_HICO(HICO, HICO_dir, 42,296,305) # 42 cup 97 | save_HICO(HICO, HICO_dir, 43,331,336) # 43 fork 98 | save_HICO(HICO, HICO_dir, 44,377,383) # 44 knife 99 | save_HICO(HICO, HICO_dir, 45,484,488) # 45 spoon 100 | save_HICO(HICO, HICO_dir, 46,253,257) # 46 bowl 101 | save_HICO(HICO, HICO_dir, 47,215,224) # 47 banana 102 | save_HICO(HICO, HICO_dir, 48,199,208) # 48 apple 103 | save_HICO(HICO, HICO_dir, 49,439,445) # 49 sandwich 104 | save_HICO(HICO, HICO_dir, 50,398,407) # 50 orange 105 | save_HICO(HICO, HICO_dir, 51,258,264) # 51 broccoli 106 | save_HICO(HICO, HICO_dir, 52,274,283) # 52 carrot 107 | save_HICO(HICO, HICO_dir, 53,357,363) # 53 hot_dog 108 | save_HICO(HICO, HICO_dir, 54,419,429) # 54 pizza 109 | save_HICO(HICO, HICO_dir, 55,306,313) # 55 donut 110 | save_HICO(HICO, HICO_dir, 56,265,273) # 56 cake 111 | save_HICO(HICO, HICO_dir, 57,87,92) # 57 chair 112 | save_HICO(HICO, HICO_dir, 58,93,96) # 58 couch 113 | save_HICO(HICO, HICO_dir, 59,171,174) # 59 potted_plant 114 | save_HICO(HICO, HICO_dir, 60,240,243) #60 bed 115 | save_HICO(HICO, HICO_dir, 61,108,111) #61 dining_table 116 | save_HICO(HICO, HICO_dir, 62,551,558) #62 toilet 117 | save_HICO(HICO, HICO_dir, 63,195,198) #63 TV 118 | save_HICO(HICO, HICO_dir, 64,384,389) #64 laptop 119 | save_HICO(HICO, HICO_dir, 65,394,397) #65 mouse 120 | save_HICO(HICO, HICO_dir, 66,435,438) #66 remote 121 | save_HICO(HICO, HICO_dir, 67,364,368) #67 keyboard 122 | save_HICO(HICO, HICO_dir, 68,284,290) #68 cell_phone 123 | save_HICO(HICO, HICO_dir, 69,390,393) #69 microwave 124 | save_HICO(HICO, HICO_dir, 70,408,414) #70 oven 125 | save_HICO(HICO, HICO_dir, 71,547,550) #71 toaster 126 | save_HICO(HICO, HICO_dir, 72,450,453) #72 sink 127 | save_HICO(HICO, HICO_dir, 73,430,434) #73 refrigerator 128 | save_HICO(HICO, HICO_dir, 74,248,252) #74 book 129 | save_HICO(HICO, HICO_dir, 75,291,295) #75 clock 130 | save_HICO(HICO, HICO_dir, 76,585,588) #76 vase 131 | save_HICO(HICO, HICO_dir, 77,446,449) #77 scissors 132 | save_HICO(HICO, HICO_dir, 78,529,533) #78 teddy_bear 133 | save_HICO(HICO, HICO_dir, 79,349,352) #79 hair_drier 134 | save_HICO(HICO, HICO_dir, 80,559,562) #80 toothbrush 135 | 136 | 137 | if __name__ == "__main__": 138 | hico_dir = "/DATA1/Benchmark/ETE_HOI/DenseNet/logs/vcoco_head_only_hicodet_bs12_d2_fixbn/results/HICO/" 139 | detection_path = "/DATA1/Benchmark/ETE_HOI/DenseNet/logs/vcoco_head_only_hicodet_bs12_d2_fixbn/results/test_bbox_results.pkl" 140 | Generate_HICO_detection(detection_path, hico_dir) -------------------------------------------------------------------------------- /efficientnet/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | from .utils import ( 6 | round_filters, 7 | round_repeats, 8 | drop_connect, 9 | get_same_padding_conv2d, 10 | get_model_params, 11 | efficientnet_params, 12 | load_pretrained_weights, 13 | Swish, 14 | MemoryEfficientSwish, 15 | ) 16 | 17 | class MBConvBlock(nn.Module): 18 | """ 19 | Mobile Inverted Residual Bottleneck Block 20 | 21 | Args: 22 | block_args (namedtuple): BlockArgs, see above 23 | global_params (namedtuple): GlobalParam, see above 24 | 25 | Attributes: 26 | has_se (bool): Whether the block contains a Squeeze and Excitation layer. 27 | """ 28 | 29 | def __init__(self, block_args, global_params): 30 | super().__init__() 31 | self._block_args = block_args 32 | self._bn_mom = 1 - global_params.batch_norm_momentum 33 | self._bn_eps = global_params.batch_norm_epsilon 34 | self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ratio <= 1) 35 | self.id_skip = block_args.id_skip # skip connection and drop connect 36 | 37 | # Get static or dynamic convolution depending on image size 38 | Conv2d = get_same_padding_conv2d(image_size=global_params.image_size) 39 | 40 | # Expansion phase 41 | inp = self._block_args.input_filters # number of input channels 42 | oup = self._block_args.input_filters * self._block_args.expand_ratio # number of output channels 43 | if self._block_args.expand_ratio != 1: 44 | self._expand_conv = Conv2d(in_channels=inp, out_channels=oup, kernel_size=1, bias=False) 45 | self._bn0 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps) 46 | 47 | # Depthwise convolution phase 48 | k = self._block_args.kernel_size 49 | s = self._block_args.stride 50 | self._depthwise_conv = Conv2d( 51 | in_channels=oup, out_channels=oup, groups=oup, # groups makes it depthwise 52 | kernel_size=k, stride=s, bias=False) 53 | self._bn1 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps) 54 | 55 | # Squeeze and Excitation layer, if desired 56 | if self.has_se: 57 | num_squeezed_channels = max(1, int(self._block_args.input_filters * self._block_args.se_ratio)) 58 | self._se_reduce = Conv2d(in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1) 59 | self._se_expand = Conv2d(in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1) 60 | 61 | # Output phase 62 | final_oup = self._block_args.output_filters 63 | self._project_conv = Conv2d(in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False) 64 | self._bn2 = nn.BatchNorm2d(num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps) 65 | self._swish = MemoryEfficientSwish() 66 | 67 | def forward(self, inputs, drop_connect_rate=None): 68 | """ 69 | :param inputs: input tensor 70 | :param drop_connect_rate: drop connect rate (float, between 0 and 1) 71 | :return: output of block 72 | """ 73 | 74 | # Expansion and Depthwise Convolution 75 | x = inputs 76 | if self._block_args.expand_ratio != 1: 77 | x = self._expand_conv(inputs) 78 | x = self._bn0(x) 79 | x = self._swish(x) 80 | 81 | x = self._depthwise_conv(x) 82 | x = self._bn1(x) 83 | x = self._swish(x) 84 | 85 | # Squeeze and Excitation 86 | if self.has_se: 87 | x_squeezed = F.adaptive_avg_pool2d(x, 1) 88 | x_squeezed = self._se_reduce(x_squeezed) 89 | x_squeezed = self._swish(x_squeezed) 90 | x_squeezed = self._se_expand(x_squeezed) 91 | x = torch.sigmoid(x_squeezed) * x 92 | 93 | x = self._project_conv(x) 94 | x = self._bn2(x) 95 | 96 | # Skip connection and drop connect 97 | input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters 98 | if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters: 99 | if drop_connect_rate: 100 | x = drop_connect(x, p=drop_connect_rate, training=self.training) 101 | x = x + inputs # skip connection 102 | return x 103 | 104 | def set_swish(self, memory_efficient=True): 105 | """Sets swish function as memory efficient (for training) or standard (for export)""" 106 | self._swish = MemoryEfficientSwish() if memory_efficient else Swish() 107 | 108 | 109 | class EfficientNet(nn.Module): 110 | """ 111 | An EfficientNet model. Most easily loaded with the .from_name or .from_pretrained methods 112 | 113 | Args: 114 | blocks_args (list): A list of BlockArgs to construct blocks 115 | global_params (namedtuple): A set of GlobalParams shared between blocks 116 | 117 | Example: 118 | model = EfficientNet.from_pretrained('efficientnet-b0') 119 | 120 | """ 121 | 122 | def __init__(self, blocks_args=None, global_params=None): 123 | super().__init__() 124 | assert isinstance(blocks_args, list), 'blocks_args should be a list' 125 | assert len(blocks_args) > 0, 'block args must be greater than 0' 126 | self._global_params = global_params 127 | self._blocks_args = blocks_args 128 | 129 | # Get static or dynamic convolution depending on image size 130 | Conv2d = get_same_padding_conv2d(image_size=global_params.image_size) 131 | 132 | # Batch norm parameters 133 | bn_mom = 1 - self._global_params.batch_norm_momentum 134 | bn_eps = self._global_params.batch_norm_epsilon 135 | 136 | # Stem 137 | in_channels = 3 # rgb 138 | out_channels = round_filters(32, self._global_params) # number of output channels 139 | self._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False) 140 | self._bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps) 141 | 142 | # Build blocks 143 | self._blocks = nn.ModuleList([]) 144 | for block_args in self._blocks_args: 145 | 146 | # Update block input and output filters based on depth multiplier. 147 | block_args = block_args._replace( 148 | input_filters=round_filters(block_args.input_filters, self._global_params), 149 | output_filters=round_filters(block_args.output_filters, self._global_params), 150 | num_repeat=round_repeats(block_args.num_repeat, self._global_params) 151 | ) 152 | 153 | # The first block needs to take care of stride and filter size increase. 154 | self._blocks.append(MBConvBlock(block_args, self._global_params)) 155 | if block_args.num_repeat > 1: 156 | block_args = block_args._replace(input_filters=block_args.output_filters, stride=1) 157 | for _ in range(block_args.num_repeat - 1): 158 | self._blocks.append(MBConvBlock(block_args, self._global_params)) 159 | 160 | # Head 161 | in_channels = block_args.output_filters # output of final block 162 | out_channels = round_filters(1280, self._global_params) 163 | self._conv_head = Conv2d(in_channels, out_channels, kernel_size=1, bias=False) 164 | self._bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps) 165 | 166 | # Final linear layer 167 | self._avg_pooling = nn.AdaptiveAvgPool2d(1) 168 | self._dropout = nn.Dropout(self._global_params.dropout_rate) 169 | self._fc = nn.Linear(out_channels, self._global_params.num_classes) 170 | self._swish = MemoryEfficientSwish() 171 | 172 | def set_swish(self, memory_efficient=True): 173 | """Sets swish function as memory efficient (for training) or standard (for export)""" 174 | self._swish = MemoryEfficientSwish() if memory_efficient else Swish() 175 | for block in self._blocks: 176 | block.set_swish(memory_efficient) 177 | 178 | 179 | def extract_features(self, inputs): 180 | """ Returns output of the final convolution layer """ 181 | 182 | # Stem 183 | x = self._swish(self._bn0(self._conv_stem(inputs))) 184 | 185 | # Blocks 186 | for idx, block in enumerate(self._blocks): 187 | drop_connect_rate = self._global_params.drop_connect_rate 188 | if drop_connect_rate: 189 | drop_connect_rate *= float(idx) / len(self._blocks) 190 | x = block(x, drop_connect_rate=drop_connect_rate) 191 | # Head 192 | x = self._swish(self._bn1(self._conv_head(x))) 193 | 194 | return x 195 | 196 | def forward(self, inputs): 197 | """ Calls extract_features to extract features, applies final linear layer, and returns logits. """ 198 | bs = inputs.size(0) 199 | # Convolution layers 200 | x = self.extract_features(inputs) 201 | 202 | # Pooling and final linear layer 203 | x = self._avg_pooling(x) 204 | x = x.view(bs, -1) 205 | x = self._dropout(x) 206 | x = self._fc(x) 207 | return x 208 | 209 | @classmethod 210 | def from_name(cls, model_name, override_params=None): 211 | cls._check_model_name_is_valid(model_name) 212 | blocks_args, global_params = get_model_params(model_name, override_params) 213 | return cls(blocks_args, global_params) 214 | 215 | @classmethod 216 | def from_pretrained(cls, model_name, load_weights=True, advprop=True, num_classes=1000, in_channels=3): 217 | model = cls.from_name(model_name, override_params={'num_classes': num_classes}) 218 | if load_weights: 219 | load_pretrained_weights(model, model_name, load_fc=(num_classes == 1000), advprop=advprop) 220 | if in_channels != 3: 221 | Conv2d = get_same_padding_conv2d(image_size = model._global_params.image_size) 222 | out_channels = round_filters(32, model._global_params) 223 | model._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False) 224 | return model 225 | 226 | @classmethod 227 | def get_image_size(cls, model_name): 228 | cls._check_model_name_is_valid(model_name) 229 | _, _, res, _ = efficientnet_params(model_name) 230 | return res 231 | 232 | @classmethod 233 | def _check_model_name_is_valid(cls, model_name): 234 | """ Validates model name. """ 235 | valid_models = ['efficientnet-b'+str(i) for i in range(9)] 236 | if model_name not in valid_models: 237 | raise ValueError('model_name should be one of: ' + ', '.join(valid_models)) 238 | -------------------------------------------------------------------------------- /efficientnet/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains helper functions for building the model and for loading model parameters. 3 | These helper functions are built to mirror those in the official TensorFlow implementation. 4 | """ 5 | 6 | import re 7 | import math 8 | import collections 9 | from functools import partial 10 | import torch 11 | from torch import nn 12 | from torch.nn import functional as F 13 | from torch.utils import model_zoo 14 | from .utils_extra import Conv2dStaticSamePadding 15 | 16 | ######################################################################## 17 | ############### HELPERS FUNCTIONS FOR MODEL ARCHITECTURE ############### 18 | ######################################################################## 19 | 20 | 21 | # Parameters for the entire model (stem, all blocks, and head) 22 | 23 | GlobalParams = collections.namedtuple('GlobalParams', [ 24 | 'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate', 25 | 'num_classes', 'width_coefficient', 'depth_coefficient', 26 | 'depth_divisor', 'min_depth', 'drop_connect_rate', 'image_size']) 27 | 28 | # Parameters for an individual model block 29 | BlockArgs = collections.namedtuple('BlockArgs', [ 30 | 'kernel_size', 'num_repeat', 'input_filters', 'output_filters', 31 | 'expand_ratio', 'id_skip', 'stride', 'se_ratio']) 32 | 33 | # Change namedtuple defaults 34 | GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields) 35 | BlockArgs.__new__.__defaults__ = (None,) * len(BlockArgs._fields) 36 | 37 | 38 | class SwishImplementation(torch.autograd.Function): 39 | @staticmethod 40 | def forward(ctx, i): 41 | result = i * torch.sigmoid(i) 42 | ctx.save_for_backward(i) 43 | return result 44 | 45 | @staticmethod 46 | def backward(ctx, grad_output): 47 | i = ctx.saved_variables[0] 48 | sigmoid_i = torch.sigmoid(i) 49 | return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i))) 50 | 51 | 52 | class MemoryEfficientSwish(nn.Module): 53 | def forward(self, x): 54 | return SwishImplementation.apply(x) 55 | 56 | 57 | class Swish(nn.Module): 58 | def forward(self, x): 59 | return x * torch.sigmoid(x) 60 | 61 | 62 | def round_filters(filters, global_params): 63 | """ Calculate and round number of filters based on depth multiplier. """ 64 | multiplier = global_params.width_coefficient 65 | if not multiplier: 66 | return filters 67 | divisor = global_params.depth_divisor 68 | min_depth = global_params.min_depth 69 | filters *= multiplier 70 | min_depth = min_depth or divisor 71 | new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor) 72 | if new_filters < 0.9 * filters: # prevent rounding by more than 10% 73 | new_filters += divisor 74 | return int(new_filters) 75 | 76 | 77 | def round_repeats(repeats, global_params): 78 | """ Round number of filters based on depth multiplier. """ 79 | multiplier = global_params.depth_coefficient 80 | if not multiplier: 81 | return repeats 82 | return int(math.ceil(multiplier * repeats)) 83 | 84 | 85 | def drop_connect(inputs, p, training): 86 | """ Drop connect. """ 87 | if not training: return inputs 88 | batch_size = inputs.shape[0] 89 | keep_prob = 1 - p 90 | random_tensor = keep_prob 91 | random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=inputs.dtype, device=inputs.device) 92 | binary_tensor = torch.floor(random_tensor) 93 | output = inputs / keep_prob * binary_tensor 94 | return output 95 | 96 | 97 | def get_same_padding_conv2d(image_size=None): 98 | """ Chooses static padding if you have specified an image size, and dynamic padding otherwise. 99 | Static padding is necessary for ONNX exporting of models. """ 100 | if image_size is None: 101 | return Conv2dDynamicSamePadding 102 | else: 103 | return partial(Conv2dStaticSamePadding, image_size=image_size) 104 | 105 | 106 | class Conv2dDynamicSamePadding(nn.Conv2d): 107 | """ 2D Convolutions like TensorFlow, for a dynamic image size """ 108 | 109 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True): 110 | super().__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias) 111 | self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2 112 | 113 | def forward(self, x): 114 | ih, iw = x.size()[-2:] 115 | kh, kw = self.weight.size()[-2:] 116 | sh, sw = self.stride 117 | oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) 118 | pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0) 119 | pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0) 120 | if pad_h > 0 or pad_w > 0: 121 | x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2]) 122 | return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) 123 | 124 | 125 | class Identity(nn.Module): 126 | def __init__(self, ): 127 | super(Identity, self).__init__() 128 | 129 | def forward(self, input): 130 | return input 131 | 132 | 133 | ######################################################################## 134 | ############## HELPERS FUNCTIONS FOR LOADING MODEL PARAMS ############## 135 | ######################################################################## 136 | 137 | 138 | def efficientnet_params(model_name): 139 | """ Map EfficientNet model name to parameter coefficients. """ 140 | params_dict = { 141 | # Coefficients: width,depth,res,dropout 142 | 'efficientnet-b0': (1.0, 1.0, 224, 0.2), 143 | 'efficientnet-b1': (1.0, 1.1, 240, 0.2), 144 | 'efficientnet-b2': (1.1, 1.2, 260, 0.3), 145 | 'efficientnet-b3': (1.2, 1.4, 300, 0.3), 146 | 'efficientnet-b4': (1.4, 1.8, 380, 0.4), 147 | 'efficientnet-b5': (1.6, 2.2, 456, 0.4), 148 | 'efficientnet-b6': (1.8, 2.6, 528, 0.5), 149 | 'efficientnet-b7': (2.0, 3.1, 600, 0.5), 150 | 'efficientnet-b8': (2.2, 3.6, 672, 0.5), 151 | 'efficientnet-l2': (4.3, 5.3, 800, 0.5), 152 | } 153 | return params_dict[model_name] 154 | 155 | 156 | class BlockDecoder(object): 157 | """ Block Decoder for readability, straight from the official TensorFlow repository """ 158 | 159 | @staticmethod 160 | def _decode_block_string(block_string): 161 | """ Gets a block through a string notation of arguments. """ 162 | assert isinstance(block_string, str) 163 | 164 | ops = block_string.split('_') 165 | options = {} 166 | for op in ops: 167 | splits = re.split(r'(\d.*)', op) 168 | if len(splits) >= 2: 169 | key, value = splits[:2] 170 | options[key] = value 171 | 172 | # Check stride 173 | assert (('s' in options and len(options['s']) == 1) or 174 | (len(options['s']) == 2 and options['s'][0] == options['s'][1])) 175 | 176 | return BlockArgs( 177 | kernel_size=int(options['k']), 178 | num_repeat=int(options['r']), 179 | input_filters=int(options['i']), 180 | output_filters=int(options['o']), 181 | expand_ratio=int(options['e']), 182 | id_skip=('noskip' not in block_string), 183 | se_ratio=float(options['se']) if 'se' in options else None, 184 | stride=[int(options['s'][0])]) 185 | 186 | @staticmethod 187 | def _encode_block_string(block): 188 | """Encodes a block to a string.""" 189 | args = [ 190 | 'r%d' % block.num_repeat, 191 | 'k%d' % block.kernel_size, 192 | 's%d%d' % (block.strides[0], block.strides[1]), 193 | 'e%s' % block.expand_ratio, 194 | 'i%d' % block.input_filters, 195 | 'o%d' % block.output_filters 196 | ] 197 | if 0 < block.se_ratio <= 1: 198 | args.append('se%s' % block.se_ratio) 199 | if block.id_skip is False: 200 | args.append('noskip') 201 | return '_'.join(args) 202 | 203 | @staticmethod 204 | def decode(string_list): 205 | """ 206 | Decodes a list of string notations to specify blocks inside the network. 207 | 208 | :param string_list: a list of strings, each string is a notation of block 209 | :return: a list of BlockArgs namedtuples of block args 210 | """ 211 | assert isinstance(string_list, list) 212 | blocks_args = [] 213 | for block_string in string_list: 214 | blocks_args.append(BlockDecoder._decode_block_string(block_string)) 215 | return blocks_args 216 | 217 | @staticmethod 218 | def encode(blocks_args): 219 | """ 220 | Encodes a list of BlockArgs to a list of strings. 221 | 222 | :param blocks_args: a list of BlockArgs namedtuples of block args 223 | :return: a list of strings, each string is a notation of block 224 | """ 225 | block_strings = [] 226 | for block in blocks_args: 227 | block_strings.append(BlockDecoder._encode_block_string(block)) 228 | return block_strings 229 | 230 | 231 | def efficientnet(width_coefficient=None, depth_coefficient=None, dropout_rate=0.2, 232 | drop_connect_rate=0.2, image_size=None, num_classes=1000): 233 | """ Creates a efficientnet model. """ 234 | 235 | blocks_args = [ 236 | 'r1_k3_s11_e1_i32_o16_se0.25', 'r2_k3_s22_e6_i16_o24_se0.25', 237 | 'r2_k5_s22_e6_i24_o40_se0.25', 'r3_k3_s22_e6_i40_o80_se0.25', 238 | 'r3_k5_s11_e6_i80_o112_se0.25', 'r4_k5_s22_e6_i112_o192_se0.25', 239 | 'r1_k3_s11_e6_i192_o320_se0.25', 240 | ] 241 | blocks_args = BlockDecoder.decode(blocks_args) 242 | 243 | global_params = GlobalParams( 244 | batch_norm_momentum=0.99, 245 | batch_norm_epsilon=1e-3, 246 | dropout_rate=dropout_rate, 247 | drop_connect_rate=drop_connect_rate, 248 | # data_format='channels_last', # removed, this is always true in PyTorch 249 | num_classes=num_classes, 250 | width_coefficient=width_coefficient, 251 | depth_coefficient=depth_coefficient, 252 | depth_divisor=8, 253 | min_depth=None, 254 | image_size=image_size, 255 | ) 256 | 257 | return blocks_args, global_params 258 | 259 | 260 | def get_model_params(model_name, override_params): 261 | """ Get the block args and global params for a given model """ 262 | if model_name.startswith('efficientnet'): 263 | w, d, s, p = efficientnet_params(model_name) 264 | # note: all models have drop connect rate = 0.2 265 | blocks_args, global_params = efficientnet( 266 | width_coefficient=w, depth_coefficient=d, dropout_rate=p, image_size=s) 267 | else: 268 | raise NotImplementedError('model name is not pre-defined: %s' % model_name) 269 | if override_params: 270 | # ValueError will be raised here if override_params has fields not included in global_params. 271 | global_params = global_params._replace(**override_params) 272 | return blocks_args, global_params 273 | 274 | 275 | url_map = { 276 | 'efficientnet-b0': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b0-355c32eb.pth', 277 | 'efficientnet-b1': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b1-f1951068.pth', 278 | 'efficientnet-b2': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b2-8bb594d6.pth', 279 | 'efficientnet-b3': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b3-5fb5a3c3.pth', 280 | 'efficientnet-b4': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b4-6ed6700e.pth', 281 | 'efficientnet-b5': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b5-b6417697.pth', 282 | 'efficientnet-b6': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b6-c76e70fd.pth', 283 | 'efficientnet-b7': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b7-dcc49843.pth', 284 | } 285 | 286 | url_map_advprop = { 287 | 'efficientnet-b0': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b0-b64d5a18.pth', 288 | 'efficientnet-b1': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b1-0f3ce85a.pth', 289 | 'efficientnet-b2': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b2-6e9d97e5.pth', 290 | 'efficientnet-b3': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b3-cdd7c0f4.pth', 291 | 'efficientnet-b4': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b4-44fb3a87.pth', 292 | 'efficientnet-b5': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b5-86493f6b.pth', 293 | 'efficientnet-b6': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b6-ac80338e.pth', 294 | 'efficientnet-b7': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b7-4652b6dd.pth', 295 | 'efficientnet-b8': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b8-22a8fe65.pth', 296 | } 297 | 298 | 299 | def load_pretrained_weights(model, model_name, load_fc=True, advprop=False): 300 | """ Loads pretrained weights, and downloads if loading for the first time. """ 301 | # AutoAugment or Advprop (different preprocessing) 302 | url_map_ = url_map_advprop if advprop else url_map 303 | state_dict = model_zoo.load_url(url_map_[model_name], map_location=torch.device('cpu')) 304 | # state_dict = torch.load('../../weights/backbone_efficientnetb0.pth') 305 | if load_fc: 306 | ret = model.load_state_dict(state_dict, strict=False) 307 | print(ret) 308 | else: 309 | state_dict.pop('_fc.weight') 310 | state_dict.pop('_fc.bias') 311 | res = model.load_state_dict(state_dict, strict=False) 312 | assert set(res.missing_keys) == set(['_fc.weight', '_fc.bias']), 'issue loading pretrained weights' 313 | print('Loaded pretrained weights for {}'.format(model_name)) 314 | -------------------------------------------------------------------------------- /efficientdet/hico_det_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import json 4 | import numpy as np 5 | import sys 6 | sys.path.append("/home/yichen/DenseNet") 7 | 8 | from torch.utils.data import Dataset, DataLoader 9 | from PIL import Image, ImageEnhance, ImageOps, ImageFile 10 | 11 | import cv2 12 | 13 | from tqdm.autonotebook import tqdm 14 | 15 | import datasets.vcoco.vsrl_utils as vu 16 | from efficientdet.vcoco_dataset import * 17 | from efficientdet.help_function import * 18 | 19 | 20 | obj_list = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 21 | 'fire hydrant', '', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 22 | 'cow', 'elephant', 'bear', 'zebra', 'giraffe', '', 'backpack', 'umbrella', '', '', 'handbag', 'tie', 23 | 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 24 | 'skateboard', 'surfboard', 'tennis racket', 'bottle', '', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 25 | 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 26 | 'cake', 'chair', 'couch', 'potted plant', 'bed', '', 'dining table', '', '', 'toilet', '', 'tv', 27 | 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 28 | 'refrigerator', '', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier','toothbrush'] 29 | 30 | 31 | class HICO_DET_Dataset(Dataset): 32 | def __init__(self, root_dir, set='train', transform=None, color_prob=0): 33 | # self.root_dir = root_dir 34 | self.data_dir = root_dir 35 | self.processed_dir = os.path.join(self.data_dir, "hico_processed") 36 | self.setname = set 37 | self.transform = transform 38 | self.color_prob = color_prob 39 | 40 | self.load_object_category() 41 | self.load_verb_category() 42 | self.load_hoi_category() 43 | self.load_ann_list() 44 | self.load_ann_by_image() 45 | 46 | def load_object_category(self): 47 | self.obj_to_id = {} 48 | self.id_to_obj = {} 49 | for id, obj in enumerate(obj_list): 50 | if obj != "": 51 | self.obj_to_id[obj] = id 52 | self.id_to_obj[id] = obj 53 | assert len(self.obj_to_id) == 80 54 | assert len(self.id_to_obj) == 80 55 | 56 | def load_verb_category(self): 57 | self.id_to_verb = {} 58 | self.verb_to_id = {} 59 | verb_list_path = os.path.join(self.processed_dir, "verb_list.json") 60 | with open(verb_list_path, "r") as file: 61 | verb_list = json.load(file) 62 | for item in verb_list: 63 | id = int(item["id"]) 64 | name = item["name"] 65 | self.id_to_verb[id] = name 66 | self.verb_to_id[name] = id 67 | self.num_verbs = len(self.verb_to_id) 68 | 69 | def load_hoi_category(self): 70 | self.hoi_to_objid = {} 71 | self.hoi_to_verbid = {} 72 | hoi_list_path = os.path.join(self.processed_dir, "hoi_list.json") 73 | with open(hoi_list_path, "r") as file: 74 | hoi_list = json.load(file) 75 | for item in hoi_list: 76 | hoi_id = int(item["id"]) 77 | object = item["object"] 78 | object = object.replace("_", " ") 79 | verb = item["verb"] 80 | self.hoi_to_objid[hoi_id] = self.obj_to_id[object] 81 | self.hoi_to_verbid[hoi_id] = self.verb_to_id[verb] 82 | self.num_hois = len(self.hoi_to_verbid) 83 | 84 | def load_ann_list(self): 85 | ann_list_path = os.path.join(self.processed_dir, "anno_list.json") 86 | with open(ann_list_path, "r") as file: 87 | ann_list = json.load(file) 88 | split_ann_list = [] 89 | for item in ann_list: 90 | if self.setname in item["global_id"]: 91 | split_ann_list.append(item) 92 | self.split_ann_list = split_ann_list 93 | 94 | def load_ann_by_image(self): 95 | self.ann_by_image = [] 96 | self.hoi_count = np.zeros(self.num_hois).tolist() 97 | self.verb_count = np.zeros(self.num_verbs).tolist() 98 | 99 | for image_id, image_item in enumerate(self.split_ann_list): 100 | img_anns = {} 101 | 102 | image_path_postfix = image_item["image_path_postfix"] 103 | img_path = os.path.join(self.data_dir, "images", image_path_postfix) 104 | img_anns["img_path"] = img_path 105 | 106 | hois = image_item["hois"] 107 | 108 | inters = [] # (human_bbox, object_bbox, object_category, [action_category]) 109 | instances = [] # (instance_bbox, instance_category, [human_actions], [object_actions]) 110 | 111 | for idx, hoi in enumerate(hois): 112 | id_to_inter = {} # (human_id, object_id) : (human_bbox, object_bbox, object_category, [action_category]) 113 | id_to_human = {} # human_id: (instance_bbox, instance_category, [human_actions], []) 114 | id_to_object = {} # object_id: (instance_bbox, instance_category, [object_actions]) 115 | 116 | hoi_id = int(hoi["id"]) 117 | if hoi["invis"]: 118 | continue 119 | # print(len(hoi["connections"]), len(hoi["human_bboxes"]), len(hoi["object_bboxes"])) 120 | for i in range(len(hoi["connections"])): 121 | 122 | connection = hoi["connections"][i] 123 | human_bbox = hoi["human_bboxes"][connection[0]] 124 | object_bbox = hoi["object_bboxes"][connection[1]] 125 | 126 | inter_id = tuple([idx] + connection) 127 | human_id = tuple([idx] + [connection[0]]) 128 | object_id = tuple([idx] + [connection[1]]) 129 | 130 | self.hoi_count[hoi_id - 1] += 1 131 | self.verb_count[self.hoi_to_verbid[hoi_id]-1] += 1 132 | 133 | if inter_id in id_to_inter: 134 | # id_to_inter[inter_id][3].append(hoi_id) 135 | id_to_inter[inter_id][3].append(self.hoi_to_verbid[hoi_id]) 136 | 137 | else: 138 | item = [] 139 | item.append(human_bbox) 140 | item.append(object_bbox) 141 | item.append(self.hoi_to_objid[hoi_id]) 142 | item.append([self.hoi_to_verbid[hoi_id]]) 143 | # item.append([hoi_id]) 144 | id_to_inter[inter_id] = item 145 | 146 | if human_id in id_to_human: 147 | id_to_human[human_id][2].append(self.hoi_to_verbid[hoi_id]) 148 | else: 149 | id_to_human[human_id] = [human_bbox, 0, [self.hoi_to_verbid[hoi_id]], []] 150 | 151 | if object_id in id_to_object: 152 | id_to_object[object_id][3].append(self.hoi_to_verbid[hoi_id]) 153 | else: 154 | id_to_object[object_id] = [object_bbox, self.hoi_to_objid[hoi_id], [], [self.hoi_to_verbid[hoi_id]]] 155 | 156 | inters += list(id_to_inter.values()) 157 | instances = instances + list(id_to_human.values()) + list(id_to_object.values()) 158 | 159 | unique_instances = [] 160 | for inst in instances: 161 | m = 0.7 162 | minst = None 163 | for uinst in unique_instances: 164 | if inst[1] == uinst[1] and single_iou(inst[0], uinst[0]) > m: 165 | minst = uinst 166 | m = single_iou(inst[0], uinst[0]) 167 | if minst is None: 168 | unique_instances.append(inst) 169 | else: 170 | minst[2] += inst[2] 171 | minst[3] += inst[3] 172 | 173 | unique_inters = [] 174 | for inter in inters: 175 | m = 0.7 ** 2 176 | minter = None 177 | for uinter in unique_inters: 178 | hiou = single_iou(inter[0], uinter[0]) 179 | oiou = single_iou(inter[1], uinter[1]) 180 | if inter[2] == uinter[2] and hiou > 0.7 and oiou > 0.7 and hiou*oiou > m: 181 | minter = uinter 182 | m = hiou * oiou 183 | if minter is None: 184 | unique_inters.append(inter) 185 | else: 186 | minter[3] += inter[3] 187 | 188 | 189 | # human_instances = list(id_to_human.values()) 190 | # obj_instances = [] 191 | # for id, obj in id_to_object.items(): 192 | # if obj[1] == 0: # human, judge overlap with human instance 193 | # flag = False 194 | # for hinst in human_instances: 195 | # if single_iou(hinst[0], obj[0]) > 0.75: 196 | # hinst[3].extend(obj[3]) 197 | # flag = True 198 | # break 199 | # if not flag: 200 | # obj_instances.append(obj) 201 | # instances = human_instances + obj_instances 202 | 203 | # if len(unique_instances) > 0: 204 | # img_anns["interaction"] = unique_inters 205 | # img_anns["instance"] = unique_instances 206 | # self.ann_by_image.append(img_anns) 207 | # else: 208 | # no_inst += 1 209 | # print("%d images has no instances"%no_inst) 210 | img_anns["interaction"] = unique_inters 211 | img_anns["instance"] = unique_instances 212 | self.ann_by_image.append(img_anns) 213 | self.num_images = len(self.ann_by_image) 214 | # with open("hico-det_hoi_count.json", "w") as file: 215 | # json.dump(self.hoi_count, file) 216 | # with open("hico-det_verb_count.json", "w") as file: 217 | # json.dump(self.verb_count, file) 218 | 219 | def __len__(self): 220 | return self.num_images 221 | 222 | def __getitem__(self, index): 223 | img_item = self.ann_by_image[index] 224 | img = self.load_img(img_item["img_path"]) 225 | 226 | annot_bbox = {"instance": [], "interaction": []} 227 | for i, ann in enumerate(img_item["instance"]): 228 | tmp = np.zeros(4 + 1 + self.num_verbs * 2) # (bbox, obj_cat, human action, object action) 229 | tmp[0:4] = ann[0] # bbox 230 | tmp[4] = ann[1] # object category 231 | human_act = np.zeros(self.num_verbs) # human action 232 | obj_act = np.zeros(self.num_verbs) # object action 233 | 234 | h_acts = np.array(ann[2]) - 1 235 | o_acts = np.array(ann[3]) - 1 236 | 237 | if h_acts.shape[0] > 0: 238 | human_act[h_acts] = 1 239 | if o_acts.shape[0] > 0: 240 | obj_act[o_acts] = 1 241 | 242 | tmp[5:5+self.num_verbs] = human_act 243 | tmp[5+self.num_verbs:5+2*self.num_verbs] = obj_act 244 | annot_bbox["instance"].append(tmp) 245 | 246 | for i, ann in enumerate(img_item["interaction"]): 247 | # tmp = np.zeros(12 + 1 + self.num_hois) # (human bbox, object bbox, union bbox, obj category, union action) 248 | tmp = np.zeros(12 + 1 + self.num_verbs) # (human bbox, object bbox, union bbox, obj category, union action) 249 | tmp[0:4] = ann[0] 250 | tmp[4:8] = ann[1] 251 | tmp[8:12] = self.merge_bbox(ann[0], ann[1]) 252 | tmp[12] = ann[2] 253 | 254 | union_acts = np.zeros(self.num_verbs) 255 | 256 | u_acts = np.array(ann[3]) - 1 257 | union_acts[u_acts] = 1 258 | tmp[13:] = union_acts 259 | annot_bbox["interaction"].append(tmp) 260 | 261 | for key in annot_bbox: 262 | annot_bbox[key] = np.array(annot_bbox[key]) 263 | 264 | sample = {'img': img, 'annot': annot_bbox} 265 | if self.transform: 266 | sample = self.transform(sample) 267 | return sample 268 | 269 | def merge_bbox(self, b1, b2): 270 | if b1[0] < 0: 271 | return b2 272 | if b2[0] < 0: 273 | return b1 274 | return [min(b1[0], b2[0]), min(b1[1], b2[1]), 275 | max(b1[2], b2[2]), max(b1[3], b2[3])] 276 | 277 | def load_img(self, img_path): 278 | img = cv2.imread(img_path) 279 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 280 | 281 | if np.random.uniform(0, 1) < self.color_prob: 282 | pil_img = Image.fromarray(img) 283 | img = np.array(randomColor(pil_img)) 284 | return img.astype(np.float32) / 255. 285 | 286 | 287 | if __name__=="__main__": 288 | from torch.utils.data import DataLoader 289 | from torchvision import transforms 290 | training_set = HICO_DET_Dataset(root_dir="/home/yichen/DenseNet/datasets", set="train", 291 | transform=transforms.Compose([Normalizer(), Augmenter(), Resizer()])) 292 | 293 | training_params = {'batch_size': 4, 294 | 'shuffle': False, 295 | 'drop_last': True, 296 | 'collate_fn': collater, 297 | 'num_workers': 0} 298 | training_generator = DataLoader(training_set, **training_params) 299 | 300 | 301 | # print("len:", len(training_generator)) 302 | np.set_printoptions(precision=3, suppress=True, threshold=np.inf) 303 | 304 | for epoch in range(100): 305 | print("epoch:", epoch) 306 | progress_bar = tqdm(training_generator) 307 | for i, data in enumerate(training_generator): 308 | # if iter < step - last_epoch * num_iter_per_epoch: 309 | # progress_bar.update() 310 | # continue 311 | imgs = data['img'] 312 | annot = data['annot'] 313 | 314 | # for key in annot: 315 | # print(key, annot[key].numpy()) 316 | 317 | 318 | 319 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | 4 | import argparse 5 | import torch 6 | import yaml 7 | import pickle 8 | import numpy as np 9 | 10 | from backbone import EfficientDetBackbone 11 | from efficientdet.utils import BBoxTransform, ClipBoxes 12 | from efficientdet.help_function import single_iou, single_ioa, single_inter, single_union, transform_action, label_to_class, sub_label_to_class 13 | from utils.utils import preprocess, invert_affine, postprocess, postprocess_hoi, postprocess_dense_union, postprocess_hoi_flip, postprocess_dense_union_flip 14 | from utils.apply_prior import apply_prior 15 | from utils.visual import visual_demo 16 | 17 | 18 | ap = argparse.ArgumentParser() 19 | ap.add_argument('-p', '--project', type=str, default='vcoco', help='project file that contains parameters') 20 | ap.add_argument('-c', '--compound_coef', type=int, default=3, help='coefficients of efficientdet') 21 | ap.add_argument('-w', '--weights', type=str, default='weights/vcoco_best.pth', help='/path/to/weights') 22 | ap.add_argument('--image_path', type=str, default='test/test.jpg', help='/path/to/image') 23 | ap.add_argument('--save_path', type=str, default='test/detection.jpg', help='/path/to/detection/result') 24 | ap.add_argument('--cuda', type=int, default=1) 25 | ap.add_argument('--device', type=int, default=0) 26 | ap.add_argument('--float16', type=int, default=0) 27 | ap.add_argument('--flip_test', type=int, default=1, help='whether apply flip augmentation when testing') 28 | 29 | args = ap.parse_args() 30 | 31 | image_path = args.image_path 32 | save_path = args.save_path 33 | compound_coef = args.compound_coef 34 | nms_threshold = 0.3 35 | use_cuda = args.cuda 36 | gpu = args.device 37 | use_float16 = args.float16 38 | need_visual = True 39 | weights_path = f'weights/efficientdet-d{compound_coef}.pth' if args.weights is None else args.weights 40 | project = args.project 41 | 42 | params = yaml.safe_load(open(f'projects/{project}.yml')) 43 | SET_NAME = params['val_set'] 44 | project_name = params["project_name"] 45 | 46 | 47 | print(f'running demonstration with weights {weights_path} on image {image_path}...') 48 | 49 | params = yaml.safe_load(open(f'projects/{project}.yml')) 50 | obj_list = params['obj_list'] 51 | union_action_list = eval(params['union_action_list']) 52 | 53 | input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] 54 | input_size = input_sizes[compound_coef] 55 | output_dir = f"./logs/{project_name}/results" 56 | 57 | 58 | def calc_ioa(a, b): 59 | # a(anchor) [boxes, (x1, y1, x2, y2)] 60 | # b(gt, coco-style) [boxes, (x1, y1, x2, y2)] 61 | 62 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) 63 | 64 | exp_x1 = np.expand_dims(a[:, 0], axis=1) 65 | exp_x2 = np.expand_dims(a[:, 2], axis=1) 66 | exp_y1 = np.expand_dims(a[:, 1], 1) 67 | exp_y2 = np.expand_dims(a[:, 3], 1) 68 | 69 | iw = np.where(exp_x2 < b[:, 2], exp_x2, b[:, 2]) - np.where(exp_x1 > b[:, 0], exp_x1, b[:, 0]) 70 | ih = np.where(exp_y2 < b[:, 3], exp_y2, b[:, 3]) - np.where(exp_y1 > b[:, 1], exp_y1, b[:, 1]) 71 | iw = np.where(iw > 0, iw, 0) 72 | ih = np.where(ih > 0, ih, 0) 73 | 74 | intersection = iw * ih 75 | area = np.where(area > 1e-6, area, 1e-6) 76 | IoA = intersection / area 77 | return IoA 78 | 79 | 80 | def calc_iou(a, b): 81 | # a(anchor) [boxes, (x1, y1, x2, y2)] 82 | # b(gt, coco-style) [boxes, (x1, y1, x2, y2)] 83 | 84 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) 85 | 86 | exp_x1 = np.expand_dims(a[:, 0], axis=1) 87 | exp_x2 = np.expand_dims(a[:, 2], axis=1) 88 | exp_y1 = np.expand_dims(a[:, 1], 1) 89 | exp_y2 = np.expand_dims(a[:, 3], 1) 90 | 91 | iw = np.where(exp_x2 < b[:, 2], exp_x2, b[:, 2]) - np.where(exp_x1 > b[:, 0], exp_x1, b[:, 0]) 92 | ih = np.where(exp_y2 < b[:, 3], exp_y2, b[:, 3]) - np.where(exp_y1 > b[:, 1], exp_y1, b[:, 1]) 93 | 94 | iw = np.where(iw > 0, iw, 0) 95 | ih = np.where(ih > 0, ih, 0) 96 | 97 | ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih 98 | ua = np.where(ua > 0, ua, 1e-8) 99 | 100 | intersection = iw * ih 101 | IoU = intersection / ua 102 | return IoU 103 | 104 | 105 | def xy_to_wh(bbox): 106 | ctr_x = (bbox[0] + bbox[2]) / 2 107 | ctr_y = (bbox[1] + bbox[3]) / 2 108 | width = bbox[2] - bbox[0] 109 | height = bbox[3] - bbox[1] 110 | return ctr_x, ctr_y, width, height 111 | 112 | 113 | def fetch_location_score(anchor_bbox, obj_bbox, target_bbox, human_bbox, sigma): 114 | xo, yo, wo, ho = xy_to_wh(obj_bbox) 115 | xt, yt, wt, ht = xy_to_wh(target_bbox) 116 | xa, ya, wa, ha = xy_to_wh(anchor_bbox) 117 | dist = np.zeros(2, dtype=np.float) 118 | dist[0] = (xo - xt) / wa 119 | dist[1] = (yo - yt) / ha 120 | 121 | return np.exp(-1*np.sum(dist**2)/(2*sigma**2)) 122 | 123 | 124 | def target_object_dist(target_objects_pos, objects_pos, anchors): 125 | width = anchors[:, 2] - anchors[:, 0] 126 | height = anchors[:, 3] - anchors[:, 1] 127 | anchors_size = np.stack([width, height], axis=1) 128 | anchors_size = np.expand_dims(anchors_size, axis=1) 129 | target_objects_pos = np.expand_dims(target_objects_pos, 1) 130 | diff = target_objects_pos - objects_pos 131 | diff = diff / anchors_size 132 | dist = np.sum(diff**2, axis=2) 133 | return dist 134 | 135 | 136 | def hoi_match(preds_inst, preds_union, prior_mask, thre=0.05, human_thre=0.6, anchor_thre=0.25, loc_thre=0.1): 137 | num_inst = len(preds_inst["rois"]) 138 | humans = [] 139 | objects = [] 140 | human_bboxes = [] 141 | human_inst_ids = [] 142 | human_role_scores = [] 143 | 144 | while len(humans)==0: 145 | if human_thre < 0.5: 146 | break 147 | for inst_id in range(num_inst): 148 | if preds_inst["obj_class_ids"][inst_id] != 0 or preds_inst["obj_scores"][inst_id] < human_thre: 149 | continue 150 | item = {} 151 | item["bbox"] = preds_inst["rois"][inst_id] 152 | item["agent_scores"] = preds_inst["act_scores"][inst_id] 153 | item["role_scores"] = transform_action(preds_inst["act_scores"][inst_id], "subject") 154 | item["obj_scores"] = preds_inst["obj_scores"][inst_id] 155 | item["inst_id"] = inst_id 156 | humans.append(item) 157 | human_bboxes.append(item["bbox"]) 158 | human_inst_ids.append(item["inst_id"]) 159 | human_role_scores.append(item["role_scores"]) 160 | human_thre -= 0.1 161 | human_bboxes = np.array(human_bboxes) 162 | human_inst_ids = np.array(human_inst_ids) 163 | human_role_scores = np.array(human_role_scores) 164 | 165 | obj_role_scores = [] 166 | for obj_id in range(len(preds_inst["rois"])): 167 | item = {} 168 | obj_role_score = transform_action(preds_inst["act_scores"][obj_id], "object") 169 | obj_role_score = apply_prior(obj_role_score, preds_inst["obj_class_ids"][obj_id]) 170 | item["obj_role_scores"] = obj_role_score 171 | 172 | item["obj_scores"] = preds_inst["obj_scores"][obj_id] 173 | 174 | item["obj_class_id"] = preds_inst["obj_class_ids"][obj_id] 175 | item["inst_id"] = obj_id 176 | obj_bbox = preds_inst["rois"][obj_id] 177 | item["bbox"] = obj_bbox 178 | objects.append(item) 179 | obj_role_scores.append(obj_role_score) 180 | object_bboxes = np.array(preds_inst["rois"]) 181 | obj_role_scores = np.array(obj_role_scores) 182 | 183 | hoi_pair_score = np.zeros((len(humans), len(preds_inst["obj_class_ids"]), len(union_action_list)), dtype=np.float) 184 | 185 | if len(human_bboxes) > 0: 186 | IoA = calc_ioa(preds_union["rois"], human_bboxes) 187 | 188 | IoA_max = np.max(IoA, axis=1) 189 | human_foreground = IoA_max > 0.25 190 | human_IoA = IoA[human_foreground] 191 | for key in preds_union: 192 | preds_union[key] = preds_union[key][human_foreground] 193 | 194 | new_IoA = calc_ioa(preds_union["rois"], preds_inst["rois"]) 195 | new_IoA_argmax = np.argmax(new_IoA, axis=1) 196 | new_IoA[np.arange(new_IoA.shape[0]), new_IoA_argmax] = 0 197 | new_IoA_sec_max = np.max(new_IoA, axis=1) 198 | obj_foreground = new_IoA_sec_max > 0.25 199 | for key in preds_union: 200 | preds_union[key] = preds_union[key][obj_foreground] 201 | 202 | human_IoU = calc_iou(preds_union["rois"], human_bboxes) 203 | human_IoA = human_IoA[obj_foreground] 204 | human_IoU_argmax = np.argmax(human_IoU * (human_IoA > 0.25), axis=1) 205 | obj_IoA = calc_ioa(preds_union["rois"], preds_inst["rois"]) 206 | 207 | num_union = len(preds_union["rois"]) 208 | num_human = len(human_bboxes) 209 | 210 | sp_vectors = preds_union["sp_vector"] 211 | inter_human_regions = human_bboxes[human_IoU_argmax] 212 | humans_pos_x = (inter_human_regions[:, 0] + inter_human_regions[:, 2]) / 2 213 | humans_pos_y = (inter_human_regions[:, 1] + inter_human_regions[:, 3]) / 2 214 | humans_pos = np.stack([humans_pos_x, humans_pos_y], axis=1) 215 | inter_objects_pos = humans_pos + sp_vectors 216 | 217 | objects_pos_x = (object_bboxes[:, 0] + object_bboxes[:, 2]) / 2 218 | objects_pos_y = (object_bboxes[:, 1] + object_bboxes[:, 3]) / 2 219 | objects_pos = np.stack([objects_pos_x, objects_pos_y], axis=1) 220 | 221 | obj_dists = target_object_dist(inter_objects_pos, objects_pos, preds_union["rois"]) 222 | inter_human_instids = human_inst_ids[human_IoU_argmax] 223 | obj_dists[np.arange(num_union), inter_human_instids] = 100 224 | obj_dists[obj_IoA < 0.25] = 100 225 | inter_obj_ids = np.argmin(obj_dists, 1) 226 | inter_obj_dist = obj_dists[np.arange(num_union), inter_obj_ids] 227 | 228 | sigma = 0.9 229 | location_scores = np.exp(-1 * inter_obj_dist / (2 * sigma ** 2)) 230 | location_scores = np.where(location_scores thre or prior_mask[id, 0] < 0.1: 275 | det[action + "_" + role] = (obj_bbox[0], obj_bbox[1], obj_bbox[2], obj_bbox[3], max_score[id]) 276 | agent_score = max_score[id] 277 | else: 278 | if human["role_scores"][id] > 0.0 and prior_mask[id, 0] > 0.1: 279 | det[action + "_" + role] = (0, 0, 0, 0, human["role_scores"][id] * human["obj_scores"] * prior_mask[id, 0]) 280 | agent_score = human["role_scores"][id] * human["obj_scores"] 281 | 282 | else: 283 | det[action + "_" + role] = (0, 0, 0, 0, 0) 284 | agent_score = 0 285 | 286 | if action + "_agent" not in det: 287 | det[action + "_agent"] = agent_score 288 | else: 289 | det[action + "_agent"] = max(agent_score, det[action + "_agent"]) 290 | for i in range(len(sub_label_to_class)): 291 | action = sub_label_to_class[i] 292 | if action + "_agent" not in det: 293 | det[action+"_agent"] = human["agent_scores"][i] 294 | dets.append(det) 295 | 296 | return dets 297 | 298 | 299 | def img_detect(file, model, input_size, regressBoxes, clipBoxes, prior_mask, threshold): 300 | img_path = file 301 | 302 | ori_imgs, framed_imgs, framed_metas = preprocess(img_path, max_size=input_size) 303 | if use_cuda: 304 | x = torch.stack([torch.from_numpy(fi).cuda() for fi in framed_imgs], 0) 305 | else: 306 | x = torch.stack([torch.from_numpy(fi) for fi in framed_imgs], 0) 307 | 308 | x = x.to(torch.float32 if not use_float16 else torch.float16).permute(0, 3, 1, 2) 309 | 310 | if args.flip_test: 311 | ids = torch.arange(x.shape[-1]-1, -1, -1).long().cuda() 312 | x_flip = x[..., ids] 313 | x_cat = torch.cat([x, x_flip], 0) 314 | 315 | with torch.no_grad(): 316 | if args.flip_test: 317 | 318 | features, union_act_cls, union_sub_reg, union_obj_reg, \ 319 | inst_act_cls, inst_obj_cls, inst_bbox_reg, anchors = model(x_cat) 320 | 321 | anchors = torch.cat([anchors, anchors], 0) 322 | preds_union = postprocess_dense_union_flip(x_cat, anchors, union_act_cls, union_sub_reg, union_obj_reg, 323 | regressBoxes, clipBoxes, 0.5, 1) 324 | preds_inst = postprocess_hoi_flip(x_cat, anchors, inst_bbox_reg, inst_obj_cls, inst_act_cls, 325 | regressBoxes, clipBoxes, threshold, nms_threshold, 326 | mode="object", classwise=True) 327 | else: 328 | 329 | 330 | features, union_act_cls, union_sub_reg, union_obj_reg, \ 331 | inst_act_cls, inst_obj_cls, inst_bbox_reg, anchors = model(x) 332 | 333 | preds_union = postprocess_dense_union(x, anchors, union_act_cls, union_sub_reg, union_obj_reg, 334 | regressBoxes, clipBoxes, 0.5, 1, classwise=True) 335 | preds_inst = postprocess_hoi(x, anchors, inst_bbox_reg, inst_obj_cls, inst_act_cls, 336 | regressBoxes, clipBoxes, threshold, nms_threshold, 337 | mode="object", classwise=True) 338 | 339 | preds_inst = invert_affine(framed_metas, preds_inst)[0] 340 | preds_union = invert_affine(framed_metas, preds_union)[0] 341 | 342 | dets = hoi_match(preds_inst, preds_union, prior_mask) 343 | 344 | return dets 345 | 346 | 347 | def test(threshold=0.2): 348 | with open("datasets/vcoco/new_prior_mask.pkl", "rb") as file: 349 | prior_mask = pickle.load(file, encoding="bytes") 350 | 351 | model = EfficientDetBackbone(num_classes=len(eval(params["obj_list"])), num_union_classes=25, 352 | num_inst_classes=51, compound_coef=args.compound_coef, 353 | ratios=eval(params["anchors_ratios"]), scales=eval(params["anchors_scales"])) 354 | model.load_state_dict(torch.load(weights_path, map_location=torch.device('cpu'))) 355 | model.requires_grad_(False) 356 | model.eval() 357 | 358 | if args.cuda: 359 | model = model.cuda() 360 | if args.float16: 361 | model = model.half() 362 | 363 | regressBoxes = BBoxTransform() 364 | clipBoxes = ClipBoxes() 365 | 366 | img_detection = img_detect(image_path, model, input_size, regressBoxes, clipBoxes, prior_mask, threshold=threshold) 367 | visual_demo(img_detection, image_path, save_path) 368 | 369 | 370 | if __name__ == '__main__': 371 | test() 372 | 373 | 374 | 375 | 376 | -------------------------------------------------------------------------------- /utils/sync_batchnorm/batchnorm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File : batchnorm.py 3 | # Author : Jiayuan Mao 4 | # Email : maojiayuan@gmail.com 5 | # Date : 27/01/2018 6 | # 7 | # This file is part of Synchronized-BatchNorm-PyTorch. 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch 9 | # Distributed under MIT License. 10 | 11 | import collections 12 | import contextlib 13 | 14 | import torch 15 | import torch.nn.functional as F 16 | 17 | from torch.nn.modules.batchnorm import _BatchNorm 18 | 19 | try: 20 | from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast 21 | except ImportError: 22 | ReduceAddCoalesced = Broadcast = None 23 | 24 | try: 25 | from jactorch.parallel.comm import SyncMaster 26 | from jactorch.parallel.data_parallel import JacDataParallel as DataParallelWithCallback 27 | except ImportError: 28 | from .comm import SyncMaster 29 | from .replicate import DataParallelWithCallback 30 | 31 | __all__ = [ 32 | 'SynchronizedBatchNorm1d', 'SynchronizedBatchNorm2d', 'SynchronizedBatchNorm3d', 33 | 'patch_sync_batchnorm', 'convert_model' 34 | ] 35 | 36 | 37 | def _sum_ft(tensor): 38 | """sum over the first and last dimention""" 39 | return tensor.sum(dim=0).sum(dim=-1) 40 | 41 | 42 | def _unsqueeze_ft(tensor): 43 | """add new dimensions at the front and the tail""" 44 | return tensor.unsqueeze(0).unsqueeze(-1) 45 | 46 | 47 | _ChildMessage = collections.namedtuple('_ChildMessage', ['sum', 'ssum', 'sum_size']) 48 | _MasterMessage = collections.namedtuple('_MasterMessage', ['sum', 'inv_std']) 49 | 50 | 51 | class _SynchronizedBatchNorm(_BatchNorm): 52 | def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True): 53 | assert ReduceAddCoalesced is not None, 'Can not use Synchronized Batch Normalization without CUDA support.' 54 | 55 | super(_SynchronizedBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine) 56 | 57 | self._sync_master = SyncMaster(self._data_parallel_master) 58 | 59 | self._is_parallel = False 60 | self._parallel_id = None 61 | self._slave_pipe = None 62 | 63 | def forward(self, input): 64 | # If it is not parallel computation or is in evaluation mode, use PyTorch's implementation. 65 | if not (self._is_parallel and self.training): 66 | return F.batch_norm( 67 | input, self.running_mean, self.running_var, self.weight, self.bias, 68 | self.training, self.momentum, self.eps) 69 | 70 | # Resize the input to (B, C, -1). 71 | input_shape = input.size() 72 | input = input.view(input.size(0), self.num_features, -1) 73 | 74 | # Compute the sum and square-sum. 75 | sum_size = input.size(0) * input.size(2) 76 | input_sum = _sum_ft(input) 77 | input_ssum = _sum_ft(input ** 2) 78 | 79 | # Reduce-and-broadcast the statistics. 80 | if self._parallel_id == 0: 81 | mean, inv_std = self._sync_master.run_master(_ChildMessage(input_sum, input_ssum, sum_size)) 82 | else: 83 | mean, inv_std = self._slave_pipe.run_slave(_ChildMessage(input_sum, input_ssum, sum_size)) 84 | 85 | # Compute the output. 86 | if self.affine: 87 | # MJY:: Fuse the multiplication for speed. 88 | output = (input - _unsqueeze_ft(mean)) * _unsqueeze_ft(inv_std * self.weight) + _unsqueeze_ft(self.bias) 89 | else: 90 | output = (input - _unsqueeze_ft(mean)) * _unsqueeze_ft(inv_std) 91 | 92 | # Reshape it. 93 | return output.view(input_shape) 94 | 95 | def __data_parallel_replicate__(self, ctx, copy_id): 96 | self._is_parallel = True 97 | self._parallel_id = copy_id 98 | 99 | # parallel_id == 0 means master device. 100 | if self._parallel_id == 0: 101 | ctx.sync_master = self._sync_master 102 | else: 103 | self._slave_pipe = ctx.sync_master.register_slave(copy_id) 104 | 105 | def _data_parallel_master(self, intermediates): 106 | """Reduce the sum and square-sum, compute the statistics, and broadcast it.""" 107 | 108 | # Always using same "device order" makes the ReduceAdd operation faster. 109 | # Thanks to:: Tete Xiao (http://tetexiao.com/) 110 | intermediates = sorted(intermediates, key=lambda i: i[1].sum.get_device()) 111 | 112 | to_reduce = [i[1][:2] for i in intermediates] 113 | to_reduce = [j for i in to_reduce for j in i] # flatten 114 | target_gpus = [i[1].sum.get_device() for i in intermediates] 115 | 116 | sum_size = sum([i[1].sum_size for i in intermediates]) 117 | sum_, ssum = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce) 118 | mean, inv_std = self._compute_mean_std(sum_, ssum, sum_size) 119 | 120 | broadcasted = Broadcast.apply(target_gpus, mean, inv_std) 121 | 122 | outputs = [] 123 | for i, rec in enumerate(intermediates): 124 | outputs.append((rec[0], _MasterMessage(*broadcasted[i*2:i*2+2]))) 125 | 126 | return outputs 127 | 128 | def _compute_mean_std(self, sum_, ssum, size): 129 | """Compute the mean and standard-deviation with sum and square-sum. This method 130 | also maintains the moving average on the master device.""" 131 | assert size > 1, 'BatchNorm computes unbiased standard-deviation, which requires size > 1.' 132 | mean = sum_ / size 133 | sumvar = ssum - sum_ * mean 134 | unbias_var = sumvar / (size - 1) 135 | bias_var = sumvar / size 136 | 137 | if hasattr(torch, 'no_grad'): 138 | with torch.no_grad(): 139 | self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean.data 140 | self.running_var = (1 - self.momentum) * self.running_var + self.momentum * unbias_var.data 141 | else: 142 | self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean.data 143 | self.running_var = (1 - self.momentum) * self.running_var + self.momentum * unbias_var.data 144 | 145 | return mean, bias_var.clamp(self.eps) ** -0.5 146 | 147 | 148 | class SynchronizedBatchNorm1d(_SynchronizedBatchNorm): 149 | r"""Applies Synchronized Batch Normalization over a 2d or 3d input that is seen as a 150 | mini-batch. 151 | 152 | .. math:: 153 | 154 | y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta 155 | 156 | This module differs from the built-in PyTorch BatchNorm1d as the mean and 157 | standard-deviation are reduced across all devices during training. 158 | 159 | For example, when one uses `nn.DataParallel` to wrap the network during 160 | training, PyTorch's implementation normalize the tensor on each device using 161 | the statistics only on that device, which accelerated the computation and 162 | is also easy to implement, but the statistics might be inaccurate. 163 | Instead, in this synchronized version, the statistics will be computed 164 | over all training samples distributed on multiple devices. 165 | 166 | Note that, for one-GPU or CPU-only case, this module behaves exactly same 167 | as the built-in PyTorch implementation. 168 | 169 | The mean and standard-deviation are calculated per-dimension over 170 | the mini-batches and gamma and beta are learnable parameter vectors 171 | of size C (where C is the input size). 172 | 173 | During training, this layer keeps a running estimate of its computed mean 174 | and variance. The running sum is kept with a default momentum of 0.1. 175 | 176 | During evaluation, this running mean/variance is used for normalization. 177 | 178 | Because the BatchNorm is done over the `C` dimension, computing statistics 179 | on `(N, L)` slices, it's common terminology to call this Temporal BatchNorm 180 | 181 | Args: 182 | num_features: num_features from an expected input of size 183 | `batch_size x num_features [x width]` 184 | eps: a value added to the denominator for numerical stability. 185 | Default: 1e-5 186 | momentum: the value used for the running_mean and running_var 187 | computation. Default: 0.1 188 | affine: a boolean value that when set to ``True``, gives the layer learnable 189 | affine parameters. Default: ``True`` 190 | 191 | Shape:: 192 | - Input: :math:`(N, C)` or :math:`(N, C, L)` 193 | - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input) 194 | 195 | Examples: 196 | >>> # With Learnable Parameters 197 | >>> m = SynchronizedBatchNorm1d(100) 198 | >>> # Without Learnable Parameters 199 | >>> m = SynchronizedBatchNorm1d(100, affine=False) 200 | >>> input = torch.autograd.Variable(torch.randn(20, 100)) 201 | >>> output = m(input) 202 | """ 203 | 204 | def _check_input_dim(self, input): 205 | if input.dim() != 2 and input.dim() != 3: 206 | raise ValueError('expected 2D or 3D input (got {}D input)' 207 | .format(input.dim())) 208 | super(SynchronizedBatchNorm1d, self)._check_input_dim(input) 209 | 210 | 211 | class SynchronizedBatchNorm2d(_SynchronizedBatchNorm): 212 | r"""Applies Batch Normalization over a 4d input that is seen as a mini-batch 213 | of 3d inputs 214 | 215 | .. math:: 216 | 217 | y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta 218 | 219 | This module differs from the built-in PyTorch BatchNorm2d as the mean and 220 | standard-deviation are reduced across all devices during training. 221 | 222 | For example, when one uses `nn.DataParallel` to wrap the network during 223 | training, PyTorch's implementation normalize the tensor on each device using 224 | the statistics only on that device, which accelerated the computation and 225 | is also easy to implement, but the statistics might be inaccurate. 226 | Instead, in this synchronized version, the statistics will be computed 227 | over all training samples distributed on multiple devices. 228 | 229 | Note that, for one-GPU or CPU-only case, this module behaves exactly same 230 | as the built-in PyTorch implementation. 231 | 232 | The mean and standard-deviation are calculated per-dimension over 233 | the mini-batches and gamma and beta are learnable parameter vectors 234 | of size C (where C is the input size). 235 | 236 | During training, this layer keeps a running estimate of its computed mean 237 | and variance. The running sum is kept with a default momentum of 0.1. 238 | 239 | During evaluation, this running mean/variance is used for normalization. 240 | 241 | Because the BatchNorm is done over the `C` dimension, computing statistics 242 | on `(N, H, W)` slices, it's common terminology to call this Spatial BatchNorm 243 | 244 | Args: 245 | num_features: num_features from an expected input of 246 | size batch_size x num_features x height x width 247 | eps: a value added to the denominator for numerical stability. 248 | Default: 1e-5 249 | momentum: the value used for the running_mean and running_var 250 | computation. Default: 0.1 251 | affine: a boolean value that when set to ``True``, gives the layer learnable 252 | affine parameters. Default: ``True`` 253 | 254 | Shape:: 255 | - Input: :math:`(N, C, H, W)` 256 | - Output: :math:`(N, C, H, W)` (same shape as input) 257 | 258 | Examples: 259 | >>> # With Learnable Parameters 260 | >>> m = SynchronizedBatchNorm2d(100) 261 | >>> # Without Learnable Parameters 262 | >>> m = SynchronizedBatchNorm2d(100, affine=False) 263 | >>> input = torch.autograd.Variable(torch.randn(20, 100, 35, 45)) 264 | >>> output = m(input) 265 | """ 266 | 267 | def _check_input_dim(self, input): 268 | if input.dim() != 4: 269 | raise ValueError('expected 4D input (got {}D input)' 270 | .format(input.dim())) 271 | super(SynchronizedBatchNorm2d, self)._check_input_dim(input) 272 | 273 | 274 | class SynchronizedBatchNorm3d(_SynchronizedBatchNorm): 275 | r"""Applies Batch Normalization over a 5d input that is seen as a mini-batch 276 | of 4d inputs 277 | 278 | .. math:: 279 | 280 | y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta 281 | 282 | This module differs from the built-in PyTorch BatchNorm3d as the mean and 283 | standard-deviation are reduced across all devices during training. 284 | 285 | For example, when one uses `nn.DataParallel` to wrap the network during 286 | training, PyTorch's implementation normalize the tensor on each device using 287 | the statistics only on that device, which accelerated the computation and 288 | is also easy to implement, but the statistics might be inaccurate. 289 | Instead, in this synchronized version, the statistics will be computed 290 | over all training samples distributed on multiple devices. 291 | 292 | Note that, for one-GPU or CPU-only case, this module behaves exactly same 293 | as the built-in PyTorch implementation. 294 | 295 | The mean and standard-deviation are calculated per-dimension over 296 | the mini-batches and gamma and beta are learnable parameter vectors 297 | of size C (where C is the input size). 298 | 299 | During training, this layer keeps a running estimate of its computed mean 300 | and variance. The running sum is kept with a default momentum of 0.1. 301 | 302 | During evaluation, this running mean/variance is used for normalization. 303 | 304 | Because the BatchNorm is done over the `C` dimension, computing statistics 305 | on `(N, D, H, W)` slices, it's common terminology to call this Volumetric BatchNorm 306 | or Spatio-temporal BatchNorm 307 | 308 | Args: 309 | num_features: num_features from an expected input of 310 | size batch_size x num_features x depth x height x width 311 | eps: a value added to the denominator for numerical stability. 312 | Default: 1e-5 313 | momentum: the value used for the running_mean and running_var 314 | computation. Default: 0.1 315 | affine: a boolean value that when set to ``True``, gives the layer learnable 316 | affine parameters. Default: ``True`` 317 | 318 | Shape:: 319 | - Input: :math:`(N, C, D, H, W)` 320 | - Output: :math:`(N, C, D, H, W)` (same shape as input) 321 | 322 | Examples: 323 | >>> # With Learnable Parameters 324 | >>> m = SynchronizedBatchNorm3d(100) 325 | >>> # Without Learnable Parameters 326 | >>> m = SynchronizedBatchNorm3d(100, affine=False) 327 | >>> input = torch.autograd.Variable(torch.randn(20, 100, 35, 45, 10)) 328 | >>> output = m(input) 329 | """ 330 | 331 | def _check_input_dim(self, input): 332 | if input.dim() != 5: 333 | raise ValueError('expected 5D input (got {}D input)' 334 | .format(input.dim())) 335 | super(SynchronizedBatchNorm3d, self)._check_input_dim(input) 336 | 337 | 338 | @contextlib.contextmanager 339 | def patch_sync_batchnorm(): 340 | import torch.nn as nn 341 | 342 | backup = nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d 343 | 344 | nn.BatchNorm1d = SynchronizedBatchNorm1d 345 | nn.BatchNorm2d = SynchronizedBatchNorm2d 346 | nn.BatchNorm3d = SynchronizedBatchNorm3d 347 | 348 | yield 349 | 350 | nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d = backup 351 | 352 | 353 | def convert_model(module): 354 | """Traverse the input module and its child recursively 355 | and replace all instance of torch.nn.modules.batchnorm.BatchNorm*N*d 356 | to SynchronizedBatchNorm*N*d 357 | 358 | Args: 359 | module: the input module needs to be convert to SyncBN model 360 | 361 | Examples: 362 | >>> import torch.nn as nn 363 | >>> import torchvision 364 | >>> # m is a standard pytorch model 365 | >>> m = torchvision.models.resnet18(True) 366 | >>> m = nn.DataParallel(m) 367 | >>> # after convert, m is using SyncBN 368 | >>> m = convert_model(m) 369 | """ 370 | if isinstance(module, torch.nn.DataParallel): 371 | mod = module.module 372 | mod = convert_model(mod) 373 | mod = DataParallelWithCallback(mod, device_ids=module.device_ids) 374 | return mod 375 | 376 | mod = module 377 | for pth_module, sync_module in zip([torch.nn.modules.batchnorm.BatchNorm1d, 378 | torch.nn.modules.batchnorm.BatchNorm2d, 379 | torch.nn.modules.batchnorm.BatchNorm3d], 380 | [SynchronizedBatchNorm1d, 381 | SynchronizedBatchNorm2d, 382 | SynchronizedBatchNorm3d]): 383 | if isinstance(module, pth_module): 384 | mod = sync_module(module.num_features, module.eps, module.momentum, module.affine) 385 | mod.running_mean = module.running_mean 386 | mod.running_var = module.running_var 387 | if module.affine: 388 | mod.weight.data = module.weight.data.clone().detach() 389 | mod.bias.data = module.bias.data.clone().detach() 390 | 391 | for name, child in module.named_children(): 392 | mod.add_module(name, convert_model(child)) 393 | 394 | return mod 395 | -------------------------------------------------------------------------------- /test_vcoco.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import cv2 4 | import time 5 | import threading 6 | 7 | import argparse 8 | import torch 9 | import yaml 10 | import pickle 11 | import numpy as np 12 | 13 | from utils.vsrl_eval import VCOCOeval 14 | from backbone import EfficientDetBackbone 15 | from efficientdet.utils import BBoxTransform, ClipBoxes 16 | from efficientdet.help_function import single_iou, single_ioa, single_inter, single_union, transform_action, label_to_class, sub_label_to_class 17 | from utils.utils import preprocess, invert_affine, postprocess, postprocess_hoi, postprocess_dense_union, postprocess_hoi_flip, postprocess_dense_union_flip 18 | from utils.apply_prior import apply_prior 19 | from utils.timer import Timer 20 | from utils.visual import visual 21 | 22 | 23 | ap = argparse.ArgumentParser() 24 | ap.add_argument('-p', '--project', type=str, default='vcoco', help='project file that contains parameters') 25 | ap.add_argument('-c', '--compound_coef', type=int, default=3, help='coefficients of efficientdet') 26 | ap.add_argument('-w', '--weights', type=str, default=None, help='/path/to/weights') 27 | ap.add_argument('--nms_threshold', type=float, default=0.3, help='nms threshold, don\'t change it if not for testing purposes') 28 | ap.add_argument('--cuda', type=int, default=1) 29 | ap.add_argument('--device', type=int, default=0) 30 | ap.add_argument('--float16', type=int, default=0) 31 | ap.add_argument('--override', type=int, default=0, help='override previous bbox results file if exists') 32 | ap.add_argument('--data_dir', type=str, default='./datasets', help='the root folder of dataset') 33 | ap.add_argument('--need_visual', type=int, default=0, help='whether need to visualize the results') 34 | ap.add_argument('--flip_test', type=int, default=1, help='whether apply flip augmentation when testing') 35 | 36 | args = ap.parse_args() 37 | 38 | compound_coef = args.compound_coef 39 | nms_threshold = args.nms_threshold 40 | use_cuda = args.cuda 41 | gpu = args.device 42 | use_float16 = args.float16 43 | override_prev_results = args.override 44 | need_visual = args.need_visual 45 | weights_path = f'weights/efficientdet-d{compound_coef}.pth' if args.weights is None else args.weights 46 | data_dir = args.data_dir 47 | project = args.project 48 | 49 | params = yaml.safe_load(open(f'projects/{project}.yml')) 50 | SET_NAME = params['val_set'] 51 | project_name = params["project_name"] 52 | 53 | 54 | print(f'running coco-style evaluation on project {project_name}, weights {weights_path}...') 55 | 56 | params = yaml.safe_load(open(f'projects/{project}.yml')) 57 | obj_list = params['obj_list'] 58 | union_action_list = eval(params['union_action_list']) 59 | 60 | input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] 61 | input_size = input_sizes[compound_coef] 62 | output_dir = f"./logs/{project_name}/results" 63 | 64 | if not os.path.exists(output_dir): 65 | os.mkdir(output_dir) 66 | 67 | if args.flip_test: 68 | detection_path = os.path.join(output_dir, f'{SET_NAME}_bbox_results_flip_final.pkl') 69 | else: 70 | detection_path = os.path.join(output_dir, f'{SET_NAME}_bbox_results_final.pkl') 71 | 72 | def calc_ioa(a, b): 73 | # a(anchor) [boxes, (x1, y1, x2, y2)] 74 | # b(gt, coco-style) [boxes, (x1, y1, x2, y2)] 75 | 76 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) 77 | 78 | exp_x1 = np.expand_dims(a[:, 0], axis=1) 79 | exp_x2 = np.expand_dims(a[:, 2], axis=1) 80 | exp_y1 = np.expand_dims(a[:, 1], 1) 81 | exp_y2 = np.expand_dims(a[:, 3], 1) 82 | 83 | iw = np.where(exp_x2 < b[:, 2], exp_x2, b[:, 2]) - np.where(exp_x1 > b[:, 0], exp_x1, b[:, 0]) 84 | ih = np.where(exp_y2 < b[:, 3], exp_y2, b[:, 3]) - np.where(exp_y1 > b[:, 1], exp_y1, b[:, 1]) 85 | iw = np.where(iw > 0, iw, 0) 86 | ih = np.where(ih > 0, ih, 0) 87 | 88 | intersection = iw * ih 89 | area = np.where(area > 1e-6, area, 1e-6) 90 | IoA = intersection / area 91 | return IoA 92 | 93 | 94 | def calc_iou(a, b): 95 | # a(anchor) [boxes, (x1, y1, x2, y2)] 96 | # b(gt, coco-style) [boxes, (x1, y1, x2, y2)] 97 | 98 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) 99 | 100 | exp_x1 = np.expand_dims(a[:, 0], axis=1) 101 | exp_x2 = np.expand_dims(a[:, 2], axis=1) 102 | exp_y1 = np.expand_dims(a[:, 1], 1) 103 | exp_y2 = np.expand_dims(a[:, 3], 1) 104 | 105 | iw = np.where(exp_x2 < b[:, 2], exp_x2, b[:, 2]) - np.where(exp_x1 > b[:, 0], exp_x1, b[:, 0]) 106 | ih = np.where(exp_y2 < b[:, 3], exp_y2, b[:, 3]) - np.where(exp_y1 > b[:, 1], exp_y1, b[:, 1]) 107 | 108 | iw = np.where(iw > 0, iw, 0) 109 | ih = np.where(ih > 0, ih, 0) 110 | 111 | ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih 112 | ua = np.where(ua > 0, ua, 1e-8) 113 | 114 | intersection = iw * ih 115 | IoU = intersection / ua 116 | return IoU 117 | 118 | 119 | def xy_to_wh(bbox): 120 | ctr_x = (bbox[0] + bbox[2]) / 2 121 | ctr_y = (bbox[1] + bbox[3]) / 2 122 | width = bbox[2] - bbox[0] 123 | height = bbox[3] - bbox[1] 124 | return ctr_x, ctr_y, width, height 125 | 126 | 127 | def fetch_location_score(anchor_bbox, obj_bbox, target_bbox, human_bbox, sigma): 128 | xo, yo, wo, ho = xy_to_wh(obj_bbox) 129 | xt, yt, wt, ht = xy_to_wh(target_bbox) 130 | xa, ya, wa, ha = xy_to_wh(anchor_bbox) 131 | dist = np.zeros(2, dtype=np.float) 132 | dist[0] = (xo - xt) / wa 133 | dist[1] = (yo - yt) / ha 134 | 135 | return np.exp(-1*np.sum(dist**2)/(2*sigma**2)) 136 | 137 | 138 | def target_object_dist(target_objects_pos, objects_pos, anchors): 139 | width = anchors[:, 2] - anchors[:, 0] 140 | height = anchors[:, 3] - anchors[:, 1] 141 | anchors_size = np.stack([width, height], axis=1) 142 | anchors_size = np.expand_dims(anchors_size, axis=1) 143 | target_objects_pos = np.expand_dims(target_objects_pos, 1) 144 | diff = target_objects_pos - objects_pos 145 | diff = diff / anchors_size 146 | dist = np.sum(diff**2, axis=2) 147 | return dist 148 | 149 | 150 | def hoi_match(image_id, preds_inst, preds_union, prior_mask, thre=0.05, human_thre=0.6, anchor_thre=0.25, loc_thre=0.1): 151 | num_inst = len(preds_inst["rois"]) 152 | humans = [] 153 | objects = [] 154 | human_bboxes = [] 155 | human_inst_ids = [] 156 | human_role_scores = [] 157 | 158 | while len(humans)==0: 159 | if human_thre < 0.5: 160 | break 161 | for inst_id in range(num_inst): 162 | if preds_inst["obj_class_ids"][inst_id] != 0 or preds_inst["obj_scores"][inst_id] < human_thre: 163 | continue 164 | item = {} 165 | item["bbox"] = preds_inst["rois"][inst_id] 166 | item["agent_scores"] = preds_inst["act_scores"][inst_id] 167 | item["role_scores"] = transform_action(preds_inst["act_scores"][inst_id], "subject") 168 | item["obj_scores"] = preds_inst["obj_scores"][inst_id] 169 | item["inst_id"] = inst_id 170 | humans.append(item) 171 | human_bboxes.append(item["bbox"]) 172 | human_inst_ids.append(item["inst_id"]) 173 | human_role_scores.append(item["role_scores"]) 174 | human_thre -= 0.1 175 | human_bboxes = np.array(human_bboxes) 176 | human_inst_ids = np.array(human_inst_ids) 177 | human_role_scores = np.array(human_role_scores) 178 | 179 | obj_role_scores = [] 180 | for obj_id in range(len(preds_inst["rois"])): 181 | item = {} 182 | obj_role_score = transform_action(preds_inst["act_scores"][obj_id], "object") 183 | obj_role_score = apply_prior(obj_role_score, preds_inst["obj_class_ids"][obj_id]) 184 | item["obj_role_scores"] = obj_role_score 185 | 186 | item["obj_scores"] = preds_inst["obj_scores"][obj_id] 187 | 188 | item["obj_class_id"] = preds_inst["obj_class_ids"][obj_id] 189 | item["inst_id"] = obj_id 190 | obj_bbox = preds_inst["rois"][obj_id] 191 | item["bbox"] = obj_bbox 192 | objects.append(item) 193 | obj_role_scores.append(obj_role_score) 194 | object_bboxes = np.array(preds_inst["rois"]) 195 | obj_role_scores = np.array(obj_role_scores) 196 | 197 | hoi_pair_score = np.zeros((len(humans), len(preds_inst["obj_class_ids"]), len(union_action_list)), dtype=np.float) 198 | 199 | if len(human_bboxes) > 0: 200 | IoA = calc_ioa(preds_union["rois"], human_bboxes) 201 | 202 | IoA_max = np.max(IoA, axis=1) 203 | human_foreground = IoA_max > 0.25 204 | human_IoA = IoA[human_foreground] 205 | for key in preds_union: 206 | preds_union[key] = preds_union[key][human_foreground] 207 | 208 | new_IoA = calc_ioa(preds_union["rois"], preds_inst["rois"]) 209 | new_IoA_argmax = np.argmax(new_IoA, axis=1) 210 | new_IoA[np.arange(new_IoA.shape[0]), new_IoA_argmax] = 0 211 | new_IoA_sec_max = np.max(new_IoA, axis=1) 212 | obj_foreground = new_IoA_sec_max > 0.25 213 | for key in preds_union: 214 | preds_union[key] = preds_union[key][obj_foreground] 215 | 216 | human_IoU = calc_iou(preds_union["rois"], human_bboxes) 217 | human_IoA = human_IoA[obj_foreground] 218 | human_IoU_argmax = np.argmax(human_IoU * (human_IoA > 0.25), axis=1) 219 | obj_IoA = calc_ioa(preds_union["rois"], preds_inst["rois"]) 220 | 221 | num_union = len(preds_union["rois"]) 222 | num_human = len(human_bboxes) 223 | 224 | sp_vectors = preds_union["sp_vector"] 225 | inter_human_regions = human_bboxes[human_IoU_argmax] 226 | humans_pos_x = (inter_human_regions[:, 0] + inter_human_regions[:, 2]) / 2 227 | humans_pos_y = (inter_human_regions[:, 1] + inter_human_regions[:, 3]) / 2 228 | humans_pos = np.stack([humans_pos_x, humans_pos_y], axis=1) 229 | inter_objects_pos = humans_pos + sp_vectors 230 | 231 | objects_pos_x = (object_bboxes[:, 0] + object_bboxes[:, 2]) / 2 232 | objects_pos_y = (object_bboxes[:, 1] + object_bboxes[:, 3]) / 2 233 | objects_pos = np.stack([objects_pos_x, objects_pos_y], axis=1) 234 | 235 | obj_dists = target_object_dist(inter_objects_pos, objects_pos, preds_union["rois"]) 236 | inter_human_instids = human_inst_ids[human_IoU_argmax] 237 | obj_dists[np.arange(num_union), inter_human_instids] = 100 238 | obj_dists[obj_IoA < 0.25] = 100 239 | inter_obj_ids = np.argmin(obj_dists, 1) 240 | inter_obj_dist = obj_dists[np.arange(num_union), inter_obj_ids] 241 | 242 | sigma = 0.9 243 | location_scores = np.exp(-1 * inter_obj_dist / (2 * sigma ** 2)) 244 | location_scores = np.where(location_scores thre or prior_mask[id, 0] < 0.1: 290 | det[action + "_" + role] = (obj_bbox[0], obj_bbox[1], obj_bbox[2], obj_bbox[3], max_score[id]) 291 | agent_score = max_score[id] 292 | else: 293 | if human["role_scores"][id] > 0.0 and prior_mask[id, 0] > 0.1: 294 | det[action + "_" + role] = (0, 0, 0, 0, human["role_scores"][id] * human["obj_scores"] * prior_mask[id, 0]) 295 | agent_score = human["role_scores"][id] * human["obj_scores"] 296 | 297 | else: 298 | det[action + "_" + role] = (0, 0, 0, 0, 0) 299 | agent_score = 0 300 | 301 | if action + "_agent" not in det: 302 | det[action + "_agent"] = agent_score 303 | else: 304 | det[action + "_agent"] = max(agent_score, det[action + "_agent"]) 305 | for i in range(len(sub_label_to_class)): 306 | action = sub_label_to_class[i] 307 | if action + "_agent" not in det: 308 | det[action+"_agent"] = human["agent_scores"][i] 309 | dets.append(det) 310 | 311 | return dets 312 | 313 | 314 | def img_detect(file, img_dir, model, input_size, regressBoxes, clipBoxes, prior_mask, threshold): 315 | fname, ext = os.path.splitext(file) 316 | image_id = int(fname.split("_")[-1]) 317 | 318 | img_path = os.path.join(img_dir, file) 319 | ori_imgs, framed_imgs, framed_metas = preprocess(img_path, max_size=input_size) 320 | if use_cuda: 321 | x = torch.stack([torch.from_numpy(fi).cuda() for fi in framed_imgs], 0) 322 | else: 323 | x = torch.stack([torch.from_numpy(fi) for fi in framed_imgs], 0) 324 | 325 | x = x.to(torch.float32 if not use_float16 else torch.float16).permute(0, 3, 1, 2) 326 | 327 | if args.flip_test: 328 | ids = torch.arange(x.shape[-1]-1, -1, -1).long().cuda() 329 | x_flip = x[..., ids] 330 | x_cat = torch.cat([x, x_flip], 0) 331 | 332 | with torch.no_grad(): 333 | if args.flip_test: 334 | 335 | features, union_act_cls, union_sub_reg, union_obj_reg, \ 336 | inst_act_cls, inst_obj_cls, inst_bbox_reg, anchors = model(x_cat) 337 | 338 | anchors = torch.cat([anchors, anchors], 0) 339 | preds_union = postprocess_dense_union_flip(x_cat, anchors, union_act_cls, union_sub_reg, union_obj_reg, 340 | regressBoxes, clipBoxes, 0.5, 1) 341 | preds_inst = postprocess_hoi_flip(x_cat, anchors, inst_bbox_reg, inst_obj_cls, inst_act_cls, 342 | regressBoxes, clipBoxes, threshold, nms_threshold, 343 | mode="object", classwise=True) 344 | else: 345 | 346 | 347 | features, union_act_cls, union_sub_reg, union_obj_reg, \ 348 | inst_act_cls, inst_obj_cls, inst_bbox_reg, anchors = model(x) 349 | 350 | preds_union = postprocess_dense_union(x, anchors, union_act_cls, union_sub_reg, union_obj_reg, 351 | regressBoxes, clipBoxes, 0.5, 1, classwise=True) 352 | preds_inst = postprocess_hoi(x, anchors, inst_bbox_reg, inst_obj_cls, inst_act_cls, 353 | regressBoxes, clipBoxes, threshold, nms_threshold, 354 | mode="object", classwise=True) 355 | 356 | preds_inst = invert_affine(framed_metas, preds_inst)[0] 357 | preds_union = invert_affine(framed_metas, preds_union)[0] 358 | 359 | dets = hoi_match(image_id, preds_inst, preds_union, prior_mask) 360 | 361 | return dets 362 | 363 | 364 | def test(threshold=0.2): 365 | with open("datasets/vcoco/new_prior_mask.pkl", "rb") as file: 366 | prior_mask = pickle.load(file, encoding="bytes") 367 | 368 | model = EfficientDetBackbone(num_classes=len(eval(params["obj_list"])), num_union_classes=25, 369 | num_inst_classes=51, compound_coef=args.compound_coef, 370 | ratios=eval(params["anchors_ratios"]), scales=eval(params["anchors_scales"])) 371 | model.load_state_dict(torch.load(weights_path, map_location=torch.device('cpu'))) 372 | model.requires_grad_(False) 373 | model.eval() 374 | 375 | if args.cuda: 376 | model = model.cuda() 377 | if args.float16: 378 | model = model.half() 379 | 380 | regressBoxes = BBoxTransform() 381 | clipBoxes = ClipBoxes() 382 | 383 | img_dir = os.path.join(data_dir, "vcoco/coco/images/%s" % "val2014") 384 | 385 | with open(os.path.join(data_dir, 'vcoco/data/splits/vcoco_test.ids'), 'r') as f: 386 | image_ids = f.readlines() 387 | image_ids = [int(id) for id in image_ids] 388 | 389 | _t = {'im_detect': Timer(), 'misc': Timer()} 390 | detection = [] 391 | 392 | for i, image_id in enumerate(image_ids): 393 | 394 | _t['im_detect'].tic() 395 | 396 | file = "COCO_val2014_" + (str(image_id)).zfill(12) + '.jpg' 397 | 398 | img_detection = img_detect(file, img_dir, model, input_size, regressBoxes, clipBoxes, prior_mask, threshold=threshold) 399 | detection.extend(img_detection) 400 | if need_visual: 401 | visual(img_detection, image_id) 402 | _t['im_detect'].toc() 403 | 404 | print('im_detect: {:d}/{:d}, average time: {:.3f}s'.format(i + 1, len(image_ids), _t['im_detect'].average_time)) 405 | 406 | with open(detection_path, "wb") as file: 407 | pickle.dump(detection, file) 408 | 409 | 410 | if __name__ == '__main__': 411 | vsrl_annot_file = "./datasets/vcoco/data/vcoco/vcoco_test.json" 412 | coco_file = "./datasets/vcoco/coco/annotations/instances_val2014.json" 413 | split_file = "./datasets/vcoco/data/splits/vcoco_test.ids" 414 | if override_prev_results or not os.path.exists(detection_path): 415 | test() 416 | vcocoeval = VCOCOeval(vsrl_annot_file, coco_file, split_file) 417 | vcocoeval._do_eval(detection_path, ovr_thresh=0.5) 418 | 419 | 420 | 421 | 422 | -------------------------------------------------------------------------------- /efficientdet/model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | from torchvision.ops.boxes import nms as nms_torch 4 | 5 | from efficientnet import EfficientNet as EffNet 6 | from efficientnet.utils import MemoryEfficientSwish, Swish 7 | from efficientnet.utils_extra import Conv2dStaticSamePadding, MaxPool2dStaticSamePadding 8 | 9 | 10 | def nms(dets, thresh): 11 | return nms_torch(dets[:, :4], dets[:, 4], thresh) 12 | 13 | 14 | class SeparableConvBlock(nn.Module): 15 | """ 16 | created by Zylo117 17 | """ 18 | 19 | def __init__(self, in_channels, out_channels=None, norm=True, activation=False, onnx_export=False): 20 | super(SeparableConvBlock, self).__init__() 21 | if out_channels is None: 22 | out_channels = in_channels 23 | 24 | # Q: whether separate conv 25 | # share bias between depthwise_conv and pointwise_conv 26 | # or just pointwise_conv apply bias. 27 | # A: Confirmed, just pointwise_conv applies bias, depthwise_conv has no bias. 28 | 29 | self.depthwise_conv = Conv2dStaticSamePadding(in_channels, in_channels, 30 | kernel_size=3, stride=1, groups=in_channels, bias=False) 31 | self.pointwise_conv = Conv2dStaticSamePadding(in_channels, out_channels, kernel_size=1, stride=1) 32 | 33 | self.norm = norm 34 | if self.norm: 35 | # Warning: pytorch momentum is different from tensorflow's, momentum_pytorch = 1 - momentum_tensorflow 36 | self.bn = nn.BatchNorm2d(num_features=out_channels, momentum=0.01, eps=1e-3) 37 | 38 | self.activation = activation 39 | if self.activation: 40 | self.swish = MemoryEfficientSwish() if not onnx_export else Swish() 41 | 42 | def forward(self, x): 43 | x = self.depthwise_conv(x) 44 | x = self.pointwise_conv(x) 45 | 46 | if self.norm: 47 | x = self.bn(x) 48 | 49 | if self.activation: 50 | x = self.swish(x) 51 | 52 | return x 53 | 54 | 55 | class BiFPN(nn.Module): 56 | """ 57 | modified by Zylo117 58 | """ 59 | 60 | def __init__(self, num_channels, conv_channels, first_time=False, epsilon=1e-4, onnx_export=False, attention=True): 61 | """ 62 | 63 | Args: 64 | num_channels: 65 | conv_channels: 66 | first_time: whether the input comes directly from the efficientnet, 67 | if True, downchannel it first, and downsample P5 to generate P6 then P7 68 | epsilon: epsilon of fast weighted attention sum of BiFPN, not the BN's epsilon 69 | onnx_export: if True, use Swish instead of MemoryEfficientSwish 70 | """ 71 | super(BiFPN, self).__init__() 72 | self.epsilon = epsilon 73 | # Conv layers 74 | self.conv6_up = SeparableConvBlock(num_channels, onnx_export=onnx_export) 75 | self.conv5_up = SeparableConvBlock(num_channels, onnx_export=onnx_export) 76 | self.conv4_up = SeparableConvBlock(num_channels, onnx_export=onnx_export) 77 | self.conv3_up = SeparableConvBlock(num_channels, onnx_export=onnx_export) 78 | self.conv4_down = SeparableConvBlock(num_channels, onnx_export=onnx_export) 79 | self.conv5_down = SeparableConvBlock(num_channels, onnx_export=onnx_export) 80 | self.conv6_down = SeparableConvBlock(num_channels, onnx_export=onnx_export) 81 | self.conv7_down = SeparableConvBlock(num_channels, onnx_export=onnx_export) 82 | 83 | # Feature scaling layers 84 | self.p6_upsample = nn.Upsample(scale_factor=2, mode='nearest') 85 | self.p5_upsample = nn.Upsample(scale_factor=2, mode='nearest') 86 | self.p4_upsample = nn.Upsample(scale_factor=2, mode='nearest') 87 | self.p3_upsample = nn.Upsample(scale_factor=2, mode='nearest') 88 | 89 | self.p4_downsample = MaxPool2dStaticSamePadding(3, 2) 90 | self.p5_downsample = MaxPool2dStaticSamePadding(3, 2) 91 | self.p6_downsample = MaxPool2dStaticSamePadding(3, 2) 92 | self.p7_downsample = MaxPool2dStaticSamePadding(3, 2) 93 | 94 | self.swish = MemoryEfficientSwish() if not onnx_export else Swish() 95 | 96 | self.first_time = first_time 97 | if self.first_time: 98 | self.p5_down_channel = nn.Sequential( 99 | Conv2dStaticSamePadding(conv_channels[2], num_channels, 1), 100 | nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3), 101 | ) 102 | self.p4_down_channel = nn.Sequential( 103 | Conv2dStaticSamePadding(conv_channels[1], num_channels, 1), 104 | nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3), 105 | ) 106 | self.p3_down_channel = nn.Sequential( 107 | Conv2dStaticSamePadding(conv_channels[0], num_channels, 1), 108 | nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3), 109 | ) 110 | 111 | self.p5_to_p6 = nn.Sequential( 112 | Conv2dStaticSamePadding(conv_channels[2], num_channels, 1), 113 | nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3), 114 | MaxPool2dStaticSamePadding(3, 2) 115 | ) 116 | self.p6_to_p7 = nn.Sequential( 117 | MaxPool2dStaticSamePadding(3, 2) 118 | ) 119 | 120 | self.p4_down_channel_2 = nn.Sequential( 121 | Conv2dStaticSamePadding(conv_channels[1], num_channels, 1), 122 | nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3), 123 | ) 124 | self.p5_down_channel_2 = nn.Sequential( 125 | Conv2dStaticSamePadding(conv_channels[2], num_channels, 1), 126 | nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3), 127 | ) 128 | 129 | # Weight 130 | self.p6_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True) 131 | self.p6_w1_relu = nn.ReLU() 132 | self.p5_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True) 133 | self.p5_w1_relu = nn.ReLU() 134 | self.p4_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True) 135 | self.p4_w1_relu = nn.ReLU() 136 | self.p3_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True) 137 | self.p3_w1_relu = nn.ReLU() 138 | 139 | self.p4_w2 = nn.Parameter(torch.ones(3, dtype=torch.float32), requires_grad=True) 140 | self.p4_w2_relu = nn.ReLU() 141 | self.p5_w2 = nn.Parameter(torch.ones(3, dtype=torch.float32), requires_grad=True) 142 | self.p5_w2_relu = nn.ReLU() 143 | self.p6_w2 = nn.Parameter(torch.ones(3, dtype=torch.float32), requires_grad=True) 144 | self.p6_w2_relu = nn.ReLU() 145 | self.p7_w2 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True) 146 | self.p7_w2_relu = nn.ReLU() 147 | 148 | self.attention = attention 149 | 150 | def forward(self, inputs): 151 | """ 152 | illustration of a minimal bifpn unit 153 | P7_0 -------------------------> P7_2 --------> 154 | |-------------| ↑ 155 | ↓ | 156 | P6_0 ---------> P6_1 ---------> P6_2 --------> 157 | |-------------|--------------↑ ↑ 158 | ↓ | 159 | P5_0 ---------> P5_1 ---------> P5_2 --------> 160 | |-------------|--------------↑ ↑ 161 | ↓ | 162 | P4_0 ---------> P4_1 ---------> P4_2 --------> 163 | |-------------|--------------↑ ↑ 164 | |--------------↓ | 165 | P3_0 -------------------------> P3_2 --------> 166 | """ 167 | 168 | # downsample channels using same-padding conv2d to target phase's if not the same 169 | # judge: same phase as target, 170 | # if same, pass; 171 | # elif earlier phase, downsample to target phase's by pooling 172 | # elif later phase, upsample to target phase's by nearest interpolation 173 | 174 | if self.attention: 175 | p3_out, p4_out, p5_out, p6_out, p7_out = self._forward_fast_attention(inputs) 176 | else: 177 | p3_out, p4_out, p5_out, p6_out, p7_out = self._forward(inputs) 178 | 179 | return p3_out, p4_out, p5_out, p6_out, p7_out 180 | 181 | def _forward_fast_attention(self, inputs): 182 | if self.first_time: 183 | p3, p4, p5 = inputs 184 | 185 | p6_in = self.p5_to_p6(p5) 186 | p7_in = self.p6_to_p7(p6_in) 187 | 188 | p3_in = self.p3_down_channel(p3) 189 | p4_in = self.p4_down_channel(p4) 190 | p5_in = self.p5_down_channel(p5) 191 | 192 | else: 193 | # P3_0, P4_0, P5_0, P6_0 and P7_0 194 | p3_in, p4_in, p5_in, p6_in, p7_in = inputs 195 | 196 | # P7_0 to P7_2 197 | 198 | # Weights for P6_0 and P7_0 to P6_1 199 | p6_w1 = self.p6_w1_relu(self.p6_w1) 200 | weight = p6_w1 / (torch.sum(p6_w1, dim=0) + self.epsilon) 201 | # Connections for P6_0 and P7_0 to P6_1 respectively 202 | p6_up = self.conv6_up(self.swish(weight[0] * p6_in + weight[1] * self.p6_upsample(p7_in))) 203 | 204 | # Weights for P5_0 and P6_0 to P5_1 205 | p5_w1 = self.p5_w1_relu(self.p5_w1) 206 | weight = p5_w1 / (torch.sum(p5_w1, dim=0) + self.epsilon) 207 | # Connections for P5_0 and P6_0 to P5_1 respectively 208 | p5_up = self.conv5_up(self.swish(weight[0] * p5_in + weight[1] * self.p5_upsample(p6_up))) 209 | 210 | # Weights for P4_0 and P5_0 to P4_1 211 | p4_w1 = self.p4_w1_relu(self.p4_w1) 212 | weight = p4_w1 / (torch.sum(p4_w1, dim=0) + self.epsilon) 213 | # Connections for P4_0 and P5_0 to P4_1 respectively 214 | p4_up = self.conv4_up(self.swish(weight[0] * p4_in + weight[1] * self.p4_upsample(p5_up))) 215 | 216 | # Weights for P3_0 and P4_1 to P3_2 217 | p3_w1 = self.p3_w1_relu(self.p3_w1) 218 | weight = p3_w1 / (torch.sum(p3_w1, dim=0) + self.epsilon) 219 | # Connections for P3_0 and P4_1 to P3_2 respectively 220 | p3_out = self.conv3_up(self.swish(weight[0] * p3_in + weight[1] * self.p3_upsample(p4_up))) 221 | 222 | if self.first_time: 223 | p4_in = self.p4_down_channel_2(p4) 224 | p5_in = self.p5_down_channel_2(p5) 225 | 226 | # Weights for P4_0, P4_1 and P3_2 to P4_2 227 | p4_w2 = self.p4_w2_relu(self.p4_w2) 228 | weight = p4_w2 / (torch.sum(p4_w2, dim=0) + self.epsilon) 229 | # Connections for P4_0, P4_1 and P3_2 to P4_2 respectively 230 | p4_out = self.conv4_down( 231 | self.swish(weight[0] * p4_in + weight[1] * p4_up + weight[2] * self.p4_downsample(p3_out))) 232 | 233 | # Weights for P5_0, P5_1 and P4_2 to P5_2 234 | p5_w2 = self.p5_w2_relu(self.p5_w2) 235 | weight = p5_w2 / (torch.sum(p5_w2, dim=0) + self.epsilon) 236 | # Connections for P5_0, P5_1 and P4_2 to P5_2 respectively 237 | p5_out = self.conv5_down( 238 | self.swish(weight[0] * p5_in + weight[1] * p5_up + weight[2] * self.p5_downsample(p4_out))) 239 | 240 | # Weights for P6_0, P6_1 and P5_2 to P6_2 241 | p6_w2 = self.p6_w2_relu(self.p6_w2) 242 | weight = p6_w2 / (torch.sum(p6_w2, dim=0) + self.epsilon) 243 | # Connections for P6_0, P6_1 and P5_2 to P6_2 respectively 244 | p6_out = self.conv6_down( 245 | self.swish(weight[0] * p6_in + weight[1] * p6_up + weight[2] * self.p6_downsample(p5_out))) 246 | 247 | # Weights for P7_0 and P6_2 to P7_2 248 | p7_w2 = self.p7_w2_relu(self.p7_w2) 249 | weight = p7_w2 / (torch.sum(p7_w2, dim=0) + self.epsilon) 250 | # Connections for P7_0 and P6_2 to P7_2 251 | p7_out = self.conv7_down(self.swish(weight[0] * p7_in + weight[1] * self.p7_downsample(p6_out))) 252 | 253 | return p3_out, p4_out, p5_out, p6_out, p7_out 254 | 255 | def _forward(self, inputs): 256 | if self.first_time: 257 | p3, p4, p5 = inputs 258 | 259 | p6_in = self.p5_to_p6(p5) 260 | p7_in = self.p6_to_p7(p6_in) 261 | 262 | p3_in = self.p3_down_channel(p3) 263 | p4_in = self.p4_down_channel(p4) 264 | p5_in = self.p5_down_channel(p5) 265 | 266 | else: 267 | # P3_0, P4_0, P5_0, P6_0 and P7_0 268 | p3_in, p4_in, p5_in, p6_in, p7_in = inputs 269 | 270 | # P7_0 to P7_2 271 | 272 | # Connections for P6_0 and P7_0 to P6_1 respectively 273 | p6_up = self.conv6_up(self.swish(p6_in + self.p6_upsample(p7_in))) 274 | 275 | # Connections for P5_0 and P6_0 to P5_1 respectively 276 | p5_up = self.conv5_up(self.swish(p5_in + self.p5_upsample(p6_up))) 277 | 278 | # Connections for P4_0 and P5_0 to P4_1 respectively 279 | p4_up = self.conv4_up(self.swish(p4_in + self.p4_upsample(p5_up))) 280 | 281 | # Connections for P3_0 and P4_1 to P3_2 respectively 282 | p3_out = self.conv3_up(self.swish(p3_in + self.p3_upsample(p4_up))) 283 | 284 | if self.first_time: 285 | p4_in = self.p4_down_channel_2(p4) 286 | p5_in = self.p5_down_channel_2(p5) 287 | 288 | # Connections for P4_0, P4_1 and P3_2 to P4_2 respectively 289 | p4_out = self.conv4_down( 290 | self.swish(p4_in + p4_up + self.p4_downsample(p3_out))) 291 | 292 | # Connections for P5_0, P5_1 and P4_2 to P5_2 respectively 293 | p5_out = self.conv5_down( 294 | self.swish(p5_in + p5_up + self.p5_downsample(p4_out))) 295 | 296 | # Connections for P6_0, P6_1 and P5_2 to P6_2 respectively 297 | p6_out = self.conv6_down( 298 | self.swish(p6_in + p6_up + self.p6_downsample(p5_out))) 299 | 300 | # Connections for P7_0 and P6_2 to P7_2 301 | p7_out = self.conv7_down(self.swish(p7_in + self.p7_downsample(p6_out))) 302 | 303 | return p3_out, p4_out, p5_out, p6_out, p7_out 304 | 305 | 306 | class Regressor(nn.Module): 307 | """ 308 | modified by Zylo117 309 | """ 310 | 311 | def __init__(self, in_channels, num_anchors, num_layers, onnx_export=False): 312 | super(Regressor, self).__init__() 313 | self.num_layers = num_layers 314 | 315 | self.conv_list = nn.ModuleList( 316 | [SeparableConvBlock(in_channels, in_channels, norm=False, activation=False) for i in range(num_layers)]) 317 | self.bn_list = nn.ModuleList( 318 | [nn.ModuleList([nn.BatchNorm2d(in_channels, momentum=0.01, eps=1e-3) for i in range(num_layers)]) for j in 319 | range(5)]) 320 | self.header = SeparableConvBlock(in_channels, num_anchors * 4, norm=False, activation=False) 321 | self.swish = MemoryEfficientSwish() if not onnx_export else Swish() 322 | 323 | def forward(self, inputs): 324 | feats = [] 325 | for feat, bn_list in zip(inputs, self.bn_list): 326 | for i, bn, conv in zip(range(self.num_layers), bn_list, self.conv_list): 327 | feat = conv(feat) 328 | feat = bn(feat) 329 | feat = self.swish(feat) 330 | feat = self.header(feat) 331 | 332 | feat = feat.permute(0, 2, 3, 1) # (batch_size, height, width, num_anchor*4) 333 | feat = feat.contiguous().view(feat.shape[0], -1, 4) # (batch_size, h*w*num_anchor, 4) 334 | 335 | feats.append(feat) 336 | 337 | feats = torch.cat(feats, dim=1) # (batch_size, h*w*feat_num*num_anchor, 4) 338 | 339 | return feats 340 | 341 | 342 | class Classifier(nn.Module): 343 | """ 344 | modified by Zylo117 345 | """ 346 | 347 | def __init__(self, in_channels, num_anchors, num_classes, num_layers, onnx_export=False): 348 | super(Classifier, self).__init__() 349 | self.num_anchors = num_anchors 350 | self.num_classes = num_classes 351 | self.num_layers = num_layers 352 | self.conv_list = nn.ModuleList( 353 | [SeparableConvBlock(in_channels, in_channels, norm=False, activation=False) for i in range(num_layers)]) 354 | self.bn_list = nn.ModuleList( 355 | [nn.ModuleList([nn.BatchNorm2d(in_channels, momentum=0.01, eps=1e-3) for i in range(num_layers)]) for j in 356 | range(5)]) 357 | self.header = SeparableConvBlock(in_channels, num_anchors * num_classes, norm=False, activation=False) 358 | self.swish = MemoryEfficientSwish() if not onnx_export else Swish() 359 | 360 | def forward(self, inputs): 361 | feats = [] 362 | for feat, bn_list in zip(inputs, self.bn_list): 363 | for i, bn, conv in zip(range(self.num_layers), bn_list, self.conv_list): 364 | feat = conv(feat) 365 | feat = bn(feat) 366 | feat = self.swish(feat) 367 | feat = self.header(feat) 368 | 369 | feat = feat.permute(0, 2, 3, 1) # (batch_size, height, width, num_anchors*num_classes) 370 | feat = feat.contiguous().view(feat.shape[0], feat.shape[1], feat.shape[2], self.num_anchors, 371 | self.num_classes) # (batch_size, height, width, num_anchors, num_classes) 372 | feat = feat.contiguous().view(feat.shape[0], -1, self.num_classes) # (batch_size, h*w*num_anchor, num_classes) 373 | 374 | feats.append(feat) 375 | 376 | feats = torch.cat(feats, dim=1) # (batch_size, h*w*num_anchor*feat_num, num_classes) 377 | feats = feats.sigmoid() 378 | 379 | return feats 380 | 381 | 382 | class EfficientNet(nn.Module): 383 | """ 384 | modified by Zylo117 385 | """ 386 | 387 | def __init__(self, compound_coef, load_weights=False): 388 | super(EfficientNet, self).__init__() 389 | model = EffNet.from_pretrained(f'efficientnet-b{compound_coef}', load_weights) 390 | del model._conv_head 391 | del model._bn1 392 | del model._avg_pooling 393 | del model._dropout 394 | del model._fc 395 | self.model = model 396 | 397 | def forward(self, x): 398 | x = self.model._conv_stem(x) 399 | x = self.model._bn0(x) 400 | x = self.model._swish(x) 401 | feature_maps = [] 402 | 403 | # TODO: temporarily storing extra tensor last_x and del it later might not be a good idea, 404 | # try recording stride changing when creating efficientnet, 405 | # and then apply it here. 406 | last_x = None 407 | for idx, block in enumerate(self.model._blocks): 408 | drop_connect_rate = self.model._global_params.drop_connect_rate 409 | if drop_connect_rate: 410 | drop_connect_rate *= float(idx) / len(self.model._blocks) 411 | x = block(x, drop_connect_rate=drop_connect_rate) 412 | 413 | if block._depthwise_conv.stride == [2, 2]: 414 | feature_maps.append(last_x) 415 | elif idx == len(self.model._blocks) - 1: 416 | feature_maps.append(x) 417 | last_x = x 418 | del last_x 419 | return feature_maps[1:] 420 | 421 | 422 | if __name__ == '__main__': 423 | from tensorboardX import SummaryWriter 424 | 425 | 426 | def count_parameters(model): 427 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 428 | -------------------------------------------------------------------------------- /test_hico-det.py: -------------------------------------------------------------------------------- 1 | # Author: Zylo117 2 | 3 | """ 4 | COCO-Style Evaluations 5 | 6 | put images here datasets/your_project_name/annotations/val_set_name/*.jpg 7 | put annotations here datasets/your_project_name/annotations/instances_{val_set_name}.json 8 | put weights here /path/to/your/weights/*.pth 9 | change compound_coef 10 | 11 | """ 12 | 13 | import json 14 | import os 15 | import cv2 16 | import time 17 | import glob 18 | 19 | import argparse 20 | import torch 21 | import yaml 22 | import pickle 23 | import numpy as np 24 | # from pycocotools.cocoeval import COCOeval 25 | 26 | # from utils.vsrl_eval import VCOCOeval 27 | from backbone import EfficientDetBackbone 28 | from efficientdet.utils import BBoxTransform, ClipBoxes 29 | from efficientdet.help_function import single_iou, single_ioa, single_inter, single_union 30 | from utils.utils import preprocess, invert_affine, postprocess, postprocess_hoi, postprocess_dense_union, postprocess_hoi_flip, postprocess_dense_union_flip 31 | # from utils.apply_prior import apply_prior 32 | from utils.timer import Timer 33 | from utils.visual_hico import visual_hico 34 | from Generate_HICO_detection import Generate_HICO_detection 35 | 36 | 37 | ap = argparse.ArgumentParser() 38 | ap.add_argument('-p', '--project', type=str, default='hico-det', help='project file that contains parameters') 39 | ap.add_argument('-c', '--compound_coef', type=int, default=3, help='coefficients of efficientdet') 40 | ap.add_argument('-w', '--weights', type=str, default=None, help='/path/to/weights') 41 | ap.add_argument('--nms_threshold', type=float, default=0.3, help='nms threshold, don\'t change it if not for testing purposes') 42 | ap.add_argument('--cuda', type=int, default=1) 43 | ap.add_argument('--device', type=int, default=0) 44 | ap.add_argument('--float16', type=int, default=0) 45 | ap.add_argument('--override', type=int, default=0, help='override previous bbox results file if exists') 46 | ap.add_argument('--data_dir', type=str, default='./datasets', help='the root folder of dataset') 47 | ap.add_argument('--need_visual', type=int, default=0, help='whether need to visualize the results') 48 | ap.add_argument('--flip_test', type=int, default=1, help='whether apply flip augmentation when testing') 49 | 50 | 51 | args = ap.parse_args() 52 | 53 | compound_coef = args.compound_coef 54 | nms_threshold = args.nms_threshold 55 | use_cuda = args.cuda 56 | gpu = args.device 57 | use_float16 = args.float16 58 | override_prev_results = args.override 59 | need_visual = args.need_visual 60 | weights_path = f'weights/efficientdet-d{compound_coef}.pth' if args.weights is None else args.weights 61 | data_dir = args.data_dir 62 | project = args.project 63 | 64 | params = yaml.safe_load(open(f'projects/{project}.yml')) 65 | SET_NAME = params['val_set'] 66 | project_name = params["project_name"] 67 | 68 | 69 | print(f'running coco-style evaluation on project {project_name}, weights {weights_path}...') 70 | 71 | params = yaml.safe_load(open(f'projects/{project}.yml')) 72 | num_objects = 90 73 | num_union_actions = 117 74 | num_union_hois = 600 75 | num_inst_actions = 234 76 | 77 | input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] 78 | input_size = input_sizes[compound_coef] 79 | output_dir = f"./logs/{project_name}/results" 80 | 81 | if not os.path.exists(output_dir): 82 | os.mkdir(output_dir) 83 | 84 | if args.flip_test: 85 | detection_path = os.path.join(output_dir, f'{SET_NAME}_bbox_results_flip_final.pkl') 86 | else: 87 | detection_path = os.path.join(output_dir, f'{SET_NAME}_bbox_results_final.pkl') 88 | 89 | obj_list = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 90 | 'fire hydrant', '', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 91 | 'cow', 'elephant', 'bear', 'zebra', 'giraffe', '', 'backpack', 'umbrella', '', '', 'handbag', 'tie', 92 | 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 93 | 'skateboard', 'surfboard', 'tennis racket', 'bottle', '', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 94 | 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 95 | 'cake', 'chair', 'couch', 'potted plant', 'bed', '', 'dining table', '', '', 'toilet', '', 'tv', 96 | 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 97 | 'refrigerator', '', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 98 | 'toothbrush'] 99 | 100 | obj_dict = {} 101 | cid = 0 102 | for obj in obj_list: 103 | if obj != "": 104 | cid += 1 105 | obj_dict[obj] = cid 106 | 107 | with open(args.data_dir + "/hico_20160224_det/hico_processed/verb_list.json", "r") as file: 108 | verbs_hico = json.load(file) 109 | verbs_dict = {} 110 | for id, item in enumerate(verbs_hico): 111 | verb_name = item["name"] 112 | verbs_dict[verb_name] = id 113 | 114 | with open(args.data_dir + "/hico_20160224_det/hico_processed/hoi_list.json", "r") as file: 115 | hois_hico = json.load(file) 116 | verb_to_hoi = {} 117 | for hoi_id, item in enumerate(hois_hico): 118 | verb_id = verbs_dict[item["verb"]] 119 | if verb_id in verb_to_hoi: 120 | verb_to_hoi[verb_id].append(hoi_id) 121 | else: 122 | verb_to_hoi[verb_id] = [hoi_id] 123 | 124 | n = 0 125 | for verb_id in verb_to_hoi: 126 | n += len(verb_to_hoi[verb_id]) 127 | verb_to_hoi[verb_id] = np.array(verb_to_hoi[verb_id]) 128 | assert n == num_union_hois 129 | 130 | 131 | def calc_ioa(a, b): 132 | # a(anchor) [boxes, (x1, y1, x2, y2)] 133 | # b(gt, coco-style) [boxes, (x1, y1, x2, y2)] 134 | 135 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) 136 | 137 | exp_x1 = np.expand_dims(a[:, 0], axis=1) 138 | exp_x2 = np.expand_dims(a[:, 2], axis=1) 139 | exp_y1 = np.expand_dims(a[:, 1], 1) 140 | exp_y2 = np.expand_dims(a[:, 3], 1) 141 | 142 | iw = np.where(exp_x2 < b[:, 2], exp_x2, b[:, 2]) - np.where(exp_x1 > b[:, 0], exp_x1, b[:, 0]) 143 | ih = np.where(exp_y2 < b[:, 3], exp_y2, b[:, 3]) - np.where(exp_y1 > b[:, 1], exp_y1, b[:, 1]) 144 | # iw = torch.clamp(iw, min=0) 145 | # ih = torch.clamp(ih, min=0) 146 | iw = np.where(iw > 0, iw, 0) 147 | ih = np.where(ih > 0, ih, 0) 148 | 149 | intersection = iw * ih 150 | area = np.where(area > 1e-6, area, 1e-6) 151 | IoA = intersection / area 152 | # IoA[torch.isnan(IoA)] = 1 153 | return IoA 154 | 155 | 156 | def calc_iou(a, b): 157 | # a(anchor) [boxes, (x1, y1, x2, y2)] 158 | # b(gt, coco-style) [boxes, (x1, y1, x2, y2)] 159 | 160 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) 161 | 162 | exp_x1 = np.expand_dims(a[:, 0], axis=1) 163 | exp_x2 = np.expand_dims(a[:, 2], axis=1) 164 | exp_y1 = np.expand_dims(a[:, 1], 1) 165 | exp_y2 = np.expand_dims(a[:, 3], 1) 166 | 167 | iw = np.where(exp_x2 < b[:, 2], exp_x2, b[:, 2]) - np.where(exp_x1 > b[:, 0], exp_x1, b[:, 0]) 168 | ih = np.where(exp_y2 < b[:, 3], exp_y2, b[:, 3]) - np.where(exp_y1 > b[:, 1], exp_y1, b[:, 1]) 169 | # iw = torch.clamp(iw, min=0) 170 | # ih = torch.clamp(ih, min=0) 171 | iw = np.where(iw > 0, iw, 0) 172 | ih = np.where(ih > 0, ih, 0) 173 | 174 | ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih 175 | ua = np.where(ua > 0, ua, 1e-8) 176 | 177 | intersection = iw * ih 178 | IoU = intersection / ua 179 | return IoU 180 | 181 | 182 | def transform_class_id(id): 183 | class_name = obj_list[id] 184 | hico_obj_id = obj_dict[class_name] 185 | return hico_obj_id 186 | 187 | 188 | def transform_action_hico(act_scores, mode): 189 | union_scores = np.zeros(num_union_actions) 190 | for i in range(num_inst_actions//2): 191 | if mode == "subject": 192 | union_scores[verb_to_hoi[i]] = act_scores[i] 193 | else: 194 | union_scores[verb_to_hoi[i]] = act_scores[i + num_inst_actions//2] 195 | return union_scores 196 | 197 | 198 | def xy_to_wh(bbox): 199 | ctr_x = (bbox[0] + bbox[2]) / 2 200 | ctr_y = (bbox[1] + bbox[3]) / 2 201 | width = bbox[2] - bbox[0] 202 | height = bbox[3] - bbox[1] 203 | return ctr_x, ctr_y, width, height 204 | 205 | 206 | def fetch_location_score(anchor_bbox, obj_bbox, target_bbox, human_bbox, sigma): 207 | xo, yo, wo, ho = xy_to_wh(obj_bbox) 208 | xt, yt, wt, ht = xy_to_wh(target_bbox) 209 | # xh, yh, wh, hh = xy_to_wh(human_bbox) 210 | xa, ya, wa, ha = xy_to_wh(anchor_bbox) 211 | dist = np.zeros(4, dtype=np.float) 212 | dist[0] = (xo - xt) / wa 213 | dist[1] = (yo - yt) / ha 214 | # dist[0] = (xo - xt) / wh 215 | # dist[1] = (yo - yt) / hh 216 | # dist[2] = np.log(wo/wt) 217 | # dist[3] = np.log(ho/ht) 218 | 219 | return np.exp(-1*np.sum(dist**2)/(2*sigma**2)) 220 | 221 | 222 | def target_object_dist(target_objects_pos, objects_pos, anchors): 223 | width = anchors[:, 2] - anchors[:, 0] 224 | height = anchors[:, 3] - anchors[:, 1] 225 | anchors_size = np.stack([width, height], axis=1) 226 | anchors_size = np.expand_dims(anchors_size, axis=1) 227 | target_objects_pos = np.expand_dims(target_objects_pos, 1) 228 | diff = target_objects_pos - objects_pos 229 | diff = diff / anchors_size 230 | dist = np.sum(diff**2, axis=2) 231 | return dist 232 | 233 | 234 | def hoi_match(image_id, preds_inst, preds_union, human_thre=0.3, anchor_thre=0.1, loc_thre=0.05): 235 | num_inst = len(preds_inst["rois"]) 236 | humans = [] 237 | objects = [] 238 | human_bboxes = [] 239 | human_inst_ids = [] 240 | human_role_scores = [] 241 | human_obj_scores = [] 242 | 243 | while len(humans) == 0: 244 | if human_thre < 0.2: 245 | break 246 | for inst_id in range(num_inst): 247 | if preds_inst["obj_class_ids"][inst_id] != 0 or preds_inst["obj_scores"][inst_id] < human_thre: 248 | continue 249 | item = {} 250 | item["bbox"] = preds_inst["rois"][inst_id] 251 | item["role_scores"] = preds_inst["act_scores"][inst_id][:len(verb_to_hoi)] 252 | # item["role_scores"] = transform_action_hico(preds_inst["act_scores"][inst_id], "subject") 253 | item["obj_scores"] = preds_inst["obj_scores"][inst_id] 254 | item["inst_id"] = inst_id 255 | humans.append(item) 256 | human_bboxes.append(item["bbox"]) 257 | human_inst_ids.append(item["inst_id"]) 258 | human_role_scores.append(item["role_scores"]) 259 | human_obj_scores.append(item["obj_scores"] ) 260 | human_thre -= 0.1 261 | human_bboxes = np.array(human_bboxes) 262 | human_inst_ids = np.array(human_inst_ids) 263 | human_role_scores = np.array(human_role_scores) 264 | human_obj_scores = np.array(human_obj_scores) 265 | 266 | obj_role_scores = [] 267 | obj_obj_scores = [] 268 | for obj_id in range(len(preds_inst["rois"])): 269 | item = {} 270 | # obj_role_score = transform_action_hico(preds_inst["act_scores"][obj_id], "object") 271 | obj_role_score = preds_inst["act_scores"][obj_id][len(verb_to_hoi):] 272 | item["obj_role_scores"] = obj_role_score 273 | item["obj_scores"] = preds_inst["obj_scores"][obj_id] 274 | 275 | item["obj_class_id"] = preds_inst["obj_class_ids"][obj_id] 276 | 277 | obj_bbox = preds_inst["rois"][obj_id] 278 | item["bbox"] = obj_bbox 279 | objects.append(item) 280 | obj_role_scores.append(obj_role_score) 281 | obj_obj_scores.append(item["obj_scores"]) 282 | object_bboxes = np.array(preds_inst["rois"]) 283 | obj_role_scores = np.array(obj_role_scores) 284 | obj_obj_scores = np.array(obj_obj_scores) 285 | 286 | hoi_pair_score = np.zeros((len(humans), len(preds_inst["obj_class_ids"]), num_union_actions), dtype=np.float) 287 | 288 | if len(human_bboxes) > 0: 289 | IoA = calc_ioa(preds_union["rois"], human_bboxes) 290 | 291 | IoA_max = np.max(IoA, axis=1) 292 | human_foreground = IoA_max > 0.1 # 0.25 293 | human_IoA = IoA[human_foreground] 294 | for key in preds_union: 295 | preds_union[key] = preds_union[key][human_foreground] 296 | 297 | new_IoA = calc_ioa(preds_union["rois"], preds_inst["rois"]) 298 | new_IoA_argmax = np.argmax(new_IoA, axis=1) 299 | new_IoA[np.arange(new_IoA.shape[0]), new_IoA_argmax] = 0 300 | new_IoA_sec_max = np.max(new_IoA, axis=1) 301 | obj_foreground = new_IoA_sec_max > 0.1 # 0.25 302 | for key in preds_union: 303 | preds_union[key] = preds_union[key][obj_foreground] 304 | 305 | human_IoU = calc_iou(preds_union["rois"], human_bboxes) 306 | human_IoA = human_IoA[obj_foreground] 307 | human_IoU_argmax = np.argmax(human_IoU * (human_IoA > 0.1), axis=1) # 0.25 308 | obj_IoA = calc_ioa(preds_union["rois"], preds_inst["rois"]) 309 | 310 | num_union = len(preds_union["rois"]) 311 | num_human = len(human_bboxes) 312 | 313 | sp_vectors = preds_union["sp_vector"] 314 | inter_human_regions = human_bboxes[human_IoU_argmax] 315 | humans_pos_x = (inter_human_regions[:, 0] + inter_human_regions[:, 2]) / 2 316 | humans_pos_y = (inter_human_regions[:, 1] + inter_human_regions[:, 3]) / 2 317 | humans_pos = np.stack([humans_pos_x, humans_pos_y], axis=1) 318 | inter_objects_pos = humans_pos + sp_vectors 319 | 320 | objects_pos_x = (object_bboxes[:, 0] + object_bboxes[:, 2]) / 2 321 | objects_pos_y = (object_bboxes[:, 1] + object_bboxes[:, 3]) / 2 322 | objects_pos = np.stack([objects_pos_x, objects_pos_y], axis=1) 323 | 324 | obj_dists = target_object_dist(inter_objects_pos, objects_pos, preds_union["rois"]) 325 | inter_human_instids = human_inst_ids[human_IoU_argmax] 326 | obj_dists[np.arange(num_union), inter_human_instids] = 100 327 | obj_dists[obj_IoA < 0.1] = 100 # 0.25 328 | inter_obj_ids = np.argmin(obj_dists, 1) 329 | inter_obj_dist = obj_dists[np.arange(num_union), inter_obj_ids] 330 | 331 | sigma = 0.6 332 | location_scores = np.exp(-1 * inter_obj_dist / (2 * sigma ** 2)) 333 | location_scores = np.where(location_scores