├── .gitignore
├── compare.png
├── test
    ├── test.jpg
    └── detection.jpg
├── __pycache__
    └── backbone.cpython-36.pyc
├── utils
    ├── __pycache__
    │   ├── timer.cpython-36.pyc
    │   ├── utils.cpython-36.pyc
    │   ├── utils.cpython-37.pyc
    │   ├── visual.cpython-36.pyc
    │   ├── apply_prior.cpython-36.pyc
    │   ├── visual_hico.cpython-36.pyc
    │   ├── vsrl_eval.cpython-36.pyc
    │   └── vsrl_eval.cpython-37.pyc
    ├── sync_batchnorm
    │   ├── __pycache__
    │   │   ├── comm.cpython-36.pyc
    │   │   ├── comm.cpython-37.pyc
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── batchnorm.cpython-36.pyc
    │   │   ├── batchnorm.cpython-37.pyc
    │   │   ├── replicate.cpython-36.pyc
    │   │   └── replicate.cpython-37.pyc
    │   ├── __init__.py
    │   ├── unittest.py
    │   ├── batchnorm_reimpl.py
    │   ├── replicate.py
    │   ├── comm.py
    │   └── batchnorm.py
    ├── timer.py
    ├── apply_prior.py
    ├── visual_hico.py
    └── visual.py
├── efficientdet
    ├── __pycache__
    │   ├── loss.cpython-36.pyc
    │   ├── model.cpython-36.pyc
    │   ├── model.cpython-37.pyc
    │   ├── utils.cpython-36.pyc
    │   ├── utils.cpython-37.pyc
    │   ├── dataset.cpython-36.pyc
    │   ├── hoi_model.cpython-36.pyc
    │   ├── help_function.cpython-36.pyc
    │   ├── vcoco_dataset.cpython-36.pyc
    │   ├── vcoco_dataset.cpython-37.pyc
    │   └── hico_det_dataset.cpython-36.pyc
    ├── config.py
    ├── hoi_model.py
    ├── help_function.py
    ├── utils.py
    ├── dataset.py
    ├── hico_det_dataset.py
    └── model.py
├── efficientnet
    ├── __pycache__
    │   ├── model.cpython-36.pyc
    │   ├── model.cpython-37.pyc
    │   ├── utils.cpython-36.pyc
    │   ├── utils.cpython-37.pyc
    │   ├── __init__.cpython-36.pyc
    │   ├── __init__.cpython-37.pyc
    │   ├── utils_extra.cpython-36.pyc
    │   └── utils_extra.cpython-37.pyc
    ├── __init__.py
    ├── utils_extra.py
    ├── model.py
    └── utils.py
├── projects
    ├── hico-det.yml
    └── vcoco.yml
├── backbone.py
├── README.md
├── coco_eval.py
├── Generate_HICO_detection.py
├── demo.py
├── test_vcoco.py
└── test_hico-det.py


/.gitignore:
--------------------------------------------------------------------------------
1 | datasets/
2 | logs/
3 | weights/
4 | 
5 | 


--------------------------------------------------------------------------------
/compare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/compare.png


--------------------------------------------------------------------------------
/test/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/test/test.jpg


--------------------------------------------------------------------------------
/test/detection.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/test/detection.jpg


--------------------------------------------------------------------------------
/__pycache__/backbone.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/__pycache__/backbone.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/timer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/__pycache__/timer.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/visual.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/__pycache__/visual.cpython-36.pyc


--------------------------------------------------------------------------------
/efficientdet/__pycache__/loss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/loss.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/apply_prior.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/__pycache__/apply_prior.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/visual_hico.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/__pycache__/visual_hico.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/vsrl_eval.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/__pycache__/vsrl_eval.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/vsrl_eval.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/__pycache__/vsrl_eval.cpython-37.pyc


--------------------------------------------------------------------------------
/efficientdet/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/model.cpython-36.pyc


--------------------------------------------------------------------------------
/efficientdet/__pycache__/model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/model.cpython-37.pyc


--------------------------------------------------------------------------------
/efficientdet/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/efficientdet/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/efficientnet/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientnet/__pycache__/model.cpython-36.pyc


--------------------------------------------------------------------------------
/efficientnet/__pycache__/model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientnet/__pycache__/model.cpython-37.pyc


--------------------------------------------------------------------------------
/efficientnet/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientnet/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/efficientnet/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientnet/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/efficientdet/__pycache__/dataset.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/dataset.cpython-36.pyc


--------------------------------------------------------------------------------
/efficientdet/__pycache__/hoi_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/hoi_model.cpython-36.pyc


--------------------------------------------------------------------------------
/efficientnet/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientnet/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/efficientnet/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientnet/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/efficientnet/__pycache__/utils_extra.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientnet/__pycache__/utils_extra.cpython-36.pyc


--------------------------------------------------------------------------------
/efficientnet/__pycache__/utils_extra.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientnet/__pycache__/utils_extra.cpython-37.pyc


--------------------------------------------------------------------------------
/efficientdet/__pycache__/help_function.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/help_function.cpython-36.pyc


--------------------------------------------------------------------------------
/efficientdet/__pycache__/vcoco_dataset.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/vcoco_dataset.cpython-36.pyc


--------------------------------------------------------------------------------
/efficientdet/__pycache__/vcoco_dataset.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/vcoco_dataset.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/sync_batchnorm/__pycache__/comm.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/sync_batchnorm/__pycache__/comm.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/sync_batchnorm/__pycache__/comm.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/sync_batchnorm/__pycache__/comm.cpython-37.pyc


--------------------------------------------------------------------------------
/efficientdet/__pycache__/hico_det_dataset.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/efficientdet/__pycache__/hico_det_dataset.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/sync_batchnorm/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/sync_batchnorm/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/sync_batchnorm/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/sync_batchnorm/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/sync_batchnorm/__pycache__/batchnorm.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/sync_batchnorm/__pycache__/batchnorm.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/sync_batchnorm/__pycache__/batchnorm.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/sync_batchnorm/__pycache__/batchnorm.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/sync_batchnorm/__pycache__/replicate.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/sync_batchnorm/__pycache__/replicate.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/sync_batchnorm/__pycache__/replicate.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/DIRV/HEAD/utils/sync_batchnorm/__pycache__/replicate.cpython-37.pyc


--------------------------------------------------------------------------------
/efficientnet/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "0.6.1"
 2 | from .model import EfficientNet
 3 | from .utils import (
 4 |     GlobalParams,
 5 |     BlockArgs,
 6 |     BlockDecoder,
 7 |     efficientnet,
 8 |     get_model_params,
 9 | )
10 | 
11 | 


--------------------------------------------------------------------------------
/utils/sync_batchnorm/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # File   : __init__.py
 3 | # Author : Jiayuan Mao
 4 | # Email  : maojiayuan@gmail.com
 5 | # Date   : 27/01/2018
 6 | #
 7 | # This file is part of Synchronized-BatchNorm-PyTorch.
 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
 9 | # Distributed under MIT License.
10 | 
11 | from .batchnorm import SynchronizedBatchNorm1d, SynchronizedBatchNorm2d, SynchronizedBatchNorm3d
12 | from .batchnorm import patch_sync_batchnorm, convert_model
13 | from .replicate import DataParallelWithCallback, patch_replication_callback
14 | 


--------------------------------------------------------------------------------
/projects/hico-det.yml:
--------------------------------------------------------------------------------
 1 | project_name: hico-det_final # also the folder name of the dataset that under data_path folder
 2 | train_set: trainval
 3 | val_set: test
 4 | num_gpus: 8
 5 | 
 6 | # mean and std in RGB order, actually this part should remain unchanged as long as your dataset is similar to coco.
 7 | mean: [0.485, 0.456, 0.406]
 8 | std: [0.229, 0.224, 0.225]
 9 | 
10 | # this is coco anchors, change it if necessary
11 | anchors_scales: '[2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]'
12 | anchors_ratios: '[(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)]'
13 | 
14 | # must match your dataset's category_id.
15 | # category_id is one_indexed,
16 | # for example, index of 'car' here is 2, while category_id of is 3
17 | 
18 | 


--------------------------------------------------------------------------------
/utils/sync_batchnorm/unittest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # File   : unittest.py
 3 | # Author : Jiayuan Mao
 4 | # Email  : maojiayuan@gmail.com
 5 | # Date   : 27/01/2018
 6 | #
 7 | # This file is part of Synchronized-BatchNorm-PyTorch.
 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
 9 | # Distributed under MIT License.
10 | 
11 | import unittest
12 | import torch
13 | 
14 | 
15 | class TorchTestCase(unittest.TestCase):
16 |     def assertTensorClose(self, x, y):
17 |         adiff = float((x - y).abs().max())
18 |         if (y == 0).all():
19 |             rdiff = 'NaN'
20 |         else:
21 |             rdiff = float((adiff / y).abs().max())
22 | 
23 |         message = (
24 |             'Tensor close check failed\n'
25 |             'adiff={}\n'
26 |             'rdiff={}\n'
27 |         ).format(adiff, rdiff)
28 |         self.assertTrue(torch.allclose(x, y), message)
29 | 
30 | 


--------------------------------------------------------------------------------
/utils/timer.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import time
 9 | 
10 | class Timer(object):
11 |     """A simple timer."""
12 |     def __init__(self):
13 |         self.total_time = 0.
14 |         self.calls = 0
15 |         self.start_time = 0.
16 |         self.diff = 0.
17 |         self.average_time = 0.
18 | 
19 |     def tic(self):
20 |         # using time.time instead of time.clock because time time.clock
21 |         # does not normalize for multithreading
22 |         self.start_time = time.time()
23 | 
24 |     def toc(self, average=True):
25 |         self.diff = time.time() - self.start_time
26 |         self.total_time += self.diff
27 |         self.calls += 1
28 |         self.average_time = self.total_time / self.calls
29 |         if average:
30 |             return self.average_time
31 |         else:
32 |             return self.diff
33 | 


--------------------------------------------------------------------------------
/projects/vcoco.yml:
--------------------------------------------------------------------------------
 1 | project_name: vcoco_new # also the folder name of the dataset that under data_path folder
 2 | train_set: trainval
 3 | val_set: test
 4 | num_gpus: 4
 5 | 
 6 | # mean and std in RGB order, actually this part should remain unchanged as long as your dataset is similar to coco.
 7 | mean: [0.485, 0.456, 0.406]
 8 | std: [0.229, 0.224, 0.225]
 9 | 
10 | # this is coco anchors, change it if necessary
11 | anchors_scales: '[2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]'
12 | anchors_ratios: '[(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)]'
13 | 
14 | # must match your dataset's category_id.
15 | # category_id is one_indexed,
16 | # for example, index of 'car' here is 2, while category_id of is 3
17 | obj_list: "['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
18 |            'fire hydrant', '', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
19 |            'cow', 'elephant', 'bear', 'zebra', 'giraffe', '', 'backpack', 'umbrella', '', '', 'handbag', 'tie',
20 |            'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
21 |            'skateboard', 'surfboard', 'tennis racket', 'bottle', '', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
22 |            'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut',
23 |            'cake', 'chair', 'couch', 'potted plant', 'bed', '', 'dining table', '', '', 'toilet', '', 'tv',
24 |            'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
25 |            'refrigerator', '', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier','toothbrush']"
26 | 
27 | union_action_list: "[('hold', 'obj'),  ('sit', 'instr'),  ('ride', 'instr'),  ('look', 'obj'),
28 |                 ('hit', 'instr'), ('hit', 'obj'),  ('eat', 'obj'), ('eat', 'instr'),
29 |                 ('jump', 'instr'),  ('lay', 'instr'), ('talk_on_phone', 'instr'),
30 |                 ('carry', 'obj'), ('throw', 'obj'), ('catch', 'obj'),  ('cut', 'instr'),
31 |                 ('cut', 'obj'),  ('work_on_computer', 'instr'),  ('ski', 'instr'),
32 |                 ('surf', 'instr'), ('skateboard', 'instr'),  ('drink', 'instr'),
33 |                ('kick', 'obj'), ('point', 'instr'),  ('read', 'obj'), ('snowboard', 'instr')]"
34 | 


--------------------------------------------------------------------------------
/utils/sync_batchnorm/batchnorm_reimpl.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # File   : batchnorm_reimpl.py
 4 | # Author : acgtyrant
 5 | # Date   : 11/01/2018
 6 | #
 7 | # This file is part of Synchronized-BatchNorm-PyTorch.
 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
 9 | # Distributed under MIT License.
10 | 
11 | import torch
12 | import torch.nn as nn
13 | import torch.nn.init as init
14 | 
15 | __all__ = ['BatchNorm2dReimpl']
16 | 
17 | 
18 | class BatchNorm2dReimpl(nn.Module):
19 |     """
20 |     A re-implementation of batch normalization, used for testing the numerical
21 |     stability.
22 | 
23 |     Author: acgtyrant
24 |     See also:
25 |     https://github.com/vacancy/Synchronized-BatchNorm-PyTorch/issues/14
26 |     """
27 |     def __init__(self, num_features, eps=1e-5, momentum=0.1):
28 |         super().__init__()
29 | 
30 |         self.num_features = num_features
31 |         self.eps = eps
32 |         self.momentum = momentum
33 |         self.weight = nn.Parameter(torch.empty(num_features))
34 |         self.bias = nn.Parameter(torch.empty(num_features))
35 |         self.register_buffer('running_mean', torch.zeros(num_features))
36 |         self.register_buffer('running_var', torch.ones(num_features))
37 |         self.reset_parameters()
38 | 
39 |     def reset_running_stats(self):
40 |         self.running_mean.zero_()
41 |         self.running_var.fill_(1)
42 | 
43 |     def reset_parameters(self):
44 |         self.reset_running_stats()
45 |         init.uniform_(self.weight)
46 |         init.zeros_(self.bias)
47 | 
48 |     def forward(self, input_):
49 |         batchsize, channels, height, width = input_.size()
50 |         numel = batchsize * height * width
51 |         input_ = input_.permute(1, 0, 2, 3).contiguous().view(channels, numel)
52 |         sum_ = input_.sum(1)
53 |         sum_of_square = input_.pow(2).sum(1)
54 |         mean = sum_ / numel
55 |         sumvar = sum_of_square - sum_ * mean
56 | 
57 |         self.running_mean = (
58 |                 (1 - self.momentum) * self.running_mean
59 |                 + self.momentum * mean.detach()
60 |         )
61 |         unbias_var = sumvar / (numel - 1)
62 |         self.running_var = (
63 |                 (1 - self.momentum) * self.running_var
64 |                 + self.momentum * unbias_var.detach()
65 |         )
66 | 
67 |         bias_var = sumvar / numel
68 |         inv_std = 1 / (bias_var + self.eps).pow(0.5)
69 |         output = (
70 |                 (input_ - mean.unsqueeze(1)) * inv_std.unsqueeze(1) *
71 |                 self.weight.unsqueeze(1) + self.bias.unsqueeze(1))
72 | 
73 |         return output.view(channels, batchsize, height, width).permute(1, 0, 2, 3).contiguous()
74 | 
75 | 


--------------------------------------------------------------------------------
/efficientdet/config.py:
--------------------------------------------------------------------------------
 1 | COCO_CLASSES = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
 2 |                 "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog",
 3 |                 "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
 4 |                 "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
 5 |                 "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle",
 6 |                 "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
 7 |                 "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant",
 8 |                 "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
 9 |                 "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
10 |                 "teddy bear", "hair drier", "toothbrush"]
11 | 
12 | colors = [(39, 129, 113), (164, 80, 133), (83, 122, 114), (99, 81, 172), (95, 56, 104), (37, 84, 86), (14, 89, 122),
13 |           (80, 7, 65), (10, 102, 25), (90, 185, 109), (106, 110, 132), (169, 158, 85), (188, 185, 26), (103, 1, 17),
14 |           (82, 144, 81), (92, 7, 184), (49, 81, 155), (179, 177, 69), (93, 187, 158), (13, 39, 73), (12, 50, 60),
15 |           (16, 179, 33), (112, 69, 165), (15, 139, 63), (33, 191, 159), (182, 173, 32), (34, 113, 133), (90, 135, 34),
16 |           (53, 34, 86), (141, 35, 190), (6, 171, 8), (118, 76, 112), (89, 60, 55), (15, 54, 88), (112, 75, 181),
17 |           (42, 147, 38), (138, 52, 63), (128, 65, 149), (106, 103, 24), (168, 33, 45), (28, 136, 135), (86, 91, 108),
18 |           (52, 11, 76), (142, 6, 189), (57, 81, 168), (55, 19, 148), (182, 101, 89), (44, 65, 179), (1, 33, 26),
19 |           (122, 164, 26), (70, 63, 134), (137, 106, 82), (120, 118, 52), (129, 74, 42), (182, 147, 112), (22, 157, 50),
20 |           (56, 50, 20), (2, 22, 177), (156, 100, 106), (21, 35, 42), (13, 8, 121), (142, 92, 28), (45, 118, 33),
21 |           (105, 118, 30), (7, 185, 124), (46, 34, 146), (105, 184, 169), (22, 18, 5), (147, 71, 73), (181, 64, 91),
22 |           (31, 39, 184), (164, 179, 33), (96, 50, 18), (95, 15, 106), (113, 68, 54), (136, 116, 112), (119, 139, 130),
23 |           (31, 139, 34), (66, 6, 127), (62, 39, 2), (49, 99, 180), (49, 119, 155), (153, 50, 183), (125, 38, 3),
24 |           (129, 87, 143), (49, 87, 40), (128, 62, 120), (73, 85, 148), (28, 144, 118), (29, 9, 24), (175, 45, 108),
25 |           (81, 175, 64), (178, 19, 157), (74, 188, 190), (18, 114, 2), (62, 128, 96), (21, 3, 150), (0, 6, 95),
26 |           (2, 20, 184), (122, 37, 185)]
27 | 


--------------------------------------------------------------------------------
/efficientdet/hoi_model.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch
 3 | from torchvision.ops.boxes import nms as nms_torch
 4 | 
 5 | from efficientnet import EfficientNet as EffNet
 6 | from efficientnet.utils import MemoryEfficientSwish, Swish
 7 | from efficientnet.utils_extra import Conv2dStaticSamePadding, MaxPool2dStaticSamePadding
 8 | 
 9 | from efficientdet.model import Regressor, Classifier, SeparableConvBlock
10 | 
11 | 
12 | class Union_Branch(nn.Module):
13 |     def __init__(self, in_channels, num_anchors, num_layers, num_union_classes, num_obj_classes):
14 |         super(Union_Branch, self).__init__()
15 |         self.num_layers = num_layers
16 |         self.num_anchors = num_anchors
17 |         self.in_channels = in_channels
18 | 
19 |         self.num_union_classes = num_union_classes
20 |         self.num_obj_classes = num_obj_classes
21 | 
22 |         self.action_classifier = Classifier(in_channels=self.in_channels, num_anchors=self.num_anchors,
23 |                                             num_classes=self.num_union_classes, num_layers=self.num_layers)
24 | 
25 |         self.union_sub_regressor = Regressor(in_channels=self.in_channels, num_anchors=self.num_anchors, num_layers=self.num_layers)
26 |         self.union_obj_regressor = Regressor(in_channels=self.in_channels, num_anchors=self.num_anchors, num_layers=self.num_layers)
27 | 
28 | 
29 |     def forward(self, inputs):
30 |         union_act_cls = self.action_classifier(inputs)
31 |         union_sub_reg = self.union_sub_regressor(inputs)
32 |         union_obj_reg = self.union_obj_regressor(inputs)
33 | 
34 |         return union_act_cls, union_sub_reg, union_obj_reg
35 | 
36 | 
37 | class Instance_Branch(nn.Module):
38 |     def __init__(self, in_channels, num_anchors, num_layers, num_inst_classes, num_obj_classes):
39 |         super(Instance_Branch, self).__init__()
40 |         self.num_layers = num_layers
41 |         self.num_anchors = num_anchors
42 |         self.in_channels = in_channels
43 | 
44 |         self.num_inst_classes = num_inst_classes
45 |         self.num_obj_classes = num_obj_classes
46 | 
47 |         self.action_classifier = Classifier(in_channels=self.in_channels, num_anchors=self.num_anchors,
48 |                                             num_classes=self.num_inst_classes, num_layers=self.num_layers)
49 | 
50 |         self.object_classifier = Classifier(in_channels=self.in_channels, num_anchors=self.num_anchors,
51 |                                             num_classes=self.num_obj_classes, num_layers=self.num_layers)
52 | 
53 |         self.object_regressor = Regressor(in_channels=self.in_channels, num_anchors=self.num_anchors, num_layers=self.num_layers)
54 | 
55 |     def forward(self, inputs):
56 |         inst_act_cls = self.action_classifier(inputs)
57 |         inst_obj_cls = self.object_classifier(inputs)
58 |         inst_bbox_reg = self.object_regressor(inputs)
59 | 
60 |         return inst_act_cls, inst_obj_cls, inst_bbox_reg
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/efficientnet/utils_extra.py:
--------------------------------------------------------------------------------
 1 | # Author: Zylo117
 2 | 
 3 | import math
 4 | 
 5 | from torch import nn
 6 | import torch.nn.functional as F
 7 | 
 8 | 
 9 | class Conv2dStaticSamePadding(nn.Module):
10 |     """
11 |     created by Zylo117
12 |     The real keras/tensorflow conv2d with same padding
13 |     """
14 | 
15 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1, bias=True, groups=1, dilation=1, **kwargs):
16 |         super().__init__()
17 |         self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride,
18 |                               bias=bias, groups=groups)
19 |         self.stride = self.conv.stride
20 |         self.kernel_size = self.conv.kernel_size
21 |         self.dilation = self.conv.dilation
22 | 
23 |         if isinstance(self.stride, int):
24 |             self.stride = [self.stride] * 2
25 |         elif len(self.stride) == 1:
26 |             self.stride = [self.stride[0]] * 2
27 | 
28 |         if isinstance(self.kernel_size, int):
29 |             self.kernel_size = [self.kernel_size] * 2
30 |         elif len(self.kernel_size) == 1:
31 |             self.kernel_size = [self.kernel_size[0]] * 2
32 | 
33 |     def forward(self, x):
34 |         h, w = x.shape[-2:]
35 | 
36 |         h_step = math.ceil(w / self.stride[1])
37 |         v_step = math.ceil(h / self.stride[0])
38 | 
39 |         h_cover_len = self.stride[1] * (h_step - 1) + 1 + (self.kernel_size[1] - 1)
40 |         v_cover_len = self.stride[0] * (v_step - 1) + 1 + (self.kernel_size[0] - 1)
41 | 
42 |         extra_h = h_cover_len - w
43 |         extra_v = v_cover_len - h
44 | 
45 |         left = extra_h // 2
46 |         right = extra_h - left
47 |         top = extra_v // 2
48 |         bottom = extra_v - top
49 | 
50 |         x = F.pad(x, [left, right, top, bottom])
51 | 
52 |         x = self.conv(x)
53 |         return x
54 | 
55 | 
56 | class MaxPool2dStaticSamePadding(nn.Module):
57 |     """
58 |     created by Zylo117
59 |     The real keras/tensorflow MaxPool2d with same padding
60 |     """
61 | 
62 |     def __init__(self, *args, **kwargs):
63 |         super().__init__()
64 |         self.pool = nn.MaxPool2d(*args, **kwargs)
65 |         self.stride = self.pool.stride
66 |         self.kernel_size = self.pool.kernel_size
67 | 
68 |         if isinstance(self.stride, int):
69 |             self.stride = [self.stride] * 2
70 |         elif len(self.stride) == 1:
71 |             self.stride = [self.stride[0]] * 2
72 | 
73 |         if isinstance(self.kernel_size, int):
74 |             self.kernel_size = [self.kernel_size] * 2
75 |         elif len(self.kernel_size) == 1:
76 |             self.kernel_size = [self.kernel_size[0]] * 2
77 | 
78 |     def forward(self, x):
79 |         h, w = x.shape[-2:]
80 | 
81 |         h_step = math.ceil(w / self.stride[1])
82 |         v_step = math.ceil(h / self.stride[0])
83 |         h_cover_len = self.stride[1] * (h_step - 1) + 1 + (self.kernel_size[1] - 1)
84 |         v_cover_len = self.stride[0] * (v_step - 1) + 1 + (self.kernel_size[0] - 1)
85 | 
86 |         extra_h = h_cover_len - w
87 |         extra_v = v_cover_len - h
88 | 
89 |         left = extra_h // 2
90 |         right = extra_h - left
91 |         top = extra_v // 2
92 |         bottom = extra_v - top
93 | 
94 |         x = F.pad(x, [left, right, top, bottom])
95 | 
96 |         x = self.pool(x)
97 |         return x
98 | 


--------------------------------------------------------------------------------
/utils/sync_batchnorm/replicate.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # File   : replicate.py
 3 | # Author : Jiayuan Mao
 4 | # Email  : maojiayuan@gmail.com
 5 | # Date   : 27/01/2018
 6 | # 
 7 | # This file is part of Synchronized-BatchNorm-PyTorch.
 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
 9 | # Distributed under MIT License.
10 | 
11 | import functools
12 | 
13 | from torch.nn.parallel.data_parallel import DataParallel
14 | 
15 | __all__ = [
16 |     'CallbackContext',
17 |     'execute_replication_callbacks',
18 |     'DataParallelWithCallback',
19 |     'patch_replication_callback'
20 | ]
21 | 
22 | 
23 | class CallbackContext(object):
24 |     pass
25 | 
26 | 
27 | def execute_replication_callbacks(modules):
28 |     """
29 |     Execute an replication callback `__data_parallel_replicate__` on each module created by original replication.
30 | 
31 |     The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)`
32 | 
33 |     Note that, as all modules are isomorphism, we assign each sub-module with a context
34 |     (shared among multiple copies of this module on different devices).
35 |     Through this context, different copies can share some information.
36 | 
37 |     We guarantee that the callback on the master copy (the first copy) will be called ahead of calling the callback
38 |     of any slave copies.
39 |     """
40 |     master_copy = modules[0]
41 |     nr_modules = len(list(master_copy.modules()))
42 |     ctxs = [CallbackContext() for _ in range(nr_modules)]
43 | 
44 |     for i, module in enumerate(modules):
45 |         for j, m in enumerate(module.modules()):
46 |             if hasattr(m, '__data_parallel_replicate__'):
47 |                 m.__data_parallel_replicate__(ctxs[j], i)
48 | 
49 | 
50 | class DataParallelWithCallback(DataParallel):
51 |     """
52 |     Data Parallel with a replication callback.
53 | 
54 |     An replication callback `__data_parallel_replicate__` of each module will be invoked after being created by
55 |     original `replicate` function.
56 |     The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)`
57 | 
58 |     Examples:
59 |         > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False)
60 |         > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1])
61 |         # sync_bn.__data_parallel_replicate__ will be invoked.
62 |     """
63 | 
64 |     def replicate(self, module, device_ids):
65 |         modules = super(DataParallelWithCallback, self).replicate(module, device_ids)
66 |         execute_replication_callbacks(modules)
67 |         return modules
68 | 
69 | 
70 | def patch_replication_callback(data_parallel):
71 |     """
72 |     Monkey-patch an existing `DataParallel` object. Add the replication callback.
73 |     Useful when you have customized `DataParallel` implementation.
74 | 
75 |     Examples:
76 |         > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False)
77 |         > sync_bn = DataParallel(sync_bn, device_ids=[0, 1])
78 |         > patch_replication_callback(sync_bn)
79 |         # this is equivalent to
80 |         > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False)
81 |         > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1])
82 |     """
83 | 
84 |     assert isinstance(data_parallel, DataParallel)
85 | 
86 |     old_replicate = data_parallel.replicate
87 | 
88 |     @functools.wraps(old_replicate)
89 |     def new_replicate(module, device_ids):
90 |         modules = old_replicate(module, device_ids)
91 |         execute_replication_callbacks(modules)
92 |         return modules
93 | 
94 |     data_parallel.replicate = new_replicate
95 | 


--------------------------------------------------------------------------------
/utils/apply_prior.py:
--------------------------------------------------------------------------------
 1 | def apply_prior(scores, obj_cls):
 2 |     assert len(scores) == 25
 3 |     if obj_cls != 35:  # not a snowboard, then the action is impossible to be snowboard
 4 |         scores[24] = 0
 5 | 
 6 |     if obj_cls != 83:  # not a book, then the action is impossible to be read
 7 |         scores[23] = 0
 8 | 
 9 |     if obj_cls != 36:  # not a sports ball, then the action is impossible to be kick
10 |         scores[21] = 0
11 | 
12 |     if (obj_cls != 45) and (obj_cls != 43) and (obj_cls != 46) and (obj_cls != 50):  # not 'wine glass', 'bottle', 'cup', 'bowl', then the action is impossible to be drink
13 |         scores[20] = 0
14 | 
15 |     if obj_cls != 40:  # not a skateboard, then the action is impossible to be skateboard
16 |         scores[19] = 0
17 | 
18 |     if obj_cls != 41:  # not a surfboard, then the action is impossible to be surfboard
19 |         scores[18] = 0
20 | 
21 |     if obj_cls != 34:  # not a ski, then the action is impossible to be ski
22 |         scores[17] = 0
23 | 
24 |     if obj_cls != 72:  # not a laptop, then the action is impossible to be work on computer
25 |         scores[16] = 0
26 | 
27 |     if (obj_cls != 86) and (obj_cls != 47) and (obj_cls != 48):  # not 'scissors', 'fork', 'knife', then the action is impossible to be cur instr
28 |         scores[14] = 0
29 | 
30 |     if (obj_cls != 36) and (obj_cls != 33): # not 'sports ball', 'frisbee', then the action is impossible to be throw and catch
31 |         scores[12] = 0
32 |         scores[13] = 0
33 | 
34 |     if obj_cls != 76:  # not a cellphone, then the action is impossible to be talk_on_phone
35 |         scores[10] = 0
36 | 
37 |     if (obj_cls != 14) and (obj_cls != 66) and (obj_cls != 69) and (obj_cls != 64) and (obj_cls != 62) and (obj_cls != 61):  # not 'bench', 'dining table', 'toilet', 'bed', 'couch', 'chair', then the action is impossible to be lay
38 |         scores[9] = 0
39 | 
40 |     if (obj_cls != 35) and (obj_cls != 34) and (obj_cls != 40) and (obj_cls != 41):  # not 'snowboard', 'skis', 'skateboard', 'surfboard', then the action is impossible to be jump
41 |         scores[8] = 0
42 | 
43 |     if (obj_cls != 51) and (obj_cls != 52) and (obj_cls != 53) and (obj_cls != 54) and (obj_cls != 55) and (obj_cls != 56) and (obj_cls != 57) and (obj_cls != 58) and (obj_cls != 59) and (obj_cls != 60):  # not ''banana', 'apple', 'sandwich', 'orange', 'carrot', 'broccoli', 'hot dog', 'pizza', 'cake', 'donut', then the action is impossible to be eat_obj
44 |         scores[6] = 0
45 | 
46 |     if (obj_cls != 47) and (obj_cls != 48) and (obj_cls != 49):  # not 'fork', 'knife', 'spoon', then the action is impossible to be eat_instr
47 |         scores[7] = 0
48 | 
49 |     if (obj_cls != 42) and (obj_cls != 38): # not 'tennis racket', 'baseball bat', then the action is impossible to be hit_instr
50 |         scores[4] = 0
51 | 
52 |     if (obj_cls != 36):  # not 'sports ball, then the action is impossible to be hit_obj
53 |         scores[5] = 0
54 | 
55 |     if (obj_cls != 1) and (obj_cls != 3) and (obj_cls != 5) and (obj_cls != 7) and (obj_cls != 8) and (obj_cls != 6) and (obj_cls != 4) and (obj_cls != 2) and (obj_cls != 18) and (obj_cls != 21): # not 'bicycle', 'motorcycle', 'bus', 'truck', 'boat', 'train', 'airplane', 'car', 'horse', 'elephant', then the action is impossible to be ride
56 |         scores[2] = 0
57 | 
58 |     if (obj_cls != 1) and (obj_cls != 3) and (obj_cls != 18) and (obj_cls != 21) and (obj_cls != 14) and (obj_cls != 61) and (obj_cls != 62) and (obj_cls != 64) and (obj_cls != 69) and (obj_cls != 66) and (obj_cls != 32) and (obj_cls != 30) and (obj_cls != 26): # not 'bicycle', 'motorcycle', 'horse', 'elephant', 'bench', 'chair', 'couch', 'bed', 'toilet', 'dining table', 'suitcase', 'handbag', 'backpack', then the action is impossible to be sit
59 |         scores[1] = 0
60 | 
61 |     if (obj_cls == 0):  # "person",  then the action is impossible to be cut_obj
62 |         scores[15] = 0
63 | 
64 |     return scores
65 | 


--------------------------------------------------------------------------------
/backbone.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | from efficientdet.model import BiFPN, Regressor, Classifier, EfficientNet
 7 | from efficientdet.hoi_model import Union_Branch, Instance_Branch
 8 | from efficientdet.utils import Anchors
 9 | 
10 | 
11 | class EfficientDetBackbone(nn.Module):
12 |     def __init__(self, num_classes=80, num_union_classes=25, num_inst_classes=51, compound_coef=0, load_weights=False, **kwargs):
13 |         super(EfficientDetBackbone, self).__init__()
14 |         self.compound_coef = compound_coef
15 | 
16 |         self.backbone_compound_coef = [0, 1, 2, 3, 4, 5, 6, 6]
17 |         self.fpn_num_filters = [64, 88, 112, 160, 224, 288, 384, 384]
18 |         self.fpn_cell_repeats = [3, 4, 5, 6, 7, 7, 8, 8]
19 |         self.input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]
20 |         self.box_class_repeats = [3, 3, 3, 4, 4, 4, 5, 5]
21 |         self.anchor_scale = [4., 4., 4., 4., 4., 4., 4., 5.]
22 |         self.aspect_ratios = kwargs.get('ratios', [(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)])
23 |         self.num_scales = len(kwargs.get('scales', [2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]))
24 |         conv_channel_coef = {
25 |             # the channels of P3/P4/P5.
26 |             0: [40, 112, 320],
27 |             1: [40, 112, 320],
28 |             2: [48, 120, 352],
29 |             3: [48, 136, 384],
30 |             4: [56, 160, 448],
31 |             5: [64, 176, 512],
32 |             6: [72, 200, 576],
33 |             7: [72, 200, 576],
34 |         }
35 | 
36 |         num_anchors = len(self.aspect_ratios) * self.num_scales
37 | 
38 |         self.bifpn = nn.Sequential(
39 |             *[BiFPN(self.fpn_num_filters[self.compound_coef],
40 |                     conv_channel_coef[compound_coef],
41 |                     True if _ == 0 else False,
42 |                     attention=True if compound_coef < 6 else False)
43 |               for _ in range(self.fpn_cell_repeats[compound_coef])])
44 | 
45 |         self.num_classes = num_classes
46 |         self.num_union_classes = num_union_classes
47 |         self.num_inst_classes = num_inst_classes
48 | 
49 |         self.union_branch = Union_Branch(in_channels = self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors,
50 |                                    num_layers=self.box_class_repeats[self.compound_coef],
51 |                                    num_union_classes=num_union_classes, num_obj_classes=num_classes)
52 |         self.instance_branch = Instance_Branch(in_channels = self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors,
53 |                                    num_layers=self.box_class_repeats[self.compound_coef],
54 |                                    num_inst_classes=num_inst_classes, num_obj_classes=num_classes)
55 | 
56 |         self.anchors = Anchors(anchor_scale=self.anchor_scale[compound_coef], **kwargs)
57 | 
58 |         self.backbone_net = EfficientNet(self.backbone_compound_coef[compound_coef], load_weights)
59 | 
60 |     def freeze_bn(self):
61 |         for m in self.modules():
62 |             if isinstance(m, nn.BatchNorm2d):
63 |                 m.eval()
64 | 
65 |     def forward(self, inputs):
66 |         max_size = inputs.shape[-1]
67 | 
68 |         _, p3, p4, p5 = self.backbone_net(inputs)
69 | 
70 |         features = (p3, p4, p5)
71 |         features = self.bifpn(features)
72 | 
73 |         union_act_cls, union_sub_reg, union_obj_reg = self.union_branch(features)
74 |         inst_act_cls, inst_obj_cls, inst_bbox_reg = self.instance_branch(features)
75 | 
76 |         anchors = self.anchors(inputs, inputs.dtype)
77 | 
78 |         return features, union_act_cls, union_sub_reg, union_obj_reg, inst_act_cls, inst_obj_cls, inst_bbox_reg, anchors
79 | 
80 | 
81 |     def init_backbone(self, path):
82 |         state_dict = torch.load(path)
83 |         try:
84 |             ret = self.load_state_dict(state_dict, strict=False)
85 |             print(ret)
86 |         except RuntimeError as e:
87 |             print('Ignoring ' + str(e) + '"')
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/utils/visual_hico.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | from PIL import Image
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | import pickle
  9 | import json
 10 | import numpy as np
 11 | import cv2
 12 | import os
 13 | import sys
 14 | import argparse
 15 | 
 16 | import matplotlib as mpl
 17 | mpl.use('Agg')
 18 | 
 19 | 
 20 | obj_list = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
 21 |            'fire hydrant', '', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
 22 |            'cow', 'elephant', 'bear', 'zebra', 'giraffe', '', 'backpack', 'umbrella', '', '', 'handbag', 'tie',
 23 |            'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
 24 |            'skateboard', 'surfboard', 'tennis racket', 'bottle', '', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
 25 |            'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut',
 26 |            'cake', 'chair', 'couch', 'potted plant', 'bed', '', 'dining table', '', '', 'toilet', '', 'tv',
 27 |            'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
 28 |            'refrigerator', '', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
 29 |            'toothbrush']
 30 | 
 31 | with open("/DATA1/Benchmark/hico_20160224_det/hico_processed/hoi_list.json", "r") as file:
 32 |     hois = json.load(file)
 33 | num_hois = len(hois)
 34 | union_action_list = {}
 35 | for i, item in enumerate(hois):
 36 |     union_action_list[i] = item["verb"] + "_" + item["object"]
 37 | 
 38 | 
 39 | def visual_hico(preds_inst, detection, image_id):
 40 |     output_dir = "vis/%d" % image_id
 41 |     if not os.path.exists(output_dir):
 42 |         os.mkdir(output_dir)
 43 | 
 44 |     dpi = 80
 45 | 
 46 |     im_file = "./datasets/hico_20160224_det/images/test2015/HICO_test2015_" + (str(image_id)).zfill(8) + ".jpg"
 47 | 
 48 |     im_data = plt.imread(im_file)
 49 |     height, width, nbands = im_data.shape
 50 |     figsize = width / float(dpi), height / float(dpi)
 51 | 
 52 |     fig = plt.figure(figsize=figsize)
 53 |     ax = fig.add_axes([0, 0, 1, 1])
 54 |     ax.axis('off')
 55 |     ax.imshow(im_data, interpolation='nearest')
 56 | 
 57 |     for inst_id in range(len(preds_inst["rois"])):
 58 |         box = preds_inst["rois"][inst_id]
 59 |         ax.add_patch(
 60 |             plt.Rectangle((box[0], box[1]),
 61 |                           box[2] - box[0],
 62 |                           box[3] - box[1], fill=False,
 63 |                           edgecolor="orange", linewidth=3)
 64 |         )
 65 |         text = obj_list[preds_inst["obj_class_ids"][inst_id]] + " ," + "%.3f"%preds_inst["obj_scores"][inst_id]
 66 |         ax.text(box[0] + 10, box[1] + 25,
 67 |                 text, fontsize=20, color='blue')
 68 |     fig.savefig(os.path.join(output_dir, "instances.jpg"))
 69 |     plt.close()
 70 | 
 71 |     for ele_id, ele in enumerate(detection):
 72 |         role_scores = ele[3]
 73 |         role_scores_idx_sort = np.argsort(role_scores)[::-1]
 74 | 
 75 |         if role_scores.max() < 1:
 76 |             continue
 77 | 
 78 |         fig = plt.figure(figsize=figsize)
 79 |         ax = fig.add_axes([0, 0, 1, 1])
 80 |         ax.axis('off')
 81 |         ax.imshow(im_data, interpolation='nearest')
 82 | 
 83 |         H_box = ele[0]
 84 |         O_box = ele[1]
 85 | 
 86 |         ax.add_patch(
 87 |             plt.Rectangle((H_box[0], H_box[1]),
 88 |                           H_box[2] - H_box[0],
 89 |                           H_box[3] - H_box[1], fill=False,
 90 |                           edgecolor="red", linewidth=3)
 91 |         )
 92 | 
 93 |         ax.add_patch(
 94 |             plt.Rectangle((O_box[0], O_box[1]),
 95 |                           O_box[2] - O_box[0],
 96 |                           O_box[3] - O_box[1], fill=False,
 97 |                           edgecolor="blue", linewidth=3)
 98 |         )
 99 | 
100 |         for action_count in range(5):
101 |             text = union_action_list[role_scores_idx_sort[action_count]] + ", %.2f" % role_scores[role_scores_idx_sort[action_count]]
102 |             ax.text(H_box[0] + 10, H_box[1] + 25 + action_count * 35,
103 |                 text, fontsize=16, color='green')
104 | 
105 |         fig.savefig(os.path.join(output_dir, "%d.jpg"%ele_id))
106 | 
107 |         plt.close()
108 | 
109 | 
110 | 
111 | if __name__=="__main__":
112 |     arg = argparse.ArgumentParser()
113 |     arg.add_argument('--det_file', type=str, default=None)
114 |     args = ap.parse_args()
115 | 
116 |     detection = pickle.load(open(args.det_file, "rb"))
117 |     visualize(detection)
118 | 


--------------------------------------------------------------------------------
/efficientdet/help_function.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | label_to_class = {0: ('hold', 'obj'), 1: ('sit', 'instr'), 2: ('ride', 'instr'), 3: ('look', 'obj'),
  4 |                   4: ('hit', 'instr'), 5: ('hit', 'obj'), 6: ('eat', 'obj'), 7: ('eat', 'instr'),
  5 |                   8: ('jump', 'instr'), 9: ('lay', 'instr'), 10: ('talk_on_phone', 'instr'),
  6 |                   11: ('carry', 'obj'), 12: ('throw', 'obj'), 13: ('catch', 'obj'), 14: ('cut', 'instr'),
  7 |                   15: ('cut', 'obj'), 16: ('work_on_computer', 'instr'), 17: ('ski', 'instr'),
  8 |                   18: ('surf', 'instr'), 19: ('skateboard', 'instr'), 20: ('drink', 'instr'),
  9 |                   21: ('kick', 'obj'), 22: ('point', 'instr'), 23: ('read', 'obj'), 24: ('snowboard', 'instr')}
 10 | 
 11 | sub_label_to_class = {0: 'hold', 1: 'stand', 2: 'sit', 3: 'ride', 4: 'walk', 5: 'look', 6: 'hit',
 12 |                        7: 'eat', 8: 'jump', 9: 'lay', 10: 'talk_on_phone', 11: 'carry', 12: 'throw',
 13 |                        13: 'catch', 14: 'cut', 15: 'run', 16: 'work_on_computer', 17: 'ski', 18: 'surf',
 14 |                        19: 'skateboard', 20: 'smile', 21: 'drink', 22: 'kick', 23: 'point', 24: 'read',
 15 |                        25: 'snowboard'}
 16 | 
 17 | obj_label_to_class = {26: ('hold', 'obj'), 27: ('sit', 'instr'), 28: ('ride', 'instr'), 29: ('look', 'obj'),
 18 |                       30: ('hit', 'instr'), 31: ('hit', 'obj'), 32: ('eat', 'obj'), 33: ('eat', 'instr'),
 19 |                       34: ('jump', 'instr'), 35: ('lay', 'instr'), 36: ('talk_on_phone', 'instr'),
 20 |                       37: ('carry', 'obj'), 38: ('throw', 'obj'), 39: ('catch', 'obj'), 40: ('cut', 'instr'),
 21 |                       41: ('cut', 'obj'), 42: ('work_on_computer', 'instr'), 43: ('ski', 'instr'),
 22 |                       44: ('surf', 'instr'), 45: ('skateboard', 'instr'), 46: ('drink', 'instr'),
 23 |                       47: ('kick', 'obj'), 48: ('point', 'instr'), 49: ('read', 'obj'), 50: ('snowboard', 'instr')}
 24 | 
 25 | sub_union_map = np.zeros(len(label_to_class), dtype=np.uint8)
 26 | for uid in label_to_class:
 27 |     for sid in sub_label_to_class:
 28 |         if sub_label_to_class[sid] == label_to_class[uid][0]:
 29 |             sub_union_map[uid] = sid
 30 |             break
 31 | 
 32 | 
 33 | def to_onehot(label, label_num):
 34 |     if isinstance(label, int) or isinstance(id, np.int32) or isinstance(id, np.int64):
 35 |         tmp = np.zeros(label_num)
 36 |         tmp[label] = 1
 37 |     elif isinstance(label, list) or isinstance(id, np.ndarray):
 38 |         tmp = np.zeros(label_num)
 39 |         label = np.array(label)
 40 |         assert len(label.shape) == 1
 41 |         if label.shape[0] > 0:
 42 |             tmp[label] = 1
 43 |     else:
 44 |         raise (Exception, "Only int or list is allowed to transform to one hot")
 45 |     return tmp
 46 | 
 47 | 
 48 | def single_iou(a, b, need_area = False):
 49 |     # a(x1, y1, x2, y2)
 50 |     # b(x1, y1, x2, y2)
 51 | 
 52 |     area = (b[2] - b[0]) * (b[3] - b[1])
 53 |     iw = min(a[2], b[2]) - max(a[0], b[0])
 54 |     ih = min(a[3], b[3]) - max(a[1], b[1])
 55 |     iw = max(iw, 0)
 56 |     ih = max(ih, 0)
 57 |     ua = (a[2] - a[0]) * (a[3] - a[1]) + area - iw * ih
 58 |     ua = max(ua, 1e-8)
 59 | 
 60 |     intersection = iw * ih
 61 |     IoU = intersection / ua
 62 |     if need_area:
 63 |         return IoU, intersection, ua
 64 |     else:
 65 |         return IoU
 66 | 
 67 | 
 68 | def single_ioa(a, b, need_area = False):
 69 |     # a(x1, y1, x2, y2)
 70 |     # b(x1, y1, x2, y2)
 71 | 
 72 |     area = (b[2] - b[0]) * (b[3] - b[1])
 73 |     iw = min(a[2], b[2]) - max(a[0], b[0])
 74 |     ih = min(a[3], b[3]) - max(a[1], b[1])
 75 | 
 76 |     iw = max(iw, 0)
 77 |     ih = max(ih, 0)
 78 | 
 79 |     area = max(area, 1e-8)
 80 |     intersection = iw * ih
 81 |     IoA = intersection / area
 82 | 
 83 |     if need_area:
 84 |         return IoA, intersection, area
 85 |     else:
 86 |         return IoA
 87 | 
 88 | 
 89 | def single_inter(a, b):
 90 |     inter = [max(a[0], b[0]), max(a[1], b[1]), min(a[2], b[2]), min(a[3], b[3])]
 91 |     if inter[0] > inter[2] or inter[1] > inter[3]:
 92 |         inter = [0.0, 0.0, 0.0, 0.0]
 93 |     return np.array(inter)
 94 | 
 95 | 
 96 | def single_union(a, b):
 97 |     inter = [min(a[0], b[0]), min(a[1], b[1]), max(a[2], b[2]), max(a[3], b[3])]
 98 |     if inter[0] > inter[2] or inter[1] > inter[3]:
 99 |         inter = [0.0, 0.0, 0.0, 0.0]
100 |     return np.array(inter)
101 | 
102 | 
103 | def transform_action(inst_score, mode):
104 |     assert mode in {"subject", "object"}
105 | 
106 |     num_union_act = len(label_to_class)
107 |     num_sub_act = len(sub_label_to_class)
108 |     num_obj_act = len(obj_label_to_class)
109 | 
110 |     res = np.zeros(num_union_act)
111 |     ids = np.arange(num_union_act)
112 | 
113 |     if mode == "object":
114 |         res = inst_score[num_sub_act:]
115 |         return res
116 |     else:
117 |         res[ids] = inst_score[sub_union_map[ids]]
118 |         return res


--------------------------------------------------------------------------------
/utils/sync_batchnorm/comm.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # File   : comm.py
  3 | # Author : Jiayuan Mao
  4 | # Email  : maojiayuan@gmail.com
  5 | # Date   : 27/01/2018
  6 | # 
  7 | # This file is part of Synchronized-BatchNorm-PyTorch.
  8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
  9 | # Distributed under MIT License.
 10 | 
 11 | import queue
 12 | import collections
 13 | import threading
 14 | 
 15 | __all__ = ['FutureResult', 'SlavePipe', 'SyncMaster']
 16 | 
 17 | 
 18 | class FutureResult(object):
 19 |     """A thread-safe future implementation. Used only as one-to-one pipe."""
 20 | 
 21 |     def __init__(self):
 22 |         self._result = None
 23 |         self._lock = threading.Lock()
 24 |         self._cond = threading.Condition(self._lock)
 25 | 
 26 |     def put(self, result):
 27 |         with self._lock:
 28 |             assert self._result is None, 'Previous result has\'t been fetched.'
 29 |             self._result = result
 30 |             self._cond.notify()
 31 | 
 32 |     def get(self):
 33 |         with self._lock:
 34 |             if self._result is None:
 35 |                 self._cond.wait()
 36 | 
 37 |             res = self._result
 38 |             self._result = None
 39 |             return res
 40 | 
 41 | 
 42 | _MasterRegistry = collections.namedtuple('MasterRegistry', ['result'])
 43 | _SlavePipeBase = collections.namedtuple('_SlavePipeBase', ['identifier', 'queue', 'result'])
 44 | 
 45 | 
 46 | class SlavePipe(_SlavePipeBase):
 47 |     """Pipe for master-slave communication."""
 48 | 
 49 |     def run_slave(self, msg):
 50 |         self.queue.put((self.identifier, msg))
 51 |         ret = self.result.get()
 52 |         self.queue.put(True)
 53 |         return ret
 54 | 
 55 | 
 56 | class SyncMaster(object):
 57 |     """An abstract `SyncMaster` object.
 58 | 
 59 |     - During the replication, as the data parallel will trigger an callback of each module, all slave devices should
 60 |     call `register(id)` and obtain an `SlavePipe` to communicate with the master.
 61 |     - During the forward pass, master device invokes `run_master`, all messages from slave devices will be collected,
 62 |     and passed to a registered callback.
 63 |     - After receiving the messages, the master device should gather the information and determine to message passed
 64 |     back to each slave devices.
 65 |     """
 66 | 
 67 |     def __init__(self, master_callback):
 68 |         """
 69 | 
 70 |         Args:
 71 |             master_callback: a callback to be invoked after having collected messages from slave devices.
 72 |         """
 73 |         self._master_callback = master_callback
 74 |         self._queue = queue.Queue()
 75 |         self._registry = collections.OrderedDict()
 76 |         self._activated = False
 77 | 
 78 |     def __getstate__(self):
 79 |         return {'master_callback': self._master_callback}
 80 | 
 81 |     def __setstate__(self, state):
 82 |         self.__init__(state['master_callback'])
 83 | 
 84 |     def register_slave(self, identifier):
 85 |         """
 86 |         Register an slave device.
 87 | 
 88 |         Args:
 89 |             identifier: an identifier, usually is the device id.
 90 | 
 91 |         Returns: a `SlavePipe` object which can be used to communicate with the master device.
 92 | 
 93 |         """
 94 |         if self._activated:
 95 |             assert self._queue.empty(), 'Queue is not clean before next initialization.'
 96 |             self._activated = False
 97 |             self._registry.clear()
 98 |         future = FutureResult()
 99 |         self._registry[identifier] = _MasterRegistry(future)
100 |         return SlavePipe(identifier, self._queue, future)
101 | 
102 |     def run_master(self, master_msg):
103 |         """
104 |         Main entry for the master device in each forward pass.
105 |         The messages were first collected from each devices (including the master device), and then
106 |         an callback will be invoked to compute the message to be sent back to each devices
107 |         (including the master device).
108 | 
109 |         Args:
110 |             master_msg: the message that the master want to send to itself. This will be placed as the first
111 |             message when calling `master_callback`. For detailed usage, see `_SynchronizedBatchNorm` for an example.
112 | 
113 |         Returns: the message to be sent back to the master device.
114 | 
115 |         """
116 |         self._activated = True
117 | 
118 |         intermediates = [(0, master_msg)]
119 |         for i in range(self.nr_slaves):
120 |             intermediates.append(self._queue.get())
121 | 
122 |         results = self._master_callback(intermediates)
123 |         assert results[0][0] == 0, 'The first result should belongs to the master.'
124 | 
125 |         for i, res in results:
126 |             if i == 0:
127 |                 continue
128 |             self._registry[i].result.put(res)
129 | 
130 |         for i in range(self.nr_slaves):
131 |             assert self._queue.get() is True
132 | 
133 |         return results[0][1]
134 | 
135 |     @property
136 |     def nr_slaves(self):
137 |         return len(self._registry)
138 | 


--------------------------------------------------------------------------------
/efficientdet/utils.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import torch
  3 | import torch.nn as nn
  4 | import numpy as np
  5 | 
  6 | 
  7 | class BBoxTransform(nn.Module):
  8 |     def forward(self, anchors, regression):
  9 |         """
 10 |         decode_box_outputs adapted from https://github.com/google/automl/blob/master/efficientdet/anchors.py
 11 | 
 12 |         Args:
 13 |             anchors: [batchsize, boxes, (y1, x1, y2, x2)]
 14 |             regression: [batchsize, boxes, (dy, dx, dh, dw)]
 15 | 
 16 |         Returns:
 17 | 
 18 |         """
 19 |         y_centers_a = (anchors[..., 0] + anchors[..., 2]) / 2
 20 |         x_centers_a = (anchors[..., 1] + anchors[..., 3]) / 2
 21 |         ha = anchors[..., 2] - anchors[..., 0]
 22 |         wa = anchors[..., 3] - anchors[..., 1]
 23 | 
 24 |         w = regression[..., 3].exp() * wa
 25 |         h = regression[..., 2].exp() * ha
 26 | 
 27 |         y_centers = regression[..., 0] * ha + y_centers_a
 28 |         x_centers = regression[..., 1] * wa + x_centers_a
 29 | 
 30 |         ymin = y_centers - h / 2.
 31 |         xmin = x_centers - w / 2.
 32 |         ymax = y_centers + h / 2.
 33 |         xmax = x_centers + w / 2.
 34 | 
 35 |         return torch.stack([xmin, ymin, xmax, ymax], dim=2)
 36 | 
 37 | 
 38 | class ClipBoxes(nn.Module):
 39 | 
 40 |     def __init__(self):
 41 |         super(ClipBoxes, self).__init__()
 42 | 
 43 |     def forward(self, boxes, img):
 44 |         batch_size, num_channels, height, width = img.shape
 45 | 
 46 |         boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0)
 47 |         boxes[:, :, 1] = torch.clamp(boxes[:, :, 1], min=0)
 48 | 
 49 |         boxes[:, :, 2] = torch.clamp(boxes[:, :, 2], max=width - 1)
 50 |         boxes[:, :, 3] = torch.clamp(boxes[:, :, 3], max=height - 1)
 51 | 
 52 |         return boxes
 53 | 
 54 | 
 55 | class Anchors(nn.Module):
 56 |     """
 57 |     adapted and modified from https://github.com/google/automl/blob/master/efficientdet/anchors.py by Zylo117
 58 |     """
 59 | 
 60 |     def __init__(self, anchor_scale=4., pyramid_levels=None, **kwargs):
 61 |         super().__init__()
 62 |         self.anchor_scale = anchor_scale
 63 | 
 64 |         if pyramid_levels is None:
 65 |             self.pyramid_levels = [3, 4, 5, 6, 7]
 66 | 
 67 |         self.strides = kwargs.get('strides', [2 ** x for x in self.pyramid_levels])
 68 |         self.scales = np.array(kwargs.get('scales', [2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]))
 69 |         self.ratios = kwargs.get('ratios', [(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)])
 70 | 
 71 |         self.last_anchors = {}
 72 |         self.last_shape = None
 73 | 
 74 |     def forward(self, image, dtype=torch.float32):
 75 |         """Generates multiscale anchor boxes.
 76 | 
 77 |         Args:
 78 |           image_size: integer number of input image size. The input image has the
 79 |             same dimension for width and height. The image_size should be divided by
 80 |             the largest feature stride 2^max_level.
 81 |           anchor_scale: float number representing the scale of size of the base
 82 |             anchor to the feature stride 2^level.
 83 |           anchor_configs: a dictionary with keys as the levels of anchors and
 84 |             values as a list of anchor configuration.
 85 | 
 86 |         Returns:
 87 |           anchor_boxes: a numpy array with shape [N, 4], which stacks anchors on all
 88 |             feature levels.
 89 |         Raises:
 90 |           ValueError: input size must be the multiple of largest feature stride.
 91 |         """
 92 |         image_shape = image.shape[2:]
 93 | 
 94 |         if image_shape == self.last_shape and image.device in self.last_anchors:
 95 |             return self.last_anchors[image.device]
 96 | 
 97 |         if self.last_shape is None or self.last_shape != image_shape:
 98 |             self.last_shape = image_shape # (h, w)
 99 | 
100 |         if dtype == torch.float16:
101 |             dtype = np.float16
102 |         else:
103 |             dtype = np.float32
104 | 
105 |         boxes_all = []
106 |         for stride in self.strides:
107 |             boxes_level = []
108 |             for scale, ratio in itertools.product(self.scales, self.ratios):  # scales中每个元素依次与ratios中每个元素组合
109 |                 if image_shape[1] % stride != 0:
110 |                     raise ValueError('input size must be divided by the stride.')
111 |                 base_anchor_size = self.anchor_scale * stride * scale
112 |                 anchor_size_x_2 = base_anchor_size * ratio[0] / 2.0
113 |                 anchor_size_y_2 = base_anchor_size * ratio[1] / 2.0
114 | 
115 |                 x = np.arange(stride / 2, image_shape[1], stride)
116 |                 y = np.arange(stride / 2, image_shape[0], stride)
117 |                 xv, yv = np.meshgrid(x, y)  # 原图anchor box的(x, y)坐标中点
118 |                 xv = xv.reshape(-1)
119 |                 yv = yv.reshape(-1)
120 | 
121 |                 # y1,x1,y2,x2
122 |                 boxes = np.vstack((yv - anchor_size_y_2, xv - anchor_size_x_2,
123 |                                    yv + anchor_size_y_2, xv + anchor_size_x_2))  # (4, n_boxes)
124 |                 boxes = np.swapaxes(boxes, 0, 1)  # (n_boxes, 4)
125 |                 boxes_level.append(np.expand_dims(boxes, axis=1))
126 |             # concat anchors on the same level to the reshape NxAx4
127 |             boxes_level = np.concatenate(boxes_level, axis=1)
128 |             boxes_all.append(boxes_level.reshape([-1, 4]))
129 | 
130 |         anchor_boxes = np.vstack(boxes_all)  # (batch_size, n_boxes*A, 4)
131 | 
132 |         anchor_boxes = torch.from_numpy(anchor_boxes.astype(dtype)).to(image.device)
133 |         anchor_boxes = anchor_boxes.unsqueeze(0)
134 | 
135 |         # save it for later use to reduce overhead
136 |         self.last_anchors[image.device] = anchor_boxes
137 |         return anchor_boxes
138 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DIRV: Dense Interaction Region Voting for End-to-End Human-Object Interaction Detection
  2 | 
  3 | <div align="center">
  4 |     <img src="compare.png", width="600">
  5 | </div>
  6 | 
  7 | Official code implementation for the paper "DIRV: Dense Interaction Region Voting for End-to-End Human-Object Interaction Detection" (AAAI 2021) [paper](https://arxiv.org/abs/2010.01005).
  8 | 
  9 | The code is developed based on the architecture of [zylo117/Yet-Another-EfficientDet-Pytorch](https://github.com/zylo117/Yet-Another-EfficientDet-Pytorch). We also follow some data pre-processing and model evaluation methods in [BigRedT/no_frills_hoi_det](https://github.com/BigRedT/no_frills_hoi_det) and [vt-vl-lab/iCAN](https://github.com/vt-vl-lab/iCAN). We sincerely thank the authors for the excellent work.
 10 | 
 11 | 
 12 | 
 13 | ## Checklist
 14 | 
 15 | + [x] Training and Test for V-COCO dataset
 16 | + [x] Training and Test for HICO-DET dataset
 17 | + [x] Demonstration on images
 18 | + [ ] Demonstration on videos
 19 | + [ ] More efficient voting strategy for inference using GPU
 20 | 
 21 | ## Prerequisites
 22 | 
 23 | The code was tested with python 3.6,  pytorch 1.5.1, torchvision 0.6.1, CUDA 10.2, and Ubuntu 18.04.
 24 | 
 25 | ## Installation 
 26 | 
 27 | 1. Clone this repository:
 28 | 
 29 |    ```
 30 |    git clone https://github.com/MVIG-SJTU/DIRV.git
 31 |    ```
 32 | 
 33 | 2. Install pytorch and torchvision:
 34 | 
 35 |    ```
 36 |    pip install torch==1.5.1 torchvision==0.6.1
 37 |    ```
 38 | 
 39 | 3. Install other necessary packages:
 40 | 
 41 |    ```
 42 |    pip install pycocotools numpy opencv-python tqdm tensorboard tensorboardX pyyaml webcolors
 43 |    ```
 44 | 
 45 | ## Data Preparation
 46 | 
 47 | ### V-COCO Dataset:
 48 | 
 49 | Download [V-COCO](https://github.com/s-gupta/v-coco) dataset following the official instructions. 
 50 | 
 51 | You can find the files new_prior_mask.pkl [here](https://drive.google.com/drive/folders/14xXUb5l_SugfWiRXX3o8jgKXMNac1c7_?usp=sharing). Each element inside it refers to the prior probability that a verb (e.g. eat) is associated with an object category (e.g. apple). You should also download the combined training and valdataion sets annotations instances_trainval2014.json [here](https://drive.google.com/drive/folders/14xXUb5l_SugfWiRXX3o8jgKXMNac1c7_?usp=sharing), and put it in datasets/vcoco/coco/annotations.
 52 | 
 53 | ### HICO-DET Dataset:
 54 | 
 55 | Download [HICO-DET](http://www-personal.umich.edu/~ywchao/hico/) dataset from the official website.
 56 | 
 57 | We transform the annotations of HICO-DET dataset to JSON format following [BigRedT/no_frills_hoi_det](https://github.com/BigRedT/no_frills_hoi_det). You can directly download the processed annotations from [here](https://drive.google.com/drive/folders/14xXUb5l_SugfWiRXX3o8jgKXMNac1c7_?usp=sharing).
 58 | 
 59 | We count the training sample number of each category in [hico_processed/hico-det_verb_count.json](https://drive.google.com/drive/folders/14xXUb5l_SugfWiRXX3o8jgKXMNac1c7_?usp=sharing). It serves as a weight when calculating loss.
 60 | 
 61 | ### Dataset Structure:
 62 | 
 63 | Make sure to put the files in the following structure:
 64 | 
 65 | ```
 66 | |-- datasets
 67 | |   |-- vcoco
 68 | |	|	|-- data
 69 | |	|	|	|-- splits
 70 | |	|	|	|-- vcoco
 71 | |	|	|
 72 | |	|	|-- coco
 73 | |	| 	|	|-- images
 74 | |	|	|	|-- annotations
 75 | |	|	|-- new_prior_mask.pkl   
 76 | |   |-- hico_20160224_det
 77 | |	|	|-- images
 78 | |	|	|-- hico_processed
 79 | ```
 80 | 
 81 | ## Demonstration
 82 | ### Demonstration on Images
 83 | 
 84 | ```
 85 | CUDA_VISIBLE_DEVICES=0 python demo.py --image_path /path/to/a/single/image
 86 | ```
 87 | 
 88 | ### Demonstration on Videos
 89 | 
 90 | Coming soon.
 91 | 
 92 | ## Pre-trained Weights
 93 | 
 94 | You can download the pre-trained weights for V-COCO dataset (vcoco_best.pth) and HICO-DET dataset (hico-det_best.pth) [here](https://drive.google.com/drive/folders/14xXUb5l_SugfWiRXX3o8jgKXMNac1c7_?usp=sharing).
 95 | 
 96 | ## Training
 97 | 
 98 | Download the pre-trained weight of our backbone (efficientdet-d3_vcoco.pth and efficientdet-d3_hico-det.pth) [here](https://drive.google.com/drive/folders/14xXUb5l_SugfWiRXX3o8jgKXMNac1c7_?usp=sharing), and save it in `weights/` directory. 
 99 | 
100 | ### Training on V-COCO Dataset
101 | 
102 | ```
103 | CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py -p vcoco --batch_size 32 --load_weights weights/efficientdet-d3_vcoco.pth
104 | ```
105 | 
106 | ### Training on HICO-DET Dataset
107 | 
108 | ```
109 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python train.py -p hico-det --batch_size 48 --load_weights weights/efficientdet-d3_hico-det.pth
110 | ```
111 | 
112 | You may also adjust the saving directory and GPU number in `projects/vcoco.yaml` and `projects/hico-det.yaml` or create your own projects in `projects/`.
113 | 
114 | ## Test 
115 | 
116 | ### Test on V-COCO Dataset
117 | 
118 | ```
119 | CUDA_VISIBLE_DEVICES=0 python test_vcoco.py -w $path to the checkpoint$
120 | ```
121 | 
122 | ### Test on HICO-DET Dataset
123 | 
124 | ```
125 | CUDA_VISIBLE_DEVICES=0 python test_hico-det.py -w $path to the checkpoint$
126 | ```
127 | 
128 | Then please follow the same procedures in [vt-vl-lab/iCAN](https://github.com/vt-vl-lab/iCAN) to evaluate the result on HICO-DET dataset.
129 | 
130 | ## Citation
131 | 
132 | If you found our paper or code useful for your research, please cite the following paper:
133 | ```
134 | @inproceedings{fang2020dirv,
135 |       title={DIRV: Dense Interaction Region Voting for End-to-End Human-Object Interaction Detection}, 
136 |       author={Fang, Hao-Shu and Xie, Yichen and Shao, Dian and Lu, Cewu},
137 |       year={2021},
138 |       booktitle = {The AAAI Conference on Artificial Intelligence (AAAI)}
139 | }
140 | ```
141 | 
142 | 


--------------------------------------------------------------------------------
/utils/visual.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | from PIL import Image
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | import pickle
  9 | import json
 10 | import numpy as np
 11 | import cv2
 12 | import os
 13 | import sys
 14 | import argparse
 15 | 
 16 | import matplotlib as mpl
 17 | mpl.use('Agg')
 18 | 
 19 | 
 20 | def visual(detection, image_id=None):
 21 |     if image_id is None:
 22 |         image_id = detection[0]["image_id"]
 23 | 
 24 |     cc = plt.get_cmap('hsv', lut=6)
 25 |     dpi = 80
 26 | 
 27 |     im_file = './datasets/vcoco/coco/images/val2014/COCO_val2014_' + (str(image_id)).zfill(12) + '.jpg'
 28 |     im_data = plt.imread(im_file)
 29 |     height, width, nbands = im_data.shape
 30 |     figsize = width / float(dpi), height / float(dpi)
 31 |     fig = plt.figure(figsize=figsize)
 32 |     ax = fig.add_axes([0, 0, 1, 1])
 33 |     ax.axis('off')
 34 |     ax.imshow(im_data, interpolation='nearest')
 35 | 
 36 |     HO_dic = {}
 37 |     HO_set = set()
 38 |     count = 0
 39 | 
 40 |     for ele in detection:
 41 |         if (ele['image_id'] == image_id):
 42 |             action_count = -1
 43 | 
 44 |             for action_key, action_value in ele.items():
 45 |                 if (action_key.split('_')[-1] != 'agent') and action_key != 'image_id' and action_key != 'person_box':
 46 |                     if (not np.isnan(action_value[0])) and (action_value[4] > 0.01):
 47 |                         O_box = action_value[:4]
 48 |                         H_box = ele['person_box']
 49 | 
 50 |                         action_count += 1
 51 | 
 52 |                         if tuple(O_box) not in HO_set:
 53 |                             HO_dic[tuple(O_box)] = count
 54 |                             HO_set.add(tuple(O_box))
 55 |                             count += 1
 56 |                         if tuple(H_box) not in HO_set:
 57 |                             HO_dic[tuple(H_box)] = count
 58 |                             HO_set.add(tuple(H_box))
 59 |                             count += 1
 60 | 
 61 |                         ax.add_patch(
 62 |                             plt.Rectangle((H_box[0], H_box[1]),
 63 |                                           H_box[2] - H_box[0],
 64 |                                           H_box[3] - H_box[1], fill=False,
 65 |                                           edgecolor=cc(HO_dic[tuple(H_box)])[:3], linewidth=3)
 66 |                         )
 67 |                         text = action_key.split('_')[0] + ', ' + "%.2f" % action_value[4]
 68 | 
 69 |                         ax.text(H_box[0] + 10, H_box[1] + 25 + action_count * 35,
 70 |                                 text,
 71 |                                 bbox=dict(facecolor=cc(HO_dic[tuple(O_box)])[:3], alpha=0.5),
 72 |                                 fontsize=16, color='white')
 73 | 
 74 |                         ax.add_patch(
 75 |                             plt.Rectangle((O_box[0], O_box[1]),
 76 |                                           O_box[2] - O_box[0],
 77 |                                           O_box[3] - O_box[1], fill=False,
 78 |                                           edgecolor=cc(HO_dic[tuple(O_box)])[:3], linewidth=3)
 79 |                         )
 80 |                         ax.set(xlim=[0, width], ylim=[height, 0], aspect=1)
 81 |     fig.savefig("vis/%d.jpg"%image_id)
 82 | 
 83 | 
 84 | def visual_demo(detection, im_path, save_path):
 85 |     cc = plt.get_cmap('hsv', lut=6)
 86 |     dpi = 80
 87 |     im_data = plt.imread(im_path)
 88 |     height, width, nbands = im_data.shape
 89 |     figsize = width / float(dpi), height / float(dpi)
 90 |     fig = plt.figure(figsize=figsize)
 91 |     ax = fig.add_axes([0, 0, 1, 1])
 92 |     ax.axis('off')
 93 |     ax.imshow(im_data, interpolation='nearest')
 94 | 
 95 |     HO_dic = {}
 96 |     HO_set = set()
 97 |     count = 0
 98 | 
 99 |     for ele in detection:
100 |         action_count = -1
101 |         for action_key, action_value in ele.items():
102 |             if (action_key.split('_')[-1] != 'agent') and action_key != 'image_id' and action_key != 'person_box':
103 |                 if (not np.isnan(action_value[0])) and (action_value[4] > 0.01):
104 |                     O_box = action_value[:4]
105 |                     H_box = ele['person_box']
106 |                     action_count += 1
107 |                     if tuple(O_box) not in HO_set:
108 |                         HO_dic[tuple(O_box)] = count
109 |                         HO_set.add(tuple(O_box))
110 |                         count += 1
111 |                     if tuple(H_box) not in HO_set:
112 |                         HO_dic[tuple(H_box)] = count
113 |                         HO_set.add(tuple(H_box))
114 |                         count += 1
115 |                     ax.add_patch(
116 |                         plt.Rectangle((H_box[0], H_box[1]),
117 |                                       H_box[2] - H_box[0],
118 |                                       H_box[3] - H_box[1], fill=False,
119 |                                       edgecolor=cc(HO_dic[tuple(H_box)])[:3], linewidth=3)
120 |                     )
121 |                     text = action_key.split('_')[0] + ', ' + "%.2f" % action_value[4]
122 |                     ax.text(H_box[0] + 10, H_box[1] + 25 + action_count * 35,
123 |                             text,
124 |                             bbox=dict(facecolor=cc(HO_dic[tuple(O_box)])[:3], alpha=0.5),
125 |                             fontsize=16, color='white')
126 |                     ax.add_patch(
127 |                         plt.Rectangle((O_box[0], O_box[1]),
128 |                                       O_box[2] - O_box[0],
129 |                                       O_box[3] - O_box[1], fill=False,
130 |                                       edgecolor=cc(HO_dic[tuple(O_box)])[:3], linewidth=3)
131 |                     )
132 |                     ax.set(xlim=[0, width], ylim=[height, 0], aspect=1)
133 |     fig.savefig(save_path)
134 | 
135 | 
136 | if __name__=="__main__":
137 |     arg = argparse.ArgumentParser()
138 |     arg.add_argument('--det_file', type=str, default=None)
139 |     args = ap.parse_args()
140 | 
141 |     detection = pickle.load(open(args.det_file, "rb"))
142 |     visualize(detection)
143 | 


--------------------------------------------------------------------------------
/coco_eval.py:
--------------------------------------------------------------------------------
  1 | # Author: Zylo117
  2 | 
  3 | """
  4 | COCO-Style Evaluations
  5 | 
  6 | put images here datasets/your_project_name/annotations/val_set_name/*.jpg
  7 | put annotations here datasets/your_project_name/annotations/instances_{val_set_name}.json
  8 | put weights here /path/to/your/weights/*.pth
  9 | change compound_coef
 10 | 
 11 | """
 12 | 
 13 | import json
 14 | import os
 15 | 
 16 | import argparse
 17 | import torch
 18 | import yaml
 19 | from tqdm import tqdm
 20 | from pycocotools.coco import COCO
 21 | from pycocotools.cocoeval import COCOeval
 22 | 
 23 | from backbone import EfficientDetBackbone
 24 | from efficientdet.utils import BBoxTransform, ClipBoxes
 25 | from utils.utils import preprocess, invert_affine, postprocess
 26 | 
 27 | from efficientdet.vcoco_dataset import VCOCO_Dataset, Resizer, Normalizer, Augmenter, collater
 28 | 
 29 | 
 30 | ap = argparse.ArgumentParser()
 31 | ap.add_argument('-p', '--project', type=str, default='coco', help='project file that contains parameters')
 32 | ap.add_argument('-c', '--compound_coef', type=int, default=0, help='coefficients of efficientdet')
 33 | ap.add_argument('-w', '--weights', type=str, default=None, help='/path/to/weights')
 34 | ap.add_argument('--nms_threshold', type=float, default=0.5, help='nms threshold, don\'t change it if not for testing purposes')
 35 | ap.add_argument('--cuda', type=bool, default=True)
 36 | ap.add_argument('--device', type=int, default=0)
 37 | ap.add_argument('--float16', type=bool, default=False)
 38 | ap.add_argument('--override', type=bool, default=True, help='override previous bbox results file if exists')
 39 | args = ap.parse_args()
 40 | 
 41 | compound_coef = args.compound_coef
 42 | nms_threshold = args.nms_threshold
 43 | use_cuda = args.cuda
 44 | gpu = args.device
 45 | use_float16 = args.float16
 46 | override_prev_results = args.override
 47 | project_name = args.project
 48 | weights_path = f'weights/efficientdet-d{compound_coef}.pth' if args.weights is None else args.weights
 49 | 
 50 | print(f'running coco-style evaluation on project {project_name}, weights {weights_path}...')
 51 | 
 52 | params = yaml.safe_load(open(f'projects/{project_name}.yml'))
 53 | obj_list = params['obj_list']
 54 | 
 55 | input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]
 56 | 
 57 | 
 58 | def evaluate_coco(img_path, set_name, image_ids, coco, model, threshold=0.05):
 59 |     results = []
 60 |     processed_image_ids = []
 61 | 
 62 |     regressBoxes = BBoxTransform()
 63 |     clipBoxes = ClipBoxes()
 64 | 
 65 |     for image_id in tqdm(image_ids):
 66 |         image_info = coco.loadImgs(image_id)[0]
 67 |         image_path = img_path + image_info['file_name']
 68 | 
 69 |         ori_imgs, framed_imgs, framed_metas = preprocess(image_path, max_size=input_sizes[compound_coef])
 70 |         x = torch.from_numpy(framed_imgs[0])
 71 | 
 72 |         if use_cuda:
 73 |             x = x.cuda(gpu)
 74 |             if use_float16:
 75 |                 x = x.half()
 76 |             else:
 77 |                 x = x.float()
 78 |         else:
 79 |             x = x.float()
 80 | 
 81 |         x = x.unsqueeze(0).permute(0, 3, 1, 2)
 82 |         features, regression, classification, anchors = model(x)
 83 | 
 84 |         preds = postprocess(x,
 85 |                             anchors, regression, classification,
 86 |                             regressBoxes, clipBoxes,
 87 |                             threshold, nms_threshold)
 88 | 
 89 |         processed_image_ids.append(image_id)
 90 | 
 91 |         if not preds:
 92 |             continue
 93 | 
 94 |         preds = invert_affine(framed_metas, preds)[0]
 95 | 
 96 |         scores = preds['scores']
 97 |         class_ids = preds['class_ids']
 98 |         rois = preds['rois']
 99 | 
100 |         if rois.shape[0] > 0:
101 |             # x1,y1,x2,y2 -> x1,y1,w,h
102 |             rois[:, 2] -= rois[:, 0]
103 |             rois[:, 3] -= rois[:, 1]
104 | 
105 |             bbox_score = scores
106 | 
107 |             for roi_id in range(rois.shape[0]):
108 |                 score = float(bbox_score[roi_id])
109 |                 label = int(class_ids[roi_id])
110 |                 box = rois[roi_id, :]
111 | 
112 |                 if score < threshold:
113 |                     break
114 |                 image_result = {
115 |                     'image_id': image_id,
116 |                     'category_id': label + 1,
117 |                     'score': float(score),
118 |                     'bbox': box.tolist(),
119 |                 }
120 | 
121 |                 results.append(image_result)
122 | 
123 |     if not len(results):
124 |         raise Exception('the model does not provide any valid output, check model architecture and the data input')
125 | 
126 |     # write output
127 |     filepath = f'{set_name}_bbox_results.json'
128 |     if os.path.exists(filepath):
129 |         os.remove(filepath)
130 |     json.dump(results, open(filepath, 'w'), indent=4)
131 | 
132 |     return processed_image_ids
133 | 
134 | 
135 | def _eval(coco_gt, image_ids, pred_json_path):
136 |     # load results in COCO evaluation tool
137 |     coco_pred = coco_gt.loadRes(pred_json_path)
138 | 
139 |     # run COCO evaluation
140 |     print('BBox')
141 |     coco_eval = COCOeval(coco_gt, coco_pred, 'bbox')
142 |     coco_eval.params.imgIds = image_ids
143 |     coco_eval.evaluate()
144 |     coco_eval.accumulate()
145 |     coco_eval.summarize()
146 | 
147 | 
148 | if __name__ == '__main__':
149 |     SET_NAME = params['val_set']
150 |     VAL_GT = f'datasets/{params["project_name"]}/annotations/instances_{SET_NAME}.json'
151 |     VAL_IMGS = f'datasets/{params["project_name"]}/{SET_NAME}/'
152 |     MAX_IMAGES = 10000
153 |     coco_gt = COCO(VAL_GT)
154 |     image_ids = coco_gt.getImgIds()[:MAX_IMAGES]
155 |     
156 |     if override_prev_results or not os.path.exists(f'{SET_NAME}_bbox_results.json'):
157 |         model = EfficientDetBackbone(compound_coef=compound_coef, num_classes=len(obj_list),
158 |                                      ratios=eval(params['anchors_ratios']), scales=eval(params['anchors_scales']))
159 |         model.load_state_dict(torch.load(weights_path, map_location=torch.device('cpu')))
160 |         model.requires_grad_(False)
161 |         model.eval()
162 | 
163 |         if use_cuda:
164 |             model.cuda(gpu)
165 | 
166 |             if use_float16:
167 |                 model.half()
168 | 
169 |         image_ids = evaluate_coco(VAL_IMGS, SET_NAME, image_ids, coco_gt, model)
170 | 
171 |     _eval(coco_gt, image_ids, f'{SET_NAME}_bbox_results.json')
172 | 


--------------------------------------------------------------------------------
/efficientdet/dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import numpy as np
  4 | 
  5 | from torch.utils.data import Dataset, DataLoader
  6 | from pycocotools.coco import COCO
  7 | import cv2
  8 | 
  9 | 
 10 | class CocoDataset(Dataset):
 11 |     def __init__(self, root_dir, set='train2017', transform=None):
 12 | 
 13 |         self.root_dir = root_dir
 14 |         self.set_name = set
 15 |         self.transform = transform
 16 | 
 17 |         self.coco = COCO(os.path.join(self.root_dir, 'annotations', 'instances_' + self.set_name + '.json'))
 18 |         self.image_ids = self.coco.getImgIds()
 19 | 
 20 |         self.load_classes()
 21 | 
 22 |     def load_classes(self):
 23 | 
 24 |         # load class names (name -> label)
 25 |         categories = self.coco.loadCats(self.coco.getCatIds())
 26 |         categories.sort(key=lambda x: x['id'])
 27 | 
 28 |         self.classes = {}
 29 |         self.coco_labels = {}
 30 |         self.coco_labels_inverse = {}
 31 |         for c in categories:
 32 |             self.coco_labels[len(self.classes)] = c['id']
 33 |             self.coco_labels_inverse[c['id']] = len(self.classes)
 34 |             self.classes[c['name']] = len(self.classes)
 35 | 
 36 |         # also load the reverse (label -> name)
 37 |         self.labels = {}
 38 |         for key, value in self.classes.items():
 39 |             self.labels[value] = key
 40 | 
 41 |     def __len__(self):
 42 |         return len(self.image_ids)
 43 | 
 44 |     def __getitem__(self, idx):
 45 | 
 46 |         img = self.load_image(idx)
 47 |         annot = self.load_annotations(idx)
 48 |         sample = {'img': img, 'annot': annot}
 49 |         if self.transform:
 50 |             sample = self.transform(sample)
 51 |         return sample
 52 | 
 53 |     def load_image(self, image_index):
 54 |         image_info = self.coco.loadImgs(self.image_ids[image_index])[0]
 55 |         path = os.path.join(self.root_dir, self.set_name, image_info['file_name'])
 56 |         img = cv2.imread(path)
 57 |         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 58 | 
 59 |         return img.astype(np.float32) / 255.
 60 | 
 61 |     def load_annotations(self, image_index):
 62 |         # get ground truth annotations
 63 |         annotations_ids = self.coco.getAnnIds(imgIds=self.image_ids[image_index], iscrowd=False)
 64 |         annotations = np.zeros((0, 5))
 65 | 
 66 |         # some images appear to miss annotations
 67 |         if len(annotations_ids) == 0:
 68 |             return annotations
 69 | 
 70 |         # parse annotations
 71 |         coco_annotations = self.coco.loadAnns(annotations_ids)
 72 |         for idx, a in enumerate(coco_annotations):
 73 | 
 74 |             # some annotations have basically no width / height, skip them
 75 |             if a['bbox'][2] < 1 or a['bbox'][3] < 1:
 76 |                 continue
 77 | 
 78 |             annotation = np.zeros((1, 5))
 79 |             annotation[0, :4] = a['bbox']
 80 |             annotation[0, 4] = self.coco_label_to_label(a['category_id'])
 81 |             annotations = np.append(annotations, annotation, axis=0)
 82 | 
 83 |         # transform from [x, y, w, h] to [x1, y1, x2, y2]
 84 |         annotations[:, 2] = annotations[:, 0] + annotations[:, 2]
 85 |         annotations[:, 3] = annotations[:, 1] + annotations[:, 3]
 86 | 
 87 |         return annotations
 88 | 
 89 |     def coco_label_to_label(self, coco_label):
 90 |         return self.coco_labels_inverse[coco_label]
 91 | 
 92 |     def label_to_coco_label(self, label):
 93 |         return self.coco_labels[label]
 94 | 
 95 | 
 96 | def collater(data):
 97 |     imgs = [s['img'] for s in data]
 98 |     annots = [s['annot'] for s in data]
 99 |     scales = [s['scale'] for s in data]
100 | 
101 |     imgs = torch.from_numpy(np.stack(imgs, axis=0))
102 | 
103 |     max_num_annots = max(annot.shape[0] for annot in annots)
104 | 
105 |     if max_num_annots > 0:
106 | 
107 |         annot_padded = torch.ones((len(annots), max_num_annots, 5)) * -1
108 | 
109 |         if max_num_annots > 0:
110 |             for idx, annot in enumerate(annots):
111 |                 if annot.shape[0] > 0:
112 |                     annot_padded[idx, :annot.shape[0], :] = annot
113 |     else:
114 |         annot_padded = torch.ones((len(annots), 1, 5)) * -1
115 | 
116 |     imgs = imgs.permute(0, 3, 1, 2)
117 | 
118 |     return {'img': imgs, 'annot': annot_padded, 'scale': scales}
119 | 
120 | 
121 | class Resizer(object):
122 |     """Convert ndarrays in sample to Tensors."""
123 |     
124 |     def __init__(self, img_size=512):
125 |         self.img_size = img_size
126 | 
127 |     def __call__(self, sample):
128 |         image, annots = sample['img'], sample['annot']
129 |         height, width, _ = image.shape
130 |         if height > width:
131 |             scale = self.img_size / height
132 |             resized_height = self.img_size
133 |             resized_width = int(width * scale)
134 |         else:
135 |             scale = self.img_size / width
136 |             resized_height = int(height * scale)
137 |             resized_width = self.img_size
138 | 
139 |         image = cv2.resize(image, (resized_width, resized_height), interpolation=cv2.INTER_LINEAR)
140 | 
141 |         new_image = np.zeros((self.img_size, self.img_size, 3))
142 |         new_image[0:resized_height, 0:resized_width] = image
143 | 
144 |         annots[:, :4] *= scale
145 | 
146 |         return {'img': torch.from_numpy(new_image).to(torch.float32), 'annot': torch.from_numpy(annots), 'scale': scale}
147 | 
148 | 
149 | class Augmenter(object):
150 |     """Convert ndarrays in sample to Tensors."""
151 | 
152 |     def __call__(self, sample, flip_x=0.5):
153 |         if np.random.rand() < flip_x:
154 |             image, annots = sample['img'], sample['annot']
155 |             image = image[:, ::-1, :]
156 | 
157 |             rows, cols, channels = image.shape
158 | 
159 |             x1 = annots[:, 0].copy()
160 |             x2 = annots[:, 2].copy()
161 | 
162 |             x_tmp = x1.copy()
163 | 
164 |             annots[:, 0] = cols - x2
165 |             annots[:, 2] = cols - x_tmp
166 | 
167 |             sample = {'img': image, 'annot': annots}
168 | 
169 |         return sample
170 | 
171 | 
172 | class Normalizer(object):
173 | 
174 |     def __init__(self, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
175 |         self.mean = np.array([[mean]])
176 |         self.std = np.array([[std]])
177 | 
178 |     def __call__(self, sample):
179 |         image, annots = sample['img'], sample['annot']
180 | 
181 |         return {'img': ((image.astype(np.float32) - self.mean) / self.std), 'annot': annots}
182 | 


--------------------------------------------------------------------------------
/Generate_HICO_detection.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Tensorflow iCAN
  3 | # Licensed under The MIT License [see LICENSE for details]
  4 | # Written by Chen Gao
  5 | # --------------------------------------------------------
  6 | 
  7 | """
  8 | Change the HICO-DET detection results to the right format.
  9 | """
 10 | 
 11 | import pickle
 12 | import numpy as np
 13 | import scipy.io as sio
 14 | import os
 15 | 
 16 | def save_HICO(HICO, HICO_dir, classid, begin, finish):
 17 |     print("class id:", classid)
 18 |     
 19 |     all_boxes = []
 20 |     for i in range(finish - begin + 1):
 21 |         total = []
 22 |         score = []
 23 |         for key, value in HICO.items():
 24 |             for element in value:
 25 |                 if element[2] == classid:
 26 |                     temp = []
 27 |                     temp.append(element[0].tolist())  # Human box
 28 |                     temp.append(element[1].tolist())  # Object box
 29 |                     temp.append(int(key))             # image id
 30 |                     temp.append(int(i))               # action id (0-599)
 31 |                     temp.append(element[3][begin - 1 + i] * element[4] * element[5])
 32 | 
 33 |                     total.append(temp)
 34 |                     score.append(element[3][begin - 1 + i] * element[4] * element[5])
 35 | 
 36 | 
 37 |         idx = np.argsort(score, axis=0)[::-1]
 38 |         for i_idx in range(min(len(idx),19999)):
 39 |             all_boxes.append(total[idx[i_idx]])
 40 |     savefile = HICO_dir + 'detections_' + str(classid).zfill(2) + '.mat'
 41 |     sio.savemat(savefile, {'all_boxes':all_boxes})
 42 |    
 43 | def Generate_HICO_detection(output_file, HICO_dir):
 44 | 
 45 |     if not os.path.exists(HICO_dir):
 46 |         os.makedirs(HICO_dir)
 47 | 
 48 |     # Remove previous results
 49 |     filelist = [ f for f in os.listdir(HICO_dir)]
 50 |     for f in filelist:
 51 |         os.remove(os.path.join(HICO_dir, f))
 52 | 
 53 |     HICO = pickle.load( open( output_file, "rb" ) )
 54 | 
 55 |     save_HICO(HICO, HICO_dir,  1 ,161, 170) # 1 person
 56 |     save_HICO(HICO, HICO_dir,  2 ,11,  24 ) # 2 bicycle
 57 |     save_HICO(HICO, HICO_dir,  3 ,66,  76 ) # 3 car
 58 |     save_HICO(HICO, HICO_dir,  4 ,147, 160) # 4 motorcycle
 59 |     save_HICO(HICO, HICO_dir,  5 ,1,   10 ) # 5 airplane
 60 |     save_HICO(HICO, HICO_dir,  6 ,55,  65 ) # 6 bus
 61 |     save_HICO(HICO, HICO_dir,  7 ,187, 194) # 7 train
 62 |     save_HICO(HICO, HICO_dir,  8 ,568, 576) # 8 truck
 63 |     save_HICO(HICO, HICO_dir,  9 ,32,  46 ) # 9 boat
 64 |     save_HICO(HICO, HICO_dir,  10,563, 567) # 10 traffic light
 65 |     save_HICO(HICO, HICO_dir,  11,326,330) # 11 fire_hydrant
 66 |     save_HICO(HICO, HICO_dir,  12,503,506) # 12 stop_sign
 67 |     save_HICO(HICO, HICO_dir,  13,415,418) # 13 parking_meter
 68 |     save_HICO(HICO, HICO_dir,  14,244,247) # 14 bench
 69 |     save_HICO(HICO, HICO_dir,  15,25,  31) # 15 bird
 70 |     save_HICO(HICO, HICO_dir,  16,77,  86) # 16 cat
 71 |     save_HICO(HICO, HICO_dir,  17,112,129) # 17 dog
 72 |     save_HICO(HICO, HICO_dir,  18,130,146) # 18 horse
 73 |     save_HICO(HICO, HICO_dir,  19,175,186) # 19 sheep
 74 |     save_HICO(HICO, HICO_dir,  20,97,107)  # 20 cow
 75 |     save_HICO(HICO, HICO_dir,  21,314,325) # 21 elephant
 76 |     save_HICO(HICO, HICO_dir,  22,236,239) # 22 bear
 77 |     save_HICO(HICO, HICO_dir,  23,596,600) # 23 zebra
 78 |     save_HICO(HICO, HICO_dir,  24,343,348) # 24 giraffe
 79 |     save_HICO(HICO, HICO_dir,  25,209,214) # 25 backpack
 80 |     save_HICO(HICO, HICO_dir,  26,577,584) # 26 umbrella
 81 |     save_HICO(HICO, HICO_dir,  27,353,356) # 27 handbag
 82 |     save_HICO(HICO, HICO_dir,  28,539,546) # 28 tie
 83 |     save_HICO(HICO, HICO_dir,  29,507,516) # 29 suitcase
 84 |     save_HICO(HICO, HICO_dir,  30,337,342) # 30 Frisbee
 85 |     save_HICO(HICO, HICO_dir,  31,464,474) # 31 skis
 86 |     save_HICO(HICO, HICO_dir,  32,475,483) # 32 snowboard
 87 |     save_HICO(HICO, HICO_dir,  33,489,502) # 33 sports_ball
 88 |     save_HICO(HICO, HICO_dir,  34,369,376) # 34 kite
 89 |     save_HICO(HICO, HICO_dir,  35,225,232) # 35 baseball_bat
 90 |     save_HICO(HICO, HICO_dir,  36,233,235) # 36 baseball_glove
 91 |     save_HICO(HICO, HICO_dir,  37,454,463) # 37 skateboard
 92 |     save_HICO(HICO, HICO_dir,  38,517,528) # 38 surfboard
 93 |     save_HICO(HICO, HICO_dir,  39,534,538) # 39 tennis_racket
 94 |     save_HICO(HICO, HICO_dir,  40,47,54)   # 40 bottle
 95 |     save_HICO(HICO, HICO_dir,  41,589,595) # 41 wine_glass
 96 |     save_HICO(HICO, HICO_dir,  42,296,305) # 42 cup
 97 |     save_HICO(HICO, HICO_dir,  43,331,336) # 43 fork
 98 |     save_HICO(HICO, HICO_dir,  44,377,383) # 44 knife
 99 |     save_HICO(HICO, HICO_dir,  45,484,488) # 45 spoon
100 |     save_HICO(HICO, HICO_dir,  46,253,257) # 46 bowl
101 |     save_HICO(HICO, HICO_dir,  47,215,224) # 47 banana
102 |     save_HICO(HICO, HICO_dir,  48,199,208) # 48 apple
103 |     save_HICO(HICO, HICO_dir,  49,439,445) # 49 sandwich
104 |     save_HICO(HICO, HICO_dir,  50,398,407) # 50 orange
105 |     save_HICO(HICO, HICO_dir,  51,258,264) # 51 broccoli
106 |     save_HICO(HICO, HICO_dir,  52,274,283) # 52 carrot
107 |     save_HICO(HICO, HICO_dir,  53,357,363) # 53 hot_dog
108 |     save_HICO(HICO, HICO_dir,  54,419,429) # 54 pizza
109 |     save_HICO(HICO, HICO_dir,  55,306,313) # 55 donut
110 |     save_HICO(HICO, HICO_dir,  56,265,273) # 56 cake
111 |     save_HICO(HICO, HICO_dir,  57,87,92)   # 57 chair
112 |     save_HICO(HICO, HICO_dir,  58,93,96)   # 58 couch
113 |     save_HICO(HICO, HICO_dir,  59,171,174) # 59 potted_plant
114 |     save_HICO(HICO, HICO_dir,  60,240,243) #60 bed
115 |     save_HICO(HICO, HICO_dir,  61,108,111) #61 dining_table
116 |     save_HICO(HICO, HICO_dir,  62,551,558) #62 toilet
117 |     save_HICO(HICO, HICO_dir,  63,195,198) #63 TV
118 |     save_HICO(HICO, HICO_dir,  64,384,389) #64 laptop
119 |     save_HICO(HICO, HICO_dir,  65,394,397) #65 mouse
120 |     save_HICO(HICO, HICO_dir,  66,435,438) #66 remote
121 |     save_HICO(HICO, HICO_dir,  67,364,368) #67 keyboard
122 |     save_HICO(HICO, HICO_dir,  68,284,290) #68 cell_phone
123 |     save_HICO(HICO, HICO_dir,  69,390,393) #69 microwave
124 |     save_HICO(HICO, HICO_dir,  70,408,414) #70 oven
125 |     save_HICO(HICO, HICO_dir,  71,547,550) #71 toaster
126 |     save_HICO(HICO, HICO_dir,  72,450,453) #72 sink
127 |     save_HICO(HICO, HICO_dir,  73,430,434) #73 refrigerator
128 |     save_HICO(HICO, HICO_dir,  74,248,252) #74 book
129 |     save_HICO(HICO, HICO_dir,  75,291,295) #75 clock
130 |     save_HICO(HICO, HICO_dir,  76,585,588) #76 vase
131 |     save_HICO(HICO, HICO_dir,  77,446,449) #77 scissors
132 |     save_HICO(HICO, HICO_dir,  78,529,533) #78 teddy_bear
133 |     save_HICO(HICO, HICO_dir,  79,349,352) #79 hair_drier
134 |     save_HICO(HICO, HICO_dir,  80,559,562) #80 toothbrush
135 | 
136 | 
137 | if __name__ == "__main__":
138 |     hico_dir = "/DATA1/Benchmark/ETE_HOI/DenseNet/logs/vcoco_head_only_hicodet_bs12_d2_fixbn/results/HICO/"
139 |     detection_path = "/DATA1/Benchmark/ETE_HOI/DenseNet/logs/vcoco_head_only_hicodet_bs12_d2_fixbn/results/test_bbox_results.pkl"
140 |     Generate_HICO_detection(detection_path, hico_dir)


--------------------------------------------------------------------------------
/efficientnet/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.nn import functional as F
  4 | 
  5 | from .utils import (
  6 |     round_filters,
  7 |     round_repeats,
  8 |     drop_connect,
  9 |     get_same_padding_conv2d,
 10 |     get_model_params,
 11 |     efficientnet_params,
 12 |     load_pretrained_weights,
 13 |     Swish,
 14 |     MemoryEfficientSwish,
 15 | )
 16 | 
 17 | class MBConvBlock(nn.Module):
 18 |     """
 19 |     Mobile Inverted Residual Bottleneck Block
 20 | 
 21 |     Args:
 22 |         block_args (namedtuple): BlockArgs, see above
 23 |         global_params (namedtuple): GlobalParam, see above
 24 | 
 25 |     Attributes:
 26 |         has_se (bool): Whether the block contains a Squeeze and Excitation layer.
 27 |     """
 28 | 
 29 |     def __init__(self, block_args, global_params):
 30 |         super().__init__()
 31 |         self._block_args = block_args
 32 |         self._bn_mom = 1 - global_params.batch_norm_momentum
 33 |         self._bn_eps = global_params.batch_norm_epsilon
 34 |         self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ratio <= 1)
 35 |         self.id_skip = block_args.id_skip  # skip connection and drop connect
 36 | 
 37 |         # Get static or dynamic convolution depending on image size
 38 |         Conv2d = get_same_padding_conv2d(image_size=global_params.image_size)
 39 | 
 40 |         # Expansion phase
 41 |         inp = self._block_args.input_filters  # number of input channels
 42 |         oup = self._block_args.input_filters * self._block_args.expand_ratio  # number of output channels
 43 |         if self._block_args.expand_ratio != 1:
 44 |             self._expand_conv = Conv2d(in_channels=inp, out_channels=oup, kernel_size=1, bias=False)
 45 |             self._bn0 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
 46 | 
 47 |         # Depthwise convolution phase
 48 |         k = self._block_args.kernel_size
 49 |         s = self._block_args.stride
 50 |         self._depthwise_conv = Conv2d(
 51 |             in_channels=oup, out_channels=oup, groups=oup,  # groups makes it depthwise
 52 |             kernel_size=k, stride=s, bias=False)
 53 |         self._bn1 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
 54 | 
 55 |         # Squeeze and Excitation layer, if desired
 56 |         if self.has_se:
 57 |             num_squeezed_channels = max(1, int(self._block_args.input_filters * self._block_args.se_ratio))
 58 |             self._se_reduce = Conv2d(in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1)
 59 |             self._se_expand = Conv2d(in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1)
 60 | 
 61 |         # Output phase
 62 |         final_oup = self._block_args.output_filters
 63 |         self._project_conv = Conv2d(in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
 64 |         self._bn2 = nn.BatchNorm2d(num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
 65 |         self._swish = MemoryEfficientSwish()
 66 | 
 67 |     def forward(self, inputs, drop_connect_rate=None):
 68 |         """
 69 |         :param inputs: input tensor
 70 |         :param drop_connect_rate: drop connect rate (float, between 0 and 1)
 71 |         :return: output of block
 72 |         """
 73 | 
 74 |         # Expansion and Depthwise Convolution
 75 |         x = inputs
 76 |         if self._block_args.expand_ratio != 1:
 77 |             x = self._expand_conv(inputs)
 78 |             x = self._bn0(x)
 79 |             x = self._swish(x)
 80 | 
 81 |         x = self._depthwise_conv(x)
 82 |         x = self._bn1(x)
 83 |         x = self._swish(x)
 84 | 
 85 |         # Squeeze and Excitation
 86 |         if self.has_se:
 87 |             x_squeezed = F.adaptive_avg_pool2d(x, 1)
 88 |             x_squeezed = self._se_reduce(x_squeezed)
 89 |             x_squeezed = self._swish(x_squeezed)
 90 |             x_squeezed = self._se_expand(x_squeezed)
 91 |             x = torch.sigmoid(x_squeezed) * x
 92 | 
 93 |         x = self._project_conv(x)
 94 |         x = self._bn2(x)
 95 | 
 96 |         # Skip connection and drop connect
 97 |         input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
 98 |         if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
 99 |             if drop_connect_rate:
100 |                 x = drop_connect(x, p=drop_connect_rate, training=self.training)
101 |             x = x + inputs  # skip connection
102 |         return x
103 | 
104 |     def set_swish(self, memory_efficient=True):
105 |         """Sets swish function as memory efficient (for training) or standard (for export)"""
106 |         self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
107 | 
108 | 
109 | class EfficientNet(nn.Module):
110 |     """
111 |     An EfficientNet model. Most easily loaded with the .from_name or .from_pretrained methods
112 | 
113 |     Args:
114 |         blocks_args (list): A list of BlockArgs to construct blocks
115 |         global_params (namedtuple): A set of GlobalParams shared between blocks
116 | 
117 |     Example:
118 |         model = EfficientNet.from_pretrained('efficientnet-b0')
119 | 
120 |     """
121 | 
122 |     def __init__(self, blocks_args=None, global_params=None):
123 |         super().__init__()
124 |         assert isinstance(blocks_args, list), 'blocks_args should be a list'
125 |         assert len(blocks_args) > 0, 'block args must be greater than 0'
126 |         self._global_params = global_params
127 |         self._blocks_args = blocks_args
128 | 
129 |         # Get static or dynamic convolution depending on image size
130 |         Conv2d = get_same_padding_conv2d(image_size=global_params.image_size)
131 | 
132 |         # Batch norm parameters
133 |         bn_mom = 1 - self._global_params.batch_norm_momentum
134 |         bn_eps = self._global_params.batch_norm_epsilon
135 | 
136 |         # Stem
137 |         in_channels = 3  # rgb
138 |         out_channels = round_filters(32, self._global_params)  # number of output channels
139 |         self._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False)
140 |         self._bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
141 | 
142 |         # Build blocks
143 |         self._blocks = nn.ModuleList([])
144 |         for block_args in self._blocks_args:
145 | 
146 |             # Update block input and output filters based on depth multiplier.
147 |             block_args = block_args._replace(
148 |                 input_filters=round_filters(block_args.input_filters, self._global_params),
149 |                 output_filters=round_filters(block_args.output_filters, self._global_params),
150 |                 num_repeat=round_repeats(block_args.num_repeat, self._global_params)
151 |             )
152 | 
153 |             # The first block needs to take care of stride and filter size increase.
154 |             self._blocks.append(MBConvBlock(block_args, self._global_params))
155 |             if block_args.num_repeat > 1:
156 |                 block_args = block_args._replace(input_filters=block_args.output_filters, stride=1)
157 |             for _ in range(block_args.num_repeat - 1):
158 |                 self._blocks.append(MBConvBlock(block_args, self._global_params))
159 | 
160 |         # Head
161 |         in_channels = block_args.output_filters  # output of final block
162 |         out_channels = round_filters(1280, self._global_params)
163 |         self._conv_head = Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
164 |         self._bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
165 | 
166 |         # Final linear layer
167 |         self._avg_pooling = nn.AdaptiveAvgPool2d(1)
168 |         self._dropout = nn.Dropout(self._global_params.dropout_rate)
169 |         self._fc = nn.Linear(out_channels, self._global_params.num_classes)
170 |         self._swish = MemoryEfficientSwish()
171 | 
172 |     def set_swish(self, memory_efficient=True):
173 |         """Sets swish function as memory efficient (for training) or standard (for export)"""
174 |         self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
175 |         for block in self._blocks:
176 |             block.set_swish(memory_efficient)
177 | 
178 | 
179 |     def extract_features(self, inputs):
180 |         """ Returns output of the final convolution layer """
181 | 
182 |         # Stem
183 |         x = self._swish(self._bn0(self._conv_stem(inputs)))
184 | 
185 |         # Blocks
186 |         for idx, block in enumerate(self._blocks):
187 |             drop_connect_rate = self._global_params.drop_connect_rate
188 |             if drop_connect_rate:
189 |                 drop_connect_rate *= float(idx) / len(self._blocks)
190 |             x = block(x, drop_connect_rate=drop_connect_rate)
191 |         # Head
192 |         x = self._swish(self._bn1(self._conv_head(x)))
193 | 
194 |         return x
195 | 
196 |     def forward(self, inputs):
197 |         """ Calls extract_features to extract features, applies final linear layer, and returns logits. """
198 |         bs = inputs.size(0)
199 |         # Convolution layers
200 |         x = self.extract_features(inputs)
201 | 
202 |         # Pooling and final linear layer
203 |         x = self._avg_pooling(x)
204 |         x = x.view(bs, -1)
205 |         x = self._dropout(x)
206 |         x = self._fc(x)
207 |         return x
208 | 
209 |     @classmethod
210 |     def from_name(cls, model_name, override_params=None):
211 |         cls._check_model_name_is_valid(model_name)
212 |         blocks_args, global_params = get_model_params(model_name, override_params)
213 |         return cls(blocks_args, global_params)
214 | 
215 |     @classmethod
216 |     def from_pretrained(cls, model_name, load_weights=True, advprop=True, num_classes=1000, in_channels=3):
217 |         model = cls.from_name(model_name, override_params={'num_classes': num_classes})
218 |         if load_weights:
219 |             load_pretrained_weights(model, model_name, load_fc=(num_classes == 1000), advprop=advprop)
220 |         if in_channels != 3:
221 |             Conv2d = get_same_padding_conv2d(image_size = model._global_params.image_size)
222 |             out_channels = round_filters(32, model._global_params)
223 |             model._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False)
224 |         return model
225 | 
226 |     @classmethod
227 |     def get_image_size(cls, model_name):
228 |         cls._check_model_name_is_valid(model_name)
229 |         _, _, res, _ = efficientnet_params(model_name)
230 |         return res
231 | 
232 |     @classmethod
233 |     def _check_model_name_is_valid(cls, model_name):
234 |         """ Validates model name. """
235 |         valid_models = ['efficientnet-b'+str(i) for i in range(9)]
236 |         if model_name not in valid_models:
237 |             raise ValueError('model_name should be one of: ' + ', '.join(valid_models))
238 | 


--------------------------------------------------------------------------------
/efficientnet/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains helper functions for building the model and for loading model parameters.
  3 | These helper functions are built to mirror those in the official TensorFlow implementation.
  4 | """
  5 | 
  6 | import re
  7 | import math
  8 | import collections
  9 | from functools import partial
 10 | import torch
 11 | from torch import nn
 12 | from torch.nn import functional as F
 13 | from torch.utils import model_zoo
 14 | from .utils_extra import Conv2dStaticSamePadding
 15 | 
 16 | ########################################################################
 17 | ############### HELPERS FUNCTIONS FOR MODEL ARCHITECTURE ###############
 18 | ########################################################################
 19 | 
 20 | 
 21 | # Parameters for the entire model (stem, all blocks, and head)
 22 | 
 23 | GlobalParams = collections.namedtuple('GlobalParams', [
 24 |     'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate',
 25 |     'num_classes', 'width_coefficient', 'depth_coefficient',
 26 |     'depth_divisor', 'min_depth', 'drop_connect_rate', 'image_size'])
 27 | 
 28 | # Parameters for an individual model block
 29 | BlockArgs = collections.namedtuple('BlockArgs', [
 30 |     'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
 31 |     'expand_ratio', 'id_skip', 'stride', 'se_ratio'])
 32 | 
 33 | # Change namedtuple defaults
 34 | GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields)
 35 | BlockArgs.__new__.__defaults__ = (None,) * len(BlockArgs._fields)
 36 | 
 37 | 
 38 | class SwishImplementation(torch.autograd.Function):
 39 |     @staticmethod
 40 |     def forward(ctx, i):
 41 |         result = i * torch.sigmoid(i)
 42 |         ctx.save_for_backward(i)
 43 |         return result
 44 | 
 45 |     @staticmethod
 46 |     def backward(ctx, grad_output):
 47 |         i = ctx.saved_variables[0]
 48 |         sigmoid_i = torch.sigmoid(i)
 49 |         return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
 50 | 
 51 | 
 52 | class MemoryEfficientSwish(nn.Module):
 53 |     def forward(self, x):
 54 |         return SwishImplementation.apply(x)
 55 | 
 56 | 
 57 | class Swish(nn.Module):
 58 |     def forward(self, x):
 59 |         return x * torch.sigmoid(x)
 60 | 
 61 | 
 62 | def round_filters(filters, global_params):
 63 |     """ Calculate and round number of filters based on depth multiplier. """
 64 |     multiplier = global_params.width_coefficient
 65 |     if not multiplier:
 66 |         return filters
 67 |     divisor = global_params.depth_divisor
 68 |     min_depth = global_params.min_depth
 69 |     filters *= multiplier
 70 |     min_depth = min_depth or divisor
 71 |     new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor)
 72 |     if new_filters < 0.9 * filters:  # prevent rounding by more than 10%
 73 |         new_filters += divisor
 74 |     return int(new_filters)
 75 | 
 76 | 
 77 | def round_repeats(repeats, global_params):
 78 |     """ Round number of filters based on depth multiplier. """
 79 |     multiplier = global_params.depth_coefficient
 80 |     if not multiplier:
 81 |         return repeats
 82 |     return int(math.ceil(multiplier * repeats))
 83 | 
 84 | 
 85 | def drop_connect(inputs, p, training):
 86 |     """ Drop connect. """
 87 |     if not training: return inputs
 88 |     batch_size = inputs.shape[0]
 89 |     keep_prob = 1 - p
 90 |     random_tensor = keep_prob
 91 |     random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=inputs.dtype, device=inputs.device)
 92 |     binary_tensor = torch.floor(random_tensor)
 93 |     output = inputs / keep_prob * binary_tensor
 94 |     return output
 95 | 
 96 | 
 97 | def get_same_padding_conv2d(image_size=None):
 98 |     """ Chooses static padding if you have specified an image size, and dynamic padding otherwise.
 99 |         Static padding is necessary for ONNX exporting of models. """
100 |     if image_size is None:
101 |         return Conv2dDynamicSamePadding
102 |     else:
103 |         return partial(Conv2dStaticSamePadding, image_size=image_size)
104 | 
105 | 
106 | class Conv2dDynamicSamePadding(nn.Conv2d):
107 |     """ 2D Convolutions like TensorFlow, for a dynamic image size """
108 | 
109 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True):
110 |         super().__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
111 |         self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2
112 | 
113 |     def forward(self, x):
114 |         ih, iw = x.size()[-2:]
115 |         kh, kw = self.weight.size()[-2:]
116 |         sh, sw = self.stride
117 |         oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
118 |         pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
119 |         pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
120 |         if pad_h > 0 or pad_w > 0:
121 |             x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
122 |         return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
123 | 
124 | 
125 | class Identity(nn.Module):
126 |     def __init__(self, ):
127 |         super(Identity, self).__init__()
128 | 
129 |     def forward(self, input):
130 |         return input
131 | 
132 | 
133 | ########################################################################
134 | ############## HELPERS FUNCTIONS FOR LOADING MODEL PARAMS ##############
135 | ########################################################################
136 | 
137 | 
138 | def efficientnet_params(model_name):
139 |     """ Map EfficientNet model name to parameter coefficients. """
140 |     params_dict = {
141 |         # Coefficients:   width,depth,res,dropout
142 |         'efficientnet-b0': (1.0, 1.0, 224, 0.2),
143 |         'efficientnet-b1': (1.0, 1.1, 240, 0.2),
144 |         'efficientnet-b2': (1.1, 1.2, 260, 0.3),
145 |         'efficientnet-b3': (1.2, 1.4, 300, 0.3),
146 |         'efficientnet-b4': (1.4, 1.8, 380, 0.4),
147 |         'efficientnet-b5': (1.6, 2.2, 456, 0.4),
148 |         'efficientnet-b6': (1.8, 2.6, 528, 0.5),
149 |         'efficientnet-b7': (2.0, 3.1, 600, 0.5),
150 |         'efficientnet-b8': (2.2, 3.6, 672, 0.5),
151 |         'efficientnet-l2': (4.3, 5.3, 800, 0.5),
152 |     }
153 |     return params_dict[model_name]
154 | 
155 | 
156 | class BlockDecoder(object):
157 |     """ Block Decoder for readability, straight from the official TensorFlow repository """
158 | 
159 |     @staticmethod
160 |     def _decode_block_string(block_string):
161 |         """ Gets a block through a string notation of arguments. """
162 |         assert isinstance(block_string, str)
163 | 
164 |         ops = block_string.split('_')
165 |         options = {}
166 |         for op in ops:
167 |             splits = re.split(r'(\d.*)', op)
168 |             if len(splits) >= 2:
169 |                 key, value = splits[:2]
170 |                 options[key] = value
171 | 
172 |         # Check stride
173 |         assert (('s' in options and len(options['s']) == 1) or
174 |                 (len(options['s']) == 2 and options['s'][0] == options['s'][1]))
175 | 
176 |         return BlockArgs(
177 |             kernel_size=int(options['k']),
178 |             num_repeat=int(options['r']),
179 |             input_filters=int(options['i']),
180 |             output_filters=int(options['o']),
181 |             expand_ratio=int(options['e']),
182 |             id_skip=('noskip' not in block_string),
183 |             se_ratio=float(options['se']) if 'se' in options else None,
184 |             stride=[int(options['s'][0])])
185 | 
186 |     @staticmethod
187 |     def _encode_block_string(block):
188 |         """Encodes a block to a string."""
189 |         args = [
190 |             'r%d' % block.num_repeat,
191 |             'k%d' % block.kernel_size,
192 |             's%d%d' % (block.strides[0], block.strides[1]),
193 |             'e%s' % block.expand_ratio,
194 |             'i%d' % block.input_filters,
195 |             'o%d' % block.output_filters
196 |         ]
197 |         if 0 < block.se_ratio <= 1:
198 |             args.append('se%s' % block.se_ratio)
199 |         if block.id_skip is False:
200 |             args.append('noskip')
201 |         return '_'.join(args)
202 | 
203 |     @staticmethod
204 |     def decode(string_list):
205 |         """
206 |         Decodes a list of string notations to specify blocks inside the network.
207 | 
208 |         :param string_list: a list of strings, each string is a notation of block
209 |         :return: a list of BlockArgs namedtuples of block args
210 |         """
211 |         assert isinstance(string_list, list)
212 |         blocks_args = []
213 |         for block_string in string_list:
214 |             blocks_args.append(BlockDecoder._decode_block_string(block_string))
215 |         return blocks_args
216 | 
217 |     @staticmethod
218 |     def encode(blocks_args):
219 |         """
220 |         Encodes a list of BlockArgs to a list of strings.
221 | 
222 |         :param blocks_args: a list of BlockArgs namedtuples of block args
223 |         :return: a list of strings, each string is a notation of block
224 |         """
225 |         block_strings = []
226 |         for block in blocks_args:
227 |             block_strings.append(BlockDecoder._encode_block_string(block))
228 |         return block_strings
229 | 
230 | 
231 | def efficientnet(width_coefficient=None, depth_coefficient=None, dropout_rate=0.2,
232 |                  drop_connect_rate=0.2, image_size=None, num_classes=1000):
233 |     """ Creates a efficientnet model. """
234 | 
235 |     blocks_args = [
236 |         'r1_k3_s11_e1_i32_o16_se0.25', 'r2_k3_s22_e6_i16_o24_se0.25',
237 |         'r2_k5_s22_e6_i24_o40_se0.25', 'r3_k3_s22_e6_i40_o80_se0.25',
238 |         'r3_k5_s11_e6_i80_o112_se0.25', 'r4_k5_s22_e6_i112_o192_se0.25',
239 |         'r1_k3_s11_e6_i192_o320_se0.25',
240 |     ]
241 |     blocks_args = BlockDecoder.decode(blocks_args)
242 | 
243 |     global_params = GlobalParams(
244 |         batch_norm_momentum=0.99,
245 |         batch_norm_epsilon=1e-3,
246 |         dropout_rate=dropout_rate,
247 |         drop_connect_rate=drop_connect_rate,
248 |         # data_format='channels_last',  # removed, this is always true in PyTorch
249 |         num_classes=num_classes,
250 |         width_coefficient=width_coefficient,
251 |         depth_coefficient=depth_coefficient,
252 |         depth_divisor=8,
253 |         min_depth=None,
254 |         image_size=image_size,
255 |     )
256 | 
257 |     return blocks_args, global_params
258 | 
259 | 
260 | def get_model_params(model_name, override_params):
261 |     """ Get the block args and global params for a given model """
262 |     if model_name.startswith('efficientnet'):
263 |         w, d, s, p = efficientnet_params(model_name)
264 |         # note: all models have drop connect rate = 0.2
265 |         blocks_args, global_params = efficientnet(
266 |             width_coefficient=w, depth_coefficient=d, dropout_rate=p, image_size=s)
267 |     else:
268 |         raise NotImplementedError('model name is not pre-defined: %s' % model_name)
269 |     if override_params:
270 |         # ValueError will be raised here if override_params has fields not included in global_params.
271 |         global_params = global_params._replace(**override_params)
272 |     return blocks_args, global_params
273 | 
274 | 
275 | url_map = {
276 |     'efficientnet-b0': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b0-355c32eb.pth',
277 |     'efficientnet-b1': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b1-f1951068.pth',
278 |     'efficientnet-b2': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b2-8bb594d6.pth',
279 |     'efficientnet-b3': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b3-5fb5a3c3.pth',
280 |     'efficientnet-b4': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b4-6ed6700e.pth',
281 |     'efficientnet-b5': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b5-b6417697.pth',
282 |     'efficientnet-b6': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b6-c76e70fd.pth',
283 |     'efficientnet-b7': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b7-dcc49843.pth',
284 | }
285 | 
286 | url_map_advprop = {
287 |     'efficientnet-b0': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b0-b64d5a18.pth',
288 |     'efficientnet-b1': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b1-0f3ce85a.pth',
289 |     'efficientnet-b2': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b2-6e9d97e5.pth',
290 |     'efficientnet-b3': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b3-cdd7c0f4.pth',
291 |     'efficientnet-b4': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b4-44fb3a87.pth',
292 |     'efficientnet-b5': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b5-86493f6b.pth',
293 |     'efficientnet-b6': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b6-ac80338e.pth',
294 |     'efficientnet-b7': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b7-4652b6dd.pth',
295 |     'efficientnet-b8': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b8-22a8fe65.pth',
296 | }
297 | 
298 | 
299 | def load_pretrained_weights(model, model_name, load_fc=True, advprop=False):
300 |     """ Loads pretrained weights, and downloads if loading for the first time. """
301 |     # AutoAugment or Advprop (different preprocessing)
302 |     url_map_ = url_map_advprop if advprop else url_map
303 |     state_dict = model_zoo.load_url(url_map_[model_name], map_location=torch.device('cpu'))
304 |     # state_dict = torch.load('../../weights/backbone_efficientnetb0.pth')
305 |     if load_fc:
306 |         ret = model.load_state_dict(state_dict, strict=False)
307 |         print(ret)
308 |     else:
309 |         state_dict.pop('_fc.weight')
310 |         state_dict.pop('_fc.bias')
311 |         res = model.load_state_dict(state_dict, strict=False)
312 |         assert set(res.missing_keys) == set(['_fc.weight', '_fc.bias']), 'issue loading pretrained weights'
313 |     print('Loaded pretrained weights for {}'.format(model_name))
314 | 


--------------------------------------------------------------------------------
/efficientdet/hico_det_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import json
  4 | import numpy as np
  5 | import sys
  6 | sys.path.append("/home/yichen/DenseNet")
  7 | 
  8 | from torch.utils.data import Dataset, DataLoader
  9 | from PIL import Image, ImageEnhance, ImageOps, ImageFile
 10 | 
 11 | import cv2
 12 | 
 13 | from tqdm.autonotebook import tqdm
 14 | 
 15 | import datasets.vcoco.vsrl_utils as vu
 16 | from efficientdet.vcoco_dataset import *
 17 | from efficientdet.help_function import *
 18 | 
 19 | 
 20 | obj_list = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
 21 |            'fire hydrant', '', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
 22 |            'cow', 'elephant', 'bear', 'zebra', 'giraffe', '', 'backpack', 'umbrella', '', '', 'handbag', 'tie',
 23 |            'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
 24 |            'skateboard', 'surfboard', 'tennis racket', 'bottle', '', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
 25 |            'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut',
 26 |            'cake', 'chair', 'couch', 'potted plant', 'bed', '', 'dining table', '', '', 'toilet', '', 'tv',
 27 |            'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
 28 |            'refrigerator', '', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier','toothbrush']
 29 | 
 30 | 
 31 | class HICO_DET_Dataset(Dataset):
 32 |     def __init__(self, root_dir, set='train', transform=None, color_prob=0):
 33 |         # self.root_dir = root_dir
 34 |         self.data_dir = root_dir
 35 |         self.processed_dir = os.path.join(self.data_dir, "hico_processed")
 36 |         self.setname = set
 37 |         self.transform = transform
 38 |         self.color_prob = color_prob
 39 | 
 40 |         self.load_object_category()
 41 |         self.load_verb_category()
 42 |         self.load_hoi_category()
 43 |         self.load_ann_list()
 44 |         self.load_ann_by_image()
 45 | 
 46 |     def load_object_category(self):
 47 |         self.obj_to_id = {}
 48 |         self.id_to_obj = {}
 49 |         for id, obj in enumerate(obj_list):
 50 |             if obj != "":
 51 |                 self.obj_to_id[obj] = id
 52 |                 self.id_to_obj[id] = obj
 53 |         assert len(self.obj_to_id) == 80
 54 |         assert len(self.id_to_obj) == 80
 55 | 
 56 |     def load_verb_category(self):
 57 |         self.id_to_verb = {}
 58 |         self.verb_to_id = {}
 59 |         verb_list_path = os.path.join(self.processed_dir, "verb_list.json")
 60 |         with open(verb_list_path, "r") as file:
 61 |             verb_list = json.load(file)
 62 |         for item in verb_list:
 63 |             id = int(item["id"])
 64 |             name = item["name"]
 65 |             self.id_to_verb[id] = name
 66 |             self.verb_to_id[name] = id
 67 |         self.num_verbs = len(self.verb_to_id)
 68 | 
 69 |     def load_hoi_category(self):
 70 |         self.hoi_to_objid = {}
 71 |         self.hoi_to_verbid = {}
 72 |         hoi_list_path = os.path.join(self.processed_dir, "hoi_list.json")
 73 |         with open(hoi_list_path, "r") as file:
 74 |             hoi_list = json.load(file)
 75 |         for item in hoi_list:
 76 |             hoi_id = int(item["id"])
 77 |             object = item["object"]
 78 |             object = object.replace("_", " ")
 79 |             verb = item["verb"]
 80 |             self.hoi_to_objid[hoi_id] = self.obj_to_id[object]
 81 |             self.hoi_to_verbid[hoi_id] = self.verb_to_id[verb]
 82 |         self.num_hois = len(self.hoi_to_verbid)
 83 | 
 84 |     def load_ann_list(self):
 85 |         ann_list_path = os.path.join(self.processed_dir, "anno_list.json")
 86 |         with open(ann_list_path, "r") as file:
 87 |             ann_list = json.load(file)
 88 |         split_ann_list = []
 89 |         for item in ann_list:
 90 |             if self.setname in item["global_id"]:
 91 |                 split_ann_list.append(item)
 92 |         self.split_ann_list = split_ann_list
 93 | 
 94 |     def load_ann_by_image(self):
 95 |         self.ann_by_image = []
 96 |         self.hoi_count = np.zeros(self.num_hois).tolist()
 97 |         self.verb_count = np.zeros(self.num_verbs).tolist()
 98 | 
 99 |         for image_id, image_item in enumerate(self.split_ann_list):
100 |             img_anns = {}
101 | 
102 |             image_path_postfix = image_item["image_path_postfix"]
103 |             img_path = os.path.join(self.data_dir, "images", image_path_postfix)
104 |             img_anns["img_path"] = img_path
105 | 
106 |             hois = image_item["hois"]
107 | 
108 |             inters = []  # (human_bbox, object_bbox, object_category, [action_category])
109 |             instances = []  # (instance_bbox, instance_category, [human_actions], [object_actions])
110 | 
111 |             for idx, hoi in enumerate(hois):
112 |                 id_to_inter = {}  # (human_id, object_id) : (human_bbox, object_bbox, object_category, [action_category])
113 |                 id_to_human = {}  # human_id: (instance_bbox, instance_category, [human_actions], [])
114 |                 id_to_object = {}  # object_id: (instance_bbox, instance_category, [object_actions])
115 | 
116 |                 hoi_id = int(hoi["id"])
117 |                 if hoi["invis"]:
118 |                     continue
119 |                 # print(len(hoi["connections"]), len(hoi["human_bboxes"]), len(hoi["object_bboxes"]))
120 |                 for i in range(len(hoi["connections"])):
121 | 
122 |                     connection = hoi["connections"][i]
123 |                     human_bbox = hoi["human_bboxes"][connection[0]]
124 |                     object_bbox = hoi["object_bboxes"][connection[1]]
125 | 
126 |                     inter_id = tuple([idx] + connection)
127 |                     human_id = tuple([idx] + [connection[0]])
128 |                     object_id = tuple([idx] + [connection[1]])
129 | 
130 |                     self.hoi_count[hoi_id - 1] += 1
131 |                     self.verb_count[self.hoi_to_verbid[hoi_id]-1] += 1
132 | 
133 |                     if inter_id in id_to_inter:
134 |                         # id_to_inter[inter_id][3].append(hoi_id)
135 |                         id_to_inter[inter_id][3].append(self.hoi_to_verbid[hoi_id])
136 | 
137 |                     else:
138 |                         item = []
139 |                         item.append(human_bbox)
140 |                         item.append(object_bbox)
141 |                         item.append(self.hoi_to_objid[hoi_id])
142 |                         item.append([self.hoi_to_verbid[hoi_id]])
143 |                         # item.append([hoi_id])
144 |                         id_to_inter[inter_id] = item
145 | 
146 |                     if human_id in id_to_human:
147 |                         id_to_human[human_id][2].append(self.hoi_to_verbid[hoi_id])
148 |                     else:
149 |                         id_to_human[human_id] = [human_bbox, 0, [self.hoi_to_verbid[hoi_id]], []]
150 | 
151 |                     if object_id in id_to_object:
152 |                         id_to_object[object_id][3].append(self.hoi_to_verbid[hoi_id])
153 |                     else:
154 |                         id_to_object[object_id] = [object_bbox, self.hoi_to_objid[hoi_id], [], [self.hoi_to_verbid[hoi_id]]]
155 | 
156 |                 inters += list(id_to_inter.values())
157 |                 instances = instances + list(id_to_human.values()) + list(id_to_object.values())
158 | 
159 |             unique_instances = []
160 |             for inst in instances:
161 |                 m = 0.7
162 |                 minst = None
163 |                 for uinst in unique_instances:
164 |                     if inst[1] == uinst[1] and single_iou(inst[0], uinst[0]) > m:
165 |                         minst = uinst
166 |                         m = single_iou(inst[0], uinst[0])
167 |                 if minst is None:
168 |                     unique_instances.append(inst)
169 |                 else:
170 |                     minst[2] += inst[2]
171 |                     minst[3] += inst[3]
172 | 
173 |             unique_inters = []
174 |             for inter in inters:
175 |                 m = 0.7 ** 2
176 |                 minter = None
177 |                 for uinter in unique_inters:
178 |                     hiou = single_iou(inter[0], uinter[0])
179 |                     oiou = single_iou(inter[1], uinter[1])
180 |                     if inter[2] == uinter[2] and hiou > 0.7 and oiou > 0.7 and hiou*oiou > m:
181 |                         minter = uinter
182 |                         m = hiou * oiou
183 |                 if minter is None:
184 |                     unique_inters.append(inter)
185 |                 else:
186 |                     minter[3] += inter[3]
187 | 
188 | 
189 |             # human_instances = list(id_to_human.values())
190 |             # obj_instances = []
191 |             # for id, obj in id_to_object.items():
192 |             #     if obj[1] == 0: # human, judge overlap with human instance
193 |             #         flag = False
194 |             #         for hinst in human_instances:
195 |             #             if single_iou(hinst[0], obj[0]) > 0.75:
196 |             #                 hinst[3].extend(obj[3])
197 |             #                 flag = True
198 |             #                 break
199 |             #         if not flag:
200 |             #             obj_instances.append(obj)
201 |             # instances = human_instances + obj_instances
202 | 
203 |         #     if len(unique_instances) > 0:
204 |         #         img_anns["interaction"] = unique_inters
205 |         #         img_anns["instance"] = unique_instances
206 |         #         self.ann_by_image.append(img_anns)
207 |         #     else:
208 |         #         no_inst += 1
209 |         # print("%d images has no instances"%no_inst)
210 |             img_anns["interaction"] = unique_inters
211 |             img_anns["instance"] = unique_instances
212 |             self.ann_by_image.append(img_anns)
213 |         self.num_images = len(self.ann_by_image)
214 |         # with open("hico-det_hoi_count.json", "w") as file:
215 |         #     json.dump(self.hoi_count, file)
216 |         # with open("hico-det_verb_count.json", "w") as file:
217 |         #     json.dump(self.verb_count, file)
218 | 
219 |     def __len__(self):
220 |         return self.num_images
221 | 
222 |     def __getitem__(self, index):
223 |         img_item = self.ann_by_image[index]
224 |         img = self.load_img(img_item["img_path"])
225 | 
226 |         annot_bbox = {"instance": [], "interaction": []}
227 |         for i, ann in enumerate(img_item["instance"]):
228 |             tmp = np.zeros(4 + 1 + self.num_verbs * 2)  # (bbox, obj_cat, human action, object action)
229 |             tmp[0:4] = ann[0]  # bbox
230 |             tmp[4] = ann[1]  # object category
231 |             human_act = np.zeros(self.num_verbs)  # human action
232 |             obj_act = np.zeros(self.num_verbs)   # object action
233 | 
234 |             h_acts = np.array(ann[2]) - 1
235 |             o_acts = np.array(ann[3]) - 1
236 | 
237 |             if h_acts.shape[0] > 0:
238 |                 human_act[h_acts] = 1
239 |             if o_acts.shape[0] > 0:
240 |                 obj_act[o_acts] = 1
241 | 
242 |             tmp[5:5+self.num_verbs] = human_act
243 |             tmp[5+self.num_verbs:5+2*self.num_verbs] = obj_act
244 |             annot_bbox["instance"].append(tmp)
245 | 
246 |         for i, ann in enumerate(img_item["interaction"]):
247 |             # tmp = np.zeros(12 + 1 + self.num_hois)  # (human bbox, object bbox, union bbox, obj category, union action)
248 |             tmp = np.zeros(12 + 1 + self.num_verbs)  # (human bbox, object bbox, union bbox, obj category, union action)
249 |             tmp[0:4] = ann[0]
250 |             tmp[4:8] = ann[1]
251 |             tmp[8:12] = self.merge_bbox(ann[0], ann[1])
252 |             tmp[12] = ann[2]
253 | 
254 |             union_acts = np.zeros(self.num_verbs)
255 | 
256 |             u_acts = np.array(ann[3]) - 1
257 |             union_acts[u_acts] = 1
258 |             tmp[13:] = union_acts
259 |             annot_bbox["interaction"].append(tmp)
260 | 
261 |         for key in annot_bbox:
262 |             annot_bbox[key] = np.array(annot_bbox[key])
263 | 
264 |         sample = {'img': img, 'annot': annot_bbox}
265 |         if self.transform:
266 |             sample = self.transform(sample)
267 |         return sample
268 | 
269 |     def merge_bbox(self, b1, b2):
270 |         if b1[0] < 0:
271 |             return b2
272 |         if b2[0] < 0:
273 |             return b1
274 |         return [min(b1[0], b2[0]), min(b1[1], b2[1]),
275 |                 max(b1[2], b2[2]), max(b1[3], b2[3])]
276 | 
277 |     def load_img(self, img_path):
278 |         img = cv2.imread(img_path)
279 |         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
280 | 
281 |         if np.random.uniform(0, 1) < self.color_prob:
282 |             pil_img = Image.fromarray(img)
283 |             img = np.array(randomColor(pil_img))
284 |         return img.astype(np.float32) / 255.
285 | 
286 | 
287 | if __name__=="__main__":
288 |     from torch.utils.data import DataLoader
289 |     from torchvision import transforms
290 |     training_set = HICO_DET_Dataset(root_dir="/home/yichen/DenseNet/datasets", set="train",
291 |                                transform=transforms.Compose([Normalizer(), Augmenter(), Resizer()]))
292 | 
293 |     training_params = {'batch_size': 4,
294 |                        'shuffle': False,
295 |                        'drop_last': True,
296 |                        'collate_fn': collater,
297 |                        'num_workers': 0}
298 |     training_generator = DataLoader(training_set, **training_params)
299 | 
300 | 
301 |     # print("len:", len(training_generator))
302 |     np.set_printoptions(precision=3, suppress=True, threshold=np.inf)
303 | 
304 |     for epoch in range(100):
305 |         print("epoch:", epoch)
306 |         progress_bar = tqdm(training_generator)
307 |         for i, data in enumerate(training_generator):
308 |             # if iter < step - last_epoch * num_iter_per_epoch:
309 |             #     progress_bar.update()
310 |             #     continue
311 |             imgs = data['img']
312 |             annot = data['annot']
313 | 
314 |             # for key in annot:
315 |             #     print(key, annot[key].numpy())
316 | 
317 | 
318 | 
319 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | 
  4 | import argparse
  5 | import torch
  6 | import yaml
  7 | import pickle
  8 | import numpy as np
  9 | 
 10 | from backbone import EfficientDetBackbone
 11 | from efficientdet.utils import BBoxTransform, ClipBoxes
 12 | from efficientdet.help_function import single_iou, single_ioa, single_inter, single_union, transform_action, label_to_class, sub_label_to_class
 13 | from utils.utils import preprocess, invert_affine, postprocess, postprocess_hoi, postprocess_dense_union, postprocess_hoi_flip, postprocess_dense_union_flip
 14 | from utils.apply_prior import apply_prior
 15 | from utils.visual import visual_demo
 16 | 
 17 | 
 18 | ap = argparse.ArgumentParser()
 19 | ap.add_argument('-p', '--project', type=str, default='vcoco', help='project file that contains parameters')
 20 | ap.add_argument('-c', '--compound_coef', type=int, default=3, help='coefficients of efficientdet')
 21 | ap.add_argument('-w', '--weights', type=str, default='weights/vcoco_best.pth', help='/path/to/weights')
 22 | ap.add_argument('--image_path', type=str, default='test/test.jpg', help='/path/to/image')
 23 | ap.add_argument('--save_path', type=str, default='test/detection.jpg', help='/path/to/detection/result')
 24 | ap.add_argument('--cuda', type=int, default=1)
 25 | ap.add_argument('--device', type=int, default=0)
 26 | ap.add_argument('--float16', type=int, default=0)
 27 | ap.add_argument('--flip_test', type=int, default=1, help='whether apply flip augmentation when testing')
 28 | 
 29 | args = ap.parse_args()
 30 | 
 31 | image_path = args.image_path
 32 | save_path = args.save_path
 33 | compound_coef = args.compound_coef
 34 | nms_threshold = 0.3
 35 | use_cuda = args.cuda
 36 | gpu = args.device
 37 | use_float16 = args.float16
 38 | need_visual = True
 39 | weights_path = f'weights/efficientdet-d{compound_coef}.pth' if args.weights is None else args.weights
 40 | project = args.project
 41 | 
 42 | params = yaml.safe_load(open(f'projects/{project}.yml'))
 43 | SET_NAME = params['val_set']
 44 | project_name = params["project_name"]
 45 | 
 46 | 
 47 | print(f'running demonstration with weights {weights_path} on image {image_path}...')
 48 | 
 49 | params = yaml.safe_load(open(f'projects/{project}.yml'))
 50 | obj_list = params['obj_list']
 51 | union_action_list = eval(params['union_action_list'])
 52 | 
 53 | input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]
 54 | input_size = input_sizes[compound_coef]
 55 | output_dir = f"./logs/{project_name}/results"
 56 | 
 57 | 
 58 | def calc_ioa(a, b):
 59 |     # a(anchor) [boxes, (x1, y1, x2, y2)]
 60 |     # b(gt, coco-style) [boxes, (x1, y1, x2, y2)]
 61 | 
 62 |     area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
 63 | 
 64 |     exp_x1 = np.expand_dims(a[:, 0], axis=1)
 65 |     exp_x2 = np.expand_dims(a[:, 2], axis=1)
 66 |     exp_y1 = np.expand_dims(a[:, 1], 1)
 67 |     exp_y2 = np.expand_dims(a[:, 3], 1)
 68 | 
 69 |     iw = np.where(exp_x2 < b[:, 2], exp_x2, b[:, 2]) - np.where(exp_x1 > b[:, 0], exp_x1, b[:, 0])
 70 |     ih = np.where(exp_y2 < b[:, 3], exp_y2, b[:, 3]) - np.where(exp_y1 > b[:, 1], exp_y1, b[:, 1])
 71 |     iw = np.where(iw > 0, iw, 0)
 72 |     ih = np.where(ih > 0, ih, 0)
 73 | 
 74 |     intersection = iw * ih
 75 |     area = np.where(area > 1e-6, area, 1e-6)
 76 |     IoA = intersection / area
 77 |     return IoA
 78 | 
 79 | 
 80 | def calc_iou(a, b):
 81 |     # a(anchor) [boxes, (x1, y1, x2, y2)]
 82 |     # b(gt, coco-style) [boxes, (x1, y1, x2, y2)]
 83 | 
 84 |     area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
 85 | 
 86 |     exp_x1 = np.expand_dims(a[:, 0], axis=1)
 87 |     exp_x2 = np.expand_dims(a[:, 2], axis=1)
 88 |     exp_y1 = np.expand_dims(a[:, 1], 1)
 89 |     exp_y2 = np.expand_dims(a[:, 3], 1)
 90 | 
 91 |     iw = np.where(exp_x2 < b[:, 2], exp_x2, b[:, 2]) - np.where(exp_x1 > b[:, 0], exp_x1, b[:, 0])
 92 |     ih = np.where(exp_y2 < b[:, 3], exp_y2, b[:, 3]) - np.where(exp_y1 > b[:, 1], exp_y1, b[:, 1])
 93 | 
 94 |     iw = np.where(iw > 0, iw, 0)
 95 |     ih = np.where(ih > 0, ih, 0)
 96 | 
 97 |     ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih
 98 |     ua = np.where(ua > 0, ua, 1e-8)
 99 | 
100 |     intersection = iw * ih
101 |     IoU = intersection / ua
102 |     return IoU
103 | 
104 | 
105 | def xy_to_wh(bbox):
106 |     ctr_x = (bbox[0] + bbox[2]) / 2
107 |     ctr_y = (bbox[1] + bbox[3]) / 2
108 |     width = bbox[2] - bbox[0]
109 |     height = bbox[3] - bbox[1]
110 |     return ctr_x, ctr_y, width, height
111 | 
112 | 
113 | def fetch_location_score(anchor_bbox, obj_bbox, target_bbox, human_bbox, sigma):
114 |     xo, yo, wo, ho = xy_to_wh(obj_bbox)
115 |     xt, yt, wt, ht = xy_to_wh(target_bbox)
116 |     xa, ya, wa, ha = xy_to_wh(anchor_bbox)
117 |     dist = np.zeros(2, dtype=np.float)
118 |     dist[0] = (xo - xt) / wa
119 |     dist[1] = (yo - yt) / ha
120 | 
121 |     return np.exp(-1*np.sum(dist**2)/(2*sigma**2))
122 | 
123 | 
124 | def target_object_dist(target_objects_pos, objects_pos, anchors):
125 |     width = anchors[:, 2] - anchors[:, 0]
126 |     height = anchors[:, 3] - anchors[:, 1]
127 |     anchors_size = np.stack([width, height], axis=1)
128 |     anchors_size = np.expand_dims(anchors_size, axis=1)
129 |     target_objects_pos = np.expand_dims(target_objects_pos, 1)
130 |     diff = target_objects_pos - objects_pos
131 |     diff = diff / anchors_size
132 |     dist = np.sum(diff**2, axis=2)
133 |     return dist
134 | 
135 | 
136 | def hoi_match(preds_inst, preds_union, prior_mask, thre=0.05, human_thre=0.6, anchor_thre=0.25, loc_thre=0.1):
137 |     num_inst = len(preds_inst["rois"])
138 |     humans = []
139 |     objects = []
140 |     human_bboxes = []
141 |     human_inst_ids = []
142 |     human_role_scores = []
143 | 
144 |     while len(humans)==0:
145 |         if human_thre < 0.5:
146 |             break
147 |         for inst_id in range(num_inst):
148 |             if preds_inst["obj_class_ids"][inst_id] != 0 or preds_inst["obj_scores"][inst_id] < human_thre:
149 |                 continue
150 |             item = {}
151 |             item["bbox"] = preds_inst["rois"][inst_id]
152 |             item["agent_scores"] = preds_inst["act_scores"][inst_id]
153 |             item["role_scores"] = transform_action(preds_inst["act_scores"][inst_id], "subject")
154 |             item["obj_scores"] = preds_inst["obj_scores"][inst_id]
155 |             item["inst_id"] = inst_id
156 |             humans.append(item)
157 |             human_bboxes.append(item["bbox"])
158 |             human_inst_ids.append(item["inst_id"])
159 |             human_role_scores.append(item["role_scores"])
160 |         human_thre -= 0.1
161 |     human_bboxes = np.array(human_bboxes)
162 |     human_inst_ids = np.array(human_inst_ids)
163 |     human_role_scores = np.array(human_role_scores)
164 | 
165 |     obj_role_scores = []
166 |     for obj_id in range(len(preds_inst["rois"])):
167 |         item = {}
168 |         obj_role_score = transform_action(preds_inst["act_scores"][obj_id], "object")
169 |         obj_role_score = apply_prior(obj_role_score, preds_inst["obj_class_ids"][obj_id])
170 |         item["obj_role_scores"] = obj_role_score
171 | 
172 |         item["obj_scores"] = preds_inst["obj_scores"][obj_id]
173 | 
174 |         item["obj_class_id"] = preds_inst["obj_class_ids"][obj_id]
175 |         item["inst_id"] = obj_id
176 |         obj_bbox = preds_inst["rois"][obj_id]
177 |         item["bbox"] = obj_bbox
178 |         objects.append(item)
179 |         obj_role_scores.append(obj_role_score)
180 |     object_bboxes = np.array(preds_inst["rois"])
181 |     obj_role_scores = np.array(obj_role_scores)
182 | 
183 |     hoi_pair_score = np.zeros((len(humans), len(preds_inst["obj_class_ids"]), len(union_action_list)), dtype=np.float)
184 | 
185 |     if len(human_bboxes) > 0:
186 |         IoA = calc_ioa(preds_union["rois"], human_bboxes)
187 | 
188 |         IoA_max = np.max(IoA, axis=1)
189 |         human_foreground = IoA_max > 0.25
190 |         human_IoA = IoA[human_foreground]
191 |         for key in preds_union:
192 |             preds_union[key] = preds_union[key][human_foreground]
193 | 
194 |         new_IoA = calc_ioa(preds_union["rois"], preds_inst["rois"])
195 |         new_IoA_argmax = np.argmax(new_IoA, axis=1)
196 |         new_IoA[np.arange(new_IoA.shape[0]), new_IoA_argmax] = 0
197 |         new_IoA_sec_max = np.max(new_IoA, axis=1)
198 |         obj_foreground = new_IoA_sec_max > 0.25
199 |         for key in preds_union:
200 |             preds_union[key] = preds_union[key][obj_foreground]
201 | 
202 |         human_IoU = calc_iou(preds_union["rois"], human_bboxes)
203 |         human_IoA = human_IoA[obj_foreground]
204 |         human_IoU_argmax = np.argmax(human_IoU * (human_IoA > 0.25), axis=1)
205 |         obj_IoA = calc_ioa(preds_union["rois"], preds_inst["rois"])
206 | 
207 |         num_union = len(preds_union["rois"])
208 |         num_human = len(human_bboxes)
209 | 
210 |         sp_vectors = preds_union["sp_vector"]
211 |         inter_human_regions = human_bboxes[human_IoU_argmax]
212 |         humans_pos_x = (inter_human_regions[:, 0] + inter_human_regions[:, 2]) / 2
213 |         humans_pos_y = (inter_human_regions[:, 1] + inter_human_regions[:, 3]) / 2
214 |         humans_pos = np.stack([humans_pos_x, humans_pos_y], axis=1)
215 |         inter_objects_pos = humans_pos + sp_vectors
216 | 
217 |         objects_pos_x = (object_bboxes[:, 0] + object_bboxes[:, 2]) / 2
218 |         objects_pos_y = (object_bboxes[:, 1] + object_bboxes[:, 3]) / 2
219 |         objects_pos = np.stack([objects_pos_x, objects_pos_y], axis=1)
220 | 
221 |         obj_dists = target_object_dist(inter_objects_pos, objects_pos, preds_union["rois"])
222 |         inter_human_instids = human_inst_ids[human_IoU_argmax]
223 |         obj_dists[np.arange(num_union), inter_human_instids] = 100
224 |         obj_dists[obj_IoA < 0.25] = 100
225 |         inter_obj_ids = np.argmin(obj_dists, 1)
226 |         inter_obj_dist = obj_dists[np.arange(num_union), inter_obj_ids]
227 | 
228 |         sigma = 0.9
229 |         location_scores = np.exp(-1 * inter_obj_dist / (2 * sigma ** 2))
230 |         location_scores = np.where(location_scores<loc_thre, 0, location_scores)
231 |         anchor_scores = preds_union["act_scores"]
232 |         anchor_scores = np.where(anchor_scores<anchor_thre, 0, anchor_scores)
233 | 
234 |         inter_human_ids = human_IoU_argmax
235 |         inter_human_role_score = human_role_scores[inter_human_ids]
236 |         inst_object_role_score = obj_role_scores[inter_obj_ids]
237 | 
238 |         inter_scores = 0.5 * ((inter_human_role_score + inst_object_role_score) * anchor_scores).T * location_scores
239 | 
240 |         inter_scores = inter_scores.T
241 |         inter_scores[inst_object_role_score == 0] = 0
242 | 
243 |         for human_id in range(num_human):
244 |             human_inter = inter_human_ids == human_id
245 |             human_inter_obj_id = inter_obj_ids[human_inter]
246 |             human_inter_score = inter_scores[human_inter]
247 | 
248 |             for obj_id in range(num_inst):
249 |                 hoi_pair_score[human_id, obj_id] = np.sum(human_inter_score[human_inter_obj_id==obj_id], axis=0)
250 | 
251 |     for human_id in range(len(humans)):
252 |         hoi_pair_score[human_id, :, :] *= humans[human_id]["obj_scores"]
253 | 
254 |     for obj_id in range(len(preds_inst["rois"])):
255 |         obj_cat = preds_inst["obj_class_ids"][obj_id]
256 |         hoi_pair_score[:, obj_id, :] *= preds_inst["obj_scores"][obj_id]
257 |         hoi_pair_score[:, obj_id, :] *= prior_mask[:, obj_cat + 1]
258 | 
259 |     if args.flip_test:
260 |         hoi_pair_score /= 2
261 | 
262 |     dets = []
263 |     for human_id, human in enumerate(humans):
264 |         det = {}
265 |         det["person_box"] = human["bbox"]
266 | 
267 |         max_score = np.max(hoi_pair_score[human_id], axis=0)
268 |         max_obj_id = np.argmax(hoi_pair_score[human_id], axis=0)
269 |         for id in range(len(label_to_class)):
270 |             action = label_to_class[id][0]
271 |             role = label_to_class[id][1]
272 | 
273 |             obj_bbox = preds_inst["rois"][max_obj_id[id]]
274 |             if max_score[id] > thre or prior_mask[id, 0] < 0.1:
275 |                 det[action + "_" + role] = (obj_bbox[0], obj_bbox[1], obj_bbox[2], obj_bbox[3], max_score[id])
276 |                 agent_score = max_score[id]
277 |             else:
278 |                 if human["role_scores"][id] > 0.0 and prior_mask[id, 0] > 0.1:
279 |                     det[action + "_" + role] = (0, 0, 0, 0, human["role_scores"][id] * human["obj_scores"] * prior_mask[id, 0])
280 |                     agent_score = human["role_scores"][id] * human["obj_scores"]
281 | 
282 |                 else:
283 |                     det[action + "_" + role] = (0, 0, 0, 0, 0)
284 |                     agent_score = 0
285 | 
286 |             if action + "_agent" not in det:
287 |                 det[action + "_agent"] = agent_score
288 |             else:
289 |                 det[action + "_agent"] = max(agent_score, det[action + "_agent"])
290 |         for i in range(len(sub_label_to_class)):
291 |             action = sub_label_to_class[i]
292 |             if action + "_agent" not in det:
293 |                 det[action+"_agent"] = human["agent_scores"][i]
294 |         dets.append(det)
295 | 
296 |     return dets
297 | 
298 | 
299 | def img_detect(file, model, input_size, regressBoxes, clipBoxes, prior_mask, threshold):
300 |     img_path = file
301 | 
302 |     ori_imgs, framed_imgs, framed_metas = preprocess(img_path, max_size=input_size)
303 |     if use_cuda:
304 |         x = torch.stack([torch.from_numpy(fi).cuda() for fi in framed_imgs], 0)
305 |     else:
306 |         x = torch.stack([torch.from_numpy(fi) for fi in framed_imgs], 0)
307 | 
308 |     x = x.to(torch.float32 if not use_float16 else torch.float16).permute(0, 3, 1, 2)
309 | 
310 |     if args.flip_test:
311 |         ids = torch.arange(x.shape[-1]-1, -1, -1).long().cuda()
312 |         x_flip = x[..., ids]
313 |         x_cat = torch.cat([x, x_flip], 0)
314 | 
315 |     with torch.no_grad():
316 |         if args.flip_test:
317 | 
318 |             features, union_act_cls, union_sub_reg, union_obj_reg, \
319 |             inst_act_cls, inst_obj_cls, inst_bbox_reg, anchors = model(x_cat)
320 | 
321 |             anchors = torch.cat([anchors, anchors], 0)
322 |             preds_union = postprocess_dense_union_flip(x_cat, anchors, union_act_cls, union_sub_reg, union_obj_reg,
323 |                                                   regressBoxes, clipBoxes, 0.5, 1)
324 |             preds_inst = postprocess_hoi_flip(x_cat, anchors, inst_bbox_reg, inst_obj_cls, inst_act_cls,
325 |                                          regressBoxes, clipBoxes, threshold, nms_threshold,
326 |                                          mode="object", classwise=True)
327 |         else:
328 | 
329 | 
330 |             features, union_act_cls, union_sub_reg, union_obj_reg, \
331 |             inst_act_cls, inst_obj_cls, inst_bbox_reg, anchors = model(x)
332 | 
333 |             preds_union = postprocess_dense_union(x, anchors, union_act_cls, union_sub_reg, union_obj_reg,
334 |                                                   regressBoxes, clipBoxes, 0.5, 1, classwise=True)
335 |             preds_inst = postprocess_hoi(x, anchors, inst_bbox_reg, inst_obj_cls, inst_act_cls,
336 |                                          regressBoxes, clipBoxes, threshold, nms_threshold,
337 |                                          mode="object", classwise=True)
338 | 
339 |         preds_inst = invert_affine(framed_metas, preds_inst)[0]
340 |         preds_union = invert_affine(framed_metas, preds_union)[0]
341 | 
342 |         dets = hoi_match(preds_inst, preds_union, prior_mask)
343 | 
344 |     return dets
345 | 
346 | 
347 | def test(threshold=0.2):
348 |     with open("datasets/vcoco/new_prior_mask.pkl", "rb") as file:
349 |         prior_mask = pickle.load(file, encoding="bytes")
350 | 
351 |     model = EfficientDetBackbone(num_classes=len(eval(params["obj_list"])), num_union_classes=25,
352 |                                  num_inst_classes=51, compound_coef=args.compound_coef,
353 |                                  ratios=eval(params["anchors_ratios"]), scales=eval(params["anchors_scales"]))
354 |     model.load_state_dict(torch.load(weights_path, map_location=torch.device('cpu')))
355 |     model.requires_grad_(False)
356 |     model.eval()
357 | 
358 |     if args.cuda:
359 |         model = model.cuda()
360 |     if args.float16:
361 |         model = model.half()
362 | 
363 |     regressBoxes = BBoxTransform()
364 |     clipBoxes = ClipBoxes()
365 | 
366 |     img_detection = img_detect(image_path, model, input_size, regressBoxes, clipBoxes, prior_mask, threshold=threshold)
367 |     visual_demo(img_detection, image_path, save_path)
368 | 
369 | 
370 | if __name__ == '__main__':
371 |     test()
372 | 
373 | 
374 | 
375 | 
376 | 


--------------------------------------------------------------------------------
/utils/sync_batchnorm/batchnorm.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # File   : batchnorm.py
  3 | # Author : Jiayuan Mao
  4 | # Email  : maojiayuan@gmail.com
  5 | # Date   : 27/01/2018
  6 | #
  7 | # This file is part of Synchronized-BatchNorm-PyTorch.
  8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
  9 | # Distributed under MIT License.
 10 | 
 11 | import collections
 12 | import contextlib
 13 | 
 14 | import torch
 15 | import torch.nn.functional as F
 16 | 
 17 | from torch.nn.modules.batchnorm import _BatchNorm
 18 | 
 19 | try:
 20 |     from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast
 21 | except ImportError:
 22 |     ReduceAddCoalesced = Broadcast = None
 23 | 
 24 | try:
 25 |     from jactorch.parallel.comm import SyncMaster
 26 |     from jactorch.parallel.data_parallel import JacDataParallel as DataParallelWithCallback
 27 | except ImportError:
 28 |     from .comm import SyncMaster
 29 |     from .replicate import DataParallelWithCallback
 30 | 
 31 | __all__ = [
 32 |     'SynchronizedBatchNorm1d', 'SynchronizedBatchNorm2d', 'SynchronizedBatchNorm3d',
 33 |     'patch_sync_batchnorm', 'convert_model'
 34 | ]
 35 | 
 36 | 
 37 | def _sum_ft(tensor):
 38 |     """sum over the first and last dimention"""
 39 |     return tensor.sum(dim=0).sum(dim=-1)
 40 | 
 41 | 
 42 | def _unsqueeze_ft(tensor):
 43 |     """add new dimensions at the front and the tail"""
 44 |     return tensor.unsqueeze(0).unsqueeze(-1)
 45 | 
 46 | 
 47 | _ChildMessage = collections.namedtuple('_ChildMessage', ['sum', 'ssum', 'sum_size'])
 48 | _MasterMessage = collections.namedtuple('_MasterMessage', ['sum', 'inv_std'])
 49 | 
 50 | 
 51 | class _SynchronizedBatchNorm(_BatchNorm):
 52 |     def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True):
 53 |         assert ReduceAddCoalesced is not None, 'Can not use Synchronized Batch Normalization without CUDA support.'
 54 | 
 55 |         super(_SynchronizedBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine)
 56 | 
 57 |         self._sync_master = SyncMaster(self._data_parallel_master)
 58 | 
 59 |         self._is_parallel = False
 60 |         self._parallel_id = None
 61 |         self._slave_pipe = None
 62 | 
 63 |     def forward(self, input):
 64 |         # If it is not parallel computation or is in evaluation mode, use PyTorch's implementation.
 65 |         if not (self._is_parallel and self.training):
 66 |             return F.batch_norm(
 67 |                 input, self.running_mean, self.running_var, self.weight, self.bias,
 68 |                 self.training, self.momentum, self.eps)
 69 | 
 70 |         # Resize the input to (B, C, -1).
 71 |         input_shape = input.size()
 72 |         input = input.view(input.size(0), self.num_features, -1)
 73 | 
 74 |         # Compute the sum and square-sum.
 75 |         sum_size = input.size(0) * input.size(2)
 76 |         input_sum = _sum_ft(input)
 77 |         input_ssum = _sum_ft(input ** 2)
 78 | 
 79 |         # Reduce-and-broadcast the statistics.
 80 |         if self._parallel_id == 0:
 81 |             mean, inv_std = self._sync_master.run_master(_ChildMessage(input_sum, input_ssum, sum_size))
 82 |         else:
 83 |             mean, inv_std = self._slave_pipe.run_slave(_ChildMessage(input_sum, input_ssum, sum_size))
 84 | 
 85 |         # Compute the output.
 86 |         if self.affine:
 87 |             # MJY:: Fuse the multiplication for speed.
 88 |             output = (input - _unsqueeze_ft(mean)) * _unsqueeze_ft(inv_std * self.weight) + _unsqueeze_ft(self.bias)
 89 |         else:
 90 |             output = (input - _unsqueeze_ft(mean)) * _unsqueeze_ft(inv_std)
 91 | 
 92 |         # Reshape it.
 93 |         return output.view(input_shape)
 94 | 
 95 |     def __data_parallel_replicate__(self, ctx, copy_id):
 96 |         self._is_parallel = True
 97 |         self._parallel_id = copy_id
 98 | 
 99 |         # parallel_id == 0 means master device.
100 |         if self._parallel_id == 0:
101 |             ctx.sync_master = self._sync_master
102 |         else:
103 |             self._slave_pipe = ctx.sync_master.register_slave(copy_id)
104 | 
105 |     def _data_parallel_master(self, intermediates):
106 |         """Reduce the sum and square-sum, compute the statistics, and broadcast it."""
107 | 
108 |         # Always using same "device order" makes the ReduceAdd operation faster.
109 |         # Thanks to:: Tete Xiao (http://tetexiao.com/)
110 |         intermediates = sorted(intermediates, key=lambda i: i[1].sum.get_device())
111 | 
112 |         to_reduce = [i[1][:2] for i in intermediates]
113 |         to_reduce = [j for i in to_reduce for j in i]  # flatten
114 |         target_gpus = [i[1].sum.get_device() for i in intermediates]
115 | 
116 |         sum_size = sum([i[1].sum_size for i in intermediates])
117 |         sum_, ssum = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce)
118 |         mean, inv_std = self._compute_mean_std(sum_, ssum, sum_size)
119 | 
120 |         broadcasted = Broadcast.apply(target_gpus, mean, inv_std)
121 | 
122 |         outputs = []
123 |         for i, rec in enumerate(intermediates):
124 |             outputs.append((rec[0], _MasterMessage(*broadcasted[i*2:i*2+2])))
125 | 
126 |         return outputs
127 | 
128 |     def _compute_mean_std(self, sum_, ssum, size):
129 |         """Compute the mean and standard-deviation with sum and square-sum. This method
130 |         also maintains the moving average on the master device."""
131 |         assert size > 1, 'BatchNorm computes unbiased standard-deviation, which requires size > 1.'
132 |         mean = sum_ / size
133 |         sumvar = ssum - sum_ * mean
134 |         unbias_var = sumvar / (size - 1)
135 |         bias_var = sumvar / size
136 | 
137 |         if hasattr(torch, 'no_grad'):
138 |             with torch.no_grad():
139 |                 self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean.data
140 |                 self.running_var = (1 - self.momentum) * self.running_var + self.momentum * unbias_var.data
141 |         else:
142 |             self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean.data
143 |             self.running_var = (1 - self.momentum) * self.running_var + self.momentum * unbias_var.data
144 | 
145 |         return mean, bias_var.clamp(self.eps) ** -0.5
146 | 
147 | 
148 | class SynchronizedBatchNorm1d(_SynchronizedBatchNorm):
149 |     r"""Applies Synchronized Batch Normalization over a 2d or 3d input that is seen as a
150 |     mini-batch.
151 | 
152 |     .. math::
153 | 
154 |         y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
155 | 
156 |     This module differs from the built-in PyTorch BatchNorm1d as the mean and
157 |     standard-deviation are reduced across all devices during training.
158 | 
159 |     For example, when one uses `nn.DataParallel` to wrap the network during
160 |     training, PyTorch's implementation normalize the tensor on each device using
161 |     the statistics only on that device, which accelerated the computation and
162 |     is also easy to implement, but the statistics might be inaccurate.
163 |     Instead, in this synchronized version, the statistics will be computed
164 |     over all training samples distributed on multiple devices.
165 | 
166 |     Note that, for one-GPU or CPU-only case, this module behaves exactly same
167 |     as the built-in PyTorch implementation.
168 | 
169 |     The mean and standard-deviation are calculated per-dimension over
170 |     the mini-batches and gamma and beta are learnable parameter vectors
171 |     of size C (where C is the input size).
172 | 
173 |     During training, this layer keeps a running estimate of its computed mean
174 |     and variance. The running sum is kept with a default momentum of 0.1.
175 | 
176 |     During evaluation, this running mean/variance is used for normalization.
177 | 
178 |     Because the BatchNorm is done over the `C` dimension, computing statistics
179 |     on `(N, L)` slices, it's common terminology to call this Temporal BatchNorm
180 | 
181 |     Args:
182 |         num_features: num_features from an expected input of size
183 |             `batch_size x num_features [x width]`
184 |         eps: a value added to the denominator for numerical stability.
185 |             Default: 1e-5
186 |         momentum: the value used for the running_mean and running_var
187 |             computation. Default: 0.1
188 |         affine: a boolean value that when set to ``True``, gives the layer learnable
189 |             affine parameters. Default: ``True``
190 | 
191 |     Shape::
192 |         - Input: :math:`(N, C)` or :math:`(N, C, L)`
193 |         - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
194 | 
195 |     Examples:
196 |         >>> # With Learnable Parameters
197 |         >>> m = SynchronizedBatchNorm1d(100)
198 |         >>> # Without Learnable Parameters
199 |         >>> m = SynchronizedBatchNorm1d(100, affine=False)
200 |         >>> input = torch.autograd.Variable(torch.randn(20, 100))
201 |         >>> output = m(input)
202 |     """
203 | 
204 |     def _check_input_dim(self, input):
205 |         if input.dim() != 2 and input.dim() != 3:
206 |             raise ValueError('expected 2D or 3D input (got {}D input)'
207 |                              .format(input.dim()))
208 |         super(SynchronizedBatchNorm1d, self)._check_input_dim(input)
209 | 
210 | 
211 | class SynchronizedBatchNorm2d(_SynchronizedBatchNorm):
212 |     r"""Applies Batch Normalization over a 4d input that is seen as a mini-batch
213 |     of 3d inputs
214 | 
215 |     .. math::
216 | 
217 |         y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
218 | 
219 |     This module differs from the built-in PyTorch BatchNorm2d as the mean and
220 |     standard-deviation are reduced across all devices during training.
221 | 
222 |     For example, when one uses `nn.DataParallel` to wrap the network during
223 |     training, PyTorch's implementation normalize the tensor on each device using
224 |     the statistics only on that device, which accelerated the computation and
225 |     is also easy to implement, but the statistics might be inaccurate.
226 |     Instead, in this synchronized version, the statistics will be computed
227 |     over all training samples distributed on multiple devices.
228 | 
229 |     Note that, for one-GPU or CPU-only case, this module behaves exactly same
230 |     as the built-in PyTorch implementation.
231 | 
232 |     The mean and standard-deviation are calculated per-dimension over
233 |     the mini-batches and gamma and beta are learnable parameter vectors
234 |     of size C (where C is the input size).
235 | 
236 |     During training, this layer keeps a running estimate of its computed mean
237 |     and variance. The running sum is kept with a default momentum of 0.1.
238 | 
239 |     During evaluation, this running mean/variance is used for normalization.
240 | 
241 |     Because the BatchNorm is done over the `C` dimension, computing statistics
242 |     on `(N, H, W)` slices, it's common terminology to call this Spatial BatchNorm
243 | 
244 |     Args:
245 |         num_features: num_features from an expected input of
246 |             size batch_size x num_features x height x width
247 |         eps: a value added to the denominator for numerical stability.
248 |             Default: 1e-5
249 |         momentum: the value used for the running_mean and running_var
250 |             computation. Default: 0.1
251 |         affine: a boolean value that when set to ``True``, gives the layer learnable
252 |             affine parameters. Default: ``True``
253 | 
254 |     Shape::
255 |         - Input: :math:`(N, C, H, W)`
256 |         - Output: :math:`(N, C, H, W)` (same shape as input)
257 | 
258 |     Examples:
259 |         >>> # With Learnable Parameters
260 |         >>> m = SynchronizedBatchNorm2d(100)
261 |         >>> # Without Learnable Parameters
262 |         >>> m = SynchronizedBatchNorm2d(100, affine=False)
263 |         >>> input = torch.autograd.Variable(torch.randn(20, 100, 35, 45))
264 |         >>> output = m(input)
265 |     """
266 | 
267 |     def _check_input_dim(self, input):
268 |         if input.dim() != 4:
269 |             raise ValueError('expected 4D input (got {}D input)'
270 |                              .format(input.dim()))
271 |         super(SynchronizedBatchNorm2d, self)._check_input_dim(input)
272 | 
273 | 
274 | class SynchronizedBatchNorm3d(_SynchronizedBatchNorm):
275 |     r"""Applies Batch Normalization over a 5d input that is seen as a mini-batch
276 |     of 4d inputs
277 | 
278 |     .. math::
279 | 
280 |         y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
281 | 
282 |     This module differs from the built-in PyTorch BatchNorm3d as the mean and
283 |     standard-deviation are reduced across all devices during training.
284 | 
285 |     For example, when one uses `nn.DataParallel` to wrap the network during
286 |     training, PyTorch's implementation normalize the tensor on each device using
287 |     the statistics only on that device, which accelerated the computation and
288 |     is also easy to implement, but the statistics might be inaccurate.
289 |     Instead, in this synchronized version, the statistics will be computed
290 |     over all training samples distributed on multiple devices.
291 | 
292 |     Note that, for one-GPU or CPU-only case, this module behaves exactly same
293 |     as the built-in PyTorch implementation.
294 | 
295 |     The mean and standard-deviation are calculated per-dimension over
296 |     the mini-batches and gamma and beta are learnable parameter vectors
297 |     of size C (where C is the input size).
298 | 
299 |     During training, this layer keeps a running estimate of its computed mean
300 |     and variance. The running sum is kept with a default momentum of 0.1.
301 | 
302 |     During evaluation, this running mean/variance is used for normalization.
303 | 
304 |     Because the BatchNorm is done over the `C` dimension, computing statistics
305 |     on `(N, D, H, W)` slices, it's common terminology to call this Volumetric BatchNorm
306 |     or Spatio-temporal BatchNorm
307 | 
308 |     Args:
309 |         num_features: num_features from an expected input of
310 |             size batch_size x num_features x depth x height x width
311 |         eps: a value added to the denominator for numerical stability.
312 |             Default: 1e-5
313 |         momentum: the value used for the running_mean and running_var
314 |             computation. Default: 0.1
315 |         affine: a boolean value that when set to ``True``, gives the layer learnable
316 |             affine parameters. Default: ``True``
317 | 
318 |     Shape::
319 |         - Input: :math:`(N, C, D, H, W)`
320 |         - Output: :math:`(N, C, D, H, W)` (same shape as input)
321 | 
322 |     Examples:
323 |         >>> # With Learnable Parameters
324 |         >>> m = SynchronizedBatchNorm3d(100)
325 |         >>> # Without Learnable Parameters
326 |         >>> m = SynchronizedBatchNorm3d(100, affine=False)
327 |         >>> input = torch.autograd.Variable(torch.randn(20, 100, 35, 45, 10))
328 |         >>> output = m(input)
329 |     """
330 | 
331 |     def _check_input_dim(self, input):
332 |         if input.dim() != 5:
333 |             raise ValueError('expected 5D input (got {}D input)'
334 |                              .format(input.dim()))
335 |         super(SynchronizedBatchNorm3d, self)._check_input_dim(input)
336 | 
337 | 
338 | @contextlib.contextmanager
339 | def patch_sync_batchnorm():
340 |     import torch.nn as nn
341 | 
342 |     backup = nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d
343 | 
344 |     nn.BatchNorm1d = SynchronizedBatchNorm1d
345 |     nn.BatchNorm2d = SynchronizedBatchNorm2d
346 |     nn.BatchNorm3d = SynchronizedBatchNorm3d
347 | 
348 |     yield
349 | 
350 |     nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d = backup
351 | 
352 | 
353 | def convert_model(module):
354 |     """Traverse the input module and its child recursively
355 |        and replace all instance of torch.nn.modules.batchnorm.BatchNorm*N*d
356 |        to SynchronizedBatchNorm*N*d
357 | 
358 |     Args:
359 |         module: the input module needs to be convert to SyncBN model
360 | 
361 |     Examples:
362 |         >>> import torch.nn as nn
363 |         >>> import torchvision
364 |         >>> # m is a standard pytorch model
365 |         >>> m = torchvision.models.resnet18(True)
366 |         >>> m = nn.DataParallel(m)
367 |         >>> # after convert, m is using SyncBN
368 |         >>> m = convert_model(m)
369 |     """
370 |     if isinstance(module, torch.nn.DataParallel):
371 |         mod = module.module
372 |         mod = convert_model(mod)
373 |         mod = DataParallelWithCallback(mod, device_ids=module.device_ids)
374 |         return mod
375 | 
376 |     mod = module
377 |     for pth_module, sync_module in zip([torch.nn.modules.batchnorm.BatchNorm1d,
378 |                                         torch.nn.modules.batchnorm.BatchNorm2d,
379 |                                         torch.nn.modules.batchnorm.BatchNorm3d],
380 |                                        [SynchronizedBatchNorm1d,
381 |                                         SynchronizedBatchNorm2d,
382 |                                         SynchronizedBatchNorm3d]):
383 |         if isinstance(module, pth_module):
384 |             mod = sync_module(module.num_features, module.eps, module.momentum, module.affine)
385 |             mod.running_mean = module.running_mean
386 |             mod.running_var = module.running_var
387 |             if module.affine:
388 |                 mod.weight.data = module.weight.data.clone().detach()
389 |                 mod.bias.data = module.bias.data.clone().detach()
390 | 
391 |     for name, child in module.named_children():
392 |         mod.add_module(name, convert_model(child))
393 | 
394 |     return mod
395 | 


--------------------------------------------------------------------------------
/test_vcoco.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import cv2
  4 | import time
  5 | import threading
  6 | 
  7 | import argparse
  8 | import torch
  9 | import yaml
 10 | import pickle
 11 | import numpy as np
 12 | 
 13 | from utils.vsrl_eval import VCOCOeval
 14 | from backbone import EfficientDetBackbone
 15 | from efficientdet.utils import BBoxTransform, ClipBoxes
 16 | from efficientdet.help_function import single_iou, single_ioa, single_inter, single_union, transform_action, label_to_class, sub_label_to_class
 17 | from utils.utils import preprocess, invert_affine, postprocess, postprocess_hoi, postprocess_dense_union, postprocess_hoi_flip, postprocess_dense_union_flip
 18 | from utils.apply_prior import apply_prior
 19 | from utils.timer import Timer
 20 | from utils.visual import visual
 21 | 
 22 | 
 23 | ap = argparse.ArgumentParser()
 24 | ap.add_argument('-p', '--project', type=str, default='vcoco', help='project file that contains parameters')
 25 | ap.add_argument('-c', '--compound_coef', type=int, default=3, help='coefficients of efficientdet')
 26 | ap.add_argument('-w', '--weights', type=str, default=None, help='/path/to/weights')
 27 | ap.add_argument('--nms_threshold', type=float, default=0.3, help='nms threshold, don\'t change it if not for testing purposes')
 28 | ap.add_argument('--cuda', type=int, default=1)
 29 | ap.add_argument('--device', type=int, default=0)
 30 | ap.add_argument('--float16', type=int, default=0)
 31 | ap.add_argument('--override', type=int, default=0, help='override previous bbox results file if exists')
 32 | ap.add_argument('--data_dir', type=str, default='./datasets', help='the root folder of dataset')
 33 | ap.add_argument('--need_visual', type=int, default=0, help='whether need to visualize the results')
 34 | ap.add_argument('--flip_test', type=int, default=1, help='whether apply flip augmentation when testing')
 35 | 
 36 | args = ap.parse_args()
 37 | 
 38 | compound_coef = args.compound_coef
 39 | nms_threshold = args.nms_threshold
 40 | use_cuda = args.cuda
 41 | gpu = args.device
 42 | use_float16 = args.float16
 43 | override_prev_results = args.override
 44 | need_visual = args.need_visual
 45 | weights_path = f'weights/efficientdet-d{compound_coef}.pth' if args.weights is None else args.weights
 46 | data_dir = args.data_dir
 47 | project = args.project
 48 | 
 49 | params = yaml.safe_load(open(f'projects/{project}.yml'))
 50 | SET_NAME = params['val_set']
 51 | project_name = params["project_name"]
 52 | 
 53 | 
 54 | print(f'running coco-style evaluation on project {project_name}, weights {weights_path}...')
 55 | 
 56 | params = yaml.safe_load(open(f'projects/{project}.yml'))
 57 | obj_list = params['obj_list']
 58 | union_action_list = eval(params['union_action_list'])
 59 | 
 60 | input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]
 61 | input_size = input_sizes[compound_coef]
 62 | output_dir = f"./logs/{project_name}/results"
 63 | 
 64 | if not os.path.exists(output_dir):
 65 |     os.mkdir(output_dir)
 66 | 
 67 | if args.flip_test:
 68 |     detection_path = os.path.join(output_dir, f'{SET_NAME}_bbox_results_flip_final.pkl') 
 69 | else:
 70 |     detection_path = os.path.join(output_dir, f'{SET_NAME}_bbox_results_final.pkl')
 71 | 
 72 | def calc_ioa(a, b):
 73 |     # a(anchor) [boxes, (x1, y1, x2, y2)]
 74 |     # b(gt, coco-style) [boxes, (x1, y1, x2, y2)]
 75 | 
 76 |     area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
 77 | 
 78 |     exp_x1 = np.expand_dims(a[:, 0], axis=1)
 79 |     exp_x2 = np.expand_dims(a[:, 2], axis=1)
 80 |     exp_y1 = np.expand_dims(a[:, 1], 1)
 81 |     exp_y2 = np.expand_dims(a[:, 3], 1)
 82 | 
 83 |     iw = np.where(exp_x2 < b[:, 2], exp_x2, b[:, 2]) - np.where(exp_x1 > b[:, 0], exp_x1, b[:, 0])
 84 |     ih = np.where(exp_y2 < b[:, 3], exp_y2, b[:, 3]) - np.where(exp_y1 > b[:, 1], exp_y1, b[:, 1])
 85 |     iw = np.where(iw > 0, iw, 0)
 86 |     ih = np.where(ih > 0, ih, 0)
 87 | 
 88 |     intersection = iw * ih
 89 |     area = np.where(area > 1e-6, area, 1e-6)
 90 |     IoA = intersection / area
 91 |     return IoA
 92 | 
 93 | 
 94 | def calc_iou(a, b):
 95 |     # a(anchor) [boxes, (x1, y1, x2, y2)]
 96 |     # b(gt, coco-style) [boxes, (x1, y1, x2, y2)]
 97 | 
 98 |     area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
 99 | 
100 |     exp_x1 = np.expand_dims(a[:, 0], axis=1)
101 |     exp_x2 = np.expand_dims(a[:, 2], axis=1)
102 |     exp_y1 = np.expand_dims(a[:, 1], 1)
103 |     exp_y2 = np.expand_dims(a[:, 3], 1)
104 | 
105 |     iw = np.where(exp_x2 < b[:, 2], exp_x2, b[:, 2]) - np.where(exp_x1 > b[:, 0], exp_x1, b[:, 0])
106 |     ih = np.where(exp_y2 < b[:, 3], exp_y2, b[:, 3]) - np.where(exp_y1 > b[:, 1], exp_y1, b[:, 1])
107 | 
108 |     iw = np.where(iw > 0, iw, 0)
109 |     ih = np.where(ih > 0, ih, 0)
110 | 
111 |     ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih
112 |     ua = np.where(ua > 0, ua, 1e-8)
113 | 
114 |     intersection = iw * ih
115 |     IoU = intersection / ua
116 |     return IoU
117 | 
118 | 
119 | def xy_to_wh(bbox):
120 |     ctr_x = (bbox[0] + bbox[2]) / 2
121 |     ctr_y = (bbox[1] + bbox[3]) / 2
122 |     width = bbox[2] - bbox[0]
123 |     height = bbox[3] - bbox[1]
124 |     return ctr_x, ctr_y, width, height
125 | 
126 | 
127 | def fetch_location_score(anchor_bbox, obj_bbox, target_bbox, human_bbox, sigma):
128 |     xo, yo, wo, ho = xy_to_wh(obj_bbox)
129 |     xt, yt, wt, ht = xy_to_wh(target_bbox)
130 |     xa, ya, wa, ha = xy_to_wh(anchor_bbox)
131 |     dist = np.zeros(2, dtype=np.float)
132 |     dist[0] = (xo - xt) / wa
133 |     dist[1] = (yo - yt) / ha
134 | 
135 |     return np.exp(-1*np.sum(dist**2)/(2*sigma**2))
136 | 
137 | 
138 | def target_object_dist(target_objects_pos, objects_pos, anchors):
139 |     width = anchors[:, 2] - anchors[:, 0]
140 |     height = anchors[:, 3] - anchors[:, 1]
141 |     anchors_size = np.stack([width, height], axis=1)
142 |     anchors_size = np.expand_dims(anchors_size, axis=1)
143 |     target_objects_pos = np.expand_dims(target_objects_pos, 1)
144 |     diff = target_objects_pos - objects_pos
145 |     diff = diff / anchors_size
146 |     dist = np.sum(diff**2, axis=2)
147 |     return dist
148 | 
149 | 
150 | def hoi_match(image_id, preds_inst, preds_union, prior_mask, thre=0.05, human_thre=0.6, anchor_thre=0.25, loc_thre=0.1):
151 |     num_inst = len(preds_inst["rois"])
152 |     humans = []
153 |     objects = []
154 |     human_bboxes = []
155 |     human_inst_ids = []
156 |     human_role_scores = []
157 | 
158 |     while len(humans)==0:
159 |         if human_thre < 0.5:
160 |             break
161 |         for inst_id in range(num_inst):
162 |             if preds_inst["obj_class_ids"][inst_id] != 0 or preds_inst["obj_scores"][inst_id] < human_thre:
163 |                 continue
164 |             item = {}
165 |             item["bbox"] = preds_inst["rois"][inst_id]
166 |             item["agent_scores"] = preds_inst["act_scores"][inst_id]
167 |             item["role_scores"] = transform_action(preds_inst["act_scores"][inst_id], "subject")
168 |             item["obj_scores"] = preds_inst["obj_scores"][inst_id]
169 |             item["inst_id"] = inst_id
170 |             humans.append(item)
171 |             human_bboxes.append(item["bbox"])
172 |             human_inst_ids.append(item["inst_id"])
173 |             human_role_scores.append(item["role_scores"])
174 |         human_thre -= 0.1
175 |     human_bboxes = np.array(human_bboxes)
176 |     human_inst_ids = np.array(human_inst_ids)
177 |     human_role_scores = np.array(human_role_scores)
178 | 
179 |     obj_role_scores = []
180 |     for obj_id in range(len(preds_inst["rois"])):
181 |         item = {}
182 |         obj_role_score = transform_action(preds_inst["act_scores"][obj_id], "object")
183 |         obj_role_score = apply_prior(obj_role_score, preds_inst["obj_class_ids"][obj_id])
184 |         item["obj_role_scores"] = obj_role_score
185 | 
186 |         item["obj_scores"] = preds_inst["obj_scores"][obj_id]
187 | 
188 |         item["obj_class_id"] = preds_inst["obj_class_ids"][obj_id]
189 |         item["inst_id"] = obj_id
190 |         obj_bbox = preds_inst["rois"][obj_id]
191 |         item["bbox"] = obj_bbox
192 |         objects.append(item)
193 |         obj_role_scores.append(obj_role_score)
194 |     object_bboxes = np.array(preds_inst["rois"])
195 |     obj_role_scores = np.array(obj_role_scores)
196 | 
197 |     hoi_pair_score = np.zeros((len(humans), len(preds_inst["obj_class_ids"]), len(union_action_list)), dtype=np.float)
198 | 
199 |     if len(human_bboxes) > 0:
200 |         IoA = calc_ioa(preds_union["rois"], human_bboxes)
201 | 
202 |         IoA_max = np.max(IoA, axis=1)
203 |         human_foreground = IoA_max > 0.25
204 |         human_IoA = IoA[human_foreground]
205 |         for key in preds_union:
206 |             preds_union[key] = preds_union[key][human_foreground]
207 | 
208 |         new_IoA = calc_ioa(preds_union["rois"], preds_inst["rois"])
209 |         new_IoA_argmax = np.argmax(new_IoA, axis=1)
210 |         new_IoA[np.arange(new_IoA.shape[0]), new_IoA_argmax] = 0
211 |         new_IoA_sec_max = np.max(new_IoA, axis=1)
212 |         obj_foreground = new_IoA_sec_max > 0.25
213 |         for key in preds_union:
214 |             preds_union[key] = preds_union[key][obj_foreground]
215 | 
216 |         human_IoU = calc_iou(preds_union["rois"], human_bboxes)
217 |         human_IoA = human_IoA[obj_foreground]
218 |         human_IoU_argmax = np.argmax(human_IoU * (human_IoA > 0.25), axis=1)
219 |         obj_IoA = calc_ioa(preds_union["rois"], preds_inst["rois"])
220 | 
221 |         num_union = len(preds_union["rois"])
222 |         num_human = len(human_bboxes)
223 | 
224 |         sp_vectors = preds_union["sp_vector"]
225 |         inter_human_regions = human_bboxes[human_IoU_argmax]
226 |         humans_pos_x = (inter_human_regions[:, 0] + inter_human_regions[:, 2]) / 2
227 |         humans_pos_y = (inter_human_regions[:, 1] + inter_human_regions[:, 3]) / 2
228 |         humans_pos = np.stack([humans_pos_x, humans_pos_y], axis=1)
229 |         inter_objects_pos = humans_pos + sp_vectors
230 | 
231 |         objects_pos_x = (object_bboxes[:, 0] + object_bboxes[:, 2]) / 2
232 |         objects_pos_y = (object_bboxes[:, 1] + object_bboxes[:, 3]) / 2
233 |         objects_pos = np.stack([objects_pos_x, objects_pos_y], axis=1)
234 | 
235 |         obj_dists = target_object_dist(inter_objects_pos, objects_pos, preds_union["rois"])
236 |         inter_human_instids = human_inst_ids[human_IoU_argmax]
237 |         obj_dists[np.arange(num_union), inter_human_instids] = 100
238 |         obj_dists[obj_IoA < 0.25] = 100
239 |         inter_obj_ids = np.argmin(obj_dists, 1)
240 |         inter_obj_dist = obj_dists[np.arange(num_union), inter_obj_ids]
241 | 
242 |         sigma = 0.9
243 |         location_scores = np.exp(-1 * inter_obj_dist / (2 * sigma ** 2))
244 |         location_scores = np.where(location_scores<loc_thre, 0, location_scores)
245 |         anchor_scores = preds_union["act_scores"]
246 |         anchor_scores = np.where(anchor_scores<anchor_thre, 0, anchor_scores)
247 | 
248 |         inter_human_ids = human_IoU_argmax
249 |         inter_human_role_score = human_role_scores[inter_human_ids]
250 |         inst_object_role_score = obj_role_scores[inter_obj_ids]
251 | 
252 |         inter_scores = 0.5 * ((inter_human_role_score + inst_object_role_score) * anchor_scores).T * location_scores
253 | 
254 |         inter_scores = inter_scores.T
255 |         inter_scores[inst_object_role_score == 0] = 0
256 | 
257 |         for human_id in range(num_human):
258 |             human_inter = inter_human_ids == human_id
259 |             human_inter_obj_id = inter_obj_ids[human_inter]
260 |             human_inter_score = inter_scores[human_inter]
261 | 
262 |             for obj_id in range(num_inst):
263 |                 hoi_pair_score[human_id, obj_id] = np.sum(human_inter_score[human_inter_obj_id==obj_id], axis=0)
264 | 
265 |     for human_id in range(len(humans)):
266 |         hoi_pair_score[human_id, :, :] *= humans[human_id]["obj_scores"]
267 | 
268 |     for obj_id in range(len(preds_inst["rois"])):
269 |         obj_cat = preds_inst["obj_class_ids"][obj_id]
270 |         hoi_pair_score[:, obj_id, :] *= preds_inst["obj_scores"][obj_id]
271 |         hoi_pair_score[:, obj_id, :] *= prior_mask[:, obj_cat + 1]
272 | 
273 |     if args.flip_test:
274 |         hoi_pair_score /= 2
275 | 
276 |     dets = []
277 |     for human_id, human in enumerate(humans):
278 |         det = {}
279 |         det["image_id"] = image_id
280 |         det["person_box"] = human["bbox"]
281 | 
282 |         max_score = np.max(hoi_pair_score[human_id], axis=0)
283 |         max_obj_id = np.argmax(hoi_pair_score[human_id], axis=0)
284 |         for id in range(len(label_to_class)):
285 |             action = label_to_class[id][0]
286 |             role = label_to_class[id][1]
287 | 
288 |             obj_bbox = preds_inst["rois"][max_obj_id[id]]
289 |             if max_score[id] > thre or prior_mask[id, 0] < 0.1:
290 |                 det[action + "_" + role] = (obj_bbox[0], obj_bbox[1], obj_bbox[2], obj_bbox[3], max_score[id])
291 |                 agent_score = max_score[id]
292 |             else:
293 |                 if human["role_scores"][id] > 0.0 and prior_mask[id, 0] > 0.1:
294 |                     det[action + "_" + role] = (0, 0, 0, 0, human["role_scores"][id] * human["obj_scores"] * prior_mask[id, 0])
295 |                     agent_score = human["role_scores"][id] * human["obj_scores"]
296 | 
297 |                 else:
298 |                     det[action + "_" + role] = (0, 0, 0, 0, 0)
299 |                     agent_score = 0
300 | 
301 |             if action + "_agent" not in det:
302 |                 det[action + "_agent"] = agent_score
303 |             else:
304 |                 det[action + "_agent"] = max(agent_score, det[action + "_agent"])
305 |         for i in range(len(sub_label_to_class)):
306 |             action = sub_label_to_class[i]
307 |             if action + "_agent" not in det:
308 |                 det[action+"_agent"] = human["agent_scores"][i]
309 |         dets.append(det)
310 | 
311 |     return dets
312 | 
313 | 
314 | def img_detect(file, img_dir, model, input_size, regressBoxes, clipBoxes, prior_mask, threshold):
315 |     fname, ext = os.path.splitext(file)
316 |     image_id = int(fname.split("_")[-1])
317 | 
318 |     img_path = os.path.join(img_dir, file)
319 |     ori_imgs, framed_imgs, framed_metas = preprocess(img_path, max_size=input_size)
320 |     if use_cuda:
321 |         x = torch.stack([torch.from_numpy(fi).cuda() for fi in framed_imgs], 0)
322 |     else:
323 |         x = torch.stack([torch.from_numpy(fi) for fi in framed_imgs], 0)
324 | 
325 |     x = x.to(torch.float32 if not use_float16 else torch.float16).permute(0, 3, 1, 2)
326 | 
327 |     if args.flip_test:
328 |         ids = torch.arange(x.shape[-1]-1, -1, -1).long().cuda()
329 |         x_flip = x[..., ids]
330 |         x_cat = torch.cat([x, x_flip], 0)
331 | 
332 |     with torch.no_grad():
333 |         if args.flip_test:
334 | 
335 |             features, union_act_cls, union_sub_reg, union_obj_reg, \
336 |             inst_act_cls, inst_obj_cls, inst_bbox_reg, anchors = model(x_cat)
337 | 
338 |             anchors = torch.cat([anchors, anchors], 0)
339 |             preds_union = postprocess_dense_union_flip(x_cat, anchors, union_act_cls, union_sub_reg, union_obj_reg,
340 |                                                   regressBoxes, clipBoxes, 0.5, 1)
341 |             preds_inst = postprocess_hoi_flip(x_cat, anchors, inst_bbox_reg, inst_obj_cls, inst_act_cls,
342 |                                          regressBoxes, clipBoxes, threshold, nms_threshold,
343 |                                          mode="object", classwise=True)
344 |         else:
345 | 
346 | 
347 |             features, union_act_cls, union_sub_reg, union_obj_reg, \
348 |             inst_act_cls, inst_obj_cls, inst_bbox_reg, anchors = model(x)
349 | 
350 |             preds_union = postprocess_dense_union(x, anchors, union_act_cls, union_sub_reg, union_obj_reg,
351 |                                                   regressBoxes, clipBoxes, 0.5, 1, classwise=True)
352 |             preds_inst = postprocess_hoi(x, anchors, inst_bbox_reg, inst_obj_cls, inst_act_cls,
353 |                                          regressBoxes, clipBoxes, threshold, nms_threshold,
354 |                                          mode="object", classwise=True)
355 | 
356 |         preds_inst = invert_affine(framed_metas, preds_inst)[0]
357 |         preds_union = invert_affine(framed_metas, preds_union)[0]
358 | 
359 |         dets = hoi_match(image_id, preds_inst, preds_union, prior_mask)
360 | 
361 |     return dets
362 | 
363 | 
364 | def test(threshold=0.2):
365 |     with open("datasets/vcoco/new_prior_mask.pkl", "rb") as file:
366 |         prior_mask = pickle.load(file, encoding="bytes")
367 | 
368 |     model = EfficientDetBackbone(num_classes=len(eval(params["obj_list"])), num_union_classes=25,
369 |                                  num_inst_classes=51, compound_coef=args.compound_coef,
370 |                                  ratios=eval(params["anchors_ratios"]), scales=eval(params["anchors_scales"]))
371 |     model.load_state_dict(torch.load(weights_path, map_location=torch.device('cpu')))
372 |     model.requires_grad_(False)
373 |     model.eval()
374 | 
375 |     if args.cuda:
376 |         model = model.cuda()
377 |     if args.float16:
378 |         model = model.half()
379 | 
380 |     regressBoxes = BBoxTransform()
381 |     clipBoxes = ClipBoxes()
382 | 
383 |     img_dir = os.path.join(data_dir, "vcoco/coco/images/%s" % "val2014")
384 | 
385 |     with open(os.path.join(data_dir, 'vcoco/data/splits/vcoco_test.ids'), 'r') as f:
386 |         image_ids = f.readlines()
387 |     image_ids = [int(id) for id in image_ids]
388 | 
389 |     _t = {'im_detect': Timer(), 'misc': Timer()}
390 |     detection = []
391 | 
392 |     for i, image_id in enumerate(image_ids):
393 | 
394 |         _t['im_detect'].tic()
395 | 
396 |         file = "COCO_val2014_" + (str(image_id)).zfill(12) + '.jpg'
397 | 
398 |         img_detection = img_detect(file, img_dir, model, input_size, regressBoxes, clipBoxes, prior_mask, threshold=threshold)
399 |         detection.extend(img_detection)
400 |         if need_visual:
401 |             visual(img_detection, image_id)
402 |         _t['im_detect'].toc()
403 | 
404 |         print('im_detect: {:d}/{:d}, average time: {:.3f}s'.format(i + 1, len(image_ids), _t['im_detect'].average_time))
405 | 
406 |     with open(detection_path, "wb") as file:
407 |         pickle.dump(detection, file)
408 | 
409 | 
410 | if __name__ == '__main__':
411 |     vsrl_annot_file = "./datasets/vcoco/data/vcoco/vcoco_test.json"
412 |     coco_file = "./datasets/vcoco/coco/annotations/instances_val2014.json"
413 |     split_file = "./datasets/vcoco/data/splits/vcoco_test.ids"
414 |     if override_prev_results or not os.path.exists(detection_path):
415 |         test()
416 |     vcocoeval = VCOCOeval(vsrl_annot_file, coco_file, split_file)
417 |     vcocoeval._do_eval(detection_path, ovr_thresh=0.5)
418 | 
419 | 
420 | 
421 | 
422 | 


--------------------------------------------------------------------------------
/efficientdet/model.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch
  3 | from torchvision.ops.boxes import nms as nms_torch
  4 | 
  5 | from efficientnet import EfficientNet as EffNet
  6 | from efficientnet.utils import MemoryEfficientSwish, Swish
  7 | from efficientnet.utils_extra import Conv2dStaticSamePadding, MaxPool2dStaticSamePadding
  8 | 
  9 | 
 10 | def nms(dets, thresh):
 11 |     return nms_torch(dets[:, :4], dets[:, 4], thresh)
 12 | 
 13 | 
 14 | class SeparableConvBlock(nn.Module):
 15 |     """
 16 |     created by Zylo117
 17 |     """
 18 | 
 19 |     def __init__(self, in_channels, out_channels=None, norm=True, activation=False, onnx_export=False):
 20 |         super(SeparableConvBlock, self).__init__()
 21 |         if out_channels is None:
 22 |             out_channels = in_channels
 23 | 
 24 |         # Q: whether separate conv
 25 |         #  share bias between depthwise_conv and pointwise_conv
 26 |         #  or just pointwise_conv apply bias.
 27 |         # A: Confirmed, just pointwise_conv applies bias, depthwise_conv has no bias.
 28 | 
 29 |         self.depthwise_conv = Conv2dStaticSamePadding(in_channels, in_channels,
 30 |                                                       kernel_size=3, stride=1, groups=in_channels, bias=False)
 31 |         self.pointwise_conv = Conv2dStaticSamePadding(in_channels, out_channels, kernel_size=1, stride=1)
 32 | 
 33 |         self.norm = norm
 34 |         if self.norm:
 35 |             # Warning: pytorch momentum is different from tensorflow's, momentum_pytorch = 1 - momentum_tensorflow
 36 |             self.bn = nn.BatchNorm2d(num_features=out_channels, momentum=0.01, eps=1e-3)
 37 | 
 38 |         self.activation = activation
 39 |         if self.activation:
 40 |             self.swish = MemoryEfficientSwish() if not onnx_export else Swish()
 41 | 
 42 |     def forward(self, x):
 43 |         x = self.depthwise_conv(x)
 44 |         x = self.pointwise_conv(x)
 45 | 
 46 |         if self.norm:
 47 |             x = self.bn(x)
 48 | 
 49 |         if self.activation:
 50 |             x = self.swish(x)
 51 | 
 52 |         return x
 53 | 
 54 | 
 55 | class BiFPN(nn.Module):
 56 |     """
 57 |     modified by Zylo117
 58 |     """
 59 | 
 60 |     def __init__(self, num_channels, conv_channels, first_time=False, epsilon=1e-4, onnx_export=False, attention=True):
 61 |         """
 62 | 
 63 |         Args:
 64 |             num_channels:
 65 |             conv_channels:
 66 |             first_time: whether the input comes directly from the efficientnet,
 67 |                         if True, downchannel it first, and downsample P5 to generate P6 then P7
 68 |             epsilon: epsilon of fast weighted attention sum of BiFPN, not the BN's epsilon
 69 |             onnx_export: if True, use Swish instead of MemoryEfficientSwish
 70 |         """
 71 |         super(BiFPN, self).__init__()
 72 |         self.epsilon = epsilon
 73 |         # Conv layers
 74 |         self.conv6_up = SeparableConvBlock(num_channels, onnx_export=onnx_export)
 75 |         self.conv5_up = SeparableConvBlock(num_channels, onnx_export=onnx_export)
 76 |         self.conv4_up = SeparableConvBlock(num_channels, onnx_export=onnx_export)
 77 |         self.conv3_up = SeparableConvBlock(num_channels, onnx_export=onnx_export)
 78 |         self.conv4_down = SeparableConvBlock(num_channels, onnx_export=onnx_export)
 79 |         self.conv5_down = SeparableConvBlock(num_channels, onnx_export=onnx_export)
 80 |         self.conv6_down = SeparableConvBlock(num_channels, onnx_export=onnx_export)
 81 |         self.conv7_down = SeparableConvBlock(num_channels, onnx_export=onnx_export)
 82 | 
 83 |         # Feature scaling layers
 84 |         self.p6_upsample = nn.Upsample(scale_factor=2, mode='nearest')
 85 |         self.p5_upsample = nn.Upsample(scale_factor=2, mode='nearest')
 86 |         self.p4_upsample = nn.Upsample(scale_factor=2, mode='nearest')
 87 |         self.p3_upsample = nn.Upsample(scale_factor=2, mode='nearest')
 88 | 
 89 |         self.p4_downsample = MaxPool2dStaticSamePadding(3, 2)
 90 |         self.p5_downsample = MaxPool2dStaticSamePadding(3, 2)
 91 |         self.p6_downsample = MaxPool2dStaticSamePadding(3, 2)
 92 |         self.p7_downsample = MaxPool2dStaticSamePadding(3, 2)
 93 | 
 94 |         self.swish = MemoryEfficientSwish() if not onnx_export else Swish()
 95 | 
 96 |         self.first_time = first_time
 97 |         if self.first_time:
 98 |             self.p5_down_channel = nn.Sequential(
 99 |                 Conv2dStaticSamePadding(conv_channels[2], num_channels, 1),
100 |                 nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
101 |             )
102 |             self.p4_down_channel = nn.Sequential(
103 |                 Conv2dStaticSamePadding(conv_channels[1], num_channels, 1),
104 |                 nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
105 |             )
106 |             self.p3_down_channel = nn.Sequential(
107 |                 Conv2dStaticSamePadding(conv_channels[0], num_channels, 1),
108 |                 nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
109 |             )
110 | 
111 |             self.p5_to_p6 = nn.Sequential(
112 |                 Conv2dStaticSamePadding(conv_channels[2], num_channels, 1),
113 |                 nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
114 |                 MaxPool2dStaticSamePadding(3, 2)
115 |             )
116 |             self.p6_to_p7 = nn.Sequential(
117 |                 MaxPool2dStaticSamePadding(3, 2)
118 |             )
119 | 
120 |             self.p4_down_channel_2 = nn.Sequential(
121 |                 Conv2dStaticSamePadding(conv_channels[1], num_channels, 1),
122 |                 nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
123 |             )
124 |             self.p5_down_channel_2 = nn.Sequential(
125 |                 Conv2dStaticSamePadding(conv_channels[2], num_channels, 1),
126 |                 nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
127 |             )
128 | 
129 |         # Weight
130 |         self.p6_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
131 |         self.p6_w1_relu = nn.ReLU()
132 |         self.p5_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
133 |         self.p5_w1_relu = nn.ReLU()
134 |         self.p4_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
135 |         self.p4_w1_relu = nn.ReLU()
136 |         self.p3_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
137 |         self.p3_w1_relu = nn.ReLU()
138 | 
139 |         self.p4_w2 = nn.Parameter(torch.ones(3, dtype=torch.float32), requires_grad=True)
140 |         self.p4_w2_relu = nn.ReLU()
141 |         self.p5_w2 = nn.Parameter(torch.ones(3, dtype=torch.float32), requires_grad=True)
142 |         self.p5_w2_relu = nn.ReLU()
143 |         self.p6_w2 = nn.Parameter(torch.ones(3, dtype=torch.float32), requires_grad=True)
144 |         self.p6_w2_relu = nn.ReLU()
145 |         self.p7_w2 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
146 |         self.p7_w2_relu = nn.ReLU()
147 | 
148 |         self.attention = attention
149 | 
150 |     def forward(self, inputs):
151 |         """
152 |         illustration of a minimal bifpn unit
153 |             P7_0 -------------------------> P7_2 -------->
154 |                |-------------|                ↑
155 |                              ↓                |
156 |             P6_0 ---------> P6_1 ---------> P6_2 -------->
157 |                |-------------|--------------↑ ↑
158 |                              ↓                |
159 |             P5_0 ---------> P5_1 ---------> P5_2 -------->
160 |                |-------------|--------------↑ ↑
161 |                              ↓                |
162 |             P4_0 ---------> P4_1 ---------> P4_2 -------->
163 |                |-------------|--------------↑ ↑
164 |                              |--------------↓ |
165 |             P3_0 -------------------------> P3_2 -------->
166 |         """
167 | 
168 |         # downsample channels using same-padding conv2d to target phase's if not the same
169 |         # judge: same phase as target,
170 |         # if same, pass;
171 |         # elif earlier phase, downsample to target phase's by pooling
172 |         # elif later phase, upsample to target phase's by nearest interpolation
173 | 
174 |         if self.attention:
175 |             p3_out, p4_out, p5_out, p6_out, p7_out = self._forward_fast_attention(inputs)
176 |         else:
177 |             p3_out, p4_out, p5_out, p6_out, p7_out = self._forward(inputs)
178 | 
179 |         return p3_out, p4_out, p5_out, p6_out, p7_out
180 | 
181 |     def _forward_fast_attention(self, inputs):
182 |         if self.first_time:
183 |             p3, p4, p5 = inputs
184 | 
185 |             p6_in = self.p5_to_p6(p5)
186 |             p7_in = self.p6_to_p7(p6_in)
187 | 
188 |             p3_in = self.p3_down_channel(p3)
189 |             p4_in = self.p4_down_channel(p4)
190 |             p5_in = self.p5_down_channel(p5)
191 | 
192 |         else:
193 |             # P3_0, P4_0, P5_0, P6_0 and P7_0
194 |             p3_in, p4_in, p5_in, p6_in, p7_in = inputs
195 | 
196 |         # P7_0 to P7_2
197 | 
198 |         # Weights for P6_0 and P7_0 to P6_1
199 |         p6_w1 = self.p6_w1_relu(self.p6_w1)
200 |         weight = p6_w1 / (torch.sum(p6_w1, dim=0) + self.epsilon)
201 |         # Connections for P6_0 and P7_0 to P6_1 respectively
202 |         p6_up = self.conv6_up(self.swish(weight[0] * p6_in + weight[1] * self.p6_upsample(p7_in)))
203 | 
204 |         # Weights for P5_0 and P6_0 to P5_1
205 |         p5_w1 = self.p5_w1_relu(self.p5_w1)
206 |         weight = p5_w1 / (torch.sum(p5_w1, dim=0) + self.epsilon)
207 |         # Connections for P5_0 and P6_0 to P5_1 respectively
208 |         p5_up = self.conv5_up(self.swish(weight[0] * p5_in + weight[1] * self.p5_upsample(p6_up)))
209 | 
210 |         # Weights for P4_0 and P5_0 to P4_1
211 |         p4_w1 = self.p4_w1_relu(self.p4_w1)
212 |         weight = p4_w1 / (torch.sum(p4_w1, dim=0) + self.epsilon)
213 |         # Connections for P4_0 and P5_0 to P4_1 respectively
214 |         p4_up = self.conv4_up(self.swish(weight[0] * p4_in + weight[1] * self.p4_upsample(p5_up)))
215 | 
216 |         # Weights for P3_0 and P4_1 to P3_2
217 |         p3_w1 = self.p3_w1_relu(self.p3_w1)
218 |         weight = p3_w1 / (torch.sum(p3_w1, dim=0) + self.epsilon)
219 |         # Connections for P3_0 and P4_1 to P3_2 respectively
220 |         p3_out = self.conv3_up(self.swish(weight[0] * p3_in + weight[1] * self.p3_upsample(p4_up)))
221 | 
222 |         if self.first_time:
223 |             p4_in = self.p4_down_channel_2(p4)
224 |             p5_in = self.p5_down_channel_2(p5)
225 | 
226 |         # Weights for P4_0, P4_1 and P3_2 to P4_2
227 |         p4_w2 = self.p4_w2_relu(self.p4_w2)
228 |         weight = p4_w2 / (torch.sum(p4_w2, dim=0) + self.epsilon)
229 |         # Connections for P4_0, P4_1 and P3_2 to P4_2 respectively
230 |         p4_out = self.conv4_down(
231 |             self.swish(weight[0] * p4_in + weight[1] * p4_up + weight[2] * self.p4_downsample(p3_out)))
232 | 
233 |         # Weights for P5_0, P5_1 and P4_2 to P5_2
234 |         p5_w2 = self.p5_w2_relu(self.p5_w2)
235 |         weight = p5_w2 / (torch.sum(p5_w2, dim=0) + self.epsilon)
236 |         # Connections for P5_0, P5_1 and P4_2 to P5_2 respectively
237 |         p5_out = self.conv5_down(
238 |             self.swish(weight[0] * p5_in + weight[1] * p5_up + weight[2] * self.p5_downsample(p4_out)))
239 | 
240 |         # Weights for P6_0, P6_1 and P5_2 to P6_2
241 |         p6_w2 = self.p6_w2_relu(self.p6_w2)
242 |         weight = p6_w2 / (torch.sum(p6_w2, dim=0) + self.epsilon)
243 |         # Connections for P6_0, P6_1 and P5_2 to P6_2 respectively
244 |         p6_out = self.conv6_down(
245 |             self.swish(weight[0] * p6_in + weight[1] * p6_up + weight[2] * self.p6_downsample(p5_out)))
246 | 
247 |         # Weights for P7_0 and P6_2 to P7_2
248 |         p7_w2 = self.p7_w2_relu(self.p7_w2)
249 |         weight = p7_w2 / (torch.sum(p7_w2, dim=0) + self.epsilon)
250 |         # Connections for P7_0 and P6_2 to P7_2
251 |         p7_out = self.conv7_down(self.swish(weight[0] * p7_in + weight[1] * self.p7_downsample(p6_out)))
252 | 
253 |         return p3_out, p4_out, p5_out, p6_out, p7_out
254 | 
255 |     def _forward(self, inputs):
256 |         if self.first_time:
257 |             p3, p4, p5 = inputs
258 | 
259 |             p6_in = self.p5_to_p6(p5)
260 |             p7_in = self.p6_to_p7(p6_in)
261 | 
262 |             p3_in = self.p3_down_channel(p3)
263 |             p4_in = self.p4_down_channel(p4)
264 |             p5_in = self.p5_down_channel(p5)
265 | 
266 |         else:
267 |             # P3_0, P4_0, P5_0, P6_0 and P7_0
268 |             p3_in, p4_in, p5_in, p6_in, p7_in = inputs
269 | 
270 |         # P7_0 to P7_2
271 | 
272 |         # Connections for P6_0 and P7_0 to P6_1 respectively
273 |         p6_up = self.conv6_up(self.swish(p6_in + self.p6_upsample(p7_in)))
274 | 
275 |         # Connections for P5_0 and P6_0 to P5_1 respectively
276 |         p5_up = self.conv5_up(self.swish(p5_in + self.p5_upsample(p6_up)))
277 | 
278 |         # Connections for P4_0 and P5_0 to P4_1 respectively
279 |         p4_up = self.conv4_up(self.swish(p4_in + self.p4_upsample(p5_up)))
280 | 
281 |         # Connections for P3_0 and P4_1 to P3_2 respectively
282 |         p3_out = self.conv3_up(self.swish(p3_in + self.p3_upsample(p4_up)))
283 | 
284 |         if self.first_time:
285 |             p4_in = self.p4_down_channel_2(p4)
286 |             p5_in = self.p5_down_channel_2(p5)
287 | 
288 |         # Connections for P4_0, P4_1 and P3_2 to P4_2 respectively
289 |         p4_out = self.conv4_down(
290 |             self.swish(p4_in + p4_up + self.p4_downsample(p3_out)))
291 | 
292 |         # Connections for P5_0, P5_1 and P4_2 to P5_2 respectively
293 |         p5_out = self.conv5_down(
294 |             self.swish(p5_in + p5_up + self.p5_downsample(p4_out)))
295 | 
296 |         # Connections for P6_0, P6_1 and P5_2 to P6_2 respectively
297 |         p6_out = self.conv6_down(
298 |             self.swish(p6_in + p6_up + self.p6_downsample(p5_out)))
299 | 
300 |         # Connections for P7_0 and P6_2 to P7_2
301 |         p7_out = self.conv7_down(self.swish(p7_in + self.p7_downsample(p6_out)))
302 | 
303 |         return p3_out, p4_out, p5_out, p6_out, p7_out
304 | 
305 | 
306 | class Regressor(nn.Module):
307 |     """
308 |     modified by Zylo117
309 |     """
310 | 
311 |     def __init__(self, in_channels, num_anchors, num_layers, onnx_export=False):
312 |         super(Regressor, self).__init__()
313 |         self.num_layers = num_layers
314 | 
315 |         self.conv_list = nn.ModuleList(
316 |             [SeparableConvBlock(in_channels, in_channels, norm=False, activation=False) for i in range(num_layers)])
317 |         self.bn_list = nn.ModuleList(
318 |             [nn.ModuleList([nn.BatchNorm2d(in_channels, momentum=0.01, eps=1e-3) for i in range(num_layers)]) for j in
319 |              range(5)])
320 |         self.header = SeparableConvBlock(in_channels, num_anchors * 4, norm=False, activation=False)
321 |         self.swish = MemoryEfficientSwish() if not onnx_export else Swish()
322 | 
323 |     def forward(self, inputs):
324 |         feats = []
325 |         for feat, bn_list in zip(inputs, self.bn_list):
326 |             for i, bn, conv in zip(range(self.num_layers), bn_list, self.conv_list):
327 |                 feat = conv(feat)
328 |                 feat = bn(feat)
329 |                 feat = self.swish(feat)
330 |             feat = self.header(feat)
331 | 
332 |             feat = feat.permute(0, 2, 3, 1)  # (batch_size, height, width, num_anchor*4)
333 |             feat = feat.contiguous().view(feat.shape[0], -1, 4)  # (batch_size, h*w*num_anchor, 4)
334 | 
335 |             feats.append(feat)
336 | 
337 |         feats = torch.cat(feats, dim=1)  # (batch_size, h*w*feat_num*num_anchor, 4)
338 | 
339 |         return feats
340 | 
341 | 
342 | class Classifier(nn.Module):
343 |     """
344 |     modified by Zylo117
345 |     """
346 | 
347 |     def __init__(self, in_channels, num_anchors, num_classes, num_layers, onnx_export=False):
348 |         super(Classifier, self).__init__()
349 |         self.num_anchors = num_anchors
350 |         self.num_classes = num_classes
351 |         self.num_layers = num_layers
352 |         self.conv_list = nn.ModuleList(
353 |             [SeparableConvBlock(in_channels, in_channels, norm=False, activation=False) for i in range(num_layers)])
354 |         self.bn_list = nn.ModuleList(
355 |             [nn.ModuleList([nn.BatchNorm2d(in_channels, momentum=0.01, eps=1e-3) for i in range(num_layers)]) for j in
356 |              range(5)])
357 |         self.header = SeparableConvBlock(in_channels, num_anchors * num_classes, norm=False, activation=False)
358 |         self.swish = MemoryEfficientSwish() if not onnx_export else Swish()
359 | 
360 |     def forward(self, inputs):
361 |         feats = []
362 |         for feat, bn_list in zip(inputs, self.bn_list):
363 |             for i, bn, conv in zip(range(self.num_layers), bn_list, self.conv_list):
364 |                 feat = conv(feat)
365 |                 feat = bn(feat)
366 |                 feat = self.swish(feat)
367 |             feat = self.header(feat)
368 | 
369 |             feat = feat.permute(0, 2, 3, 1)  # (batch_size, height, width, num_anchors*num_classes)
370 |             feat = feat.contiguous().view(feat.shape[0], feat.shape[1], feat.shape[2], self.num_anchors,
371 |                                           self.num_classes)  # (batch_size, height, width, num_anchors, num_classes)
372 |             feat = feat.contiguous().view(feat.shape[0], -1, self.num_classes)  # (batch_size, h*w*num_anchor, num_classes)
373 | 
374 |             feats.append(feat)
375 | 
376 |         feats = torch.cat(feats, dim=1)  # (batch_size, h*w*num_anchor*feat_num, num_classes)
377 |         feats = feats.sigmoid()
378 | 
379 |         return feats
380 | 
381 | 
382 | class EfficientNet(nn.Module):
383 |     """
384 |     modified by Zylo117
385 |     """
386 | 
387 |     def __init__(self, compound_coef, load_weights=False):
388 |         super(EfficientNet, self).__init__()
389 |         model = EffNet.from_pretrained(f'efficientnet-b{compound_coef}', load_weights)
390 |         del model._conv_head
391 |         del model._bn1
392 |         del model._avg_pooling
393 |         del model._dropout
394 |         del model._fc
395 |         self.model = model
396 | 
397 |     def forward(self, x):
398 |         x = self.model._conv_stem(x)
399 |         x = self.model._bn0(x)
400 |         x = self.model._swish(x)
401 |         feature_maps = []
402 | 
403 |         # TODO: temporarily storing extra tensor last_x and del it later might not be a good idea,
404 |         #  try recording stride changing when creating efficientnet,
405 |         #  and then apply it here.
406 |         last_x = None
407 |         for idx, block in enumerate(self.model._blocks):
408 |             drop_connect_rate = self.model._global_params.drop_connect_rate
409 |             if drop_connect_rate:
410 |                 drop_connect_rate *= float(idx) / len(self.model._blocks)
411 |             x = block(x, drop_connect_rate=drop_connect_rate)
412 | 
413 |             if block._depthwise_conv.stride == [2, 2]:
414 |                 feature_maps.append(last_x)
415 |             elif idx == len(self.model._blocks) - 1:
416 |                 feature_maps.append(x)
417 |             last_x = x
418 |         del last_x
419 |         return feature_maps[1:]
420 | 
421 | 
422 | if __name__ == '__main__':
423 |     from tensorboardX import SummaryWriter
424 | 
425 | 
426 |     def count_parameters(model):
427 |         return sum(p.numel() for p in model.parameters() if p.requires_grad)
428 | 


--------------------------------------------------------------------------------
/test_hico-det.py:
--------------------------------------------------------------------------------
  1 | # Author: Zylo117
  2 | 
  3 | """
  4 | COCO-Style Evaluations
  5 | 
  6 | put images here datasets/your_project_name/annotations/val_set_name/*.jpg
  7 | put annotations here datasets/your_project_name/annotations/instances_{val_set_name}.json
  8 | put weights here /path/to/your/weights/*.pth
  9 | change compound_coef
 10 | 
 11 | """
 12 | 
 13 | import json
 14 | import os
 15 | import cv2
 16 | import time
 17 | import glob
 18 | 
 19 | import argparse
 20 | import torch
 21 | import yaml
 22 | import pickle
 23 | import numpy as np
 24 | # from pycocotools.cocoeval import COCOeval
 25 | 
 26 | # from utils.vsrl_eval import VCOCOeval
 27 | from backbone import EfficientDetBackbone
 28 | from efficientdet.utils import BBoxTransform, ClipBoxes
 29 | from efficientdet.help_function import single_iou, single_ioa, single_inter, single_union
 30 | from utils.utils import preprocess, invert_affine, postprocess, postprocess_hoi, postprocess_dense_union, postprocess_hoi_flip, postprocess_dense_union_flip
 31 | # from utils.apply_prior import apply_prior
 32 | from utils.timer import Timer
 33 | from utils.visual_hico import visual_hico
 34 | from Generate_HICO_detection import Generate_HICO_detection
 35 | 
 36 | 
 37 | ap = argparse.ArgumentParser()
 38 | ap.add_argument('-p', '--project', type=str, default='hico-det', help='project file that contains parameters')
 39 | ap.add_argument('-c', '--compound_coef', type=int, default=3, help='coefficients of efficientdet')
 40 | ap.add_argument('-w', '--weights', type=str, default=None, help='/path/to/weights')
 41 | ap.add_argument('--nms_threshold', type=float, default=0.3, help='nms threshold, don\'t change it if not for testing purposes')
 42 | ap.add_argument('--cuda', type=int, default=1)
 43 | ap.add_argument('--device', type=int, default=0)
 44 | ap.add_argument('--float16', type=int, default=0)
 45 | ap.add_argument('--override', type=int, default=0, help='override previous bbox results file if exists')
 46 | ap.add_argument('--data_dir', type=str, default='./datasets', help='the root folder of dataset')
 47 | ap.add_argument('--need_visual', type=int, default=0, help='whether need to visualize the results')
 48 | ap.add_argument('--flip_test', type=int, default=1, help='whether apply flip augmentation when testing')
 49 | 
 50 | 
 51 | args = ap.parse_args()
 52 | 
 53 | compound_coef = args.compound_coef
 54 | nms_threshold = args.nms_threshold
 55 | use_cuda = args.cuda
 56 | gpu = args.device
 57 | use_float16 = args.float16
 58 | override_prev_results = args.override
 59 | need_visual = args.need_visual
 60 | weights_path = f'weights/efficientdet-d{compound_coef}.pth' if args.weights is None else args.weights
 61 | data_dir = args.data_dir
 62 | project = args.project
 63 | 
 64 | params = yaml.safe_load(open(f'projects/{project}.yml'))
 65 | SET_NAME = params['val_set']
 66 | project_name = params["project_name"]
 67 | 
 68 | 
 69 | print(f'running coco-style evaluation on project {project_name}, weights {weights_path}...')
 70 | 
 71 | params = yaml.safe_load(open(f'projects/{project}.yml'))
 72 | num_objects = 90
 73 | num_union_actions = 117
 74 | num_union_hois = 600
 75 | num_inst_actions = 234
 76 | 
 77 | input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]
 78 | input_size = input_sizes[compound_coef]
 79 | output_dir = f"./logs/{project_name}/results"
 80 | 
 81 | if not os.path.exists(output_dir):
 82 |     os.mkdir(output_dir)
 83 | 
 84 | if args.flip_test:
 85 |     detection_path = os.path.join(output_dir, f'{SET_NAME}_bbox_results_flip_final.pkl')
 86 | else:
 87 |     detection_path = os.path.join(output_dir, f'{SET_NAME}_bbox_results_final.pkl')
 88 | 
 89 | obj_list = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
 90 |            'fire hydrant', '', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
 91 |            'cow', 'elephant', 'bear', 'zebra', 'giraffe', '', 'backpack', 'umbrella', '', '', 'handbag', 'tie',
 92 |            'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
 93 |            'skateboard', 'surfboard', 'tennis racket', 'bottle', '', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
 94 |            'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut',
 95 |            'cake', 'chair', 'couch', 'potted plant', 'bed', '', 'dining table', '', '', 'toilet', '', 'tv',
 96 |            'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
 97 |            'refrigerator', '', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
 98 |            'toothbrush']
 99 | 
100 | obj_dict = {}
101 | cid = 0
102 | for obj in obj_list:
103 |     if obj != "":
104 |         cid += 1
105 |         obj_dict[obj] = cid
106 | 
107 | with open(args.data_dir + "/hico_20160224_det/hico_processed/verb_list.json", "r") as file:
108 |     verbs_hico = json.load(file)
109 | verbs_dict = {}
110 | for id, item in enumerate(verbs_hico):
111 |     verb_name = item["name"]
112 |     verbs_dict[verb_name] = id
113 | 
114 | with open(args.data_dir + "/hico_20160224_det/hico_processed/hoi_list.json", "r") as file:
115 |     hois_hico = json.load(file)
116 | verb_to_hoi = {}
117 | for hoi_id, item in enumerate(hois_hico):
118 |     verb_id = verbs_dict[item["verb"]]
119 |     if verb_id in verb_to_hoi:
120 |         verb_to_hoi[verb_id].append(hoi_id)
121 |     else:
122 |         verb_to_hoi[verb_id] = [hoi_id]
123 | 
124 | n = 0
125 | for verb_id in verb_to_hoi:
126 |     n += len(verb_to_hoi[verb_id])
127 |     verb_to_hoi[verb_id] = np.array(verb_to_hoi[verb_id])
128 | assert n == num_union_hois
129 | 
130 | 
131 | def calc_ioa(a, b):
132 |     # a(anchor) [boxes, (x1, y1, x2, y2)]
133 |     # b(gt, coco-style) [boxes, (x1, y1, x2, y2)]
134 | 
135 |     area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
136 | 
137 |     exp_x1 = np.expand_dims(a[:, 0], axis=1)
138 |     exp_x2 = np.expand_dims(a[:, 2], axis=1)
139 |     exp_y1 = np.expand_dims(a[:, 1], 1)
140 |     exp_y2 = np.expand_dims(a[:, 3], 1)
141 | 
142 |     iw = np.where(exp_x2 < b[:, 2], exp_x2, b[:, 2]) - np.where(exp_x1 > b[:, 0], exp_x1, b[:, 0])
143 |     ih = np.where(exp_y2 < b[:, 3], exp_y2, b[:, 3]) - np.where(exp_y1 > b[:, 1], exp_y1, b[:, 1])
144 |     # iw = torch.clamp(iw, min=0)
145 |     # ih = torch.clamp(ih, min=0)
146 |     iw = np.where(iw > 0, iw, 0)
147 |     ih = np.where(ih > 0, ih, 0)
148 | 
149 |     intersection = iw * ih
150 |     area = np.where(area > 1e-6, area, 1e-6)
151 |     IoA = intersection / area
152 |     # IoA[torch.isnan(IoA)] = 1
153 |     return IoA
154 | 
155 | 
156 | def calc_iou(a, b):
157 |     # a(anchor) [boxes, (x1, y1, x2, y2)]
158 |     # b(gt, coco-style) [boxes, (x1, y1, x2, y2)]
159 | 
160 |     area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
161 | 
162 |     exp_x1 = np.expand_dims(a[:, 0], axis=1)
163 |     exp_x2 = np.expand_dims(a[:, 2], axis=1)
164 |     exp_y1 = np.expand_dims(a[:, 1], 1)
165 |     exp_y2 = np.expand_dims(a[:, 3], 1)
166 | 
167 |     iw = np.where(exp_x2 < b[:, 2], exp_x2, b[:, 2]) - np.where(exp_x1 > b[:, 0], exp_x1, b[:, 0])
168 |     ih = np.where(exp_y2 < b[:, 3], exp_y2, b[:, 3]) - np.where(exp_y1 > b[:, 1], exp_y1, b[:, 1])
169 |     # iw = torch.clamp(iw, min=0)
170 |     # ih = torch.clamp(ih, min=0)
171 |     iw = np.where(iw > 0, iw, 0)
172 |     ih = np.where(ih > 0, ih, 0)
173 | 
174 |     ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih
175 |     ua = np.where(ua > 0, ua, 1e-8)
176 | 
177 |     intersection = iw * ih
178 |     IoU = intersection / ua
179 |     return IoU
180 | 
181 | 
182 | def transform_class_id(id):
183 |     class_name = obj_list[id]
184 |     hico_obj_id = obj_dict[class_name]
185 |     return hico_obj_id
186 | 
187 | 
188 | def transform_action_hico(act_scores, mode):
189 |     union_scores = np.zeros(num_union_actions)
190 |     for i in range(num_inst_actions//2):
191 |         if mode == "subject":
192 |             union_scores[verb_to_hoi[i]] = act_scores[i]
193 |         else:
194 |             union_scores[verb_to_hoi[i]] = act_scores[i + num_inst_actions//2]
195 |     return union_scores
196 | 
197 | 
198 | def xy_to_wh(bbox):
199 |     ctr_x = (bbox[0] + bbox[2]) / 2
200 |     ctr_y = (bbox[1] + bbox[3]) / 2
201 |     width = bbox[2] - bbox[0]
202 |     height = bbox[3] - bbox[1]
203 |     return ctr_x, ctr_y, width, height
204 | 
205 | 
206 | def fetch_location_score(anchor_bbox, obj_bbox, target_bbox, human_bbox, sigma):
207 |     xo, yo, wo, ho = xy_to_wh(obj_bbox)
208 |     xt, yt, wt, ht = xy_to_wh(target_bbox)
209 |     # xh, yh, wh, hh = xy_to_wh(human_bbox)
210 |     xa, ya, wa, ha = xy_to_wh(anchor_bbox)
211 |     dist = np.zeros(4, dtype=np.float)
212 |     dist[0] = (xo - xt) / wa
213 |     dist[1] = (yo - yt) / ha
214 |     # dist[0] = (xo - xt) / wh
215 |     # dist[1] = (yo - yt) / hh
216 |     # dist[2] = np.log(wo/wt)
217 |     # dist[3] = np.log(ho/ht)
218 | 
219 |     return np.exp(-1*np.sum(dist**2)/(2*sigma**2))
220 | 
221 | 
222 | def target_object_dist(target_objects_pos, objects_pos, anchors):
223 |     width = anchors[:, 2] - anchors[:, 0]
224 |     height = anchors[:, 3] - anchors[:, 1]
225 |     anchors_size = np.stack([width, height], axis=1)
226 |     anchors_size = np.expand_dims(anchors_size, axis=1)
227 |     target_objects_pos = np.expand_dims(target_objects_pos, 1)
228 |     diff = target_objects_pos - objects_pos
229 |     diff = diff / anchors_size
230 |     dist = np.sum(diff**2, axis=2)
231 |     return dist
232 | 
233 | 
234 | def hoi_match(image_id, preds_inst, preds_union, human_thre=0.3, anchor_thre=0.1, loc_thre=0.05):
235 |     num_inst = len(preds_inst["rois"])
236 |     humans = []
237 |     objects = []
238 |     human_bboxes = []
239 |     human_inst_ids = []
240 |     human_role_scores = []
241 |     human_obj_scores = []
242 | 
243 |     while len(humans) == 0:
244 |         if human_thre < 0.2:
245 |             break
246 |         for inst_id in range(num_inst):
247 |             if preds_inst["obj_class_ids"][inst_id] != 0 or preds_inst["obj_scores"][inst_id] < human_thre:
248 |                 continue
249 |             item = {}
250 |             item["bbox"] = preds_inst["rois"][inst_id]
251 |             item["role_scores"] = preds_inst["act_scores"][inst_id][:len(verb_to_hoi)]
252 |             # item["role_scores"] = transform_action_hico(preds_inst["act_scores"][inst_id], "subject")
253 |             item["obj_scores"] = preds_inst["obj_scores"][inst_id]
254 |             item["inst_id"] = inst_id
255 |             humans.append(item)
256 |             human_bboxes.append(item["bbox"])
257 |             human_inst_ids.append(item["inst_id"])
258 |             human_role_scores.append(item["role_scores"])
259 |             human_obj_scores.append(item["obj_scores"] )
260 |         human_thre -= 0.1
261 |     human_bboxes = np.array(human_bboxes)
262 |     human_inst_ids = np.array(human_inst_ids)
263 |     human_role_scores = np.array(human_role_scores)
264 |     human_obj_scores = np.array(human_obj_scores)
265 | 
266 |     obj_role_scores = []
267 |     obj_obj_scores = []
268 |     for obj_id in range(len(preds_inst["rois"])):
269 |         item = {}
270 |         # obj_role_score = transform_action_hico(preds_inst["act_scores"][obj_id], "object")
271 |         obj_role_score = preds_inst["act_scores"][obj_id][len(verb_to_hoi):]
272 |         item["obj_role_scores"] = obj_role_score
273 |         item["obj_scores"] = preds_inst["obj_scores"][obj_id]
274 | 
275 |         item["obj_class_id"] = preds_inst["obj_class_ids"][obj_id]
276 | 
277 |         obj_bbox = preds_inst["rois"][obj_id]
278 |         item["bbox"] = obj_bbox
279 |         objects.append(item)
280 |         obj_role_scores.append(obj_role_score)
281 |         obj_obj_scores.append(item["obj_scores"])
282 |     object_bboxes = np.array(preds_inst["rois"])
283 |     obj_role_scores = np.array(obj_role_scores)
284 |     obj_obj_scores = np.array(obj_obj_scores)
285 | 
286 |     hoi_pair_score = np.zeros((len(humans), len(preds_inst["obj_class_ids"]), num_union_actions), dtype=np.float)
287 | 
288 |     if len(human_bboxes) > 0:
289 |         IoA = calc_ioa(preds_union["rois"], human_bboxes)
290 | 
291 |         IoA_max = np.max(IoA, axis=1)
292 |         human_foreground = IoA_max > 0.1  # 0.25
293 |         human_IoA = IoA[human_foreground]
294 |         for key in preds_union:
295 |             preds_union[key] = preds_union[key][human_foreground]
296 | 
297 |         new_IoA = calc_ioa(preds_union["rois"], preds_inst["rois"])
298 |         new_IoA_argmax = np.argmax(new_IoA, axis=1)
299 |         new_IoA[np.arange(new_IoA.shape[0]), new_IoA_argmax] = 0
300 |         new_IoA_sec_max = np.max(new_IoA, axis=1)
301 |         obj_foreground = new_IoA_sec_max > 0.1  # 0.25
302 |         for key in preds_union:
303 |             preds_union[key] = preds_union[key][obj_foreground]
304 | 
305 |         human_IoU = calc_iou(preds_union["rois"], human_bboxes)
306 |         human_IoA = human_IoA[obj_foreground]
307 |         human_IoU_argmax = np.argmax(human_IoU * (human_IoA > 0.1), axis=1)  # 0.25
308 |         obj_IoA = calc_ioa(preds_union["rois"], preds_inst["rois"])
309 | 
310 |         num_union = len(preds_union["rois"])
311 |         num_human = len(human_bboxes)
312 | 
313 |         sp_vectors = preds_union["sp_vector"]
314 |         inter_human_regions = human_bboxes[human_IoU_argmax]
315 |         humans_pos_x = (inter_human_regions[:, 0] + inter_human_regions[:, 2]) / 2
316 |         humans_pos_y = (inter_human_regions[:, 1] + inter_human_regions[:, 3]) / 2
317 |         humans_pos = np.stack([humans_pos_x, humans_pos_y], axis=1)
318 |         inter_objects_pos = humans_pos + sp_vectors
319 | 
320 |         objects_pos_x = (object_bboxes[:, 0] + object_bboxes[:, 2]) / 2
321 |         objects_pos_y = (object_bboxes[:, 1] + object_bboxes[:, 3]) / 2
322 |         objects_pos = np.stack([objects_pos_x, objects_pos_y], axis=1)
323 | 
324 |         obj_dists = target_object_dist(inter_objects_pos, objects_pos, preds_union["rois"])
325 |         inter_human_instids = human_inst_ids[human_IoU_argmax]
326 |         obj_dists[np.arange(num_union), inter_human_instids] = 100
327 |         obj_dists[obj_IoA < 0.1] = 100  # 0.25
328 |         inter_obj_ids = np.argmin(obj_dists, 1)
329 |         inter_obj_dist = obj_dists[np.arange(num_union), inter_obj_ids]
330 | 
331 |         sigma = 0.6
332 |         location_scores = np.exp(-1 * inter_obj_dist / (2 * sigma ** 2))
333 |         location_scores = np.where(location_scores<loc_thre, 0, location_scores)
334 |         anchor_scores = preds_union["act_scores"]
335 |         anchor_scores = np.where(anchor_scores<anchor_thre, 0, anchor_scores)
336 | 
337 |         inter_human_ids = human_IoU_argmax
338 |         inter_human_role_score = human_role_scores[inter_human_ids]
339 |         inst_object_role_score = obj_role_scores[inter_obj_ids]
340 | 
341 |         # inter_human_obj_score = np.expand_dims(human_obj_scores[inter_human_ids],1)
342 |         # inter_obj_obj_score = np.expand_dims(obj_obj_scores[inter_obj_ids],1)
343 | 
344 |         tau = 1.5
345 |         # inter_scores = 0.5 * ((inter_human_role_score + inst_object_role_score) * anchor_scores).T * location_scores
346 |         inter_scores = 0.5 * ((inter_human_role_score * inst_object_role_score) ** 0.5 * anchor_scores).T * location_scores ** tau
347 | 
348 |         inter_scores = inter_scores.T
349 |         inter_scores[inst_object_role_score == 0] = 0
350 | 
351 |         for human_id in range(num_human):
352 |             human_inter = inter_human_ids == human_id
353 |             human_inter_obj_id = inter_obj_ids[human_inter]
354 |             human_inter_score = inter_scores[human_inter]
355 | 
356 |             for obj_id in range(num_inst):
357 |                 hoi_pair_score[human_id, obj_id] = np.sum(human_inter_score[human_inter_obj_id==obj_id], axis=0)
358 | 
359 |     if args.flip_test:
360 |         hoi_pair_score /= 2
361 | 
362 |     hoi_cat_pair_score = np.zeros((len(humans), len(preds_inst["obj_class_ids"]), num_union_hois), dtype=np.float)
363 |     for verb in verb_to_hoi:
364 |         hoi_cat_pair_score[:, :, verb_to_hoi[verb]] = hoi_pair_score[:, :, [verb]]
365 | 
366 |     dets = []
367 |     for human_id, human in enumerate(humans):
368 |         for obj_id, object in enumerate(objects):
369 |             if human["inst_id"] == obj_id:
370 |                 continue
371 | 
372 |             tmp = []
373 |             tmp.append(human["bbox"])  # human box
374 |             tmp.append(object["bbox"])  # object box
375 |             tmp.append(transform_class_id(object["obj_class_id"]))
376 |             tmp.append(hoi_cat_pair_score[human_id, obj_id, :])
377 |             tmp.append(human["obj_scores"])
378 |             tmp.append(object["obj_scores"])
379 |             dets.append(tmp)
380 | 
381 |     return dets
382 | 
383 | 
384 | def img_detect(file, img_dir, model, input_size, regressBoxes, clipBoxes, threshold):
385 |     fname, ext = os.path.splitext(file)
386 |     image_id = int(fname.split("_")[-1])
387 | 
388 |     img_path = os.path.join(img_dir, file)
389 | 
390 |     ori_imgs, framed_imgs, framed_metas = preprocess(img_path, max_size=input_size)
391 |     if use_cuda:
392 |         x = torch.stack([torch.from_numpy(fi).cuda() for fi in framed_imgs], 0)
393 |     else:
394 |         x = torch.stack([torch.from_numpy(fi) for fi in framed_imgs], 0)
395 | 
396 |     x = x.to(torch.float32 if not use_float16 else torch.float16).permute(0, 3, 1, 2)
397 | 
398 |     if args.flip_test:
399 |         ids = torch.arange(x.shape[-1]-1, -1, -1).long().cuda()
400 |         x_flip = x[..., ids]
401 |         x_cat = torch.cat([x, x_flip], 0)
402 | 
403 |     with torch.no_grad():
404 |         if args.flip_test:
405 |             features, union_act_cls, union_sub_reg, union_obj_reg, \
406 |             inst_act_cls, inst_obj_cls, inst_bbox_reg, anchors = model(x_cat)
407 | 
408 |             anchors = torch.cat([anchors, anchors], 0)
409 |             preds_union = postprocess_dense_union_flip(x_cat, anchors, union_act_cls, union_sub_reg, union_obj_reg,
410 |                                                   regressBoxes, clipBoxes, 0.1, 1)
411 |             preds_inst = postprocess_hoi_flip(x_cat, anchors, inst_bbox_reg, inst_obj_cls, inst_act_cls,
412 |                                          regressBoxes, clipBoxes, threshold, nms_threshold,
413 |                                          mode="object", classwise=True)
414 |         else:
415 |             features, union_act_cls, union_sub_reg, union_obj_reg, \
416 |             inst_act_cls, inst_obj_cls, inst_bbox_reg, anchors = model(x)
417 | 
418 |             preds_union = postprocess_dense_union(x, anchors, union_act_cls, union_sub_reg, union_obj_reg,
419 |                                             regressBoxes, clipBoxes, 0.1, 1)
420 | 
421 |             preds_inst = postprocess_hoi(x, anchors, inst_bbox_reg, inst_obj_cls, inst_act_cls,
422 |                                      regressBoxes, clipBoxes, threshold, nms_threshold,
423 |                                      mode="object", classwise=True)
424 | 
425 |         preds_inst = invert_affine(framed_metas, preds_inst)[0]
426 |         preds_union = invert_affine(framed_metas, preds_union)[0]
427 | 
428 |         dets = hoi_match(image_id, preds_inst, preds_union)
429 | 
430 |     if need_visual:
431 |         visual_hico(preds_inst, dets, image_id)
432 |     return dets
433 | 
434 | 
435 | def test(threshold=0.2):
436 |     model = EfficientDetBackbone(num_classes=num_objects, num_union_classes=num_union_actions,
437 |                                  num_inst_classes=num_inst_actions, compound_coef=args.compound_coef,
438 |                                  ratios=eval(params["anchors_ratios"]), scales=eval(params["anchors_scales"]))
439 |     model.load_state_dict(torch.load(weights_path, map_location=torch.device('cpu')))
440 |     model.requires_grad_(False)
441 |     model.eval()
442 | 
443 |     if args.cuda:
444 |         model = model.cuda()
445 |     if args.float16:
446 |         model = model.half()
447 | 
448 |     regressBoxes = BBoxTransform()
449 |     clipBoxes = ClipBoxes()
450 | 
451 |     img_dir = os.path.join(data_dir, "hico_20160224_det/images/%s" % "test2015")
452 | 
453 |     _t = {'im_detect': Timer(), 'misc': Timer()}
454 |     detection = {}
455 | 
456 |     count = 0
457 |     for line in glob.iglob(img_dir + '/' + '*.jpg'):
458 |         count += 1
459 | 
460 |         _t['im_detect'].tic()
461 |         image_id = int(line[-9:-4])
462 | 
463 |         file = "HICO_test2015_" + (str(image_id)).zfill(8) + ".jpg"
464 | 
465 |         # if file != "COCO_val2014_000000001987.jpg":
466 |         #     continue
467 | 
468 |         dets = img_detect(file, img_dir, model, input_size, regressBoxes, clipBoxes, threshold=threshold)
469 | 
470 |         detection[image_id] = dets
471 |         # detection.extend(img_detection)
472 |         _t['im_detect'].toc()
473 | 
474 |         print('im_detect: {:d}/{:d}, average time: {:.3f}s'.format(count, 9658, _t['im_detect'].average_time))
475 | 
476 |     with open(detection_path, "wb") as file:
477 |         pickle.dump(detection, file)
478 | 
479 | 
480 | if __name__ == '__main__':
481 |     if override_prev_results or not os.path.exists(detection_path):
482 |         test()
483 |     if args.flip_test:
484 |         hico_dir = os.path.join(output_dir, f"{project_name}_flip_final/")
485 |     else:
486 |         hico_dir = os.path.join(output_dir, f"{project_name}_final/")
487 |     if not os.path.exists(hico_dir):
488 |         os.mkdir(hico_dir)
489 |     Generate_HICO_detection(detection_path, hico_dir)
490 | 
491 | 
492 | 
493 | 
494 | 


--------------------------------------------------------------------------------