├── .gitignore
├── .idea
├── Relation_Networks-pytorch.iml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── README.md
├── Weights
└── README.md
├── config.py
├── data
├── __init__.py
├── dataset.py
├── util.py
└── voc_dataset.py
├── demo.ipynb
├── demo
├── demo.jpg
└── demo_output.png
├── evaluate.py
├── experiments.py
├── lib
├── array_tool.py
├── bbox_tools.py
├── creator_tool.py
├── eval_tool.py
├── nms
│ ├── __init__.py
│ ├── _nms_gpu_post.c
│ ├── _nms_gpu_post.pyx
│ ├── _nms_gpu_post_py.py
│ ├── build.py
│ └── non_maximum_suppression.py
├── relation_tool.py
├── roi_cupy.py
└── vis_tool.py
├── losses.py
├── model.py
├── train.py
├── train_history.txt
└── trainer.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | wheels/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | MANIFEST
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *.cover
46 | .hypothesis/
47 | .pytest_cache/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 | db.sqlite3
57 |
58 | # Flask stuff:
59 | instance/
60 | .webassets-cache
61 |
62 | # Scrapy stuff:
63 | .scrapy
64 |
65 | # Sphinx documentation
66 | docs/_build/
67 |
68 | # PyBuilder
69 | target/
70 |
71 | # Jupyter Notebook
72 | .ipynb_checkpoints
73 |
74 | # pyenv
75 | .python-version
76 |
77 | # celery beat schedule file
78 | celerybeat-schedule
79 |
80 | # SageMath parsed files
81 | *.sage.py
82 |
83 | # Environments
84 | .env
85 | .venv
86 | env/
87 | venv/
88 | ENV/
89 | env.bak/
90 | venv.bak/
91 |
92 | # Spyder project settings
93 | .spyderproject
94 | .spyproject
95 |
96 | # Rope project settings
97 | .ropeproject
98 |
99 | # mkdocs documentation
100 | /site
101 |
102 | # mypy
103 | .mypy_cache/
104 |
105 | *.pth
106 |
107 | *.pt
108 |
109 | train_history\.txt
110 |
--------------------------------------------------------------------------------
/.idea/Relation_Networks-pytorch.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Relation_Networks-pytorch
2 | Relation Networks for object detection based pytorch
3 |
4 | ### Progress
5 |
6 | - [x] add relation module
7 | - [x] add nms using relation module
8 | - [x] end to end train renet based faster-rcnn on VOC2007
9 | - [x] evaluate mAP
10 | - [ ] modified learn_nms
11 | - [ ] add OHEM
12 | - [ ] improve performance
13 | - [ ] train and forward on COCO
14 |
15 | ## Requirements
16 | 1. Python3.5
17 | 2. The following Python packages:
18 | ```
19 | Cython
20 | cupy
21 | numpy
22 | pytorch
23 | opencv-python
24 | collections
25 | ```
26 | ## Acknowledgement
27 | This work builds on many excellent works, which include:
28 |
29 | - [Yusuke Niitani's ChainerCV](https://github.com/chainer/chainercv) (mainly)
30 | - [Ruotian Luo's pytorch-faster-rcnn](https://github.com/ruotianluo/pytorch-faster-rcnn) which based on [Xinlei Chen's tf-faster-rcnn](https://github.com/endernewton/tf-faster-rcnn)
31 | - [simple-faster-rcnn-pytorch by chenyuntc](https://github.com/chenyuntc/simple-faster-rcnn-pytorch)
32 | - [faster-rcnn.pytorch by Jianwei Yang and Jiasen Lu](https://github.com/jwyang/faster-rcnn.pytorch).It mainly refer to [longcw's faster_rcnn_pytorch](https://github.com/longcw/faster_rcnn_pytorch)
33 | - All the above Repositories have referred to [py-faster-rcnn by Ross Girshick and Sean Bell](https://github.com/rbgirshick/py-faster-rcnn) either directly or indirectly.
34 | - [yhenon's pytorch-retinanet](https://github.com/yhenon/pytorch-retinanet)
35 | - [msracver's Relation-Networks-for-Object-Detection](https://github.com/msracver/Relation-Networks-for-Object-Detection)
--------------------------------------------------------------------------------
/Weights/README.md:
--------------------------------------------------------------------------------
1 | This folder stores Weights from train
2 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 |
2 | class Config:
3 | # data
4 | voc_data_dir = '/media/heecheol/새 볼륨/DataSet/VOC2007/'
5 | min_size = 600 # image resize
6 | max_size = 1000 # image resize
7 | num_workers = 8
8 |
9 | # sigma for l1_smooth_loss
10 | rpn_sigma = 3.
11 | roi_sigma = 1.
12 |
13 | # param for optimizer
14 | # 0.0005 in origin paper but 0.0001 in tf-faster-rcnn
15 | weight_decay = 0.0001
16 | lr = 1e-4
17 |
18 |
19 | # training
20 | trainset = 'trainval'
21 | testset = 'test'
22 | epoch = 15
23 | isLearnNMS = False
24 | use_adam = True #You need set a very low lr for Adam
25 | #The batch size can still only one.
26 | batch_size=1
27 |
28 | model_name='squeeze'
29 |
30 | features_dim = 512
31 |
32 |
33 |
34 | opt = Config()
35 |
--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/heefe92/Relation_Networks-pytorch/33e645ca38691f9e1988d28fcd5cf5b3c0fcc641/data/__init__.py
--------------------------------------------------------------------------------
/data/dataset.py:
--------------------------------------------------------------------------------
1 | import torch as t
2 | from .voc_dataset import VOCBboxDataset
3 | from skimage import transform as sktsf
4 | from torchvision import transforms as tvtsf
5 | from . import util
6 | import numpy as np
7 |
8 |
9 |
10 | def preprocess(img, min_size=600, max_size=1000):
11 | """Preprocess an image for feature extraction.
12 |
13 | The length of the shorter edge is scaled to :obj:`self.min_size`.
14 | After the scaling, if the length of the longer edge is longer than
15 | :param min_size:
16 | :obj:`self.max_size`, the image is scaled to fit the longer edge
17 | to :obj:`self.max_size`.
18 |
19 | After resizing the image, the image is subtracted by a mean image value
20 | :obj:`self.mean`.
21 |
22 | Args:
23 | img (~numpy.ndarray): An image. This is in CHW and RGB format.
24 | The range of its value is :math:`[0, 255]`.
25 |
26 | Returns:
27 | ~numpy.ndarray: A preprocessed image.
28 |
29 | """
30 | C, H, W = img.shape
31 | scale1 = min_size / min(H, W)
32 | scale2 = max_size / max(H, W)
33 | scale = min(scale1, scale2)
34 | img = img.astype(np.float32)/255.0
35 | img = sktsf.resize(img, (C, (H * scale)//32 * 32, (W * scale)//32 * 32), mode='reflect')
36 | # both the longer and shorter should be less than
37 | # max_size and min_size
38 |
39 | return img
40 |
41 | def normalize(img):
42 | """
43 | https://github.com/pytorch/vision/issues/223
44 | return appr -1~1 RGB
45 | """
46 | normalize = tvtsf.Normalize(mean=[0.485, 0.456, 0.406],
47 | std=[0.229, 0.224, 0.225])
48 | img = normalize(t.from_numpy(img))
49 | return img.numpy()
50 | def VGGTestPreprocess(img):
51 | """Preprocess an image for feature extraction.
52 | The length of the shorter edge is scaled to :obj:`self.min_size`.
53 | After the scaling, if the length of the longer edge is longer than
54 | :param min_size:
55 | :obj:`self.max_size`, the image is scaled to fit the longer edge
56 | to :obj:`self.max_size`.
57 | After resizing the image, the image is subtracted by a mean image value
58 | :obj:`self.mean`.
59 | Args:
60 | img (~numpy.ndarray): An image. This is in CHW and RGB format.
61 | The range of its value is :math:`[0, 255]`.
62 | Returns:
63 | ~numpy.ndarray: A preprocessed image.
64 | """
65 | img = img / 255.
66 | # both the longer and shorter should be less than
67 | # max_size and min_size
68 |
69 | return normalize(img)
70 | def VGGpreprocess(img, min_size=600, max_size=1000):
71 | """Preprocess an image for feature extraction.
72 | The length of the shorter edge is scaled to :obj:`self.min_size`.
73 | After the scaling, if the length of the longer edge is longer than
74 | :param min_size:
75 | :obj:`self.max_size`, the image is scaled to fit the longer edge
76 | to :obj:`self.max_size`.
77 | After resizing the image, the image is subtracted by a mean image value
78 | :obj:`self.mean`.
79 | Args:
80 | img (~numpy.ndarray): An image. This is in CHW and RGB format.
81 | The range of its value is :math:`[0, 255]`.
82 | Returns:
83 | ~numpy.ndarray: A preprocessed image.
84 | """
85 | C, H, W = img.shape
86 | scale1 = min_size / min(H, W)
87 | scale2 = max_size / max(H, W)
88 | scale = min(scale1, scale2)
89 | img = img / 255.
90 | img = sktsf.resize(img, (C, H * scale, W * scale), mode='reflect',anti_aliasing=False)
91 | # both the longer and shorter should be less than
92 | # max_size and min_size
93 |
94 | return normalize(img)
95 | class VGGTransform(object):
96 | def __init__(self, min_size=600, max_size=1000):
97 | self.min_size = min_size
98 | self.max_size = max_size
99 |
100 | def __call__(self, in_data):
101 | img, bbox, label = in_data
102 | _, H, W = img.shape
103 | img = VGGpreprocess(img, self.min_size, self.max_size)
104 | _, o_H, o_W = img.shape
105 | scale = o_H / H
106 | bbox = util.resize_bbox(bbox, (H, W), (o_H, o_W))
107 |
108 | # horizontally flip
109 | img, params = util.random_flip(
110 | img, x_random=True, return_param=True)
111 | bbox = util.flip_bbox(
112 | bbox, (o_H, o_W), x_flip=params['x_flip'])
113 |
114 | return img, bbox, label, scale
115 |
116 | class Transform(object):
117 |
118 | def __init__(self, min_size, max_size):
119 | self.min_size = min_size
120 | self.max_size = max_size
121 |
122 | def __call__(self, in_data):
123 | img, bbox, label = in_data
124 | _, H, W = img.shape
125 | img = preprocess(img, self.min_size, self.max_size)
126 | _, o_H, o_W = img.shape
127 |
128 | bbox = util.resize_bbox(bbox, (H, W), (o_H, o_W))
129 |
130 | # horizontally flip
131 | img, params = util.random_flip(
132 | img, x_random=True, return_param=True)
133 | bbox = util.flip_bbox(
134 | bbox, (o_H, o_W), x_flip=params['x_flip'])
135 |
136 | return img, bbox, label, 1.0
137 |
138 | class VGGDataset:
139 | def __init__(self, opt):
140 | self.opt = opt
141 | self.db = VOCBboxDataset(opt.voc_data_dir, split=opt.trainset)
142 | self.tsf = VGGTransform(opt.min_size, opt.max_size)
143 |
144 | def __getitem__(self, idx):
145 | ori_img, bbox, label, difficult = self.db.get_example(idx)
146 |
147 | img, bbox, label, scale = self.tsf((ori_img, bbox, label))
148 | # TODO: check whose stride is negative to fix this instead copy all
149 | # some of the strides of a given numpy array are negative.
150 | return img.copy(), bbox.copy(), label.copy(), scale
151 |
152 | def __len__(self):
153 | return len(self.db)
154 | class VGGTestDataset:
155 | def __init__(self, opt, use_difficult=True):
156 | self.opt = opt
157 | self.db = VOCBboxDataset(opt.voc_data_dir, split=opt.testset, use_difficult=use_difficult)
158 | def __getitem__(self, idx):
159 | ori_img, bbox, label, difficult = self.db.get_example(idx)
160 | img = VGGTestPreprocess(ori_img)
161 | return img.copy(), img.shape[1:], bbox.copy(), label.copy(), difficult.copy()
162 |
163 | def __len__(self):
164 | return len(self.db)
165 |
166 | class Dataset:
167 | def __init__(self, opt):
168 | self.VOCBboxDataset = VOCBboxDataset(opt.voc_data_dir,split='trainval')
169 | self.Transform = Transform(opt.min_size, opt.max_size)
170 |
171 | def __getitem__(self, idx):
172 | ori_img, bbox, label, difficult = self.VOCBboxDataset.get_example(idx)
173 |
174 | img, bbox, label, scale = self.Transform((ori_img, bbox, label))
175 | # TODO: check whose stride is negative to fix this instead copy all
176 | # some of the strides of a given numpy array are negative.
177 | return img.copy(), bbox.copy(), label.copy(), scale
178 |
179 | def __len__(self):
180 | return len(self.VOCBboxDataset)
181 |
182 | class TestDataset:
183 | def __init__(self, opt, split='test', use_difficult=True):
184 | self.opt = opt
185 | self.db = VOCBboxDataset(opt.voc_data_dir, split=split, use_difficult=use_difficult)
186 | self.Transform = Transform(opt.min_size, opt.min_size)
187 | def __getitem__(self, idx):
188 | ori_img, bbox, label, difficult = self.db.get_example(idx)
189 | img, bbox, label, scale = self.Transform((ori_img, bbox, label))
190 | return img.copy(), img.shape[1:], bbox.copy(), label.copy(), difficult.copy()
191 |
192 | def __len__(self):
193 | return len(self.db)
194 |
--------------------------------------------------------------------------------
/data/util.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from PIL import Image
3 | import random
4 |
5 |
6 | def read_image(path, dtype=np.float32, color=True):
7 | """Read an image from a file.
8 |
9 | This function reads an image from given file. The image is CHW format and
10 | the range of its value is :math:`[0, 255]`. If :obj:`color = True`, the
11 | order of the channels is RGB.
12 |
13 | Args:
14 | path (str): A path of image file.
15 | dtype: The type of array. The default value is :obj:`~numpy.float32`.
16 | color (bool): This option determines the number of channels.
17 | If :obj:`True`, the number of channels is three. In this case,
18 | the order of the channels is RGB. This is the default behaviour.
19 | If :obj:`False`, this function returns a grayscale image.
20 |
21 | Returns:
22 | ~numpy.ndarray: An image.
23 | """
24 |
25 | f = Image.open(path)
26 | try:
27 | if color:
28 | img = f.convert('RGB')
29 | else:
30 | img = f.convert('P')
31 | img = np.asarray(img, dtype=dtype)
32 | finally:
33 | if hasattr(f, 'close'):
34 | f.close()
35 |
36 | if img.ndim == 2:
37 | # reshape (H, W) -> (1, H, W)
38 | return img[np.newaxis]
39 | else:
40 | # transpose (H, W, C) -> (C, H, W)
41 | return img.transpose((2, 0, 1))
42 |
43 |
44 | def resize_bbox(bbox, in_size, out_size):
45 | """Resize bounding boxes according to image resize.
46 |
47 | The bounding boxes are expected to be packed into a two dimensional
48 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
49 | bounding boxes in the image. The second axis represents attributes of
50 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
51 | where the four attributes are coordinates of the top left and the
52 | bottom right vertices.
53 |
54 | Args:
55 | bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`.
56 | :math:`R` is the number of bounding boxes.
57 | in_size (tuple): A tuple of length 2. The height and the width
58 | of the image before resized.
59 | out_size (tuple): A tuple of length 2. The height and the width
60 | of the image after resized.
61 |
62 | Returns:
63 | ~numpy.ndarray:
64 | Bounding boxes rescaled according to the given image shapes.
65 |
66 | """
67 | bbox = bbox.copy()
68 | y_scale = float(out_size[0]) / in_size[0]
69 | x_scale = float(out_size[1]) / in_size[1]
70 | bbox[:, 0] = y_scale * bbox[:, 0]
71 | bbox[:, 2] = y_scale * bbox[:, 2]
72 | bbox[:, 1] = x_scale * bbox[:, 1]
73 | bbox[:, 3] = x_scale * bbox[:, 3]
74 | return bbox
75 |
76 |
77 | def flip_bbox(bbox, size, y_flip=False, x_flip=False):
78 | """Flip bounding boxes accordingly.
79 |
80 | The bounding boxes are expected to be packed into a two dimensional
81 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
82 | bounding boxes in the image. The second axis represents attributes of
83 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
84 | where the four attributes are coordinates of the top left and the
85 | bottom right vertices.
86 |
87 | Args:
88 | bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`.
89 | :math:`R` is the number of bounding boxes.
90 | size (tuple): A tuple of length 2. The height and the width
91 | of the image before resized.
92 | y_flip (bool): Flip bounding box according to a vertical flip of
93 | an image.
94 | x_flip (bool): Flip bounding box according to a horizontal flip of
95 | an image.
96 |
97 | Returns:
98 | ~numpy.ndarray:
99 | Bounding boxes flipped according to the given flips.
100 |
101 | """
102 | H, W = size
103 | bbox = bbox.copy()
104 | if y_flip:
105 | y_max = H - bbox[:, 0]
106 | y_min = H - bbox[:, 2]
107 | bbox[:, 0] = y_min
108 | bbox[:, 2] = y_max
109 | if x_flip:
110 | x_max = W - bbox[:, 1]
111 | x_min = W - bbox[:, 3]
112 | bbox[:, 1] = x_min
113 | bbox[:, 3] = x_max
114 | return bbox
115 |
116 |
117 | def crop_bbox(
118 | bbox, y_slice=None, x_slice=None,
119 | allow_outside_center=True, return_param=False):
120 | """Translate bounding boxes to fit within the cropped area of an image.
121 |
122 | This method is mainly used together with image cropping.
123 | This method translates the coordinates of bounding boxes like
124 | :func:`data.util.translate_bbox`. In addition,
125 | this function truncates the bounding boxes to fit within the cropped area.
126 | If a bounding box does not overlap with the cropped area,
127 | this bounding box will be removed.
128 |
129 | The bounding boxes are expected to be packed into a two dimensional
130 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
131 | bounding boxes in the image. The second axis represents attributes of
132 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
133 | where the four attributes are coordinates of the top left and the
134 | bottom right vertices.
135 |
136 | Args:
137 | bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is
138 | :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
139 | y_slice (slice): The slice of y axis.
140 | x_slice (slice): The slice of x axis.
141 | allow_outside_center (bool): If this argument is :obj:`False`,
142 | bounding boxes whose centers are outside of the cropped area
143 | are removed. The default value is :obj:`True`.
144 | return_param (bool): If :obj:`True`, this function returns
145 | indices of kept bounding boxes.
146 |
147 | Returns:
148 | ~numpy.ndarray or (~numpy.ndarray, dict):
149 |
150 | If :obj:`return_param = False`, returns an array :obj:`bbox`.
151 |
152 | If :obj:`return_param = True`,
153 | returns a tuple whose elements are :obj:`bbox, param`.
154 | :obj:`param` is a dictionary of intermediate parameters whose
155 | contents are listed below with key, value-type and the description
156 | of the value.
157 |
158 | * **index** (*numpy.ndarray*): An array holding indices of used \
159 | bounding boxes.
160 |
161 | """
162 |
163 | t, b = _slice_to_bounds(y_slice)
164 | l, r = _slice_to_bounds(x_slice)
165 | crop_bb = np.array((t, l, b, r))
166 |
167 | if allow_outside_center:
168 | mask = np.ones(bbox.shape[0], dtype=bool)
169 | else:
170 | center = (bbox[:, :2] + bbox[:, 2:]) / 2
171 | mask = np.logical_and(crop_bb[:2] <= center, center < crop_bb[2:]) \
172 | .all(axis=1)
173 |
174 | bbox = bbox.copy()
175 | bbox[:, :2] = np.maximum(bbox[:, :2], crop_bb[:2])
176 | bbox[:, 2:] = np.minimum(bbox[:, 2:], crop_bb[2:])
177 | bbox[:, :2] -= crop_bb[:2]
178 | bbox[:, 2:] -= crop_bb[:2]
179 |
180 | mask = np.logical_and(mask, (bbox[:, :2] < bbox[:, 2:]).all(axis=1))
181 | bbox = bbox[mask]
182 |
183 | if return_param:
184 | return bbox, {'index': np.flatnonzero(mask)}
185 | else:
186 | return bbox
187 |
188 |
189 | def _slice_to_bounds(slice_):
190 | if slice_ is None:
191 | return 0, np.inf
192 |
193 | if slice_.start is None:
194 | l = 0
195 | else:
196 | l = slice_.start
197 |
198 | if slice_.stop is None:
199 | u = np.inf
200 | else:
201 | u = slice_.stop
202 |
203 | return l, u
204 |
205 |
206 | def translate_bbox(bbox, y_offset=0, x_offset=0):
207 | """Translate bounding boxes.
208 |
209 | This method is mainly used together with image transforms, such as padding
210 | and cropping, which translates the left top point of the image from
211 | coordinate :math:`(0, 0)` to coordinate
212 | :math:`(y, x) = (y_{offset}, x_{offset})`.
213 |
214 | The bounding boxes are expected to be packed into a two dimensional
215 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
216 | bounding boxes in the image. The second axis represents attributes of
217 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
218 | where the four attributes are coordinates of the top left and the
219 | bottom right vertices.
220 |
221 | Args:
222 | bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is
223 | :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
224 | y_offset (int or float): The offset along y axis.
225 | x_offset (int or float): The offset along x axis.
226 |
227 | Returns:
228 | ~numpy.ndarray:
229 | Bounding boxes translated according to the given offsets.
230 |
231 | """
232 |
233 | out_bbox = bbox.copy()
234 | out_bbox[:, :2] += (y_offset, x_offset)
235 | out_bbox[:, 2:] += (y_offset, x_offset)
236 |
237 | return out_bbox
238 |
239 |
240 | def random_flip(img, y_random=False, x_random=False,
241 | return_param=False, copy=False):
242 | """Randomly flip an image in vertical or horizontal direction.
243 |
244 | Args:
245 | img (~numpy.ndarray): An array that gets flipped. This is in
246 | CHW format.
247 | y_random (bool): Randomly flip in vertical direction.
248 | x_random (bool): Randomly flip in horizontal direction.
249 | return_param (bool): Returns information of flip.
250 | copy (bool): If False, a view of :obj:`img` will be returned.
251 |
252 | Returns:
253 | ~numpy.ndarray or (~numpy.ndarray, dict):
254 |
255 | If :obj:`return_param = False`,
256 | returns an array :obj:`out_img` that is the result of flipping.
257 |
258 | If :obj:`return_param = True`,
259 | returns a tuple whose elements are :obj:`out_img, param`.
260 | :obj:`param` is a dictionary of intermediate parameters whose
261 | contents are listed below with key, value-type and the description
262 | of the value.
263 |
264 | * **y_flip** (*bool*): Whether the image was flipped in the\
265 | vertical direction or not.
266 | * **x_flip** (*bool*): Whether the image was flipped in the\
267 | horizontal direction or not.
268 |
269 | """
270 | y_flip, x_flip = False, False
271 | if y_random:
272 | y_flip = random.choice([True, False])
273 | if x_random:
274 | x_flip = random.choice([True, False])
275 |
276 | if y_flip:
277 | img = img[:, ::-1, :]
278 | if x_flip:
279 | img = img[:, :, ::-1]
280 |
281 | if copy:
282 | img = img.copy()
283 |
284 | if return_param:
285 | return img, {'y_flip': y_flip, 'x_flip': x_flip}
286 | else:
287 | return img
288 |
--------------------------------------------------------------------------------
/data/voc_dataset.py:
--------------------------------------------------------------------------------
1 | import os
2 | import xml.etree.ElementTree as ET
3 |
4 | import numpy as np
5 | import skimage.io
6 | import skimage.transform
7 | import skimage.color
8 | import skimage
9 |
10 | from .util import read_image
11 |
12 |
13 | class VOCBboxDataset:
14 | """Bounding box dataset for PASCAL `VOC`_.
15 |
16 | .. _`VOC`: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/
17 |
18 | The index corresponds to each image.
19 |
20 | When queried by an index, if :obj:`return_difficult == False`,
21 | this dataset returns a corresponding
22 | :obj:`img, bbox, label`, a tuple of an image, bounding boxes and labels.
23 | This is the default behaviour.
24 | If :obj:`return_difficult == True`, this dataset returns corresponding
25 | :obj:`img, bbox, label, difficult`. :obj:`difficult` is a boolean array
26 | that indicates whether bounding boxes are labeled as difficult or not.
27 |
28 | The bounding boxes are packed into a two dimensional tensor of shape
29 | :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in
30 | the image. The second axis represents attributes of the bounding box.
31 | They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, where the
32 | four attributes are coordinates of the top left and the bottom right
33 | vertices.
34 |
35 | The labels are packed into a one dimensional tensor of shape :math:`(R,)`.
36 | :math:`R` is the number of bounding boxes in the image.
37 | The class name of the label :math:`l` is :math:`l` th element of
38 | :obj:`VOC_BBOX_LABEL_NAMES`.
39 |
40 | The array :obj:`difficult` is a one dimensional boolean array of shape
41 | :math:`(R,)`. :math:`R` is the number of bounding boxes in the image.
42 | If :obj:`use_difficult` is :obj:`False`, this array is
43 | a boolean array with all :obj:`False`.
44 |
45 | The type of the image, the bounding boxes and the labels are as follows.
46 |
47 | * :obj:`img.dtype == numpy.float32`
48 | * :obj:`bbox.dtype == numpy.float32`
49 | * :obj:`label.dtype == numpy.int32`
50 | * :obj:`difficult.dtype == numpy.bool`
51 |
52 | Args:
53 | data_dir (string): Path to the root of the training data.
54 | i.e. "/data/image/voc/VOCdevkit/VOC2007/"
55 | split ({'train', 'val', 'trainval', 'test'}): Select a split of the
56 | dataset. :obj:`test` split is only available for
57 | 2007 dataset.
58 | year ({'2007', '2012'}): Use a dataset prepared for a challenge
59 | held in :obj:`year`.
60 | use_difficult (bool): If :obj:`True`, use images that are labeled as
61 | difficult in the original annotation.
62 | return_difficult (bool): If :obj:`True`, this dataset returns
63 | a boolean array
64 | that indicates whether bounding boxes are labeled as difficult
65 | or not. The default value is :obj:`False`.
66 |
67 | """
68 |
69 | def __init__(self, data_dir, split='train',
70 | use_difficult=False, return_difficult=False,
71 | ):
72 |
73 | # if split not in ['train', 'trainval', 'val']:
74 | # if not (split == 'test' and year == '2007'):
75 | # warnings.warn(
76 | # 'please pick split from \'train\', \'trainval\', \'val\''
77 | # 'for 2012 dataset. For 2007 dataset, you can pick \'test\''
78 | # ' in addition to the above mentioned splits.'
79 | # )
80 | id_list_file = os.path.join(
81 | data_dir, 'ImageSets/Main/{0}.txt'.format(split))
82 |
83 | self.ids = [id_.strip() for id_ in open(id_list_file)]
84 | self.data_dir = data_dir
85 | self.use_difficult = use_difficult
86 | self.return_difficult = return_difficult
87 | self.label_names = VOC_BBOX_LABEL_NAMES
88 |
89 | def __len__(self):
90 | return len(self.ids)
91 |
92 | def get_example(self, i):
93 | """Returns the i-th example.
94 |
95 | Returns a color image and bounding boxes. The image is in CHW format.
96 | The returned image is RGB.
97 |
98 | Args:
99 | i (int): The index of the example.
100 |
101 | Returns:
102 | tuple of an image and bounding boxes
103 |
104 | """
105 | id_ = self.ids[i]
106 | anno = ET.parse(
107 | os.path.join(self.data_dir, 'Annotations', id_ + '.xml'))
108 | bbox = list()
109 | label = list()
110 | difficult = list()
111 | for obj in anno.findall('object'):
112 | # when in not using difficult split, and the object is
113 | # difficult, skipt it.
114 | if not self.use_difficult and int(obj.find('difficult').text) == 1:
115 | continue
116 |
117 | difficult.append(int(obj.find('difficult').text))
118 | bndbox_anno = obj.find('bndbox')
119 | # subtract 1 to make pixel indexes 0-based
120 | bbox.append([
121 | int(bndbox_anno.find(tag).text) - 1
122 | for tag in ('ymin', 'xmin', 'ymax', 'xmax')])
123 | name = obj.find('name').text.lower().strip()
124 | label.append(VOC_BBOX_LABEL_NAMES.index(name))
125 | bbox = np.stack(bbox).astype(np.float32)
126 | label = np.stack(label).astype(np.int32)
127 | # When `use_difficult==False`, all elements in `difficult` are False.
128 | difficult = np.array(difficult, dtype=np.bool).astype(np.uint8) # PyTorch don't support np.bool
129 |
130 | # Load a image
131 | img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpg')
132 | img = skimage.io.imread(img_file)
133 | if len(img.shape) == 2:
134 | img = skimage.color.gray2rgb(img)
135 | img = img.transpose((2, 0, 1))
136 |
137 | #img = read_image(img_file, color=True)
138 |
139 | # if self.return_difficult:
140 | # return img, bbox, label, difficult
141 | return img, bbox, label, difficult
142 |
143 | __getitem__ = get_example
144 |
145 |
146 | VOC_BBOX_LABEL_NAMES = (
147 | 'aeroplane',#0
148 | 'bicycle',#1
149 | 'bird',#2
150 | 'boat',#3
151 | 'bottle',#4
152 | 'bus',#5
153 | 'car',#6
154 | 'cat',#7
155 | 'chair',#8
156 | 'cow',#9
157 | 'diningtable',#10
158 | 'dog',#11
159 | 'horse',#12
160 | 'motorbike',#13
161 | 'person',#14
162 | 'pottedplant',
163 | 'sheep',
164 | 'sofa',
165 | 'train',
166 | 'tvmonitor')
167 |
--------------------------------------------------------------------------------
/demo/demo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/heefe92/Relation_Networks-pytorch/33e645ca38691f9e1988d28fcd5cf5b3c0fcc641/demo/demo.jpg
--------------------------------------------------------------------------------
/demo/demo_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/heefe92/Relation_Networks-pytorch/33e645ca38691f9e1988d28fcd5cf5b3c0fcc641/demo/demo_output.png
--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
1 | from torch.utils import data as data_
2 | import model
3 |
4 | import torch
5 | from lib.eval_tool import eval_detection_voc
6 | from data.dataset import TestDataset
7 | from config import opt
8 | import cv2,time
9 | import numpy as np
10 | from lib.array_tool import tonumpy
11 |
12 | # def eval(dataloader, model, test_num=10000):
13 | # pred_bboxes, pred_labels, pred_scores = list(), list(), list()
14 | # gt_bboxes, gt_labels, gt_difficults = list(), list(), list()
15 | # for ii, data in enumerate(dataloader):
16 | # (imgs, sizes, gt_bboxes_, gt_labels_, gt_difficults_) = data
17 | #
18 | # nms_scores, sorted_labels, sorted_cls_bboxes = model.predict(
19 | # imgs.cuda().float())
20 | # if not ( nms_scores is None):
21 | # test = np.reshape(np.argwhere(nms_scores>0.7),-1)
22 | # nms_scores = nms_scores[test]
23 | # sorted_labels = sorted_labels[test]
24 | # sorted_cls_bboxes = sorted_cls_bboxes[test]
25 | #
26 | # pred_bboxes.append(np.reshape(tonumpy(sorted_cls_bboxes),(-1,4)).copy())
27 | # pred_labels.append(np.reshape(tonumpy(sorted_labels),(-1)).copy())
28 | # pred_scores.append(np.reshape(tonumpy(nms_scores),(-1)).copy())
29 | # else:
30 | # pred_bboxes.append(np.array([]))
31 | # pred_labels.append(np.array([]))
32 | # pred_scores.append(np.array([]))
33 | # gt_bboxes += list(gt_bboxes_.numpy())
34 | # gt_labels += list(gt_labels_.numpy())
35 | # gt_difficults += list(gt_difficults_.numpy())
36 | # if ii == test_num: break
37 | # result = eval_detection_voc(
38 | # pred_bboxes, pred_labels, pred_scores,
39 | # gt_bboxes, gt_labels, gt_difficults,
40 | # use_07_metric=True)
41 | # return result
42 |
43 |
44 | def eval(dataloader, model, test_num=10000):
45 | pred_bboxes, pred_labels, pred_scores = list(), list(), list()
46 | gt_bboxes, gt_labels, gt_difficults = list(), list(), list()
47 | for ii, data in enumerate(dataloader):
48 | (imgs, sizes, gt_bboxes_, gt_labels_, gt_difficults_) = data
49 | sizes = [sizes[0][0], sizes[1][0]]
50 | pred_bboxes_, pred_labels_, pred_scores_ = model.predict(imgs, [sizes])
51 | gt_bboxes += list(gt_bboxes_.numpy())
52 | gt_labels += list(gt_labels_.numpy())
53 | gt_difficults += list(gt_difficults_.numpy())
54 | pred_bboxes += pred_bboxes_
55 | pred_labels += pred_labels_
56 | pred_scores += pred_scores_
57 | if ii == test_num: break
58 |
59 | result = eval_detection_voc(
60 | pred_bboxes, pred_labels, pred_scores,
61 | gt_bboxes, gt_labels, gt_difficults,
62 | use_07_metric=True)
63 | return result
64 |
65 | def run_evaluate():
66 | testset = TestDataset(opt)
67 | test_dataloader = data_.DataLoader(testset,
68 | batch_size=opt.batch_size,
69 | num_workers=opt.num_workers,
70 | shuffle=False#, \
71 | #pin_memory=True
72 | )
73 |
74 | resnet = model.resnet101(20,True)
75 | resnet = torch.nn.DataParallel(resnet).cuda()
76 |
77 | resnet.load_state_dict(torch.load('Weights/resnet101_relation_47.pt'))
78 | resnet.module.use_preset(isTraining=False,preset='evaluate')
79 | resnet.eval()
80 |
81 | for child in resnet.module.children():
82 | for param in child.parameters():
83 | param.requires_grad = False
84 |
85 | print(eval(test_dataloader,resnet,10000))
86 |
87 | if __name__ == "__main__":
88 | run_evaluate()
--------------------------------------------------------------------------------
/experiments.py:
--------------------------------------------------------------------------------
1 | import collections
2 |
3 | import numpy as np
4 | from torch.utils import data as data_
5 | import model
6 |
7 | from trainer import Trainer
8 | import torch
9 | import torch.optim as optim
10 | from data.dataset import VGGDataset, VGGTestDataset
11 | from config import opt
12 | import cv2,time
13 |
14 | def run_train(train_verbose=False):
15 | dataset = VGGDataset(opt)
16 | dataloader = data_.DataLoader(dataset, \
17 | batch_size=opt.batch_size, \
18 | shuffle=True, \
19 | # pin_memory=True,
20 | num_workers=opt.num_workers)
21 |
22 | testset = VGGTestDataset(opt)
23 | test_dataloader = data_.DataLoader(testset,
24 | batch_size=opt.batch_size,
25 | num_workers=opt.num_workers,
26 | shuffle=False,
27 | pin_memory=True
28 | )
29 |
30 | my_model = model.SqueezeFRCN(20).cuda()
31 |
32 | optimizer = my_model.get_optimizer()
33 |
34 | loss_hist = collections.deque(maxlen=500)
35 | epoch_loss_hist = []
36 | my_trainer = Trainer(my_model,optimizer,model_name=opt.model_name)
37 | #my_trainer.model_load(3)
38 |
39 | freeze_num = -1 #pretrain model
40 | best_map = 0
41 | best_map_epoch_num = -1
42 |
43 | for epoch_num in range(opt.epoch):
44 | my_trainer.train_mode(freeze_num)
45 | train_start_time = time.time()
46 | train_epoch_loss = []
47 | start = time.time()
48 | for iter_num, data in enumerate(dataloader):
49 | curr_loss = my_trainer.train_step(data)
50 | loss_hist.append(float(curr_loss))
51 | train_epoch_loss.append(float(curr_loss))
52 |
53 | if (train_verbose):
54 | print('Epoch: {} | Iteration: {} | loss: {:1.5f} | Running loss: {:1.5f} | Iter time: {:1.5f} | Train'
55 | ' time: {:1.5f}'.format(epoch_num, iter_num, float(curr_loss), np.mean(loss_hist),
56 | time.time()-start, time.time()-train_start_time))
57 | start = time.time()
58 |
59 | del curr_loss
60 | print('train epoch time :', time.time() - train_start_time)
61 | print('Epoch: {} | epoch train loss: {:1.5f}'.format(
62 | epoch_num, np.mean(train_epoch_loss)))
63 |
64 | vali_start_time = time.time()
65 |
66 |
67 | vali_eval_result = my_trainer.run_eval(test_dataloader)
68 | print(vali_eval_result)
69 | vali_map = vali_eval_result['map']
70 | print('vali epoch time :', time.time() - vali_start_time)
71 |
72 |
73 | if(best_map < vali_map):
74 | best_map = vali_map
75 | best_map_epoch_num = epoch_num
76 | my_trainer.model_save(epoch_num)
77 | if (epoch_num==9):
78 | my_trainer.model_load(best_map_epoch_num)
79 | my_trainer.reduce_lr(factor=0.1, verbose=True)
80 |
81 | print('best epoch num', best_map_epoch_num)
82 | print('----------------------------------------')
83 |
84 | print(epoch_loss_hist)
85 |
86 |
87 | if __name__ == "__main__":
88 | run_train(train_verbose = True)
89 | #my_model = model.SqueezeFRCN(20)
--------------------------------------------------------------------------------
/lib/array_tool.py:
--------------------------------------------------------------------------------
1 | """
2 | tools to convert specified type
3 | """
4 | import torch as t
5 | import numpy as np
6 |
7 | def tonumpy(data):
8 | if isinstance(data, np.ndarray):
9 | return data
10 | if isinstance(data, t._C._TensorBase):
11 | return data.cpu().numpy()
12 | if isinstance(data, t.autograd.Variable):
13 | return tonumpy(data.data)
14 |
15 |
16 | def totensor(data, cuda=True):
17 | if isinstance(data, np.ndarray):
18 | tensor = t.from_numpy(data)
19 | if isinstance(data, t._C._TensorBase):
20 | tensor = data
21 | if isinstance(data, t.autograd.Variable):
22 | tensor = data.data
23 | if cuda:
24 | tensor = tensor.cuda()
25 | return tensor
26 |
27 |
28 | def tovariable(data):
29 | if isinstance(data, np.ndarray):
30 | return tovariable(totensor(data))
31 | if isinstance(data, t._C._TensorBase):
32 | return t.autograd.Variable(data)
33 | if isinstance(data, t.autograd.Variable):
34 | return data
35 | else:
36 | raise ValueError("UnKnow data type: %s, input should be {np.ndarray,Tensor,Variable}" %type(data))
37 |
38 |
39 | def scalar(data):
40 | if isinstance(data, np.ndarray):
41 | return data.reshape(1)[0]
42 | if isinstance(data, t._C._TensorBase):
43 | return data.view(1)[0]
44 | if isinstance(data, t.autograd.Variable):
45 | return data.data.view(1)[0]
46 |
--------------------------------------------------------------------------------
/lib/bbox_tools.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import six
4 |
5 |
6 | def loc2bbox(src_bbox, loc):
7 | """Decode bounding boxes from bounding box offsets and scales.
8 |
9 | Given bounding box offsets and scales computed by
10 | :meth:`bbox2loc`, this function decodes the representation to
11 | coordinates in 2D image coordinates.
12 |
13 | Given scales and offsets :math:`t_y, t_x, t_h, t_w` and a bounding
14 | box whose center is :math:`(y, x) = p_y, p_x` and size :math:`p_h, p_w`,
15 | the decoded bounding box's center :math:`\\hat{g}_y`, :math:`\\hat{g}_x`
16 | and size :math:`\\hat{g}_h`, :math:`\\hat{g}_w` are calculated
17 | by the following formulas.
18 |
19 | * :math:`\\hat{g}_y = p_h t_y + p_y`
20 | * :math:`\\hat{g}_x = p_w t_x + p_x`
21 | * :math:`\\hat{g}_h = p_h \\exp(t_h)`
22 | * :math:`\\hat{g}_w = p_w \\exp(t_w)`
23 |
24 | The decoding formulas are used in works such as R-CNN [#]_.
25 |
26 | The output is same type as the type of the inputs.
27 |
28 | .. [#] Ross Girshick, Jeff Donahue, Trevor Darrell, Jitendra Malik. \
29 | Rich feature hierarchies for accurate object detection and semantic \
30 | segmentation. CVPR 2014.
31 |
32 | Args:
33 | src_bbox (array): A coordinates of bounding boxes.
34 | Its shape is :math:`(R, 4)`. These coordinates are
35 | :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`.
36 | loc (array): An array with offsets and scales.
37 | The shapes of :obj:`src_bbox` and :obj:`loc` should be same.
38 | This contains values :math:`t_y, t_x, t_h, t_w`.
39 |
40 | Returns:
41 | array:
42 | Decoded bounding box coordinates. Its shape is :math:`(R, 4)`. \
43 | The second axis contains four values \
44 | :math:`\\hat{g}_{ymin}, \\hat{g}_{xmin},
45 | \\hat{g}_{ymax}, \\hat{g}_{xmax}`.
46 |
47 | """
48 |
49 | if src_bbox.shape[0] == 0:
50 | return np.zeros((0, 4), dtype=loc.dtype)
51 |
52 | src_bbox = src_bbox.astype(src_bbox.dtype, copy=False)
53 |
54 | src_height = src_bbox[:, 2] - src_bbox[:, 0]
55 | src_width = src_bbox[:, 3] - src_bbox[:, 1]
56 | src_ctr_y = src_bbox[:, 0] + 0.5 * src_height
57 | src_ctr_x = src_bbox[:, 1] + 0.5 * src_width
58 | dy = loc[:, 0::4]
59 | dx = loc[:, 1::4]
60 | dh = loc[:, 2::4]
61 | dw = loc[:, 3::4]
62 | ctr_y = dy * src_height[:, np.newaxis] + src_ctr_y[:, np.newaxis]
63 | ctr_x = dx * src_width[:, np.newaxis] + src_ctr_x[:, np.newaxis]
64 | h = np.exp(dh) * src_height[:, np.newaxis]
65 | w = np.exp(dw) * src_width[:, np.newaxis]
66 |
67 | dst_bbox = np.zeros(loc.shape, dtype=loc.dtype)
68 | dst_bbox[:, 0::4] = ctr_y - 0.5 * h
69 | dst_bbox[:, 1::4] = ctr_x - 0.5 * w
70 | dst_bbox[:, 2::4] = ctr_y + 0.5 * h
71 | dst_bbox[:, 3::4] = ctr_x + 0.5 * w
72 |
73 | return dst_bbox
74 |
75 |
76 | def bbox2loc(src_bbox, dst_bbox):
77 | """Encodes the source and the destination bounding boxes to "loc".
78 |
79 | Given bounding boxes, this function computes offsets and scales
80 | to match the source bounding boxes to the target bounding boxes.
81 | Mathematcially, given a bounding box whose center is
82 | :math:`(y, x) = p_y, p_x` and
83 | size :math:`p_h, p_w` and the target bounding box whose center is
84 | :math:`g_y, g_x` and size :math:`g_h, g_w`, the offsets and scales
85 | :math:`t_y, t_x, t_h, t_w` can be computed by the following formulas.
86 |
87 | * :math:`t_y = \\frac{(g_y - p_y)} {p_h}`
88 | * :math:`t_x = \\frac{(g_x - p_x)} {p_w}`
89 | * :math:`t_h = \\log(\\frac{g_h} {p_h})`
90 | * :math:`t_w = \\log(\\frac{g_w} {p_w})`
91 |
92 | The output is same type as the type of the inputs.
93 | The encoding formulas are used in works such as R-CNN [#]_.
94 |
95 | .. [#] Ross Girshick, Jeff Donahue, Trevor Darrell, Jitendra Malik. \
96 | Rich feature hierarchies for accurate object detection and semantic \
97 | segmentation. CVPR 2014.
98 |
99 | Args:
100 | src_bbox (array): An image coordinate array whose shape is
101 | :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
102 | These coordinates are
103 | :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`.
104 | dst_bbox (array): An image coordinate array whose shape is
105 | :math:`(R, 4)`.
106 | These coordinates are
107 | :math:`g_{ymin}, g_{xmin}, g_{ymax}, g_{xmax}`.
108 |
109 | Returns:
110 | array:
111 | Bounding box offsets and scales from :obj:`src_bbox` \
112 | to :obj:`dst_bbox`. \
113 | This has shape :math:`(R, 4)`.
114 | The second axis contains four values :math:`t_y, t_x, t_h, t_w`.
115 |
116 | """
117 |
118 | height = src_bbox[:, 2] - src_bbox[:, 0]
119 | width = src_bbox[:, 3] - src_bbox[:, 1]
120 | ctr_y = src_bbox[:, 0] + 0.5 * height
121 | ctr_x = src_bbox[:, 1] + 0.5 * width
122 |
123 | base_height = dst_bbox[:, 2] - dst_bbox[:, 0]
124 | base_width = dst_bbox[:, 3] - dst_bbox[:, 1]
125 | base_ctr_y = dst_bbox[:, 0] + 0.5 * base_height
126 | base_ctr_x = dst_bbox[:, 1] + 0.5 * base_width
127 |
128 | eps = np.finfo(height.dtype).eps
129 | height = np.maximum(height, eps)
130 | width = np.maximum(width, eps)
131 |
132 | dy = (base_ctr_y - ctr_y) / height
133 | dx = (base_ctr_x - ctr_x) / width
134 | dh = np.log(base_height / height)
135 | dw = np.log(base_width / width)
136 |
137 | loc = np.vstack((dy, dx, dh, dw)).transpose()
138 | return loc
139 |
140 |
141 | def bbox_iou(bbox_a, bbox_b):
142 | """Calculate the Intersection of Unions (IoUs) between bounding boxes.
143 |
144 | IoU is calculated as a ratio of area of the intersection
145 | and area of the union.
146 |
147 | This function accepts both :obj:`numpy.ndarray` and :obj:`cupy.ndarray` as
148 | inputs. Please note that both :obj:`bbox_a` and :obj:`bbox_b` need to be
149 | same type.
150 | The output is same type as the type of the inputs.
151 |
152 | Args:
153 | bbox_a (array): An array whose shape is :math:`(N, 4)`.
154 | :math:`N` is the number of bounding boxes.
155 | The dtype should be :obj:`numpy.float32`.
156 | bbox_b (array): An array similar to :obj:`bbox_a`,
157 | whose shape is :math:`(K, 4)`.
158 | The dtype should be :obj:`numpy.float32`.
159 |
160 | Returns:
161 | array:
162 | An array whose shape is :math:`(N, K)`. \
163 | An element at index :math:`(n, k)` contains IoUs between \
164 | :math:`n` th bounding box in :obj:`bbox_a` and :math:`k` th bounding \
165 | box in :obj:`bbox_b`.
166 |
167 | """
168 | if bbox_a.shape[1] != 4 or bbox_b.shape[1] != 4:
169 | raise IndexError
170 |
171 | # top left
172 | tl = np.maximum(bbox_a[:, None, :2], bbox_b[:, :2])
173 | # bottom right
174 | br = np.minimum(bbox_a[:, None, 2:], bbox_b[:, 2:])
175 |
176 | area_i = np.prod(br - tl, axis=2) * (tl < br).all(axis=2)
177 | area_a = np.prod(bbox_a[:, 2:] - bbox_a[:, :2], axis=1)
178 | area_b = np.prod(bbox_b[:, 2:] - bbox_b[:, :2], axis=1)
179 | return area_i / (area_a[:, None] + area_b - area_i)
180 |
181 |
182 | def __test():
183 | pass
184 |
185 |
186 | if __name__ == '__main__':
187 | __test()
188 |
189 |
190 | def generate_anchor_base(base_size=16, ratios=[0.5, 1, 2],
191 | anchor_scales=[8, 16, 32]):
192 | """Generate anchor base windows by enumerating aspect ratio and scales.
193 |
194 | Generate anchors that are scaled and modified to the given aspect ratios.
195 | Area of a scaled anchor is preserved when modifying to the given aspect
196 | ratio.
197 |
198 | :obj:`R = len(ratios) * len(anchor_scales)` anchors are generated by this
199 | function.
200 | The :obj:`i * len(anchor_scales) + j` th anchor corresponds to an anchor
201 | generated by :obj:`ratios[i]` and :obj:`anchor_scales[j]`.
202 |
203 | For example, if the scale is :math:`8` and the ratio is :math:`0.25`,
204 | the width and the height of the base window will be stretched by :math:`8`.
205 | For modifying the anchor to the given aspect ratio,
206 | the height is halved and the width is doubled.
207 |
208 | Args:
209 | base_size (number): The width and the height of the reference window.
210 | ratios (list of floats): This is ratios of width to height of
211 | the anchors.
212 | anchor_scales (list of numbers): This is areas of anchors.
213 | Those areas will be the product of the square of an element in
214 | :obj:`anchor_scales` and the original area of the reference
215 | window.
216 |
217 | Returns:
218 | ~numpy.ndarray:
219 | An array of shape :math:`(R, 4)`.
220 | Each element is a set of coordinates of a bounding box.
221 | The second axis corresponds to
222 | :math:`(y_{min}, x_{min}, y_{max}, x_{max})` of a bounding box.
223 |
224 | """
225 | py = base_size / 2.
226 | px = base_size / 2.
227 |
228 | anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4),
229 | dtype=np.float32)
230 | for i in six.moves.range(len(ratios)):
231 | for j in six.moves.range(len(anchor_scales)):
232 | h = base_size * anchor_scales[j] * np.sqrt(ratios[i])
233 | w = base_size * anchor_scales[j] * np.sqrt(1. / ratios[i])
234 |
235 | index = i * len(anchor_scales) + j
236 | anchor_base[index, 0] = py - h / 2.
237 | anchor_base[index, 1] = px - w / 2.
238 | anchor_base[index, 2] = py + h / 2.
239 | anchor_base[index, 3] = px + w / 2.
240 | return anchor_base
241 |
--------------------------------------------------------------------------------
/lib/creator_tool.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import cupy as cp
3 |
4 | from lib.bbox_tools import bbox2loc, bbox_iou, loc2bbox
5 | from lib.nms import non_maximum_suppression
6 |
7 |
8 | class ProposalTargetCreator(object):
9 | """Assign ground truth bounding boxes to given RoIs.
10 |
11 | The :meth:`__call__` of this class generates training targets
12 | for each object proposal.
13 | This is used to train Faster RCNN [#]_.
14 |
15 | .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
16 | Faster R-CNN: Towards Real-Time Object Detection with \
17 | Region Proposal Networks. NIPS 2015.
18 |
19 | Args:
20 | n_sample (int): The number of sampled regions.
21 | pos_ratio (float): Fraction of regions that is labeled as a
22 | foreground.
23 | pos_iou_thresh (float): IoU threshold for a RoI to be considered as a
24 | foreground.
25 | neg_iou_thresh_hi (float): RoI is considered to be the background
26 | if IoU is in
27 | [:obj:`neg_iou_thresh_hi`, :obj:`neg_iou_thresh_hi`).
28 | neg_iou_thresh_lo (float): See above.
29 |
30 | """
31 |
32 | def __init__(self,
33 | n_sample=128,
34 | pos_ratio=0.25, pos_iou_thresh=0.5,
35 | neg_iou_thresh_hi=0.5, neg_iou_thresh_lo=0.0
36 | ):
37 | self.n_sample = n_sample
38 | self.pos_ratio = pos_ratio
39 | self.pos_iou_thresh = pos_iou_thresh
40 | self.neg_iou_thresh_hi = neg_iou_thresh_hi
41 | self.neg_iou_thresh_lo = neg_iou_thresh_lo # NOTE: py-faster-rcnn默认的值是0.1
42 |
43 | def __call__(self, roi, bbox, label,
44 | loc_normalize_mean=(0., 0., 0., 0.),
45 | loc_normalize_std=(0.1, 0.1, 0.2, 0.2)):
46 | """Assigns ground truth to sampled proposals.
47 |
48 | This function samples total of :obj:`self.n_sample` RoIs
49 | from the combination of :obj:`roi` and :obj:`bbox`.
50 | The RoIs are assigned with the ground truth class labels as well as
51 | bounding box offsets and scales to match the ground truth bounding
52 | boxes. As many as :obj:`pos_ratio * self.n_sample` RoIs are
53 | sampled as foregrounds.
54 |
55 | Offsets and scales of bounding boxes are calculated using
56 | :func:`model.utils.bbox_tools.bbox2loc`.
57 | Also, types of input arrays and output arrays are same.
58 |
59 | Here are notations.
60 |
61 | * :math:`S` is the total number of sampled RoIs, which equals \
62 | :obj:`self.n_sample`.
63 | * :math:`L` is number of object classes possibly including the \
64 | background.
65 |
66 | Args:
67 | roi (array): Region of Interests (RoIs) from which we sample.
68 | Its shape is :math:`(R, 4)`
69 | bbox (array): The coordinates of ground truth bounding boxes.
70 | Its shape is :math:`(R', 4)`.
71 | label (array): Ground truth bounding box labels. Its shape
72 | is :math:`(R',)`. Its range is :math:`[0, L - 1]`, where
73 | :math:`L` is the number of foreground classes.
74 | loc_normalize_mean (tuple of four floats): Mean values to normalize
75 | coordinates of bouding boxes.
76 | loc_normalize_std (tupler of four floats): Standard deviation of
77 | the coordinates of bounding boxes.
78 |
79 | Returns:
80 | (array, array, array):
81 |
82 | * **sample_roi**: Regions of interests that are sampled. \
83 | Its shape is :math:`(S, 4)`.
84 | * **gt_roi_loc**: Offsets and scales to match \
85 | the sampled RoIs to the ground truth bounding boxes. \
86 | Its shape is :math:`(S, 4)`.
87 | * **gt_roi_label**: Labels assigned to sampled RoIs. Its shape is \
88 | :math:`(S,)`. Its range is :math:`[0, L]`. The label with \
89 | value 0 is the background.
90 |
91 | """
92 | n_bbox, _ = bbox.shape
93 |
94 | roi = np.concatenate((roi, bbox), axis=0)
95 |
96 | pos_roi_per_image = np.round(self.n_sample * self.pos_ratio)
97 | iou = bbox_iou(roi, bbox)
98 | gt_assignment = iou.argmax(axis=1)
99 | max_iou = iou.max(axis=1)
100 | # Offset range of classes from [0, n_fg_class - 1] to [1, n_fg_class].
101 | # The label with value 0 is the background.
102 | gt_roi_label = label[gt_assignment] + 1
103 |
104 |
105 | # Select foreground RoIs as those with >= pos_iou_thresh IoU.
106 | pos_index = np.where(max_iou >= self.pos_iou_thresh)[0]
107 | pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size))
108 | if pos_index.size > 0:
109 | pos_index = np.random.choice(
110 | pos_index, size=pos_roi_per_this_image, replace=False)
111 |
112 | # Select background RoIs as those within
113 | # [neg_iou_thresh_lo, neg_iou_thresh_hi).
114 | neg_index = np.where((max_iou < self.neg_iou_thresh_hi) &
115 | (max_iou >= self.neg_iou_thresh_lo))[0]
116 | neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image
117 | neg_roi_per_this_image = int(min(neg_roi_per_this_image,
118 | neg_index.size))
119 | if neg_index.size > 0:
120 | neg_index = np.random.choice(
121 | neg_index, size=neg_roi_per_this_image, replace=False)
122 |
123 | # The indices that we're selecting (both positive and negative).
124 | keep_index = np.append(pos_index, neg_index)
125 | gt_roi_label = gt_roi_label[keep_index]
126 | gt_roi_label[pos_roi_per_this_image:] = 0 # negative labels --> 0
127 | sample_roi = roi[keep_index]
128 |
129 | # Compute offsets and scales to match sampled RoIs to the GTs.
130 | gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]])
131 | gt_roi_loc = ((gt_roi_loc - np.array(loc_normalize_mean, np.float32)
132 | ) / np.array(loc_normalize_std, np.float32))
133 |
134 | return sample_roi, gt_roi_loc, gt_roi_label
135 |
136 |
137 | class AnchorTargetCreator(object):
138 | """Assign the ground truth bounding boxes to anchors.
139 |
140 | Assigns the ground truth bounding boxes to anchors for training Region
141 | Proposal Networks introduced in Faster R-CNN [#]_.
142 |
143 | Offsets and scales to match anchors to the ground truth are
144 | calculated using the encoding scheme of
145 | :func:`model.utils.bbox_tools.bbox2loc`.
146 |
147 | .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
148 | Faster R-CNN: Towards Real-Time Object Detection with \
149 | Region Proposal Networks. NIPS 2015.
150 |
151 | Args:
152 | n_sample (int): The number of regions to produce.
153 | pos_iou_thresh (float): Anchors with IoU above this
154 | threshold will be assigned as positive.
155 | neg_iou_thresh (float): Anchors with IoU below this
156 | threshold will be assigned as negative.
157 | pos_ratio (float): Ratio of positive regions in the
158 | sampled regions.
159 |
160 | """
161 |
162 | def __init__(self,
163 | n_sample=256,
164 | pos_iou_thresh=0.7, neg_iou_thresh=0.3,
165 | pos_ratio=0.5):
166 | self.n_sample = n_sample
167 | self.pos_iou_thresh = pos_iou_thresh
168 | self.neg_iou_thresh = neg_iou_thresh
169 | self.pos_ratio = pos_ratio
170 |
171 | def __call__(self, bbox, anchor, img_size):
172 | """Assign ground truth supervision to sampled subset of anchors.
173 |
174 | Types of input arrays and output arrays are same.
175 |
176 | Here are notations.
177 |
178 | * :math:`S` is the number of anchors.
179 | * :math:`R` is the number of bounding boxes.
180 |
181 | Args:
182 | bbox (array): Coordinates of bounding boxes. Its shape is
183 | :math:`(R, 4)`.
184 | anchor (array): Coordinates of anchors. Its shape is
185 | :math:`(S, 4)`.
186 | img_size (tuple of ints): A tuple :obj:`H, W`, which
187 | is a tuple of height and width of an image.
188 |
189 | Returns:
190 | (array, array):
191 |
192 | #NOTE: it's scale not only offset
193 | * **loc**: Offsets and scales to match the anchors to \
194 | the ground truth bounding boxes. Its shape is :math:`(S, 4)`.
195 | * **label**: Labels of anchors with values \
196 | :obj:`(1=positive, 0=negative, -1=ignore)`. Its shape \
197 | is :math:`(S,)`.
198 |
199 | """
200 |
201 | img_H, img_W = img_size
202 |
203 | n_anchor = len(anchor)
204 | inside_index = _get_inside_index(anchor, img_H, img_W)
205 | anchor = anchor[inside_index]
206 | argmax_ious, label = self._create_label(
207 | inside_index, anchor, bbox)
208 |
209 | # compute bounding box regression targets
210 | loc = bbox2loc(anchor, bbox[argmax_ious])
211 |
212 | # map up to original set of anchors
213 | label = _unmap(label, n_anchor, inside_index, fill=-1)
214 | loc = _unmap(loc, n_anchor, inside_index, fill=0)
215 |
216 | return loc, label
217 |
218 | def _create_label(self, inside_index, anchor, bbox):
219 | # label: 1 is positive, 0 is negative, -1 is dont care
220 | label = np.empty((len(inside_index),), dtype=np.int32)
221 | label.fill(-1)
222 |
223 | argmax_ious, max_ious, gt_argmax_ious = \
224 | self._calc_ious(anchor, bbox, inside_index)
225 |
226 | # assign negative labels first so that positive labels can clobber them
227 | label[max_ious < self.neg_iou_thresh] = 0
228 |
229 | # positive label: for each gt, anchor with highest iou
230 | label[gt_argmax_ious] = 1
231 |
232 | # positive label: above threshold IOU
233 | label[max_ious >= self.pos_iou_thresh] = 1
234 |
235 | # subsample positive labels if we have too many
236 | n_pos = int(self.pos_ratio * self.n_sample)
237 | pos_index = np.where(label == 1)[0]
238 | if len(pos_index) > n_pos:
239 | disable_index = np.random.choice(
240 | pos_index, size=(len(pos_index) - n_pos), replace=False)
241 | label[disable_index] = -1
242 |
243 | # subsample negative labels if we have too many
244 | n_neg = self.n_sample - np.sum(label == 1)
245 | neg_index = np.where(label == 0)[0]
246 | if len(neg_index) > n_neg:
247 | disable_index = np.random.choice(
248 | neg_index, size=(len(neg_index) - n_neg), replace=False)
249 | label[disable_index] = -1
250 |
251 | return argmax_ious, label
252 |
253 | def _calc_ious(self, anchor, bbox, inside_index):
254 | # ious between the anchors and the gt boxes
255 | ious = bbox_iou(anchor, bbox)
256 | argmax_ious = ious.argmax(axis=1)
257 | max_ious = ious[np.arange(len(inside_index)), argmax_ious]
258 | gt_argmax_ious = ious.argmax(axis=0)
259 | gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])]
260 | gt_argmax_ious = np.where(ious == gt_max_ious)[0]
261 |
262 | return argmax_ious, max_ious, gt_argmax_ious
263 |
264 |
265 | def _unmap(data, count, index, fill=0):
266 | # Unmap a subset of item (data) back to the original set of items (of
267 | # size count)
268 |
269 | if len(data.shape) == 1:
270 | ret = np.empty((count,), dtype=data.dtype)
271 | ret.fill(fill)
272 | ret[index] = data
273 | else:
274 | ret = np.empty((count,) + data.shape[1:], dtype=data.dtype)
275 | ret.fill(fill)
276 | ret[index, :] = data
277 | return ret
278 |
279 |
280 | def _get_inside_index(anchor, H, W):
281 | # Calc indicies of anchors which are located completely inside of the image
282 | # whose size is speficied.
283 | index_inside = np.where(
284 | (anchor[:, 0] >= 0) &
285 | (anchor[:, 1] >= 0) &
286 | (anchor[:, 2] <= H) &
287 | (anchor[:, 3] <= W)
288 | )[0]
289 | return index_inside
290 |
291 |
292 | class ProposalCreator:
293 | # unNOTE: I'll make it undifferential
294 | # unTODO: make sure it's ok
295 | # It's ok
296 | """Proposal regions are generated by calling this object.
297 |
298 | The :meth:`__call__` of this object outputs object detection proposals by
299 | applying estimated bounding box offsets
300 | to a set of anchors.
301 |
302 | This class takes parameters to control number of bounding boxes to
303 | pass to NMS and keep after NMS.
304 | If the paramters are negative, it uses all the bounding boxes supplied
305 | or keep all the bounding boxes returned by NMS.
306 |
307 | This class is used for Region Proposal Networks introduced in
308 | Faster R-CNN [#]_.
309 |
310 | .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
311 | Faster R-CNN: Towards Real-Time Object Detection with \
312 | Region Proposal Networks. NIPS 2015.
313 |
314 | Args:
315 | nms_thresh (float): Threshold value used when calling NMS.
316 | n_train_pre_nms (int): Number of top scored bounding boxes
317 | to keep before passing to NMS in train mode.
318 | n_train_post_nms (int): Number of top scored bounding boxes
319 | to keep after passing to NMS in train mode.
320 | n_test_pre_nms (int): Number of top scored bounding boxes
321 | to keep before passing to NMS in test mode.
322 | n_test_post_nms (int): Number of top scored bounding boxes
323 | to keep after passing to NMS in test mode.
324 | force_cpu_nms (bool): If this is :obj:`True`,
325 | always use NMS in CPU mode. If :obj:`False`,
326 | the NMS mode is selected based on the type of inputs.
327 | min_size (int): A paramter to determine the threshold on
328 | discarding bounding boxes based on their sizes.
329 |
330 | """
331 |
332 | def __init__(self,
333 | parent_model,
334 | nms_thresh=0.7,
335 | n_train_pre_nms=12000,
336 | n_train_post_nms=2000,
337 | n_test_pre_nms=6000,
338 | n_test_post_nms=300,
339 | min_size=16
340 | ):
341 | self.parent_model = parent_model
342 | self.nms_thresh = nms_thresh
343 | self.n_train_pre_nms = n_train_pre_nms
344 | self.n_train_post_nms = n_train_post_nms
345 | self.n_test_pre_nms = n_test_pre_nms
346 | self.n_test_post_nms = n_test_post_nms
347 | self.min_size = min_size
348 |
349 | def __call__(self, loc, score,
350 | anchor, img_size, scale=1.):
351 | """input should be ndarray
352 | Propose RoIs.
353 |
354 | Inputs :obj:`loc, score, anchor` refer to the same anchor when indexed
355 | by the same index.
356 |
357 | On notations, :math:`R` is the total number of anchors. This is equal
358 | to product of the height and the width of an image and the number of
359 | anchor bases per pixel.
360 |
361 | Type of the output is same as the inputs.
362 |
363 | Args:
364 | loc (array): Predicted offsets and scaling to anchors.
365 | Its shape is :math:`(R, 4)`.
366 | score (array): Predicted foreground probability for anchors.
367 | Its shape is :math:`(R,)`.
368 | anchor (array): Coordinates of anchors. Its shape is
369 | :math:`(R, 4)`.
370 | img_size (tuple of ints): A tuple :obj:`height, width`,
371 | which contains image size after scaling.
372 | scale (float): The scaling factor used to scale an image after
373 | reading it from a file.
374 |
375 | Returns:
376 | array:
377 | An array of coordinates of proposal boxes.
378 | Its shape is :math:`(S, 4)`. :math:`S` is less than
379 | :obj:`self.n_test_post_nms` in test time and less than
380 | :obj:`self.n_train_post_nms` in train time. :math:`S` depends on
381 | the size of the predicted bounding boxes and the number of
382 | bounding boxes discarded by NMS.
383 |
384 | """
385 | # NOTE: when test, remember
386 | # faster_rcnn.eval()
387 | # to set self.traing = False
388 | if self.parent_model.training:
389 | n_pre_nms = self.n_train_pre_nms
390 | n_post_nms = self.n_train_post_nms
391 | else:
392 | n_pre_nms = self.n_test_pre_nms
393 | n_post_nms = self.n_test_post_nms
394 |
395 | # Convert anchors into proposal via bbox transformations.
396 | # roi = loc2bbox(anchor, loc)
397 | roi = loc2bbox(anchor, loc)
398 |
399 | # Clip predicted boxes to image.
400 | roi[:, slice(0, 4, 2)] = np.clip(
401 | roi[:, slice(0, 4, 2)], 0, img_size[0])
402 | roi[:, slice(1, 4, 2)] = np.clip(
403 | roi[:, slice(1, 4, 2)], 0, img_size[1])
404 |
405 | # Remove predicted boxes with either height or width < threshold.
406 | min_size = self.min_size * scale
407 | hs = roi[:, 2] - roi[:, 0]
408 | ws = roi[:, 3] - roi[:, 1]
409 | keep = np.where((hs >= min_size) & (ws >= min_size))[0]
410 | roi = roi[keep, :]
411 | score = score[keep]
412 |
413 | # Sort all (proposal, score) pairs by score from highest to lowest.
414 | # Take top pre_nms_topN (e.g. 6000).
415 | order = score.ravel().argsort()[::-1]
416 | if n_pre_nms > 0:
417 | order = order[:n_pre_nms]
418 | roi = roi[order, :]
419 |
420 | # Apply nms (e.g. threshold = 0.7).
421 | # Take after_nms_topN (e.g. 300).
422 |
423 | # unNOTE: somthing is wrong here!
424 | # TODO: remove cuda.to_gpu
425 | keep = non_maximum_suppression(
426 | cp.ascontiguousarray(cp.asarray(roi)),
427 | thresh=self.nms_thresh)
428 | if n_post_nms > 0:
429 | keep = keep[:n_post_nms]
430 | roi = roi[keep]
431 | return roi
432 |
--------------------------------------------------------------------------------
/lib/eval_tool.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 |
3 | from collections import defaultdict
4 | import itertools
5 | import numpy as np
6 | import six
7 |
8 | from lib.bbox_tools import bbox_iou
9 |
10 |
11 | def eval_detection_voc(
12 | pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels,
13 | gt_difficults=None,
14 | iou_thresh=0.5, use_07_metric=False):
15 | """Calculate average precisions based on evaluation code of PASCAL VOC.
16 |
17 | This function evaluates predicted bounding boxes obtained from a dataset
18 | which has :math:`N` images by using average precision for each class.
19 | The code is based on the evaluation code used in PASCAL VOC Challenge.
20 |
21 | Args:
22 | pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N`
23 | sets of bounding boxes.
24 | Its index corresponds to an index for the base dataset.
25 | Each element of :obj:`pred_bboxes` is a set of coordinates
26 | of bounding boxes. This is an array whose shape is :math:`(R, 4)`,
27 | where :math:`R` corresponds
28 | to the number of bounding boxes, which may vary among boxes.
29 | The second axis corresponds to
30 | :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box.
31 | pred_labels (iterable of numpy.ndarray): An iterable of labels.
32 | Similar to :obj:`pred_bboxes`, its index corresponds to an
33 | index for the base dataset. Its length is :math:`N`.
34 | pred_scores (iterable of numpy.ndarray): An iterable of confidence
35 | scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`,
36 | its index corresponds to an index for the base dataset.
37 | Its length is :math:`N`.
38 | gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth
39 | bounding boxes
40 | whose length is :math:`N`. An element of :obj:`gt_bboxes` is a
41 | bounding box whose shape is :math:`(R, 4)`. Note that the number of
42 | bounding boxes in each image does not need to be same as the number
43 | of corresponding predicted boxes.
44 | gt_labels (iterable of numpy.ndarray): An iterable of ground truth
45 | labels which are organized similarly to :obj:`gt_bboxes`.
46 | gt_difficults (iterable of numpy.ndarray): An iterable of boolean
47 | arrays which is organized similarly to :obj:`gt_bboxes`.
48 | This tells whether the
49 | corresponding ground truth bounding box is difficult or not.
50 | By default, this is :obj:`None`. In that case, this function
51 | considers all bounding boxes to be not difficult.
52 | iou_thresh (float): A prediction is correct if its Intersection over
53 | Union with the ground truth is above this value.
54 | use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric
55 | for calculating average precision. The default value is
56 | :obj:`False`.
57 |
58 | Returns:
59 | dict:
60 |
61 | The keys, value-types and the description of the values are listed
62 | below.
63 |
64 | * **ap** (*numpy.ndarray*): An array of average precisions. \
65 | The :math:`l`-th value corresponds to the average precision \
66 | for class :math:`l`. If class :math:`l` does not exist in \
67 | either :obj:`pred_labels` or :obj:`gt_labels`, the corresponding \
68 | value is set to :obj:`numpy.nan`.
69 | * **map** (*float*): The average of Average Precisions over classes.
70 |
71 | """
72 |
73 | prec, rec = calc_detection_voc_prec_rec(
74 | pred_bboxes, pred_labels, pred_scores,
75 | gt_bboxes, gt_labels, gt_difficults,
76 | iou_thresh=iou_thresh)
77 |
78 | ap = calc_detection_voc_ap(prec, rec, use_07_metric=use_07_metric)
79 |
80 | return {'ap': ap, 'map': np.nanmean(ap)}
81 |
82 |
83 | def calc_detection_voc_prec_rec(
84 | pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels,
85 | gt_difficults=None,
86 | iou_thresh=0.5):
87 | """Calculate precision and recall based on evaluation code of PASCAL VOC.
88 |
89 | This function calculates precision and recall of
90 | predicted bounding boxes obtained from a dataset which has :math:`N`
91 | images.
92 | The code is based on the evaluation code used in PASCAL VOC Challenge.
93 |
94 | Args:
95 | pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N`
96 | sets of bounding boxes.
97 | Its index corresponds to an index for the base dataset.
98 | Each element of :obj:`pred_bboxes` is a set of coordinates
99 | of bounding boxes. This is an array whose shape is :math:`(R, 4)`,
100 | where :math:`R` corresponds
101 | to the number of bounding boxes, which may vary among boxes.
102 | The second axis corresponds to
103 | :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box.
104 | pred_labels (iterable of numpy.ndarray): An iterable of labels.
105 | Similar to :obj:`pred_bboxes`, its index corresponds to an
106 | index for the base dataset. Its length is :math:`N`.
107 | pred_scores (iterable of numpy.ndarray): An iterable of confidence
108 | scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`,
109 | its index corresponds to an index for the base dataset.
110 | Its length is :math:`N`.
111 | gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth
112 | bounding boxes
113 | whose length is :math:`N`. An element of :obj:`gt_bboxes` is a
114 | bounding box whose shape is :math:`(R, 4)`. Note that the number of
115 | bounding boxes in each image does not need to be same as the number
116 | of corresponding predicted boxes.
117 | gt_labels (iterable of numpy.ndarray): An iterable of ground truth
118 | labels which are organized similarly to :obj:`gt_bboxes`.
119 | gt_difficults (iterable of numpy.ndarray): An iterable of boolean
120 | arrays which is organized similarly to :obj:`gt_bboxes`.
121 | This tells whether the
122 | corresponding ground truth bounding box is difficult or not.
123 | By default, this is :obj:`None`. In that case, this function
124 | considers all bounding boxes to be not difficult.
125 | iou_thresh (float): A prediction is correct if its Intersection over
126 | Union with the ground truth is above this value..
127 |
128 | Returns:
129 | tuple of two lists:
130 | This function returns two lists: :obj:`prec` and :obj:`rec`.
131 |
132 | * :obj:`prec`: A list of arrays. :obj:`prec[l]` is precision \
133 | for class :math:`l`. If class :math:`l` does not exist in \
134 | either :obj:`pred_labels` or :obj:`gt_labels`, :obj:`prec[l]` is \
135 | set to :obj:`None`.
136 | * :obj:`rec`: A list of arrays. :obj:`rec[l]` is recall \
137 | for class :math:`l`. If class :math:`l` that is not marked as \
138 | difficult does not exist in \
139 | :obj:`gt_labels`, :obj:`rec[l]` is \
140 | set to :obj:`None`.
141 |
142 | """
143 |
144 | pred_bboxes = iter(pred_bboxes)
145 | pred_labels = iter(pred_labels)
146 | pred_scores = iter(pred_scores)
147 | gt_bboxes = iter(gt_bboxes)
148 | gt_labels = iter(gt_labels)
149 | if gt_difficults is None:
150 | gt_difficults = itertools.repeat(None)
151 | else:
152 | gt_difficults = iter(gt_difficults)
153 |
154 | n_pos = defaultdict(int)
155 | score = defaultdict(list)
156 | match = defaultdict(list)
157 |
158 | for pred_bbox, pred_label, pred_score, gt_bbox, gt_label, gt_difficult in \
159 | six.moves.zip(
160 | pred_bboxes, pred_labels, pred_scores,
161 | gt_bboxes, gt_labels, gt_difficults):
162 |
163 | if gt_difficult is None:
164 | gt_difficult = np.zeros(gt_bbox.shape[0], dtype=bool)
165 |
166 | for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)):
167 | pred_mask_l = pred_label == l
168 | pred_bbox_l = pred_bbox[pred_mask_l]
169 | pred_score_l = pred_score[pred_mask_l]
170 | # sort by score
171 | order = pred_score_l.argsort()[::-1]
172 | pred_bbox_l = pred_bbox_l[order]
173 | pred_score_l = pred_score_l[order]
174 |
175 | gt_mask_l = gt_label == l
176 | gt_bbox_l = gt_bbox[gt_mask_l]
177 | gt_difficult_l = gt_difficult[gt_mask_l]
178 |
179 | n_pos[l] += np.logical_not(gt_difficult_l).sum()
180 | score[l].extend(pred_score_l)
181 |
182 | if len(pred_bbox_l) == 0:
183 | continue
184 | if len(gt_bbox_l) == 0:
185 | match[l].extend((0,) * pred_bbox_l.shape[0])
186 | continue
187 |
188 | # VOC evaluation follows integer typed bounding boxes.
189 | pred_bbox_l = pred_bbox_l.copy()
190 | pred_bbox_l[:, 2:] += 1
191 | gt_bbox_l = gt_bbox_l.copy()
192 | gt_bbox_l[:, 2:] += 1
193 |
194 | iou = bbox_iou(pred_bbox_l, gt_bbox_l)
195 | gt_index = iou.argmax(axis=1)
196 | # set -1 if there is no matching ground truth
197 | gt_index[iou.max(axis=1) < iou_thresh] = -1
198 | del iou
199 |
200 | selec = np.zeros(gt_bbox_l.shape[0], dtype=bool)
201 | for gt_idx in gt_index:
202 | if gt_idx >= 0:
203 | if gt_difficult_l[gt_idx]:
204 | match[l].append(-1)
205 | else:
206 | if not selec[gt_idx]:
207 | match[l].append(1)
208 | else:
209 | match[l].append(0)
210 | selec[gt_idx] = True
211 | else:
212 | match[l].append(0)
213 |
214 | for iter_ in (
215 | pred_bboxes, pred_labels, pred_scores,
216 | gt_bboxes, gt_labels, gt_difficults):
217 | if next(iter_, None) is not None:
218 | raise ValueError('Length of input iterables need to be same.')
219 |
220 | n_fg_class = max(n_pos.keys()) + 1
221 | prec = [None] * n_fg_class
222 | rec = [None] * n_fg_class
223 |
224 | for l in n_pos.keys():
225 | score_l = np.array(score[l])
226 | match_l = np.array(match[l], dtype=np.int8)
227 |
228 | order = score_l.argsort()[::-1]
229 | match_l = match_l[order]
230 |
231 | tp = np.cumsum(match_l == 1)
232 | fp = np.cumsum(match_l == 0)
233 |
234 | # If an element of fp + tp is 0,
235 | # the corresponding element of prec[l] is nan.
236 | prec[l] = tp / (fp + tp)
237 | # If n_pos[l] is 0, rec[l] is None.
238 | if n_pos[l] > 0:
239 | rec[l] = tp / n_pos[l]
240 |
241 | return prec, rec
242 |
243 |
244 | def calc_detection_voc_ap(prec, rec, use_07_metric=False):
245 | """Calculate average precisions based on evaluation code of PASCAL VOC.
246 |
247 | This function calculates average precisions
248 | from given precisions and recalls.
249 | The code is based on the evaluation code used in PASCAL VOC Challenge.
250 |
251 | Args:
252 | prec (list of numpy.array): A list of arrays.
253 | :obj:`prec[l]` indicates precision for class :math:`l`.
254 | If :obj:`prec[l]` is :obj:`None`, this function returns
255 | :obj:`numpy.nan` for class :math:`l`.
256 | rec (list of numpy.array): A list of arrays.
257 | :obj:`rec[l]` indicates recall for class :math:`l`.
258 | If :obj:`rec[l]` is :obj:`None`, this function returns
259 | :obj:`numpy.nan` for class :math:`l`.
260 | use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric
261 | for calculating average precision. The default value is
262 | :obj:`False`.
263 |
264 | Returns:
265 | ~numpy.ndarray:
266 | This function returns an array of average precisions.
267 | The :math:`l`-th value corresponds to the average precision
268 | for class :math:`l`. If :obj:`prec[l]` or :obj:`rec[l]` is
269 | :obj:`None`, the corresponding value is set to :obj:`numpy.nan`.
270 |
271 | """
272 |
273 | n_fg_class = len(prec)
274 | ap = np.empty(n_fg_class)
275 | for l in six.moves.range(n_fg_class):
276 | if prec[l] is None or rec[l] is None:
277 | ap[l] = np.nan
278 | continue
279 |
280 | if use_07_metric:
281 | # 11 point metric
282 | ap[l] = 0
283 | for t in np.arange(0., 1.1, 0.1):
284 | if np.sum(rec[l] >= t) == 0:
285 | p = 0
286 | else:
287 | p = np.max(np.nan_to_num(prec[l])[rec[l] >= t])
288 | ap[l] += p / 11
289 | else:
290 | # correct AP calculation
291 | # first append sentinel values at the end
292 | mpre = np.concatenate(([0], np.nan_to_num(prec[l]), [0]))
293 | mrec = np.concatenate(([0], rec[l], [1]))
294 |
295 | mpre = np.maximum.accumulate(mpre[::-1])[::-1]
296 |
297 | # to calculate area under PR curve, look for points
298 | # where X axis (recall) changes value
299 | i = np.where(mrec[1:] != mrec[:-1])[0]
300 |
301 | # and sum (\Delta recall) * prec
302 | ap[l] = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
303 |
304 | return ap
305 |
--------------------------------------------------------------------------------
/lib/nms/__init__.py:
--------------------------------------------------------------------------------
1 | from lib.nms.non_maximum_suppression import non_maximum_suppression
--------------------------------------------------------------------------------
/lib/nms/_nms_gpu_post.pyx:
--------------------------------------------------------------------------------
1 | cimport numpy as np
2 | from libc.stdint cimport uint64_t
3 |
4 | import numpy as np
5 |
6 | def _nms_gpu_post(np.ndarray[np.uint64_t, ndim=1] mask,
7 | int n_bbox,
8 | int threads_per_block,
9 | int col_blocks
10 | ):
11 | cdef:
12 | int i, j, nblock, index
13 | uint64_t inblock
14 | int n_selection = 0
15 | uint64_t one_ull = 1
16 | np.ndarray[np.int32_t, ndim=1] selection
17 | np.ndarray[np.uint64_t, ndim=1] remv
18 |
19 | selection = np.zeros((n_bbox,), dtype=np.int32)
20 | remv = np.zeros((col_blocks,), dtype=np.uint64)
21 |
22 | for i in range(n_bbox):
23 | nblock = i // threads_per_block
24 | inblock = i % threads_per_block
25 |
26 | if not (remv[nblock] & one_ull << inblock):
27 | selection[n_selection] = i
28 | n_selection += 1
29 |
30 | index = i * col_blocks
31 | for j in range(nblock, col_blocks):
32 | remv[j] |= mask[index + j]
33 | return selection, n_selection
34 |
--------------------------------------------------------------------------------
/lib/nms/_nms_gpu_post_py.py:
--------------------------------------------------------------------------------
1 |
2 | import numpy as np
3 |
4 | def _nms_gpu_post( mask,
5 | n_bbox,
6 | threads_per_block,
7 | col_blocks
8 | ):
9 | n_selection = 0
10 | one_ull = np.array([1],dtype=np.uint64)
11 | selection = np.zeros((n_bbox,), dtype=np.int32)
12 | remv = np.zeros((col_blocks,), dtype=np.uint64)
13 |
14 | for i in range(n_bbox):
15 | nblock = i // threads_per_block
16 | inblock = i % threads_per_block
17 |
18 | if not (remv[nblock] & one_ull << inblock):
19 | selection[n_selection] = i
20 | n_selection += 1
21 |
22 | index = i * col_blocks
23 | for j in range(nblock, col_blocks):
24 | remv[j] |= mask[index + j]
25 | return selection, n_selection
26 |
--------------------------------------------------------------------------------
/lib/nms/build.py:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 | from distutils.extension import Extension
3 | from Cython.Distutils import build_ext
4 |
5 | ext_modules = [Extension("_nms_gpu_post", ["_nms_gpu_post.pyx"])]
6 | setup(
7 | name="Hello pyx",
8 | cmdclass={'build_ext': build_ext},
9 | ext_modules=ext_modules
10 | )
11 |
--------------------------------------------------------------------------------
/lib/nms/non_maximum_suppression.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | import numpy as np
3 | import cupy as cp
4 | try:
5 | from ._nms_gpu_post import _nms_gpu_post
6 | except:
7 | import warnings
8 | warnings.warn('''
9 | the python code for non_maximum_suppression is about 2x slow
10 | It is strongly recommended to build cython code:
11 | `cd lib/nms/; python build.py build_ext --inplace''')
12 | from ._nms_gpu_post_py import _nms_gpu_post
13 |
14 |
15 | @cp.util.memoize(for_each_device=True)
16 | def _load_kernel(kernel_name, code, options=()):
17 | cp.cuda.runtime.free(0)
18 | assert isinstance(options, tuple)
19 | kernel_code = cp.cuda.compile_with_cache(code, options=options)
20 | return kernel_code.get_function(kernel_name)
21 |
22 |
23 | def non_maximum_suppression(bbox, thresh, score=None,
24 | limit=None):
25 | """Suppress bounding boxes according to their IoUs.
26 |
27 | This method checks each bounding box sequentially and selects the bounding
28 | box if the Intersection over Unions (IoUs) between the bounding box and the
29 | previously selected bounding boxes is less than :obj:`thresh`. This method
30 | is mainly used as postprocessing of object detection.
31 | The bounding boxes are selected from ones with higher scores.
32 | If :obj:`score` is not provided as an argument, the bounding box
33 | is ordered by its index in ascending order.
34 |
35 | The bounding boxes are expected to be packed into a two dimensional
36 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
37 | bounding boxes in the image. The second axis represents attributes of
38 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
39 | where the four attributes are coordinates of the top left and the
40 | bottom right vertices.
41 |
42 | :obj:`score` is a float array of shape :math:`(R,)`. Each score indicates
43 | confidence of prediction.
44 |
45 | This function accepts both :obj:`numpy.ndarray` and :obj:`cupy.ndarray` as
46 | an input. Please note that both :obj:`bbox` and :obj:`score` need to be
47 | the same type.
48 | The type of the output is the same as the input.
49 |
50 | Args:
51 | bbox (array): Bounding boxes to be transformed. The shape is
52 | :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
53 | thresh (float): Threshold of IoUs.
54 | score (array): An array of confidences whose shape is :math:`(R,)`.
55 | limit (int): The upper bound of the number of the output bounding
56 | boxes. If it is not specified, this method selects as many
57 | bounding boxes as possible.
58 |
59 | Returns:
60 | array:
61 | An array with indices of bounding boxes that are selected. \
62 | They are sorted by the scores of bounding boxes in descending \
63 | order. \
64 | The shape of this array is :math:`(K,)` and its dtype is\
65 | :obj:`numpy.int32`. Note that :math:`K \\leq R`.
66 |
67 | """
68 |
69 | return _non_maximum_suppression_gpu(bbox, thresh, score, limit)
70 |
71 |
72 | def _non_maximum_suppression_gpu(bbox, thresh, score=None, limit=None):
73 | if len(bbox) == 0:
74 | return cp.zeros((0,), dtype=np.int32)
75 |
76 | n_bbox = bbox.shape[0]
77 |
78 | if score is not None:
79 | order = score.argsort()[::-1].astype(np.int32)
80 | else:
81 | order = cp.arange(n_bbox, dtype=np.int32)
82 |
83 | sorted_bbox = bbox[order, :]
84 | selec, n_selec = _call_nms_kernel(
85 | sorted_bbox, thresh)
86 | selec = selec[:n_selec]
87 | selec = order[selec]
88 | if limit is not None:
89 | selec = selec[:limit]
90 | return cp.asnumpy(selec)
91 |
92 |
93 | _nms_gpu_code = '''
94 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
95 | int const threadsPerBlock = sizeof(unsigned long long) * 8;
96 |
97 | __device__
98 | inline float devIoU(float const *const bbox_a, float const *const bbox_b) {
99 | float top = max(bbox_a[0], bbox_b[0]);
100 | float bottom = min(bbox_a[2], bbox_b[2]);
101 | float left = max(bbox_a[1], bbox_b[1]);
102 | float right = min(bbox_a[3], bbox_b[3]);
103 | float height = max(bottom - top, 0.f);
104 | float width = max(right - left, 0.f);
105 | float area_i = height * width;
106 | float area_a = (bbox_a[2] - bbox_a[0]) * (bbox_a[3] - bbox_a[1]);
107 | float area_b = (bbox_b[2] - bbox_b[0]) * (bbox_b[3] - bbox_b[1]);
108 | return area_i / (area_a + area_b - area_i);
109 | }
110 |
111 | extern "C"
112 | __global__
113 | void nms_kernel(const int n_bbox, const float thresh,
114 | const float *dev_bbox,
115 | unsigned long long *dev_mask) {
116 | const int row_start = blockIdx.y;
117 | const int col_start = blockIdx.x;
118 |
119 | const int row_size =
120 | min(n_bbox - row_start * threadsPerBlock, threadsPerBlock);
121 | const int col_size =
122 | min(n_bbox - col_start * threadsPerBlock, threadsPerBlock);
123 |
124 | __shared__ float block_bbox[threadsPerBlock * 4];
125 | if (threadIdx.x < col_size) {
126 | block_bbox[threadIdx.x * 4 + 0] =
127 | dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 0];
128 | block_bbox[threadIdx.x * 4 + 1] =
129 | dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 1];
130 | block_bbox[threadIdx.x * 4 + 2] =
131 | dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 2];
132 | block_bbox[threadIdx.x * 4 + 3] =
133 | dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 3];
134 | }
135 | __syncthreads();
136 |
137 | if (threadIdx.x < row_size) {
138 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
139 | const float *cur_box = dev_bbox + cur_box_idx * 4;
140 | int i = 0;
141 | unsigned long long t = 0;
142 | int start = 0;
143 | if (row_start == col_start) {
144 | start = threadIdx.x + 1;
145 | }
146 | for (i = start; i < col_size; i++) {
147 | if (devIoU(cur_box, block_bbox + i * 4) >= thresh) {
148 | t |= 1ULL << i;
149 | }
150 | }
151 | const int col_blocks = DIVUP(n_bbox, threadsPerBlock);
152 | dev_mask[cur_box_idx * col_blocks + col_start] = t;
153 | }
154 | }
155 | '''
156 |
157 |
158 | def _call_nms_kernel(bbox, thresh):
159 | # PyTorch does not support unsigned long Tensor.
160 | # Doesn't matter,since it returns ndarray finally.
161 | # So I'll keep it unmodified.
162 | n_bbox = bbox.shape[0]
163 | threads_per_block = 64
164 | col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)
165 | blocks = (col_blocks, col_blocks, 1)
166 | threads = (threads_per_block, 1, 1)
167 |
168 | mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)
169 | bbox = cp.ascontiguousarray(bbox, dtype=np.float32) # NOTE: 变成连续的
170 | kern = _load_kernel('nms_kernel', _nms_gpu_code)
171 | kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh),
172 | bbox, mask_dev))
173 |
174 | mask_host = mask_dev.get()
175 | selection, n_selec = _nms_gpu_post(
176 | mask_host, n_bbox, threads_per_block, col_blocks)
177 | return selection, n_selec
178 |
--------------------------------------------------------------------------------
/lib/relation_tool.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | def RankEmbedding(rank_dim=128,feat_dim=1024,wave_len=1000):
4 | rank_range = torch.arange(0, rank_dim).cuda().float()
5 |
6 | feat_range = torch.arange(feat_dim / 2).cuda()
7 | dim_mat = feat_range / (feat_dim / 2)
8 | dim_mat = 1. / (torch.pow(wave_len, dim_mat))
9 |
10 | dim_mat = dim_mat.view(1, -1)
11 | rank_mat = rank_range.view(-1, 1)
12 |
13 | mul_mat = rank_mat * dim_mat
14 | sin_mat = torch.sin(mul_mat)
15 | cos_mat = torch.cos(mul_mat)
16 | embedding = torch.cat((sin_mat, cos_mat), -1)
17 |
18 | return embedding
19 | def PositionalEmbedding( f_g, dim_g=64, wave_len=1000):
20 | x_min, y_min, x_max, y_max = torch.chunk(f_g, 4, dim=1)
21 |
22 | cx = (x_min + x_max) * 0.5
23 | cy = (y_min + y_max) * 0.5
24 | w = (x_max - x_min) + 1.
25 | h = (y_max - y_min) + 1.
26 |
27 | delta_x = cx - cx.view(1, -1)
28 | delta_x = torch.clamp(torch.abs(delta_x / w), min=1e-3)
29 | delta_x = torch.log(delta_x)
30 |
31 | delta_y = cy - cy.view(1, -1)
32 | delta_y = torch.clamp(torch.abs(delta_y / h), min=1e-3)
33 | delta_y = torch.log(delta_y)
34 |
35 | delta_w = torch.log(w / w.view(1, -1))
36 | delta_h = torch.log(h / h.view(1, -1))
37 | size = delta_h.size()
38 |
39 | delta_x = delta_x.view(size[0], size[1], 1)
40 | delta_y = delta_y.view(size[0], size[1], 1)
41 | delta_w = delta_w.view(size[0], size[1], 1)
42 | delta_h = delta_h.view(size[0], size[1], 1)
43 |
44 | position_mat = torch.cat((delta_x, delta_y, delta_w, delta_h), -1)
45 |
46 | feat_range = torch.arange(dim_g / 8).cuda()
47 | dim_mat = feat_range / (dim_g / 8)
48 | dim_mat = 1. / (torch.pow(wave_len, dim_mat))
49 |
50 | dim_mat = dim_mat.view(1, 1, 1, -1)
51 | position_mat = position_mat.view(size[0], size[1], 4, -1)
52 | position_mat = 100. * position_mat
53 |
54 | mul_mat = position_mat * dim_mat
55 | mul_mat = mul_mat.view(size[0], size[1], -1)
56 | sin_mat = torch.sin(mul_mat)
57 | cos_mat = torch.cos(mul_mat)
58 | embedding = torch.cat((sin_mat, cos_mat), -1)
59 |
60 | return embedding
--------------------------------------------------------------------------------
/lib/roi_cupy.py:
--------------------------------------------------------------------------------
1 | kernel_forward = '''
2 | extern "C"
3 | __global__ void roi_forward(const float* const bottom_data,const float* const bottom_rois,
4 | float* top_data, int* argmax_data,
5 | const double spatial_scale,const int channels,const int height,
6 | const int width, const int pooled_height,
7 | const int pooled_width,const int NN
8 | ){
9 |
10 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
11 | if(idx>=NN)
12 | return;
13 | const int pw = idx % pooled_width;
14 | const int ph = (idx / pooled_width) % pooled_height;
15 | const int c = (idx / pooled_width / pooled_height) % channels;
16 | int num = idx / pooled_width / pooled_height / channels;
17 | const int roi_batch_ind = bottom_rois[num * 5 + 0];
18 | const int roi_start_w = round(bottom_rois[num * 5 + 1] * spatial_scale);
19 | const int roi_start_h = round(bottom_rois[num * 5 + 2] * spatial_scale);
20 | const int roi_end_w = round(bottom_rois[num * 5 + 3] * spatial_scale);
21 | const int roi_end_h = round(bottom_rois[num * 5 + 4] * spatial_scale);
22 | // Force malformed ROIs to be 1x1
23 | const int roi_width = max(roi_end_w - roi_start_w + 1, 1);
24 | const int roi_height = max(roi_end_h - roi_start_h + 1, 1);
25 | const float bin_size_h = static_cast(roi_height)
26 | / static_cast(pooled_height);
27 | const float bin_size_w = static_cast(roi_width)
28 | / static_cast(pooled_width);
29 |
30 | int hstart = static_cast(floor(static_cast(ph)
31 | * bin_size_h));
32 | int wstart = static_cast(floor(static_cast(pw)
33 | * bin_size_w));
34 | int hend = static_cast(ceil(static_cast(ph + 1)
35 | * bin_size_h));
36 | int wend = static_cast(ceil(static_cast(pw + 1)
37 | * bin_size_w));
38 |
39 | // Add roi offsets and clip to input boundaries
40 | hstart = min(max(hstart + roi_start_h, 0), height);
41 | hend = min(max(hend + roi_start_h, 0), height);
42 | wstart = min(max(wstart + roi_start_w, 0), width);
43 | wend = min(max(wend + roi_start_w, 0), width);
44 | bool is_empty = (hend <= hstart) || (wend <= wstart);
45 |
46 | // Define an empty pooling region to be zero
47 | float maxval = is_empty ? 0 : -1E+37;
48 | // If nothing is pooled, argmax=-1 causes nothing to be backprop'd
49 | int maxidx = -1;
50 | const int data_offset = (roi_batch_ind * channels + c) * height * width;
51 | for (int h = hstart; h < hend; ++h) {
52 | for (int w = wstart; w < wend; ++w) {
53 | int bottom_index = h * width + w;
54 | if (bottom_data[data_offset + bottom_index] > maxval) {
55 | maxval = bottom_data[data_offset + bottom_index];
56 | maxidx = bottom_index;
57 | }
58 | }
59 | }
60 | top_data[idx]=maxval;
61 | argmax_data[idx]=maxidx;
62 | }
63 | '''
64 | kernel_backward = '''
65 | extern "C"
66 | __global__ void roi_backward(const float* const top_diff,
67 | const int* const argmax_data,const float* const bottom_rois,
68 | float* bottom_diff, const int num_rois,
69 | const double spatial_scale, int channels,
70 | int height, int width, int pooled_height,
71 | int pooled_width,const int NN)
72 | {
73 |
74 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
75 | ////Importtan >= instead of >
76 | if(idx>=NN)
77 | return;
78 | int w = idx % width;
79 | int h = (idx / width) % height;
80 | int c = (idx/ (width * height)) % channels;
81 | int num = idx / (width * height * channels);
82 |
83 | float gradient = 0;
84 | // Accumulate gradient over all ROIs that pooled this element
85 | for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
86 | // Skip if ROI's batch index doesn't match num
87 | if (num != static_cast(bottom_rois[roi_n * 5])) {
88 | continue;
89 | }
90 |
91 | int roi_start_w = round(bottom_rois[roi_n * 5 + 1]
92 | * spatial_scale);
93 | int roi_start_h = round(bottom_rois[roi_n * 5 + 2]
94 | * spatial_scale);
95 | int roi_end_w = round(bottom_rois[roi_n * 5 + 3]
96 | * spatial_scale);
97 | int roi_end_h = round(bottom_rois[roi_n * 5 + 4]
98 | * spatial_scale);
99 |
100 | // Skip if ROI doesn't include (h, w)
101 | const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
102 | h >= roi_start_h && h <= roi_end_h);
103 | if (!in_roi) {
104 | continue;
105 | }
106 |
107 | int offset = (roi_n * channels + c) * pooled_height
108 | * pooled_width;
109 |
110 | // Compute feasible set of pooled units that could have pooled
111 | // this bottom unit
112 |
113 | // Force malformed ROIs to be 1x1
114 | int roi_width = max(roi_end_w - roi_start_w + 1, 1);
115 | int roi_height = max(roi_end_h - roi_start_h + 1, 1);
116 |
117 | float bin_size_h = static_cast(roi_height)
118 | / static_cast(pooled_height);
119 | float bin_size_w = static_cast(roi_width)
120 | / static_cast(pooled_width);
121 |
122 | int phstart = floor(static_cast(h - roi_start_h)
123 | / bin_size_h);
124 | int phend = ceil(static_cast(h - roi_start_h + 1)
125 | / bin_size_h);
126 | int pwstart = floor(static_cast(w - roi_start_w)
127 | / bin_size_w);
128 | int pwend = ceil(static_cast(w - roi_start_w + 1)
129 | / bin_size_w);
130 |
131 | phstart = min(max(phstart, 0), pooled_height);
132 | phend = min(max(phend, 0), pooled_height);
133 | pwstart = min(max(pwstart, 0), pooled_width);
134 | pwend = min(max(pwend, 0), pooled_width);
135 | for (int ph = phstart; ph < phend; ++ph) {
136 | for (int pw = pwstart; pw < pwend; ++pw) {
137 | int index_ = ph * pooled_width + pw + offset;
138 | if (argmax_data[index_] == (h * width + w)) {
139 | gradient += top_diff[index_];
140 | }
141 | }
142 | }
143 | }
144 | bottom_diff[idx] = gradient;
145 | }
146 | '''
147 |
--------------------------------------------------------------------------------
/lib/vis_tool.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | import numpy as np
4 | import matplotlib
5 | import torch as t
6 | import visdom
7 |
8 | matplotlib.use('Agg')
9 | from matplotlib import pyplot as plot
10 |
11 | # from data.voc_dataset import VOC_BBOX_LABEL_NAMES
12 |
13 |
14 | VOC_BBOX_LABEL_NAMES = (
15 | 'fly',
16 | 'bike',
17 | 'bird',
18 | 'boat',
19 | 'pin',
20 | 'bus',
21 | 'c',
22 | 'cat',
23 | 'chair',
24 | 'cow',
25 | 'table',
26 | 'dog',
27 | 'horse',
28 | 'moto',
29 | 'p',
30 | 'plant',
31 | 'shep',
32 | 'sofa',
33 | 'train',
34 | 'tv',
35 | )
36 |
37 |
38 | def vis_image(img, ax=None):
39 | """Visualize a color image.
40 |
41 | Args:
42 | img (~numpy.ndarray): An array of shape :math:`(3, height, width)`.
43 | This is in RGB format and the range of its value is
44 | :math:`[0, 255]`.
45 | ax (matplotlib.axes.Axis): The visualization is displayed on this
46 | axis. If this is :obj:`None` (default), a new axis is created.
47 |
48 | Returns:
49 | ~matploblib.axes.Axes:
50 | Returns the Axes object with the plot for further tweaking.
51 |
52 | """
53 |
54 | if ax is None:
55 | fig = plot.figure()
56 | ax = fig.add_subplot(1, 1, 1)
57 | # CHW -> HWC
58 | img = img.transpose((1, 2, 0))
59 |
60 | ax.imshow(img.astype(np.uint8))
61 | return ax
62 |
63 |
64 | def vis_bbox(img, bbox, label=None, score=None, ax=None):
65 | """Visualize bounding boxes inside image.
66 |
67 | Args:
68 | img (~numpy.ndarray): An array of shape :math:`(3, height, width)`.
69 | This is in RGB format and the range of its value is
70 | :math:`[0, 255]`.
71 | bbox (~numpy.ndarray): An array of shape :math:`(R, 4)`, where
72 | :math:`R` is the number of bounding boxes in the image.
73 | Each element is organized
74 | by :math:`(y_{min}, x_{min}, y_{max}, x_{max})` in the second axis.
75 | label (~numpy.ndarray): An integer array of shape :math:`(R,)`.
76 | The values correspond to id for label names stored in
77 | :obj:`label_names`. This is optional.
78 | score (~numpy.ndarray): A float array of shape :math:`(R,)`.
79 | Each value indicates how confident the prediction is.
80 | This is optional.
81 | label_names (iterable of strings): Name of labels ordered according
82 | to label ids. If this is :obj:`None`, labels will be skipped.
83 | ax (matplotlib.axes.Axis): The visualization is displayed on this
84 | axis. If this is :obj:`None` (default), a new axis is created.
85 |
86 | Returns:
87 | ~matploblib.axes.Axes:
88 | Returns the Axes object with the plot for further tweaking.
89 |
90 | """
91 |
92 | label_names = list(VOC_BBOX_LABEL_NAMES) + ['bg']
93 | # add for index `-1`
94 | if label is not None and not len(bbox) == len(label):
95 | raise ValueError('The length of label must be same as that of bbox')
96 | if score is not None and not len(bbox) == len(score):
97 | raise ValueError('The length of score must be same as that of bbox')
98 |
99 | # Returns newly instantiated matplotlib.axes.Axes object if ax is None
100 | ax = vis_image(img, ax=ax)
101 |
102 | # If there is no bounding box to display, visualize the image and exit.
103 | if len(bbox) == 0:
104 | return ax
105 |
106 | for i, bb in enumerate(bbox):
107 | xy = (bb[1], bb[0])
108 | height = bb[2] - bb[0]
109 | width = bb[3] - bb[1]
110 | ax.add_patch(plot.Rectangle(
111 | xy, width, height, fill=False, edgecolor='red', linewidth=2))
112 |
113 | caption = list()
114 |
115 | if label is not None and label_names is not None:
116 | lb = label[i]
117 | if not (-1 <= lb < len(label_names)): # modfy here to add backgroud
118 | raise ValueError('No corresponding name is given')
119 | caption.append(label_names[lb])
120 | if score is not None:
121 | sc = score[i]
122 | caption.append('{:.2f}'.format(sc))
123 |
124 | if len(caption) > 0:
125 | ax.text(bb[1], bb[0],
126 | ': '.join(caption),
127 | style='italic',
128 | bbox={'facecolor': 'white', 'alpha': 0.5, 'pad': 0})
129 | return ax
130 |
131 |
132 | def fig2data(fig):
133 | """
134 | brief Convert a Matplotlib figure to a 4D numpy array with RGBA
135 | channels and return it
136 |
137 | @param fig: a matplotlib figure
138 | @return a numpy 3D array of RGBA values
139 | """
140 | # draw the renderer
141 | fig.canvas.draw()
142 |
143 | # Get the RGBA buffer from the figure
144 | w, h = fig.canvas.get_width_height()
145 | buf = np.fromstring(fig.canvas.tostring_argb(), dtype=np.uint8)
146 | buf.shape = (w, h, 4)
147 |
148 | # canvas.tostring_argb give pixmap in ARGB mode. Roll the ALPHA channel to have it in RGBA mode
149 | buf = np.roll(buf, 3, axis=2)
150 | return buf.reshape(h, w, 4)
151 |
152 |
153 | def fig4vis(fig):
154 | """
155 | convert figure to ndarray
156 | """
157 | ax = fig.get_figure()
158 | img_data = fig2data(ax).astype(np.int32)
159 | plot.close()
160 | # HWC->CHW
161 | return img_data[:, :, :3].transpose((2, 0, 1)) / 255.
162 |
163 |
164 | def visdom_bbox(*args, **kwargs):
165 | fig = vis_bbox(*args, **kwargs)
166 | data = fig4vis(fig)
167 | return data
168 |
169 |
170 | class Visualizer(object):
171 | """
172 | wrapper for visdom
173 | you can still access naive visdom function by
174 | self.line, self.scater,self._send,etc.
175 | due to the implementation of `__getattr__`
176 | """
177 |
178 | def __init__(self, env='default', **kwargs):
179 | self.vis = visdom.Visdom(env=env, **kwargs)
180 | self._vis_kw = kwargs
181 |
182 | # e.g.(’loss',23) the 23th value of loss
183 | self.index = {}
184 | self.log_text = ''
185 |
186 | def reinit(self, env='default', **kwargs):
187 | """
188 | change the config of visdom
189 | """
190 | self.vis = visdom.Visdom(env=env, **kwargs)
191 | return self
192 |
193 | def plot_many(self, d):
194 | """
195 | plot multi values
196 | @params d: dict (name,value) i.e. ('loss',0.11)
197 | """
198 | for k, v in d.items():
199 | if v is not None:
200 | self.plot(k, v)
201 |
202 | def img_many(self, d):
203 | for k, v in d.items():
204 | self.img(k, v)
205 |
206 | def plot(self, name, y, **kwargs):
207 | """
208 | self.plot('loss',1.00)
209 | """
210 | x = self.index.get(name, 0)
211 | self.vis.line(Y=np.array([y]), X=np.array([x]),
212 | win=name,
213 | opts=dict(title=name),
214 | update=None if x == 0 else 'append',
215 | **kwargs
216 | )
217 | self.index[name] = x + 1
218 |
219 | def img(self, name, img_, **kwargs):
220 | """
221 | self.img('input_img',t.Tensor(64,64))
222 | self.img('input_imgs',t.Tensor(3,64,64))
223 | self.img('input_imgs',t.Tensor(100,1,64,64))
224 | self.img('input_imgs',t.Tensor(100,3,64,64),nrows=10)
225 | !!!don‘t ~~self.img('input_imgs',t.Tensor(100,64,64),nrows=10)~~!!!
226 | """
227 | self.vis.images(t.Tensor(img_).cpu().numpy(),
228 | win=name,
229 | opts=dict(title=name),
230 | **kwargs
231 | )
232 |
233 | def log(self, info, win='log_text'):
234 | """
235 | self.log({'loss':1,'lr':0.0001})
236 | """
237 | self.log_text += ('[{time}] {info}
'.format(
238 | time=time.strftime('%m%d_%H%M%S'), \
239 | info=info))
240 | self.vis.text(self.log_text, win)
241 |
242 | def __getattr__(self, name):
243 | return getattr(self.vis, name)
244 |
245 | def state_dict(self):
246 | return {
247 | 'index': self.index,
248 | 'vis_kw': self._vis_kw,
249 | 'log_text': self.log_text,
250 | 'env': self.vis.env
251 | }
252 |
253 | def load_state_dict(self, d):
254 | self.vis = visdom.Visdom(env=d.get('env', self.vis.env), **(self.d.get('vis_kw')))
255 | self.log_text = d.get('log_text', '')
256 | self.index = d.get('index', dict())
257 | return self
258 |
--------------------------------------------------------------------------------
/losses.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from torch import nn
4 | import torch as t
5 | from torch.autograd import Variable
6 | import lib.array_tool as at
7 | from torch.nn import functional as F
8 | from config import opt
9 | from lib.bbox_tools import bbox_iou
10 | from lib.array_tool import tonumpy
11 | def _smooth_l1_loss(x, t, in_weight, sigma):
12 | sigma2 = sigma ** 2
13 | diff = in_weight * (x - t)
14 | abs_diff = diff.abs()
15 | flag = (abs_diff.data < (1. / sigma2)).float()
16 | flag = Variable(flag)
17 | y = (flag * (sigma2 / 2.) * (diff ** 2) +
18 | (1 - flag) * (abs_diff - 0.5 / sigma2))
19 | return y.sum()
20 |
21 |
22 | def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma):
23 | in_weight = t.zeros(gt_loc.shape).cuda()
24 | # Localization loss is calculated only for positive rois.
25 | # NOTE: unlike origin implementation,
26 | # we don't need inside_weight and outside_weight, they can calculate by gt_label
27 | in_weight[(gt_label > 0).view(-1, 1).expand_as(in_weight).cuda()] = 1
28 | loc_loss = _smooth_l1_loss(pred_loc, gt_loc, Variable(in_weight), sigma)
29 | # Normalize by total number of negtive and positive rois.
30 | loc_loss /= (gt_label >= 0).sum().float() # ignore gt_label==-1 for rpn_loss
31 | return loc_loss
32 |
33 | class RPNLoss(nn.Module):
34 | def __init__(self):
35 | super(RPNLoss, self).__init__()
36 | self.rpn_sigma = opt.rpn_sigma
37 |
38 | def forward(self, gt_rpn_loc,gt_rpn_label, rpn_locs, rpn_scores):
39 | # Since batch size is one, convert variables to singular form
40 | rpn_score = rpn_scores[0]
41 | rpn_loc = rpn_locs[0]
42 |
43 |
44 | # ------------------ RPN losses -------------------#
45 |
46 | gt_rpn_label = at.tovariable(gt_rpn_label).long()
47 | gt_rpn_loc = at.tovariable(gt_rpn_loc)
48 | rpn_loc_loss = _fast_rcnn_loc_loss(
49 | rpn_loc,
50 | gt_rpn_loc,
51 | gt_rpn_label.data,
52 | self.rpn_sigma)
53 |
54 | # NOTE: default value of ignore_index is -100 ...
55 | rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1)
56 | return [rpn_loc_loss, rpn_cls_loss]
57 |
58 | class ROILoss(nn.Module):
59 | def __init__(self):
60 | super(ROILoss, self).__init__()
61 | self.roi_sigma = opt.roi_sigma
62 | def forward(self,gt_roi_loc, gt_roi_label,roi_cls_loc, roi_score):
63 | n_sample = roi_cls_loc.shape[0]
64 | roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
65 | gt_roi_label = at.tovariable(gt_roi_label).long()
66 | gt_roi_loc = at.tovariable(gt_roi_loc)
67 | roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), at.totensor(gt_roi_label).long()]
68 |
69 | roi_loc_loss = _fast_rcnn_loc_loss(
70 | roi_loc.contiguous(),
71 | gt_roi_loc,
72 | gt_roi_label.data,
73 | self.roi_sigma)
74 |
75 | roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())
76 | return [roi_loc_loss,roi_cls_loss]
77 |
78 | class RelationNetworksLoss(nn.Module):
79 | def __init__(self):
80 | super(RelationNetworksLoss, self).__init__()
81 |
82 | def forward(self, gt_bboxes, gt_labels, nms_scores, sorted_labels, sorted_cls_bboxes):
83 | if nms_scores is None:
84 | return [1.]
85 | sorted_score, prob_argsort = t.sort(nms_scores, descending=True)
86 | sorted_cls_bboxes = sorted_cls_bboxes[prob_argsort]
87 | sorted_labels = sorted_labels[prob_argsort]
88 | sorted_labels = tonumpy(sorted_labels)
89 | gt_labels = tonumpy(gt_labels)
90 |
91 | nms_gt = t.zeros_like(sorted_score)
92 |
93 | eps = 1e-8
94 |
95 | iou = bbox_iou(tonumpy(gt_bboxes[0]), tonumpy(sorted_cls_bboxes))
96 | for gt_idx in range(len(iou)):
97 | accept_iou = np.reshape(np.argwhere(iou[gt_idx] > 0.5),-1)
98 | accept_label = np.reshape(np.argwhere(sorted_labels[accept_iou] == gt_labels[0][gt_idx]),-1)
99 |
100 | if not(len(accept_label)==0):
101 | nms_gt[accept_iou[accept_label[0]]] = 1.
102 |
103 | loss = nms_gt * (sorted_score+ eps).log() + (1 - nms_gt) * (1-sorted_score + eps).log()
104 | loss = -loss.mean()
105 | return [loss]
106 |
--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
1 | import time
2 | import math
3 | import torch.utils.model_zoo as model_zoo
4 | import six
5 |
6 | from torch.nn import functional as F
7 | from losses import ROILoss, RPNLoss, RelationNetworksLoss
8 | from lib.nms import non_maximum_suppression
9 | from collections import namedtuple
10 | from string import Template
11 | import lib.array_tool as at
12 | from config import opt
13 | from data.dataset import preprocess, VGGpreprocess
14 | from lib.bbox_tools import loc2bbox
15 |
16 | import torch as t
17 | from torch.autograd import Function
18 |
19 | from lib.roi_cupy import kernel_backward, kernel_forward
20 | from lib.creator_tool import ProposalCreator, ProposalTargetCreator, AnchorTargetCreator
21 | from lib.relation_tool import PositionalEmbedding, RankEmbedding
22 | from torchvision.models import vgg16_bn,squeezenet1_1
23 |
24 |
25 |
26 |
27 | import torch
28 | import torch.nn as nn
29 | import numpy as np
30 | import cupy as cp
31 |
32 |
33 | Stream = namedtuple('Stream', ['ptr'])
34 |
35 | @cp.util.memoize(for_each_device=True)
36 | def load_kernel(kernel_name, code, **kwargs):
37 | cp.cuda.runtime.free(0)
38 | code = Template(code).substitute(**kwargs)
39 | kernel_code = cp.cuda.compile_with_cache(code)
40 | return kernel_code.get_function(kernel_name)
41 |
42 | CUDA_NUM_THREADS = 1024
43 |
44 | def GET_BLOCKS(N, K=CUDA_NUM_THREADS):
45 | return (N + K - 1) // K
46 |
47 | class SqueezeFRCN(nn.Module):
48 | feat_stride = 16 # downsample 16x for output of convolution squeeze
49 | def __init__(self, num_classes):
50 | super(SqueezeFRCN, self).__init__()
51 | self.loc_normalize_mean = (0., 0., 0., 0.)
52 | self.loc_normalize_std = (0.1, 0.1, 0.2, 0.2)
53 | self.n_class = num_classes +1
54 | self.training = False
55 |
56 | model = squeezenet1_1(pretrained=True)
57 | self.feature_extractor = model.features
58 |
59 | # freeze
60 | for layer in self.feature_extractor[:5]:
61 | for p in layer.parameters():
62 | p.requires_grad = False
63 |
64 | self.rpn = RegionProposalNetwork(in_channels=512, mid_channels=512, feat_stride=self.feat_stride)
65 | self.roi_head = RoIHead(n_class=self.n_class, roi_size=7, spatial_scale=(1. / self.feat_stride),
66 | in_channels=512, fc_features=512, n_relations=0)
67 |
68 | self.proposal_target_creator = ProposalTargetCreator()
69 | self.anchor_target_creator = AnchorTargetCreator()
70 |
71 | self.roiLoss = ROILoss()
72 | self.rpnLoss = RPNLoss()
73 |
74 | def forward(self,inputs,scale = 1.):
75 | if self.training:
76 | img_batch, bboxes, labels, _ = inputs
77 | else:
78 | img_batch = inputs
79 |
80 | _, _, H, W = img_batch.shape
81 | img_size = (H, W)
82 | start = time.time()
83 | features = self.feature_extractor(img_batch)
84 | rpn_locs, rpn_scores, rois, roi_indices, anchor = self.rpn(features, img_size, scale)
85 | if self.training:
86 | gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
87 | at.tonumpy(bboxes[0]),
88 | anchor,
89 | img_size)
90 | sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
91 | rois,
92 | at.tonumpy(bboxes[0]),
93 | at.tonumpy(labels[0]),
94 | self.loc_normalize_mean,
95 | self.loc_normalize_std)
96 | sample_roi_index = t.zeros(len(sample_roi))
97 |
98 | roi_cls_loc, roi_score, appearance_features = self.roi_head(features, sample_roi, sample_roi_index)
99 |
100 | return gt_rpn_loc, gt_rpn_label, gt_roi_loc, gt_roi_label, roi_cls_loc, roi_score, rpn_locs, rpn_scores, \
101 | sample_roi, roi_cls_loc, roi_score, appearance_features, img_size, labels, bboxes
102 | else:
103 | roi_cls_loc, roi_score, appearance_features = self.roi_head(features, rois, roi_indices)
104 |
105 | return roi_cls_loc, roi_score, rois, roi_indices, appearance_features, img_size
106 | def get_loss(self,inputs,isLearnNMS):
107 | gt_rpn_loc, gt_rpn_label, gt_roi_loc, gt_roi_label, roi_cls_loc, roi_score, rpn_locs, rpn_scores, \
108 | sample_roi, roi_cls_loc, roi_score, appearance_features, img_size, labels, bboxes = self(inputs)
109 | if(isLearnNMS):
110 | rpn_loss = self.rpnLoss(gt_rpn_loc,gt_rpn_label, rpn_locs, rpn_scores)
111 | roi_loss = self.roiLoss(gt_roi_loc, gt_roi_label,roi_cls_loc, roi_score)
112 | nms_scores, sorted_labels, sorted_cls_bboxes = self.duplicate_remover(sample_roi, roi_cls_loc, roi_score,
113 | appearance_features, img_size)
114 | nms_loss = self.nmsLoss(bboxes, labels,nms_scores, sorted_labels, sorted_cls_bboxes)
115 | losses = rpn_loss+roi_loss+nms_loss
116 | losses = [sum(losses)]+losses
117 | return losses
118 | else:
119 | rpn_loss = self.rpnLoss(gt_rpn_loc, gt_rpn_label, rpn_locs, rpn_scores)
120 | roi_loss = self.roiLoss(gt_roi_loc, gt_roi_label, roi_cls_loc, roi_score)
121 | losses = rpn_loss + roi_loss
122 | losses = [sum(losses)]+losses
123 | return losses
124 | def predict(self, imgs, sizes=None, visualize=False):
125 | if visualize:
126 | self.use_preset(isTraining=False, preset='visualize')
127 | prepared_imgs = list()
128 | for img in imgs:
129 | size = img.shape[1:]
130 | img = VGGpreprocess(at.tonumpy(img))
131 | prepared_imgs.append(img)
132 | else:
133 | self.use_preset(isTraining=False, preset='evaluate')
134 | prepared_imgs = imgs
135 |
136 | bboxes = list()
137 | labels = list()
138 | scores = list()
139 | for img in prepared_imgs:
140 | img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True)
141 | size = img.shape[2:]
142 | scale = np.array(1.)
143 | roi_cls_loc, roi_scores, rois, _,_ ,_ = self(img, scale=scale)
144 | # We are assuming that batch size is 1.
145 | roi_score = roi_scores.data
146 | roi_cls_loc = roi_cls_loc.data
147 |
148 | roi = at.totensor(rois)
149 |
150 | # Convert predictions to bounding boxes in image coordinates.
151 | # Bounding boxes are scaled to the scale of the input images.
152 | mean = t.Tensor(self.loc_normalize_mean).cuda(). \
153 | repeat(self.n_class)[None]
154 | std = t.Tensor(self.loc_normalize_std).cuda(). \
155 | repeat(self.n_class)[None]
156 |
157 | roi_cls_loc = (roi_cls_loc * std + mean)
158 | roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
159 | roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)
160 | cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)),
161 | at.tonumpy(roi_cls_loc).reshape((-1, 4)))
162 | cls_bbox = at.totensor(cls_bbox)
163 | cls_bbox = cls_bbox.view(-1, self.n_class * 4)
164 | # clip bounding box
165 | cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
166 | cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])
167 |
168 | prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1))
169 |
170 | raw_cls_bbox = at.tonumpy(cls_bbox)
171 | raw_prob = at.tonumpy(prob)
172 |
173 | bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
174 | bboxes.append(bbox)
175 | labels.append(label)
176 | scores.append(score)
177 |
178 | return bboxes, labels, scores
179 |
180 | def _suppress(self, raw_cls_bbox, raw_prob):
181 | bbox = list()
182 | label = list()
183 | score = list()
184 | # skip cls_id = 0 because it is the background class
185 | for l in range(1, self.n_class):
186 | cls_bbox_l = raw_cls_bbox.reshape((-1, self.n_class, 4))[:, l, :]
187 | prob_l = raw_prob[:, l]
188 | mask = prob_l > self.score_thresh
189 | cls_bbox_l = cls_bbox_l[mask]
190 | prob_l = prob_l[mask]
191 | keep = non_maximum_suppression(
192 | cp.array(cls_bbox_l), self.nms_thresh, prob_l)
193 | keep = cp.asnumpy(keep)
194 | bbox.append(cls_bbox_l[keep])
195 | # The labels are in [0, self.n_class - 2].
196 | label.append((l - 1) * np.ones((len(keep),)))
197 | score.append(prob_l[keep])
198 | bbox = np.concatenate(bbox, axis=0).astype(np.float32)
199 | label = np.concatenate(label, axis=0).astype(np.int32)
200 | score = np.concatenate(score, axis=0).astype(np.float32)
201 | return bbox, label, score
202 | def freeze_bn(self):
203 | '''Freeze BatchNorm layers.'''
204 | for layer in self.modules():
205 | if isinstance(layer, nn.BatchNorm2d):
206 | layer.eval()
207 | def use_preset(self,isTraining,preset='visualize'):
208 | if preset == 'visualize':
209 | self.nms_thresh = 0.3
210 | self.score_thresh = 0.7
211 | elif preset == 'evaluate':
212 | self.nms_thresh = 0.3
213 | self.score_thresh = 0.05
214 | self.training=isTraining
215 | def get_optimizer(self):
216 | """
217 | return optimizer, It could be overwriten if you want to specify
218 | special optimizer
219 | """
220 | lr = opt.lr
221 | params = []
222 | for key, value in dict(self.named_parameters()).items():
223 | if value.requires_grad:
224 | if 'bias' in key:
225 | params += [{'params': [value], 'lr': lr * 2, 'weight_decay': 0}]
226 | else:
227 | params += [{'params': [value], 'lr': lr, 'weight_decay': opt.weight_decay}]
228 | if(opt.use_adam):
229 | optimizer = t.optim.Adam(params)
230 | else:
231 | optimizer = t.optim.SGD(params,momentum = 0.9)
232 | return optimizer
233 | class VGGFRCN(nn.Module):
234 | feat_stride = 16 # downsample 16x for output of conv5 in vgg16
235 | def __init__(self, num_classes):
236 | super(VGGFRCN, self).__init__()
237 | self.loc_normalize_mean = (0., 0., 0., 0.)
238 | self.loc_normalize_std = (0.1, 0.1, 0.2, 0.2)
239 | self.n_class = num_classes+1
240 | self.training = False
241 | model = vgg16_bn(pretrained=True)
242 | self.feature_extractor = model.features[:43]
243 | # freeze top4 conv
244 | for layer in self.feature_extractor[:14]:
245 | for p in layer.parameters():
246 | p.requires_grad = False
247 |
248 | classifier = model.classifier
249 | del classifier[6]
250 | del classifier[5]
251 | del classifier[2]
252 | classifier = nn.Sequential(*classifier)
253 |
254 | self.rpn = RegionProposalNetwork(in_channels=512, mid_channels=512, feat_stride=self.feat_stride)
255 |
256 | self.roi_head = RoIHead(n_class=self.n_class, roi_size=7, spatial_scale=(1. / self.feat_stride), n_relations=0,
257 | in_channels=512, fc_features=4096, classifier = classifier)
258 |
259 | self.proposal_target_creator = ProposalTargetCreator()
260 | self.anchor_target_creator = AnchorTargetCreator()
261 |
262 | self.roiLoss = ROILoss()
263 | self.rpnLoss = RPNLoss()
264 | self.freeze_bn()
265 | def forward(self,inputs, scale=1.):
266 | if self.training:
267 | img_batch, bboxes, labels, _ = inputs
268 | else:
269 | img_batch = inputs
270 |
271 | _, _, H, W = img_batch.shape
272 | img_size = (H, W)
273 |
274 | features = self.feature_extractor(img_batch)
275 | rpn_locs, rpn_scores, rois, roi_indices, anchor = self.rpn(features, img_size, scale)
276 |
277 | if self.training:
278 | gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
279 | at.tonumpy(bboxes[0]),
280 | anchor,
281 | img_size)
282 | sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
283 | rois,
284 | at.tonumpy(bboxes[0]),
285 | at.tonumpy(labels[0]),
286 | self.loc_normalize_mean,
287 | self.loc_normalize_std)
288 | sample_roi_index = t.zeros(len(sample_roi))
289 |
290 | roi_cls_loc, roi_score, appearance_features = self.roi_head(features, sample_roi, sample_roi_index)
291 |
292 | return gt_rpn_loc, gt_rpn_label, gt_roi_loc, gt_roi_label, roi_cls_loc, roi_score, rpn_locs, rpn_scores, \
293 | sample_roi, roi_cls_loc, roi_score, appearance_features, img_size, labels, bboxes
294 |
295 | else:
296 | roi_cls_loc, roi_score, appearance_features = self.roi_head(features, rois, roi_indices)
297 |
298 | return roi_cls_loc, roi_score, rois, roi_indices, appearance_features, img_size
299 |
300 | def get_loss(self,inputs,isLearnNMS):
301 | gt_rpn_loc, gt_rpn_label, gt_roi_loc, gt_roi_label, roi_cls_loc, roi_score, rpn_locs, rpn_scores, \
302 | sample_roi, roi_cls_loc, roi_score, appearance_features, img_size, labels, bboxes = self(inputs)
303 | if(isLearnNMS):
304 | rpn_loss = self.rpnLoss(gt_rpn_loc,gt_rpn_label, rpn_locs, rpn_scores)
305 | roi_loss = self.roiLoss(gt_roi_loc, gt_roi_label,roi_cls_loc, roi_score)
306 | nms_scores, sorted_labels, sorted_cls_bboxes = self.duplicate_remover(sample_roi, roi_cls_loc, roi_score,
307 | appearance_features, img_size)
308 | nms_loss = self.nmsLoss(bboxes, labels,nms_scores, sorted_labels, sorted_cls_bboxes)
309 | losses = rpn_loss+roi_loss+nms_loss
310 | losses = [sum(losses)]+losses
311 | return losses
312 | else:
313 | rpn_loss = self.rpnLoss(gt_rpn_loc, gt_rpn_label, rpn_locs, rpn_scores)
314 | roi_loss = self.roiLoss(gt_roi_loc, gt_roi_label, roi_cls_loc, roi_score)
315 | losses = rpn_loss + roi_loss
316 | losses = [sum(losses)]+losses
317 | return losses
318 | def predict(self, imgs, sizes=None, visualize=False):
319 | if visualize:
320 | self.use_preset(isTraining=False, preset='visualize')
321 | prepared_imgs = list()
322 | for img in imgs:
323 | size = img.shape[1:]
324 | img = VGGpreprocess(at.tonumpy(img))
325 | prepared_imgs.append(img)
326 | else:
327 | self.use_preset(isTraining=False, preset='evaluate')
328 | prepared_imgs = imgs
329 |
330 | bboxes = list()
331 | labels = list()
332 | scores = list()
333 | for img in prepared_imgs:
334 | img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True)
335 | size = img.shape[2:]
336 | scale = np.array(1.)
337 | roi_cls_loc, roi_scores, rois, _,_ ,_ = self(img, scale=scale)
338 | # We are assuming that batch size is 1.
339 | roi_score = roi_scores.data
340 | roi_cls_loc = roi_cls_loc.data
341 |
342 | roi = at.totensor(rois)
343 |
344 | # Convert predictions to bounding boxes in image coordinates.
345 | # Bounding boxes are scaled to the scale of the input images.
346 | mean = t.Tensor(self.loc_normalize_mean).cuda(). \
347 | repeat(self.n_class)[None]
348 | std = t.Tensor(self.loc_normalize_std).cuda(). \
349 | repeat(self.n_class)[None]
350 |
351 | roi_cls_loc = (roi_cls_loc * std + mean)
352 | roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
353 | roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)
354 | cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)),
355 | at.tonumpy(roi_cls_loc).reshape((-1, 4)))
356 | cls_bbox = at.totensor(cls_bbox)
357 | cls_bbox = cls_bbox.view(-1, self.n_class * 4)
358 | # clip bounding box
359 | cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
360 | cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])
361 |
362 | prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1))
363 |
364 | raw_cls_bbox = at.tonumpy(cls_bbox)
365 | raw_prob = at.tonumpy(prob)
366 |
367 | bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
368 | bboxes.append(bbox)
369 | labels.append(label)
370 | scores.append(score)
371 |
372 | return bboxes, labels, scores
373 |
374 | def _suppress(self, raw_cls_bbox, raw_prob):
375 | bbox = list()
376 | label = list()
377 | score = list()
378 | # skip cls_id = 0 because it is the background class
379 | for l in range(1, self.n_class):
380 | cls_bbox_l = raw_cls_bbox.reshape((-1, self.n_class, 4))[:, l, :]
381 | prob_l = raw_prob[:, l]
382 | mask = prob_l > self.score_thresh
383 | cls_bbox_l = cls_bbox_l[mask]
384 | prob_l = prob_l[mask]
385 | keep = non_maximum_suppression(
386 | cp.array(cls_bbox_l), self.nms_thresh, prob_l)
387 | keep = cp.asnumpy(keep)
388 | bbox.append(cls_bbox_l[keep])
389 | # The labels are in [0, self.n_class - 2].
390 | label.append((l - 1) * np.ones((len(keep),)))
391 | score.append(prob_l[keep])
392 | bbox = np.concatenate(bbox, axis=0).astype(np.float32)
393 | label = np.concatenate(label, axis=0).astype(np.int32)
394 | score = np.concatenate(score, axis=0).astype(np.float32)
395 | return bbox, label, score
396 | def freeze_bn(self):
397 | '''Freeze BatchNorm layers.'''
398 | for layer in self.modules():
399 | if isinstance(layer, nn.BatchNorm2d):
400 | layer.eval()
401 | def use_preset(self,isTraining,preset='visualize'):
402 | if preset == 'visualize':
403 | self.nms_thresh = 0.3
404 | self.score_thresh = 0.7
405 | elif preset == 'evaluate':
406 | self.nms_thresh = 0.3
407 | self.score_thresh = 0.05
408 | self.training=isTraining
409 | def get_optimizer(self):
410 | """
411 | return optimizer, It could be overwriten if you want to specify
412 | special optimizer
413 | """
414 | lr = opt.lr
415 | params = []
416 | for key, value in dict(self.named_parameters()).items():
417 | if value.requires_grad:
418 | if 'bias' in key:
419 | params += [{'params': [value], 'lr': lr * 2, 'weight_decay': 0}]
420 | else:
421 | params += [{'params': [value], 'lr': lr, 'weight_decay': opt.weight_decay}]
422 | if(opt.use_adam):
423 | optimizer = t.optim.Adam(params)
424 | else:
425 | optimizer = t.optim.SGD(params,momentum = 0.9)
426 | return optimizer
427 |
428 | # class ResFRCN(nn.Module):
429 | # feat_stride = 16 # downsample 32x for output of convolution resnet
430 | # def __init__(self, num_classes, block, layers):
431 | # self.training=False
432 | # self.inplanes = 64
433 | # self.loc_normalize_mean = (0., 0., 0., 0.)
434 | # self.loc_normalize_std = (0.1, 0.1, 0.2, 0.2)
435 | # self.n_class = num_classes+1
436 | #
437 | # super(ResFRCN, self).__init__()
438 | # self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
439 | # self.bn1 = nn.BatchNorm2d(64)
440 | # self.relu = nn.ReLU(inplace=True)
441 | # self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
442 | # self.layer1 = self._make_layer(block, 64, layers[0])
443 | # self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
444 | # self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
445 | # self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
446 | #
447 | # if block == BasicBlock:
448 | # fpn_sizes = [self.layer2[layers[1]-1].conv2.out_channels, self.layer3[layers[2]-1].conv2.out_channels,
449 | # self.layer4[layers[3]-1].conv2.out_channels]
450 | # self.conv2 = nn.Conv2d(self.layer4[layers[3]-1].conv2.out_channels, 512, kernel_size=1, stride=1, bias=False)
451 | # elif block == Bottleneck:
452 | # fpn_sizes = [self.layer2[layers[1]-1].conv3.out_channels, self.layer3[layers[2]-1].conv3.out_channels,
453 | # self.layer4[layers[3]-1].conv3.out_channels]
454 | # self.conv2 = nn.Conv2d(self.layer4[layers[3]-1].conv3.out_channels, 512, kernel_size=1, stride=1, bias=False)
455 | #
456 | # #self.fpn = PyramidFeatures(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2],feature_size = 512)
457 | #
458 | # self.rpn = RegionProposalNetwork(in_channels=512,mid_channels=512,feat_stride = self.feat_stride)
459 | # self.roi_head = RoIHead(n_class = num_classes+1,roi_size=7,spatial_scale=(1. / self.feat_stride),
460 | # in_channels=512,fc_features = 1024, n_relations= 16)
461 | # self.duplicate_remover = DuplicationRemovalNetwork(n_relations=16,appearance_feature_dim=1024,
462 | # num_classes=num_classes)
463 | # self.proposal_target_creator = ProposalTargetCreator()
464 | # self.anchor_target_creator = AnchorTargetCreator()
465 | #
466 | # self.roiLoss = ROILoss()
467 | # self.rpnLoss = RPNLoss()
468 | # self.nmsLoss = RelationNetworksLoss()
469 | # for m in self.modules():
470 | # if isinstance(m, nn.Conv2d):
471 | # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
472 | # m.weight.data.normal_(0, math.sqrt(2. / n))
473 | # elif isinstance(m, nn.BatchNorm2d):
474 | # m.weight.data.fill_(1)
475 | # m.bias.data.zero_()
476 | #
477 | #
478 | # self.freeze_bn()
479 | # def use_preset(self,isTraining,preset='visualize'):
480 | # if preset == 'visualize':
481 | # self.nms_thresh = 0.3
482 | # self.score_thresh = 0.7
483 | # elif preset == 'evaluate':
484 | # self.nms_thresh = 0.3
485 | # self.score_thresh = 0.5
486 | # self.training=isTraining
487 | # def _make_layer(self, block, planes, blocks, stride=1):
488 | # downsample = None
489 | # if stride != 1 or self.inplanes != planes * block.expansion:
490 | # downsample = nn.Sequential(
491 | # nn.Conv2d(self.inplanes, planes * block.expansion,
492 | # kernel_size=1, stride=stride, bias=False),
493 | # nn.BatchNorm2d(planes * block.expansion),
494 | # )
495 | # layers = []
496 | # layers.append(block(self.inplanes, planes, stride, downsample))
497 | # self.inplanes = planes * block.expansion
498 | # for i in range(1, blocks):
499 | # layers.append(block(self.inplanes, planes))
500 | # return nn.Sequential(*layers)
501 | # def freeze_bn(self):
502 | # '''Freeze BatchNorm layers.'''
503 | # for layer in self.modules():
504 | # if isinstance(layer, nn.BatchNorm2d):
505 | # layer.eval()
506 | # def forward(self, inputs, scale=1.):
507 | # if self.training:
508 | # img_batch, bboxes, labels, _ = inputs
509 | # else:
510 | # img_batch = inputs
511 | #
512 | # _, _, H, W = img_batch.shape
513 | # img_size = (H, W)
514 | # x = self.conv1(img_batch)
515 | # x = self.bn1(x)
516 | # x = self.relu(x)
517 | # x = self.maxpool(x)
518 | # x1 = self.layer1(x)
519 | # x2 = self.layer2(x1)
520 | # x3 = self.layer3(x2)
521 | # x4 = self.layer4(x3)
522 | #
523 | # #features = self.fpn([x2, x3, x4])
524 | # features = self.conv2(x4)
525 | # rpn_locs, rpn_scores, rois, roi_indices, anchor = self.rpn(features,img_size,scale)
526 | #
527 | # if self.training:
528 | # gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
529 | # at.tonumpy(bboxes[0]),
530 | # anchor,
531 | # img_size)
532 | # sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
533 | # rois,
534 | # at.tonumpy(bboxes[0]),
535 | # at.tonumpy(labels[0]),
536 | # self.loc_normalize_mean,
537 | # self.loc_normalize_std)
538 | # sample_roi_index = t.zeros(len(sample_roi))
539 | #
540 | # roi_cls_loc, roi_score, appearance_features = self.roi_head(features, sample_roi, sample_roi_index)
541 | #
542 | # return gt_rpn_loc, gt_rpn_label, gt_roi_loc, gt_roi_label, roi_cls_loc, roi_score, rpn_locs, rpn_scores,\
543 | # sample_roi, roi_cls_loc, roi_score, appearance_features, img_size,labels, bboxes
544 | #
545 | # else:
546 | # roi_cls_loc, roi_score, appearance_features = self.roi_head(features, rois, roi_indices)
547 | #
548 | # return roi_cls_loc,roi_score, rois, roi_indices, appearance_features, img_size
549 | #
550 | # def _suppress(self, raw_cls_bbox, raw_prob):
551 | # bbox = list()
552 | # label = list()
553 | # score = list()
554 | # # skip cls_id = 0 because it is the background class
555 | # for l in range(1, self.n_class):
556 | # cls_bbox_l = raw_cls_bbox.reshape((-1, self.n_class, 4))[:, l, :]
557 | # prob_l = raw_prob[:, l]
558 | # mask = prob_l > self.score_thresh
559 | # cls_bbox_l = cls_bbox_l[mask]
560 | # prob_l = prob_l[mask]
561 | # keep = non_maximum_suppression(
562 | # cp.array(cls_bbox_l), self.nms_thresh, prob_l)
563 | # keep = cp.asnumpy(keep)
564 | # bbox.append(cls_bbox_l[keep])
565 | # # The labels are in [0, self.n_class - 2].
566 | # label.append((l - 1) * np.ones((len(keep),)))
567 | # score.append(prob_l[keep])
568 | # bbox = np.concatenate(bbox, axis=0).astype(np.float32)
569 | # label = np.concatenate(label, axis=0).astype(np.int32)
570 | # score = np.concatenate(score, axis=0).astype(np.float32)
571 | # return bbox, label, score
572 | # def predict(self, imgs, sizes=None, visualize=False):
573 | # if visualize:
574 | # self.use_preset(isTraining=False, preset='visualize')
575 | # prepared_imgs = list()
576 | # sizes = list()
577 | # for img in imgs:
578 | # size = img.shape[1:]
579 | # img = preprocess(at.tonumpy(img))
580 | # prepared_imgs.append(img)
581 | # sizes.append(size)
582 | # else:
583 | # self.use_preset(isTraining=False, preset='evaluate')
584 | # prepared_imgs = imgs
585 | # bboxes = list()
586 | # labels = list()
587 | # scores = list()
588 | # for img, size in zip(prepared_imgs, sizes):
589 | # img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True)
590 | # scale = img.shape[3] / size[1]
591 | # roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale)
592 | # # We are assuming that batch size is 1.
593 | # roi_score = roi_scores.data
594 | # roi_cls_loc = roi_cls_loc.data
595 | # if visualize:
596 | # roi = at.totensor(rois) / scale
597 | # else:
598 | # roi = at.totensor(rois) / scale.cuda().float()
599 | #
600 | # # Convert predictions to bounding boxes in image coordinates.
601 | # # Bounding boxes are scaled to the scale of the input images.
602 | # mean = t.Tensor(self.loc_normalize_mean).cuda(). \
603 | # repeat(self.n_class)[None]
604 | # std = t.Tensor(self.loc_normalize_std).cuda(). \
605 | # repeat(self.n_class)[None]
606 | #
607 | # roi_cls_loc = (roi_cls_loc * std + mean)
608 | # roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
609 | # roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)
610 | # cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)),
611 | # at.tonumpy(roi_cls_loc).reshape((-1, 4)))
612 | # cls_bbox = at.totensor(cls_bbox)
613 | # cls_bbox = cls_bbox.view(-1, self.n_class * 4)
614 | # # clip bounding box
615 | # cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
616 | # cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])
617 | #
618 | # prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1))
619 | #
620 | # raw_cls_bbox = at.tonumpy(cls_bbox)
621 | # raw_prob = at.tonumpy(prob)
622 | #
623 | # bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
624 | # bboxes.append(bbox)
625 | # labels.append(label)
626 | # scores.append(score)
627 | #
628 | # # self.use_preset('evaluate')
629 | # # self.train()
630 | # return bboxes, labels, scores
631 | #
632 | # def get_loss(self,inputs,isLearnNMS):
633 | # gt_rpn_loc, gt_rpn_label, gt_roi_loc, gt_roi_label, roi_cls_loc, roi_score, rpn_locs, rpn_scores, \
634 | # sample_roi, roi_cls_loc, roi_score, appearance_features, img_size, labels, bboxes = self(inputs)
635 | # if(isLearnNMS):
636 | # rpn_loss = self.rpnLoss(gt_rpn_loc,gt_rpn_label, rpn_locs, rpn_scores)
637 | # roi_loss = self.roiLoss(gt_roi_loc, gt_roi_label,roi_cls_loc, roi_score)
638 | # nms_scores, sorted_labels, sorted_cls_bboxes = self.duplicate_remover(sample_roi, roi_cls_loc, roi_score,
639 | # appearance_features, img_size)
640 | # nms_loss = self.nmsLoss(bboxes, labels,nms_scores, sorted_labels, sorted_cls_bboxes)
641 | # losses = rpn_loss+roi_loss+nms_loss
642 | # losses = [sum(losses)]+losses
643 | # return losses
644 | # else:
645 | # rpn_loss = self.rpnLoss(gt_rpn_loc, gt_rpn_label, rpn_locs, rpn_scores)
646 | # roi_loss = self.roiLoss(gt_roi_loc, gt_roi_label, roi_cls_loc, roi_score)
647 | # losses = rpn_loss + roi_loss
648 | # losses = [sum(losses)]+losses
649 | # return losses
650 |
651 | class RegionProposalNetwork(nn.Module):
652 | """Region Proposal Network introduced in Faster R-CNN.
653 |
654 | This is Region Proposal Network introduced in Faster R-CNN [#]_.
655 | This takes features extracted from images and propose
656 | class agnostic bounding boxes around "objects".
657 |
658 | .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
659 | Faster R-CNN: Towards Real-Time Object Detection with \
660 | Region Proposal Networks. NIPS 2015.
661 |
662 | Args:
663 | in_channels (int): The channel size of input.
664 | mid_channels (int): The channel size of the intermediate tensor.
665 | ratios (list of floats): This is ratios of width to height of
666 | the anchors.
667 | anchor_scales (list of numbers): This is areas of anchors.
668 | Those areas will be the product of the square of an element in
669 | :obj:`anchor_scales` and the original area of the reference
670 | window.
671 | feat_stride (int): Stride size after extracting features from an
672 | image.
673 | initialW (callable): Initial weight value. If :obj:`None` then this
674 | function uses Gaussian distribution scaled by 0.1 to
675 | initialize weight.
676 | May also be a callable that takes an array and edits its values.
677 | proposal_creator_params (dict): Key valued paramters for
678 | :class:`model.utils.creator_tools.ProposalCreator`.
679 |
680 | .. seealso::
681 | :class:`~model.utils.creator_tools.ProposalCreator`
682 |
683 | """
684 |
685 | def __init__(
686 | self, in_channels=256, mid_channels=256, ratios=[0.5, 1, 2],
687 | anchor_scales=[8, 16, 32], feat_stride=32,
688 | proposal_creator_params=dict(),
689 | ):
690 | super(RegionProposalNetwork, self).__init__()
691 | self.anchor_base = self.generate_anchor_base(
692 | anchor_scales=anchor_scales, ratios=ratios)
693 | self.feat_stride = feat_stride
694 | self.proposal_layer = ProposalCreator(self, **proposal_creator_params)
695 | n_anchor = self.anchor_base.shape[0]
696 |
697 | self.conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
698 | self.score = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0)
699 | self.loc = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0)
700 |
701 | def forward(self, x, img_size, scale=1.):
702 | """Forward Region Proposal Network.
703 |
704 | Here are notations.
705 |
706 | * :math:`N` is batch size.
707 | * :math:`C` channel size of the input.
708 | * :math:`H` and :math:`W` are height and witdh of the input feature.
709 | * :math:`A` is number of anchors assigned to each pixel.
710 |
711 | Args:
712 | x (~torch.autograd.Variable): The Features extracted from images.
713 | Its shape is :math:`(N, C, H, W)`.
714 | img_size (tuple of ints): A tuple :obj:`height, width`,
715 | which contains image size after scaling.
716 | scale (float): The amount of scaling done to the input images after
717 | reading them from files.
718 |
719 | Returns:
720 | (~torch.autograd.Variable, ~torch.autograd.Variable, array, array, array):
721 |
722 | This is a tuple of five following values.
723 |
724 | * **rpn_locs**: Predicted bounding box offsets and scales for \
725 | anchors. Its shape is :math:`(N, H W A, 4)`.
726 | * **rpn_scores**: Predicted foreground scores for \
727 | anchors. Its shape is :math:`(N, H W A, 2)`.
728 | * **rois**: A bounding box array containing coordinates of \
729 | proposal boxes. This is a concatenation of bounding box \
730 | arrays from multiple images in the batch. \
731 | Its shape is :math:`(R', 4)`. Given :math:`R_i` predicted \
732 | bounding boxes from the :math:`i` th image, \
733 | :math:`R' = \\sum _{i=1} ^ N R_i`.
734 | * **roi_indices**: An array containing indices of images to \
735 | which RoIs correspond to. Its shape is :math:`(R',)`.
736 | * **anchor**: Coordinates of enumerated shifted anchors. \
737 | Its shape is :math:`(H W A, 4)`.
738 |
739 | """
740 | n, _, hh, ww = x.shape
741 | anchor = self._enumerate_shifted_anchor_torch(
742 | np.array(self.anchor_base),
743 | self.feat_stride, hh, ww)
744 | n_anchor = anchor.shape[0] // (hh * ww)
745 | h = F.relu(self.conv1(x))
746 |
747 | rpn_locs = self.loc(h)
748 | # UNNOTE: check whether need contiguous
749 | # A: Yes
750 | rpn_locs = rpn_locs.permute(0, 2, 3, 1).contiguous().view(n, -1, 4)
751 |
752 | rpn_scores = self.score(h)
753 | rpn_scores = rpn_scores.permute(0, 2, 3, 1).contiguous()
754 | rpn_fg_scores = \
755 | rpn_scores.view(n, hh, ww, n_anchor, 2)[:, :, :, :, 1].contiguous()
756 | rpn_fg_scores = rpn_fg_scores.view(n, -1)
757 | rpn_scores = rpn_scores.view(n, -1, 2)
758 |
759 | rois = list()
760 | roi_indices = list()
761 |
762 | for i in range(n):
763 | roi = self.proposal_layer(
764 | rpn_locs[i].cpu().data.numpy(),
765 | rpn_fg_scores[i].cpu().data.numpy(),
766 | anchor, img_size,
767 | scale=scale)
768 | batch_index = i * np.ones((len(roi),), dtype=np.int32)
769 | rois.append(roi)
770 | roi_indices.append(batch_index)
771 |
772 | rois = np.concatenate(rois, axis=0)
773 | roi_indices = np.concatenate(roi_indices, axis=0)
774 | return rpn_locs, rpn_scores, rois, roi_indices, anchor
775 |
776 | def generate_anchor_base(self,base_size=16, ratios=[0.5, 1, 2],
777 | anchor_scales=[8, 16, 32]):
778 | """Generate anchor base windows by enumerating aspect ratio and scales.
779 |
780 | Generate anchors that are scaled and modified to the given aspect ratios.
781 | Area of a scaled anchor is preserved when modifying to the given aspect
782 | ratio.
783 |
784 | :obj:`R = len(ratios) * len(anchor_scales)` anchors are generated by this
785 | function.
786 | The :obj:`i * len(anchor_scales) + j` th anchor corresponds to an anchor
787 | generated by :obj:`ratios[i]` and :obj:`anchor_scales[j]`.
788 |
789 | For example, if the scale is :math:`8` and the ratio is :math:`0.25`,
790 | the width and the height of the base window will be stretched by :math:`8`.
791 | For modifying the anchor to the given aspect ratio,
792 | the height is halved and the width is doubled.
793 |
794 | Args:
795 | base_size (number): The width and the height of the reference window.
796 | ratios (list of floats): This is ratios of width to height of
797 | the anchors.
798 | anchor_scales (list of numbers): This is areas of anchors.
799 | Those areas will be the product of the square of an element in
800 | :obj:`anchor_scales` and the original area of the reference
801 | window.
802 |
803 | Returns:
804 | ~numpy.ndarray:
805 | An array of shape :math:`(R, 4)`.
806 | Each element is a set of coordinates of a bounding box.
807 | The second axis corresponds to
808 | :math:`(y_{min}, x_{min}, y_{max}, x_{max})` of a bounding box.
809 |
810 | """
811 | py = base_size / 2.
812 | px = base_size / 2.
813 |
814 | anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4),
815 | dtype=np.float32)
816 | for i in six.moves.range(len(ratios)):
817 | for j in six.moves.range(len(anchor_scales)):
818 | h = base_size * anchor_scales[j] * np.sqrt(ratios[i])
819 | w = base_size * anchor_scales[j] * np.sqrt(1. / ratios[i])
820 |
821 | index = i * len(anchor_scales) + j
822 | anchor_base[index, 0] = py - h / 2.
823 | anchor_base[index, 1] = px - w / 2.
824 | anchor_base[index, 2] = py + h / 2.
825 | anchor_base[index, 3] = px + w / 2.
826 | return anchor_base
827 |
828 | def _enumerate_shifted_anchor_torch(self,anchor_base, feat_stride, height, width):
829 | # Enumerate all shifted anchors:
830 | #
831 | # add A anchors (1, A, 4) to
832 | # cell K shifts (K, 1, 4) to get
833 | # shift anchors (K, A, 4)
834 | # reshape to (K*A, 4) shifted anchors
835 | # return (K*A, 4)
836 |
837 | # !TODO: add support for torch.CudaTensor
838 | # xp = cuda.get_array_module(anchor_base)
839 | shift_y = t.arange(0, height * feat_stride, feat_stride)
840 | shift_x = t.arange(0, width * feat_stride, feat_stride)
841 | shift_x, shift_y = np.meshgrid(shift_x, shift_y)
842 | shift = np.stack((shift_y.ravel(), shift_x.ravel(),
843 | shift_y.ravel(), shift_x.ravel()), axis=1)
844 |
845 | A = anchor_base.shape[0]
846 | K = shift.shape[0]
847 | anchor = anchor_base.reshape((1, A, 4)) + \
848 | shift.reshape((1, K, 4)).transpose((1, 0, 2))
849 | anchor = anchor.reshape((K * A, 4)).astype(np.float32)
850 | return anchor
851 |
852 | class RoI(Function):
853 | """
854 | NOTE:only CUDA-compatible
855 | """
856 |
857 | def __init__(self, outh, outw, spatial_scale):
858 | self.forward_fn = load_kernel('roi_forward', kernel_forward)
859 | self.backward_fn = load_kernel('roi_backward', kernel_backward)
860 | self.outh, self.outw, self.spatial_scale = outh, outw, spatial_scale
861 |
862 | def forward(self, x, rois):
863 | # NOTE: MAKE SURE input is contiguous too
864 | x = x.contiguous()
865 | rois = rois.contiguous()
866 | self.in_size = B, C, H, W = x.size() ## 1, 128, heights/32, width/32
867 | self.N = N = rois.size(0) ## 128
868 | output = t.zeros(N, C, self.outh, self.outw).cuda() ## 128,128,7,7
869 | self.argmax_data = t.zeros(N, C, self.outh, self.outw).int().cuda()
870 | self.rois = rois
871 | args = [x.data_ptr(), rois.data_ptr(),
872 | output.data_ptr(),
873 | self.argmax_data.data_ptr(),
874 | self.spatial_scale, C, H, W,
875 | self.outh, self.outw,
876 | output.numel()]
877 | stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
878 | self.forward_fn(args=args,
879 | block=(CUDA_NUM_THREADS, 1, 1),
880 | grid=(GET_BLOCKS(output.numel()), 1, 1),
881 | stream=stream)
882 | return output
883 |
884 | def backward(self, grad_output):
885 | ##NOTE: IMPORTANT CONTIGUOUS
886 | # TODO: input
887 | grad_output = grad_output.contiguous()
888 | B, C, H, W = self.in_size
889 | grad_input = t.zeros(self.in_size).cuda()
890 | stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
891 | args = [grad_output.data_ptr(),
892 | self.argmax_data.data_ptr(),
893 | self.rois.data_ptr(),
894 | grad_input.data_ptr(),
895 | self.N, self.spatial_scale, C, H, W, self.outh, self.outw,
896 | grad_input.numel()]
897 | self.backward_fn(args=args,
898 | block=(CUDA_NUM_THREADS, 1, 1),
899 | grid=(GET_BLOCKS(grad_input.numel()), 1, 1),
900 | stream=stream
901 | )
902 | return grad_input, None
903 | class RoIPooling2D(nn.Module):
904 |
905 | def __init__(self, outh, outw, spatial_scale):
906 | super(RoIPooling2D, self).__init__()
907 | self.RoI = RoI(outh, outw, spatial_scale)
908 |
909 | def forward(self, x, rois):
910 | return self.RoI(x, rois)
911 |
912 | class DuplicationRemovalNetwork(nn.Module):
913 | def __init__(self,n_relations = 16, appearance_feature_dim=1024,num_classes=20,d_f=128):
914 | super(DuplicationRemovalNetwork, self).__init__()
915 | self.loc_normalize_mean = (0., 0., 0., 0.)
916 | self.loc_normalize_std = (0.1, 0.1, 0.2, 0.2)
917 | self.key_feature_dim = int(appearance_feature_dim/n_relations)
918 | self.geo_feature_dim = int(appearance_feature_dim/n_relations)
919 | self.appearance_feature_dim=appearance_feature_dim
920 | self.n_class = num_classes+1
921 |
922 | self.nms_rank_fc = nn.Linear(appearance_feature_dim, d_f, bias=True)
923 | self.roi_feat_embedding_fc = nn.Linear(appearance_feature_dim,d_f,bias=True)
924 | self.relation_module = RelationModule(n_relations=n_relations,appearance_feature_dim=d_f,
925 | key_feature_dim=64,
926 | geo_feature_dim=64,isDuplication=True)
927 |
928 | self.nms_logit_fc = nn.Linear(appearance_feature_dim,1,bias=True)
929 | self.sigmoid = nn.Sigmoid()
930 | def forward(self,sample_roi,roi_cls_loc, roi_score, appearance_features,size):
931 | N = sample_roi.shape[0]
932 | roi_score = roi_score.data
933 | roi_cls_loc = roi_cls_loc.data
934 | roi = at.totensor(sample_roi)
935 |
936 |
937 | mean = t.Tensor(self.loc_normalize_mean).cuda(). \
938 | repeat(self.n_class)[None]
939 | std = t.Tensor(self.loc_normalize_std).cuda(). \
940 | repeat(self.n_class)[None]
941 |
942 | roi_cls_loc = (roi_cls_loc * std + mean)
943 | roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
944 | roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)
945 | cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)),
946 | at.tonumpy(roi_cls_loc).reshape((-1, 4)))
947 | cls_bbox = at.totensor(cls_bbox)
948 | cls_bbox = cls_bbox.view(-1, self.n_class , 4)
949 | # clip bounding box
950 | cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
951 | cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])
952 |
953 | prob = F.softmax(at.tovariable(roi_score), dim=1)
954 |
955 | prob,prob_argmax = torch.max(prob,dim=-1)
956 | cls_bbox = cls_bbox[np.arange(start=0,stop=N),prob_argmax]
957 |
958 | nonzero_idx=torch.nonzero(prob_argmax)
959 |
960 | if(nonzero_idx.size()[0]==0):
961 | return None,None,None
962 | else:
963 | nonzero_idx = nonzero_idx[:, 0]
964 | prob_argmax = prob_argmax[nonzero_idx]
965 | prob = prob[nonzero_idx]
966 | cls_bbox = cls_bbox[nonzero_idx]
967 | appearance_features_nobg = appearance_features[nonzero_idx]
968 | sorted_score,prob_argsort = torch.sort(prob,descending=True)
969 |
970 | sorted_prob = prob[prob_argsort]
971 | sorted_cls_bboxes = cls_bbox[prob_argsort]
972 | sorted_labels = prob_argmax[prob_argsort]
973 | sorted_features = appearance_features_nobg[prob_argsort]
974 |
975 | nms_rank_embedding = RankEmbedding(sorted_prob.size()[0],self.appearance_feature_dim)
976 | nms_rank = self.nms_rank_fc(nms_rank_embedding)
977 | roi_feat_embedding = self.roi_feat_embedding_fc(sorted_features)
978 | nms_embedding_feat = nms_rank + roi_feat_embedding
979 | position_embedding = PositionalEmbedding(sorted_cls_bboxes,dim_g = self.geo_feature_dim)
980 | nms_logit = self.relation_module([sorted_features, nms_embedding_feat,position_embedding])
981 | nms_logit = self.nms_logit_fc(nms_logit)
982 | s1 = self.sigmoid(nms_logit).view(-1)
983 | nms_scores = s1 * sorted_prob
984 |
985 | return nms_scores, sorted_labels-1, sorted_cls_bboxes
986 | class RelationModule(nn.Module):
987 | def __init__(self,n_relations = 16, appearance_feature_dim=1024,key_feature_dim = 64, geo_feature_dim = 64, isDuplication = False):
988 | super(RelationModule, self).__init__()
989 | self.isDuplication=isDuplication
990 | self.Nr = n_relations
991 | self.dim_g = geo_feature_dim
992 | self.relation = nn.ModuleList()
993 | for N in range(self.Nr):
994 | self.relation.append(RelationUnit(appearance_feature_dim, key_feature_dim, geo_feature_dim))
995 | def forward(self, input_data ):
996 | if(self.isDuplication):
997 | f_a, embedding_f_a, position_embedding =input_data
998 | else:
999 | f_a, position_embedding = input_data
1000 | isFirst=True
1001 | for N in range(self.Nr):
1002 | if(isFirst):
1003 | if(self.isDuplication):
1004 | concat = self.relation[N](embedding_f_a,position_embedding)
1005 | else:
1006 | concat = self.relation[N](f_a,position_embedding)
1007 | isFirst=False
1008 | else:
1009 | if(self.isDuplication):
1010 | concat = torch.cat((concat, self.relation[N](embedding_f_a, position_embedding)), -1)
1011 | else:
1012 | concat = torch.cat((concat, self.relation[N](f_a, position_embedding)), -1)
1013 | return concat+f_a
1014 | class RelationUnit(nn.Module):
1015 | def __init__(self, appearance_feature_dim=1024,key_feature_dim = 64, geo_feature_dim = 64):
1016 | super(RelationUnit, self).__init__()
1017 | self.dim_g = geo_feature_dim
1018 | self.dim_k = key_feature_dim
1019 | self.WG = nn.Linear(geo_feature_dim, 1, bias=True)
1020 | self.WK = nn.Linear(appearance_feature_dim, key_feature_dim, bias=True)
1021 | self.WQ = nn.Linear(appearance_feature_dim, key_feature_dim, bias=True)
1022 | self.WV = nn.Linear(appearance_feature_dim, key_feature_dim, bias=True)
1023 | self.relu = nn.ReLU(inplace=True)
1024 |
1025 |
1026 | def forward(self, f_a, position_embedding):
1027 | N,_ = f_a.size()
1028 |
1029 | position_embedding = position_embedding.view(-1,self.dim_g)
1030 |
1031 | w_g = self.relu(self.WG(position_embedding))
1032 | w_k = self.WK(f_a)
1033 | w_k = w_k.view(N,1,self.dim_k)
1034 |
1035 | w_q = self.WQ(f_a)
1036 | w_q = w_q.view(1,N,self.dim_k)
1037 |
1038 | scaled_dot = torch.sum((w_k*w_q),-1 )
1039 | scaled_dot = scaled_dot / np.sqrt(self.dim_k)
1040 |
1041 | w_g = w_g.view(N,N)
1042 | w_a = scaled_dot.view(N,N)
1043 |
1044 | w_mn = torch.log(torch.clamp(w_g, min = 1e-6)) + w_a
1045 | w_mn = torch.nn.Softmax(dim=1)(w_mn)
1046 |
1047 | w_v = self.WV(f_a)
1048 |
1049 | w_mn = w_mn.view(N,N,1)
1050 | w_v = w_v.view(N,1,-1)
1051 |
1052 | output = w_mn*w_v
1053 |
1054 | output = torch.sum(output,-2)
1055 | return output
1056 |
1057 | class RoIHead(nn.Module):
1058 | """Faster R-CNN Head for VGG-16 based implementation.
1059 | This class is used as a head for Faster R-CNN.
1060 | This outputs class-wise localizations and classification based on feature
1061 | maps in the given RoIs.
1062 |
1063 | Args:
1064 | n_class (int): The number of classes possibly including the background.
1065 | roi_size (int): Height and width of the feature maps after RoI-pooling.
1066 | spatial_scale (float): Scale of the roi is resized.
1067 | classifier (nn.Module): Two layer Linear ported from vgg16
1068 |
1069 | """
1070 |
1071 | def __init__(self, n_class, roi_size, spatial_scale,
1072 | in_channels = 128,fc_features = 1024, n_relations = 0 , classifier = None):
1073 | # n_class includes the background
1074 | super(RoIHead, self).__init__()
1075 | if classifier is None:
1076 | self.n_relations=n_relations
1077 | fully_connected1 = nn.Linear(7*7*in_channels, fc_features)
1078 | relu1 = nn.ReLU(inplace=True)
1079 |
1080 | fully_connected2 = nn.Linear(fc_features, fc_features)
1081 | relu2 = nn.ReLU(inplace=True)
1082 | if(n_relations>0):
1083 | self.dim_g = int(fc_features/n_relations)
1084 | relation1= RelationModule(n_relations = n_relations, appearance_feature_dim=fc_features,
1085 | key_feature_dim = self.dim_g, geo_feature_dim = self.dim_g)
1086 |
1087 | relation2 = RelationModule(n_relations=n_relations, appearance_feature_dim=fc_features,
1088 | key_feature_dim=self.dim_g, geo_feature_dim=self.dim_g)
1089 | self.classifier = nn.Sequential(fully_connected1, relu1, relation1,
1090 | fully_connected2, relu2, relation2)
1091 | else:
1092 | self.classifier = nn.Sequential(fully_connected1, relu1,
1093 | fully_connected2, relu2)
1094 | else :
1095 | self.classifier = classifier
1096 |
1097 | self.cls_loc = nn.Linear(fc_features, n_class * 4)
1098 | self.score = nn.Linear(fc_features, n_class)
1099 | normal_init(self.cls_loc, 0, 0.001)
1100 | normal_init(self.score, 0, 0.01)
1101 | self.n_class = n_class
1102 | self.roi_size = roi_size
1103 | self.spatial_scale = spatial_scale
1104 | self.roi = RoIPooling2D(self.roi_size, self.roi_size, self.spatial_scale)
1105 |
1106 | def forward(self, x, rois, roi_indices):
1107 | """Forward the chain.
1108 |
1109 | We assume that there are :math:`N` batches.
1110 |
1111 | Args:
1112 | x (Variable): 4D image variable.
1113 | rois (Tensor): A bounding box array containing coordinates of
1114 | proposal boxes. This is a concatenation of bounding box
1115 | arrays from multiple images in the batch.
1116 | Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed
1117 | RoIs from the :math:`i` th image,
1118 | :math:`R' = \\sum _{i=1} ^ N R_i`.
1119 | roi_indices (Tensor): An array containing indices of images to
1120 | which bounding boxes correspond to. Its shape is :math:`(R',)`.
1121 |
1122 | """
1123 | # in case roi_indices is ndarray
1124 | roi_indices = at.totensor(roi_indices).float()
1125 | rois = at.totensor(rois).float()
1126 | indices_and_rois = t.cat([roi_indices[:, None], rois], dim=1)
1127 | # NOTE: important: yx->xy
1128 | xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]]
1129 | indices_and_rois = t.autograd.Variable(xy_indices_and_rois.contiguous())
1130 | if(self.n_relations>0):
1131 | position_embedding = PositionalEmbedding(indices_and_rois[:, 1:],dim_g = self.dim_g)
1132 |
1133 | pool = self.roi(x, indices_and_rois)
1134 |
1135 | pool = pool.view(pool.size(0), -1)
1136 |
1137 | fc7 = self.classifier(pool)
1138 | roi_cls_locs = self.cls_loc(fc7)
1139 | roi_scores = self.score(fc7)
1140 | return roi_cls_locs, roi_scores, fc7
1141 |
1142 | class VGG16RoIHead(nn.Module):
1143 | """Faster R-CNN Head for VGG-16 based implementation.
1144 | This class is used as a head for Faster R-CNN.
1145 | This outputs class-wise localizations and classification based on feature
1146 | maps in the given RoIs.
1147 |
1148 | Args:
1149 | n_class (int): The number of classes possibly including the background.
1150 | roi_size (int): Height and width of the feature maps after RoI-pooling.
1151 | spatial_scale (float): Scale of the roi is resized.
1152 | classifier (nn.Module): Two layer Linear ported from vgg16
1153 | """
1154 |
1155 | def __init__(self, n_class, roi_size, spatial_scale,
1156 | classifier):
1157 | # n_class includes the background
1158 | super(VGG16RoIHead, self).__init__()
1159 |
1160 | self.classifier = classifier
1161 | self.cls_loc = nn.Linear(4096, n_class * 4)
1162 | self.score = nn.Linear(4096, n_class)
1163 |
1164 | normal_init(self.cls_loc, 0, 0.001)
1165 | normal_init(self.score, 0, 0.01)
1166 |
1167 | self.n_class = n_class
1168 | self.roi_size = roi_size
1169 | self.spatial_scale = spatial_scale
1170 | self.roi = RoIPooling2D(self.roi_size, self.roi_size, self.spatial_scale)
1171 |
1172 | def forward(self, x, rois, roi_indices):
1173 | """Forward the chain.
1174 |
1175 | We assume that there are :math:`N` batches.
1176 |
1177 | Args:
1178 | x (Variable): 4D image variable.
1179 | rois (Tensor): A bounding box array containing coordinates of
1180 | proposal boxes. This is a concatenation of bounding box
1181 | arrays from multiple images in the batch.
1182 | Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed
1183 | RoIs from the :math:`i` th image,
1184 | :math:`R' = \\sum _{i=1} ^ N R_i`.
1185 | roi_indices (Tensor): An array containing indices of images to
1186 | which bounding boxes correspond to. Its shape is :math:`(R',)`.
1187 |
1188 | """
1189 | # in case roi_indices is ndarray
1190 | roi_indices = at.totensor(roi_indices).float()
1191 | rois = at.totensor(rois).float()
1192 | indices_and_rois = t.cat([roi_indices[:, None], rois], dim=1)
1193 | # NOTE: important: yx->xy
1194 | xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]]
1195 | indices_and_rois = xy_indices_and_rois.contiguous()
1196 |
1197 | pool = self.roi(x, indices_and_rois)
1198 | pool = pool.view(pool.size(0), -1)
1199 | fc7 = self.classifier(pool)
1200 | roi_cls_locs = self.cls_loc(fc7)
1201 | roi_scores = self.score(fc7)
1202 | return roi_cls_locs, roi_scores, fc7
1203 |
1204 | def normal_init(m, mean, stddev, truncated=False):
1205 | """
1206 | weight initalizer: truncated normal and random normal.
1207 | """
1208 | # x is a parameter
1209 | if truncated:
1210 | m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean) # not a perfect approximation
1211 | else:
1212 | m.weight.data.normal_(mean, stddev)
1213 | m.bias.data.zero_()
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | import collections
2 |
3 | import numpy as np
4 | from torch.utils import data as data_
5 | import model
6 |
7 | from trainer import Trainer
8 | import torch
9 | import torch.optim as optim
10 | from data.dataset import Dataset, TestDataset
11 | from config import opt
12 | import cv2,time
13 |
14 | def run_train(train_verbose=False):
15 | dataset = Dataset(opt)
16 | dataloader = data_.DataLoader(dataset, \
17 | batch_size=opt.batch_size, \
18 | shuffle=True, \
19 | # pin_memory=True,
20 | num_workers=opt.num_workers)
21 |
22 | testset = TestDataset(opt)
23 | test_dataloader = data_.DataLoader(testset,
24 | batch_size=opt.batch_size,
25 | num_workers=opt.num_workers,
26 | shuffle=False#, \
27 | #pin_memory=True
28 | )
29 |
30 | resnet = model.resnet101(20,True).cuda()
31 |
32 | optimizer = optim.Adam(resnet.parameters(), lr=opt.lr)
33 |
34 | loss_hist = collections.deque(maxlen=500)
35 | epoch_loss_hist = []
36 | resnet_trainer = Trainer(resnet,optimizer,model_name=opt.model_name)
37 |
38 | freeze_num = 8 #pretrain model
39 | best_loss = 10
40 | best_loss_epoch_num = -1
41 | num_bad_epochs = 0
42 | max_bad_epochs = 5
43 | resnet_trainer.model_freeze(freeze_num=freeze_num)
44 |
45 | for epoch_num in range(opt.epoch):
46 | resnet_trainer.train_mode(freeze_num)
47 | train_start_time = time.time()
48 | train_epoch_loss = []
49 | start = time.time()
50 | for iter_num, data in enumerate(dataloader):
51 | curr_loss = resnet_trainer.train_step(data)
52 | loss_hist.append(float(curr_loss))
53 | train_epoch_loss.append(float(curr_loss))
54 |
55 | if (train_verbose):
56 | print('Epoch: {} | Iteration: {} | loss: {:1.5f} | Running loss: {:1.5f} | Iter time: {:1.5f} | Train'
57 | ' time: {:1.5f}'.format(epoch_num, iter_num, float(curr_loss), np.mean(loss_hist),
58 | time.time()-start, time.time()-train_start_time))
59 | start = time.time()
60 |
61 | del curr_loss
62 | print('train epoch time :', time.time() - train_start_time)
63 | print('Epoch: {} | epoch train loss: {:1.5f}'.format(
64 | epoch_num, np.mean(train_epoch_loss)))
65 |
66 | vali_start_time = time.time()
67 |
68 | #resnet_trainer.eval_mode()
69 | vali_epoch_loss = []
70 | for iter_num, data in enumerate(test_dataloader):
71 | curr_loss = resnet_trainer.get_loss(data)
72 | vali_epoch_loss.append(float(curr_loss))
73 |
74 | del curr_loss
75 |
76 | epoch_loss_hist.append(np.mean(vali_epoch_loss))
77 |
78 | # vali_eval_result = resnet_trainer.run_eval(test_dataloader)
79 | # print(vali_eval_result)
80 | print('vali epoch time :', time.time() - vali_start_time)
81 | print('Epoch: {} | epoch vali loss: {:1.5f}'.format(
82 | epoch_num, np.mean(vali_epoch_loss)))
83 |
84 | if (best_loss < np.mean(vali_epoch_loss)):
85 | num_bad_epochs += 1
86 | else:
87 | best_loss = np.mean(vali_epoch_loss)
88 | best_loss_epoch_num = epoch_num
89 | num_bad_epochs = 0
90 | resnet_trainer.model_save(epoch_num)
91 | if (num_bad_epochs > max_bad_epochs):
92 | freeze_num=11
93 | num_bad_epochs = 0
94 | resnet_trainer.model_load(best_loss_epoch_num)
95 | resnet_trainer.reduce_lr(factor=0.1, verbose=True)
96 | resnet_trainer.model_freeze(freeze_num=freeze_num)
97 |
98 | print('best epoch num', best_loss_epoch_num)
99 | print('----------------------------------------')
100 |
101 | print(epoch_loss_hist)
102 |
103 |
104 | if __name__ == "__main__":
105 | run_train(train_verbose = True)
--------------------------------------------------------------------------------
/train_history.txt:
--------------------------------------------------------------------------------
1 | resnet18_pyramid_no_relation
2 | Epoch: 36 | epoch train loss: 0.44957
3 | Epoch: 36 | epoch vali loss: 0.79966
4 | best epoch num 36
5 |
6 | -----------------------------------------------
7 | resnet101_pyramid_no_relation
8 | Epoch: 53 | epoch train loss: 0.29215
9 | Epoch: 53 | epoch vali loss: 0.76078
10 | best epoch num 53
11 |
12 | --------------------------------------------
13 | resnet101_pyramid_no_relation_e2e
14 | Epoch: 14 | epoch train loss: 3.83095
15 | Epoch: 14 | epoch vali loss: 3.70913
16 | best epoch num 14
17 |
18 | ---------------------------------------------
19 | resnet101_pyramid_no_relation_e2e
20 | Epoch: 6 | epoch train loss: 0.49311
21 | Epoch: 6 | epoch vali loss: 0.98503
22 | best epoch num 6
23 |
24 | --------------------------------------------
25 | resnet101_pyramid_no_relation_e2e
26 | Epoch: 7 | epoch train loss: 0.52024
27 | Epoch: 7 | epoch vali loss: 0.94327
28 | best epoch num 7
29 |
--------------------------------------------------------------------------------
/trainer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from evaluate import eval
3 | from config import opt
4 |
5 | class Trainer():
6 | def __init__(self, my_model, optimizer, model_name):
7 | self.my_model=my_model
8 | self.optimizer=optimizer
9 | self.model_name = model_name
10 | self.my_model.train()
11 | self.my_model.use_preset(isTraining=True)
12 | self.my_model.freeze_bn()
13 |
14 | def train_mode(self,freeze_num):
15 | self.my_model.train()
16 | self.my_model.use_preset(isTraining=True)
17 | self.my_model.freeze_bn()
18 | self.model_freeze(freeze_num)
19 |
20 | def eval_mode(self):
21 | self.my_model.eval()
22 | self.my_model.use_preset(isTraining=False, preset='evaluate')
23 | for child in self.my_model.children():
24 | for param in child.parameters():
25 | param.requires_grad = False
26 | def train_step(self, data):
27 | self.optimizer.zero_grad()
28 | losses = self.my_model.get_loss(
29 | [data[0].cuda().float(), data[1].cuda().float(), data[2].cuda().float(), data[3].cuda().float()],opt.isLearnNMS)
30 | if losses[0]==0.:
31 | return 1.
32 | losses[0].backward()
33 | torch.nn.utils.clip_grad_norm_(self.my_model.parameters(), 0.1)
34 |
35 | self.optimizer.step()
36 |
37 | curr_loss = losses[0].item()
38 | return curr_loss
39 | def get_loss(self, data):
40 | losses = self.my_model.get_loss(
41 | [data[0].cuda().float(), data[2].cuda().float(), data[3].cuda().float(), data[4].cuda().float()],opt.isLearnNMS)
42 | if losses[0]==0.:
43 | return 1.
44 | curr_loss = losses[0].item()
45 | return curr_loss
46 |
47 | def model_save(self,epoch_num):
48 | torch.save(self.my_model.state_dict(), 'Weights/'+self.model_name+'_{}.pt'.format(epoch_num))
49 |
50 | def model_load(self,epoch_num):
51 | self.my_model.load_state_dict(torch.load('Weights/'+self.model_name+'_{}.pt'.format(epoch_num)),False)
52 |
53 | def reduce_lr(self,factor=0.1,verbose=True):
54 | for i, param_group in enumerate(self.optimizer.param_groups):
55 | old_lr = float(param_group['lr'])
56 | new_lr = old_lr * factor
57 | param_group['lr'] = new_lr
58 | if verbose:
59 | print('reducing learning rate'
60 | ' of group {} to {:.4e}.'.format( i, new_lr))
61 |
62 | def model_freeze(self,freeze_num):
63 | if(freeze_num == -1):
64 | return
65 | child_count = 0
66 | for child in self.my_model.children():
67 | if(child_count < freeze_num):
68 | for param in child.parameters():
69 | param.requires_grad = False
70 | else:
71 | for param in child.parameters():
72 | param.requires_grad = True
73 | child_count+=1
74 | self.my_model.freeze_bn()
75 |
76 | def run_eval(self, data_loader,test_num=1000000):
77 | return eval(data_loader, self.my_model, test_num)
--------------------------------------------------------------------------------