├── cfgs
├── voc.json
├── __init__.py
└── config.py
├── lib
├── __init__.py
├── bbox
│ ├── __init__.py
│ ├── box_transform.py
│ └── bbox.py
├── det_ops
│ ├── __init__.py
│ ├── anchors.py
│ ├── anchor_target.py
│ └── loss.py
└── nms
│ ├── .gitignore
│ ├── __init__.py
│ ├── Makefile
│ ├── gpu_nms.hpp
│ ├── nms_wrapper.py
│ ├── gpu_nms.pyx
│ ├── cpu_nms.pyx
│ ├── setup.py
│ ├── cpu_soft_nms.pyx
│ └── nms_kernel.cu
├── models
├── __init__.py
├── fpn.py
├── retina.py
└── resnet.py
├── utils
├── __init__.py
├── logger.py
└── visualization.py
├── datasets
├── __init__.py
├── utils.py
├── icdar15.py
├── synthtext.py
└── minibatch.py
├── evaluation
├── __init__.py
└── evaluation.py
├── .DS_Store
├── images
├── .DS_Store
├── icdar.png
└── synth.png
├── .idea
├── vcs.xml
├── misc.xml
├── modules.xml
├── RetinaNet-Text-Detection.iml
└── workspace.xml
├── README.md
├── LICENSE
├── train.py
└── test.py
/cfgs/voc.json:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/cfgs/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/lib/bbox/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/lib/det_ops/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/lib/nms/.gitignore:
--------------------------------------------------------------------------------
1 | *.cpp
2 |
--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wondervictor/RetinaNet-Text-Detection/HEAD/.DS_Store
--------------------------------------------------------------------------------
/lib/nms/__init__.py:
--------------------------------------------------------------------------------
1 | from .nms_wrapper import nms, soft_nms
2 |
3 | __all__ = ['nms', 'soft_nms']
4 |
--------------------------------------------------------------------------------
/images/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wondervictor/RetinaNet-Text-Detection/HEAD/images/.DS_Store
--------------------------------------------------------------------------------
/images/icdar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wondervictor/RetinaNet-Text-Detection/HEAD/images/icdar.png
--------------------------------------------------------------------------------
/images/synth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wondervictor/RetinaNet-Text-Detection/HEAD/images/synth.png
--------------------------------------------------------------------------------
/lib/nms/Makefile:
--------------------------------------------------------------------------------
1 |
2 | all:
3 | echo "Compiling nms kernels..."
4 | python setup.py build_ext --inplace
5 |
6 | clean:
7 | rm -f *.so
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/lib/nms/gpu_nms.hpp:
--------------------------------------------------------------------------------
1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
2 | int boxes_dim, float nms_overlap_thresh, int device_id, size_t base);
3 | size_t nms_Malloc();
4 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## RetinaNet-Text-Detection
2 |
3 | **Work in progress (Fork of [wondervictor/RetinaNet](https://github.com/wondervictor/RetinaNet))**
4 |
5 | RetinaNet for `Text Detection` implemented with Pure PyTorch
6 |
7 |
8 | ### Results
9 |
10 | * ICDAR
11 |
12 | 
13 |
14 | * SynthText
15 |
16 | 
17 |
18 |
19 | ### Licence
20 |
21 | This project is under the **MIT Licence**
22 |
--------------------------------------------------------------------------------
/.idea/RetinaNet-Text-Detection.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/utils/logger.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Training Logger
4 |
5 | """
6 | import torch
7 |
8 |
9 | class Logger:
10 |
11 | def __init__(self):
12 | pass
13 |
14 |
15 | def save_checkpoints(model, optimizer, epoch, iteration, path):
16 |
17 | state_dict = {
18 | "model": model.state_dict(),
19 | "optimizer": optimizer.state_dict(),
20 | "epoch": epoch,
21 | "iteration": iteration
22 | }
23 |
24 | torch.save(state_dict, path)
25 |
26 |
27 | def load_checkpoints(path):
28 | state_dict = torch.load(path)
29 |
30 | return state_dict['model'], state_dict['optimizer'], state_dict['epoch'], state_dict['iteration']
31 |
32 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Vic Chan
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/utils/visualization.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | """
4 |
5 | import cv2
6 | import json
7 | import random
8 | import argparse
9 |
10 |
11 | def show_img(im_path, boxes):
12 |
13 | img = cv2.imread(im_path)
14 | for bb in boxes:
15 | if bb[4] < 0.3:
16 | continue
17 | img = cv2.rectangle(img, (int(bb[0]), int(bb[1])), (int(bb[2]), int(bb[3])), (0, 255, 0), 1)
18 | img = cv2.putText(img, '{}:{:.2f}'.format(bb[5], bb[4]), (int(bb[0]), int(bb[1])+10), cv2.FONT_HERSHEY_SIMPLEX,
19 | 0.5, (0, 255, 0), 1)
20 | cv2.imshow('img', img)
21 |
22 | cv2.waitKey(0)
23 | cv2.destroyAllWindows()
24 |
25 |
26 | def main():
27 | parser = argparse.ArgumentParser()
28 | parser.add_argument('-d', '--dt', default='', type=str)
29 | args = parser.parse_args()
30 | with open(args.dt, 'r') as f:
31 | lines = f.readlines()
32 | lines = [json.loads(x.rstrip('\n')) for x in lines]
33 | random.shuffle(lines)
34 | dt = dict()
35 | for l in lines:
36 | name = l['image_id']
37 | res = l['result']
38 | _boxes = []
39 | for bb in res:
40 | _boxes.append(bb['bbox']+[bb['prob'], bb['class']])
41 | dt[name] = _boxes
42 |
43 | for k in dt.keys():
44 | show_img('/public_datasets/SynthText/'+k, dt[k])
45 |
46 |
47 | if __name__ == '__main__':
48 |
49 | main()
50 |
--------------------------------------------------------------------------------
/lib/nms/nms_wrapper.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 | from .gpu_nms import gpu_nms
5 | from .cpu_nms import cpu_nms
6 | from .cpu_soft_nms import cpu_soft_nms
7 |
8 |
9 | def nms(dets, thresh, device_id=None):
10 | """Dispatch to either CPU or GPU NMS implementations."""
11 |
12 | if isinstance(dets, torch.Tensor):
13 | if dets.is_cuda:
14 | device_id = dets.get_device()
15 | dets = dets.detach().cpu().numpy()
16 | assert isinstance(dets, np.ndarray)
17 |
18 | if dets.shape[0] == 0:
19 | inds = []
20 | else:
21 | inds = (gpu_nms(dets, thresh, device_id=device_id)
22 | if device_id is not None else cpu_nms(dets, thresh))
23 |
24 | if isinstance(dets, torch.Tensor):
25 | return dets.new_tensor(inds, dtype=torch.long)
26 | else:
27 | return np.array(inds, dtype=np.int)
28 |
29 |
30 | def soft_nms(dets, Nt=0.3, method=1, sigma=0.5, min_score=0):
31 | if isinstance(dets, torch.Tensor):
32 | _dets = dets.detach().cpu().numpy()
33 | else:
34 | _dets = dets.copy()
35 | assert isinstance(_dets, np.ndarray)
36 |
37 | new_dets, inds = cpu_soft_nms(
38 | _dets, Nt=Nt, method=method, sigma=sigma, threshold=min_score)
39 |
40 | if isinstance(dets, torch.Tensor):
41 | return dets.new_tensor(
42 | inds, dtype=torch.long), dets.new_tensor(new_dets)
43 | else:
44 | return np.array(
45 | inds, dtype=np.int), np.array(
46 | new_dets, dtype=np.float32)
47 |
--------------------------------------------------------------------------------
/lib/nms/gpu_nms.pyx:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Faster R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 |
8 | import numpy as np
9 | cimport numpy as np
10 |
11 | assert sizeof(int) == sizeof(np.int32_t)
12 |
13 | cdef extern from "gpu_nms.hpp":
14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int, size_t) nogil
15 | size_t nms_Malloc() nogil
16 |
17 | memory_pool = {}
18 |
19 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
20 | np.int32_t device_id=0):
21 | cdef int boxes_num = dets.shape[0]
22 | cdef int boxes_dim = dets.shape[1]
23 | cdef int num_out
24 | cdef size_t base
25 | cdef np.ndarray[np.int32_t, ndim=1] \
26 | keep = np.zeros(boxes_num, dtype=np.int32)
27 | cdef np.ndarray[np.float32_t, ndim=1] \
28 | scores = dets[:, 4]
29 | cdef np.ndarray[np.int_t, ndim=1] \
30 | order = scores.argsort()[::-1]
31 | cdef np.ndarray[np.float32_t, ndim=2] \
32 | sorted_dets = dets[order, :]
33 | cdef float cthresh = thresh
34 | if device_id not in memory_pool:
35 | with nogil:
36 | base = nms_Malloc()
37 | memory_pool[device_id] = base
38 | # print "malloc", base
39 | base = memory_pool[device_id]
40 | with nogil:
41 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, cthresh, device_id, base)
42 | keep = keep[:num_out]
43 | return list(order[keep])
44 |
--------------------------------------------------------------------------------
/datasets/utils.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Dataset utils
4 |
5 | """
6 | import cv2
7 | import numpy as np
8 | from PIL import Image
9 |
10 |
11 | def flip_pil_img_and_boxes(img, boxes=None):
12 | """ Flip PIL Images and Boxes
13 | Args:
14 | img: PIL Image
15 | boxes: [N, 4]
16 | """
17 | assert isinstance(img, Image.Image), "img should be PIL.Image"
18 | w, h = img.size
19 | flip_img = img.transpose(Image.FLIP_LEFT_RIGHT)
20 | if boxes is not None:
21 | flip_boxes = boxes.copy()
22 | flip_boxes[:, 0] = w - boxes[:, 2] - 1
23 | flip_boxes[:, 2] = w - boxes[:, 0] - 1
24 | return flip_img, flip_boxes
25 | else:
26 | return flip_img
27 |
28 |
29 | def flip_img_boxes(img, boxes=None):
30 |
31 | h, w, c = img.shape
32 | flip_img = cv2.flip(img, 1)
33 | if boxes is not None:
34 | flip_boxes = boxes.copy()
35 | for i in range(flip_boxes.shape[0]):
36 | flip_boxes[i, 0] = w - boxes[i, 2] - 1
37 | flip_boxes[i, 2] = w - boxes[i, 0] - 1
38 | return flip_img, flip_boxes
39 | else:
40 | return flip_img
41 |
42 |
43 | def normalize_image(img):
44 | img = img / 255.0
45 | mean = np.array([.485, .456, .406])
46 | std = np.array([.229, .224, .225])
47 | img = (img - mean) / std
48 | return img
49 |
50 |
51 | def get_im_scale(h, w, target_size, max_size):
52 | img_min_size = min(h, w)
53 | img_max_size = max(h, w)
54 | scale = target_size / img_min_size
55 | if scale * img_max_size > max_size:
56 | scale = max_size / img_max_size
57 |
58 | return int(round(h*scale)), int(round(w*scale)), scale
59 |
--------------------------------------------------------------------------------
/lib/det_ops/anchors.py:
--------------------------------------------------------------------------------
1 | """
2 | Generate Anchors
3 | """
4 |
5 | import math
6 | import torch
7 |
8 |
9 | def mesh_grid(x, y):
10 | """ mesh grid
11 |
12 | """
13 | a = torch.arange(0, x)
14 | b = torch.arange(0, y)
15 | xx = a.repeat(y).view(-1, 1)
16 | yy = b.view(-1, 1).repeat(1, x).view(-1, 1)
17 |
18 | return torch.cat([xx, yy], dim=1).float()
19 |
20 |
21 | def compute_anchor_whs(num_layers, areas, aspect_ratios, sizes):
22 | anchors = []
23 | for i in range(len(areas)):
24 | area = areas[i]
25 | for ar in aspect_ratios:
26 | h = math.sqrt(area / ar)
27 | w = h * ar
28 | for s in sizes:
29 | anchor_h = h * s
30 | anchor_w = w * s
31 | anchors.append([anchor_w, anchor_h])
32 | # M * K * 2
33 | # Faster R-CNN: 1*K*2 (1x9x2)
34 | # FPN: 5*K*2 (5x3x2)
35 | # RetinaNet: 5*K*2 (5*9*2)
36 | return torch.Tensor(anchors).view(num_layers, -1, 2)
37 |
38 |
39 | def generate_anchors(anchor_whs, input_size, strides):
40 | """ generate anchors
41 | """
42 | boxes = []
43 | num_strides = len(strides)
44 | num_anchors = anchor_whs.shape[1]
45 |
46 | for i in range(num_strides):
47 | stride = strides[i]
48 | feature_size = input_size / stride
49 | fmw, fmh = int(math.ceil(feature_size[0])), int(math.ceil(feature_size[1]))
50 | xy = mesh_grid(fmh, fmw) + 0.5 # shift to center
51 | xy = (xy * stride).view(fmh, fmw, 1, 2).expand(fmh, fmw, num_anchors, 2)
52 | wh = anchor_whs[i].view(1, 1, num_anchors, 2).expand(fmh, fmw, num_anchors, 2)
53 | box = torch.cat([xy, wh], dim=3)
54 | boxes.append(box.view(-1, 4))
55 | boxes = torch.cat(boxes, 0)
56 | # box: H * W * self._num_anchors * 2
57 | return boxes
58 |
--------------------------------------------------------------------------------
/models/fpn.py:
--------------------------------------------------------------------------------
1 | """
2 | Feature Pyramid Network for Object Detection
3 |
4 | """
5 |
6 | import torch
7 | import torch.nn.functional as F
8 | from torchvision.models import resnet
9 | import torch.nn as nn
10 | from .resnet import ResNet50Stages
11 |
12 |
13 | class FPN50(nn.Module):
14 |
15 | def __init__(self, pretrained_path):
16 | super(FPN50, self).__init__()
17 | self.backbone = ResNet50Stages(pretrained_path)
18 |
19 | self.lateral_layer1 = nn.Conv2d(2048, 256, 1)
20 | self.lateral_layer2 = nn.Conv2d(1024, 256, 1)
21 | self.lateral_layer3 = nn.Conv2d(512, 256, 1)
22 |
23 | self.conv6 = nn.Conv2d(2048, 256, 3, padding=1, stride=2)
24 | self.conv7 = nn.Conv2d(256, 256, 3, padding=1, stride=2)
25 | self.relu = nn.ReLU(inplace=True)
26 |
27 | self._weight_initialize()
28 |
29 | def _weight_initialize(self):
30 |
31 | self.lateral_layer1.weight.data.normal_(std=0.01)
32 | self.lateral_layer1.bias.data.fill_(0.0)
33 |
34 | self.lateral_layer2.weight.data.normal_(std=0.01)
35 | self.lateral_layer2.bias.data.fill_(0.0)
36 |
37 | self.lateral_layer3.weight.data.normal_(std=0.01)
38 | self.lateral_layer3.bias.data.fill_(0.0)
39 |
40 | self.conv6.weight.data.normal_(std=0.01)
41 | self.conv6.bias.data.fill_(0.0)
42 |
43 | self.conv7.weight.data.normal_(std=0.01)
44 | self.conv7.bias.data.fill_(0.0)
45 |
46 | def upsample_add(self, x, y):
47 | _, _, h, w = y.size()
48 | return F.interpolate(x, size=(h, w), mode='bilinear', align_corners=True)
49 |
50 | def forward(self, x):
51 | c3, c4, c5 = self.backbone(x)
52 | # c2: 64*4=256 c3: 128*4=512 c4: 256*4=1024 c5: 512*4=2048
53 |
54 | p5 = self.lateral_layer1(c5)
55 |
56 | p4 = self.lateral_layer2(c4)
57 | p4 = self.upsample_add(p5, p4)
58 |
59 | p3 = self.lateral_layer3(c3)
60 | p3 = self.upsample_add(p4, p3)
61 |
62 | p6 = self.conv6(c5)
63 | p7 = self.conv7(self.relu(p6))
64 |
65 | return p3, p4, p5, p6, p7
66 |
--------------------------------------------------------------------------------
/lib/bbox/box_transform.py:
--------------------------------------------------------------------------------
1 | """
2 | BBox transform
3 | """
4 |
5 | import torch
6 |
7 |
8 | def bbox_transform(boxes, gtboxes):
9 | """ Bounding Box Transform
10 | from groundtruth boxes and proposal boxes to deltas
11 |
12 | Args:
13 | boxes: [N, 4] torch.Tensor (xyxy)
14 | gtboxes: [N, 4] torch.Tensor (xywh)
15 | Return:
16 | delta: [N, 4] torch.Tensor
17 | """
18 | gt_w = gtboxes[:, 2] - gtboxes[:, 0] + 1
19 | gt_h = gtboxes[:, 3] - gtboxes[:, 1] + 1
20 | # center
21 | gt_x = gtboxes[:, 0] + 0.5 * gt_w
22 | gt_y = gtboxes[:, 1] + 0.5 * gt_h
23 |
24 | # Anchors [x,y,w,h]
25 | anchor_x = boxes[:, 0]
26 | anchor_y = boxes[:, 1]
27 | anchor_w = boxes[:, 2]
28 | anchor_h = boxes[:, 3]
29 | # anchor_w = boxes[:, 2] - boxes[:, 0] + 1
30 | # anchor_h = boxes[:, 3] - boxes[:, 1] + 1
31 | # # center
32 | # anchor_x = boxes[:, 0] + 0.5 * anchor_w
33 | # anchor_y = boxes[:, 1] + 0.5 * anchor_h
34 |
35 | delta_x = (gt_x - anchor_x) / anchor_w
36 | delta_y = (gt_y - anchor_y) / anchor_h
37 | delta_w = torch.log(gt_w / anchor_w)
38 | delta_h = torch.log(gt_h / anchor_h)
39 |
40 | # [N, 4]
41 | return torch.stack([delta_x, delta_y, delta_w, delta_h]).transpose(0, 1)
42 |
43 |
44 | def bbox_transform_inv(boxes, delta):
45 | """ Inverse Bounding Box Transform
46 | from deltas and proposal boxes to predicted boxes
47 | Args:
48 | boxes: [N, 4] torch.Tensor (xywh)
49 | delta: [N, 4] torch.Tensor (xywh)
50 | Return:
51 | pred: [N, 4] torch.Tensor (xyxy)
52 | """
53 | pred_boxes = torch.zeros_like(boxes)
54 | pred_x = boxes[:, 0] + boxes[:, 2] * delta[:, 0]
55 | pred_y = boxes[:, 1] + boxes[:, 3] * delta[:, 1]
56 | pred_w = boxes[:, 2] * torch.exp(delta[:, 2])
57 | pred_h = boxes[:, 3] * torch.exp(delta[:, 3])
58 |
59 | pred_boxes[:, 0] = pred_x - 0.5 * pred_w
60 | pred_boxes[:, 1] = pred_y - 0.5 * pred_h
61 | pred_boxes[:, 2] = pred_x + 0.5 * pred_w
62 | pred_boxes[:, 3] = pred_y + 0.5 * pred_h
63 |
64 | return pred_boxes
65 |
66 |
67 | if __name__ == '__main__':
68 |
69 | pass
70 |
--------------------------------------------------------------------------------
/cfgs/config.py:
--------------------------------------------------------------------------------
1 | """
2 | Config File
3 | """
4 |
5 |
6 | config = {
7 |
8 | "synth_baseline": {
9 | # lr and general config
10 | 'base_lr': 1e-2,
11 | "lr_decay": [60000, 80000],
12 | "workers": 8,
13 | "num_classes": 21,
14 | "weight_decay": 1e-4,
15 | "epochs": 200,
16 |
17 | "basemodel_path": '/home/tianhengcheng/.torch/models/resnet50-19c8e357.pth',
18 | "data_dir": "/public_datasets/SynthText",
19 |
20 | # anchor config
21 | "positive_anchor_threshold": 0.5,
22 | "negative_anchor_threshold": 0.4,
23 | "anchor_sizes": [2 ** 0, 2 ** (1 / 3), 2 ** (2 / 3)],
24 | "aspect_ratios": [1, 3, 5],
25 | "anchor_areas": [32 ** 2, 64 ** 2, 128 ** 2, 256 ** 2, 512 ** 2],
26 | "strides": [8, 16, 32, 64, 128],
27 | "base_size": 8,
28 |
29 | # dataset
30 | "image_scales": [600],
31 | "max_image_size": 1000,
32 |
33 | # test config
34 | "pre_nms_boxes": 1000,
35 | "test_nms": 0.5,
36 | "test_max_boxes": 300,
37 | "cls_thresh": 0.05,
38 |
39 | # log
40 | "logdir": "log",
41 | "tb_dump_dir": "",
42 | "model_dump_dir": "",
43 | },
44 |
45 | "icdar_baseline": {
46 | # lr and general config
47 | 'base_lr': 1e-2,
48 | "lr_decay": [60000, 80000],
49 | "workers": 8,
50 | "num_classes": 21,
51 | "weight_decay": 1e-4,
52 | "epochs": 200,
53 |
54 | "basemodel_path": '/home/tianhengcheng/.torch/models/resnet50-19c8e357.pth',
55 | "data_dir": "/public_datasets/Text/icdar2015/",
56 |
57 | # anchor config
58 | "positive_anchor_threshold": 0.5,
59 | "negative_anchor_threshold": 0.4,
60 | "anchor_sizes": [2 ** 0, 2 ** (1 / 3), 2 ** (2 / 3)],
61 | "aspect_ratios": [1, 3, 5],
62 | "anchor_areas": [32 ** 2, 64 ** 2, 128 ** 2, 256 ** 2, 512 ** 2],
63 | "strides": [8, 16, 32, 64, 128],
64 | "base_size": 8,
65 |
66 | # dataset
67 | "image_scales": [600],
68 | "max_image_size": 1000,
69 |
70 | # test config
71 | "pre_nms_boxes": 1000,
72 | "test_nms": 0.5,
73 | "test_max_boxes": 300,
74 | "cls_thresh": 0.05,
75 |
76 | # log
77 | "logdir": "log",
78 | "tb_dump_dir": "",
79 | "model_dump_dir": "",
80 | }
81 |
82 | }
83 |
--------------------------------------------------------------------------------
/lib/nms/cpu_nms.pyx:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 |
8 | import numpy as np
9 | cimport numpy as np
10 |
11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
12 | return a if a >= b else b
13 |
14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
15 | return a if a <= b else b
16 |
17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
23 |
24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
26 |
27 | cdef int ndets = dets.shape[0]
28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \
29 | np.zeros((ndets), dtype=np.int)
30 |
31 | # nominal indices
32 | cdef int _i, _j
33 | # sorted indices
34 | cdef int i, j
35 | # temp variables for box i's (the box currently under consideration)
36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea
37 | # variables for computing overlap with box j (lower scoring box)
38 | cdef np.float32_t xx1, yy1, xx2, yy2
39 | cdef np.float32_t w, h
40 | cdef np.float32_t inter, ovr
41 |
42 | keep = []
43 | for _i in range(ndets):
44 | i = order[_i]
45 | if suppressed[i] == 1:
46 | continue
47 | keep.append(i)
48 | ix1 = x1[i]
49 | iy1 = y1[i]
50 | ix2 = x2[i]
51 | iy2 = y2[i]
52 | iarea = areas[i]
53 | for _j in range(_i + 1, ndets):
54 | j = order[_j]
55 | if suppressed[j] == 1:
56 | continue
57 | xx1 = max(ix1, x1[j])
58 | yy1 = max(iy1, y1[j])
59 | xx2 = min(ix2, x2[j])
60 | yy2 = min(iy2, y2[j])
61 | w = max(0.0, xx2 - xx1 + 1)
62 | h = max(0.0, yy2 - yy1 + 1)
63 | inter = w * h
64 | ovr = inter / (iarea + areas[j] - inter)
65 | if ovr >= thresh:
66 | suppressed[j] = 1
67 |
68 | return keep
69 |
--------------------------------------------------------------------------------
/lib/det_ops/anchor_target.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Anchor Layer
4 |
5 |
6 | """
7 |
8 | import torch
9 | import torch.nn.functional as F
10 | from lib.det_ops.anchors import compute_anchor_whs, generate_anchors
11 | import sys
12 | sys.path.append('../')
13 | from lib.bbox import bbox, box_transform
14 | from IPython import embed
15 |
16 |
17 | class AnchorLayer:
18 | """ Anchor Layer
19 | """
20 | def __init__(self, strides, areas, aspect_ratios, sizes):
21 | self.aspect_ratios = aspect_ratios
22 | self.areas = areas
23 | self.strides = strides
24 | self.sizes = sizes
25 |
26 | self._anchor_sizes = self._compute_anchor_size()
27 |
28 | # self._num_anchors = len(self.scales) * len(self.aspect_ratios)
29 |
30 | def _compute_anchor_size(self):
31 | return compute_anchor_whs(len(self.strides), self.areas, self.aspect_ratios, self.sizes)
32 |
33 | def _generate_anchors(self, input_size):
34 | boxes = generate_anchors(self._anchor_sizes, input_size, self.strides)
35 | return boxes
36 |
37 | def assign(self, gt_boxes, labels, input_size, neg_thresh=0.4, pos_thresh=0.5):
38 | """ assign groundtruth box to anchor box
39 |
40 | """
41 | anchor_boxes = self._generate_anchors(input_size)
42 | if labels.shape[0] == 0:
43 | return torch.LongTensor([0]*anchor_boxes.shape[0]), torch.zeros_like(anchor_boxes)
44 | # M * N
45 | xyxy_anchors = bbox.xywh2xyxy(anchor_boxes)
46 | ious = bbox.box_overlaps(xyxy_anchors, gt_boxes)
47 | max_ious, max_inds = ious.max(1)
48 | # M * 4
49 | matched_boxes = gt_boxes[max_inds]
50 | box_targets = box_transform.bbox_transform(anchor_boxes, matched_boxes)
51 |
52 | cls_targets = labels[max_inds]
53 | # negative
54 | cls_targets[max_ious < neg_thresh] = 0
55 | # ignore
56 | cls_targets[(max_ious > neg_thresh) & (max_ious < pos_thresh)] = -1
57 | return cls_targets, box_targets
58 |
59 |
60 | if __name__ == '__main__':
61 | # RetinaNet settings
62 | strides = [8, 16, 32, 64, 128]
63 | aspect_ratios = [0.5, 1, 2]
64 | sizes = [1, 2**(1/3), 2**(2/3)]
65 | areas = [32**2, 64**2, 128**2, 256**2, 512**2]
66 | anchor_layer = AnchorLayer(strides=strides, areas=areas, aspect_ratios=aspect_ratios, sizes=sizes)
67 |
68 | boxes = torch.Tensor([[10, 20, 44, 56], [50, 34, 260, 340],
69 | [70, 80, 190, 410], [360, 270, 500, 600]])
70 | labels = torch.LongTensor([3, 1, 1, 4])
71 | cls_target, box_target = anchor_layer.assign(boxes, labels, torch.FloatTensor([600, 600]), 0.4, 0.5)
72 |
73 | embed()
74 |
--------------------------------------------------------------------------------
/lib/nms/setup.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 | from distutils.core import setup, Extension
3 |
4 | import numpy as np
5 | from Cython.Build import cythonize
6 | from Cython.Distutils import build_ext
7 |
8 | # extensions
9 | ext_args = dict(
10 | include_dirs=[np.get_include()],
11 | language='c++',
12 | extra_compile_args={
13 | 'cc': ['-Wno-unused-function', '-Wno-write-strings'],
14 | 'nvcc': ['-c', '--compiler-options', '-fPIC'],
15 | },
16 | )
17 |
18 | extensions = [
19 | Extension('cpu_nms', ['cpu_nms.pyx'], **ext_args),
20 | Extension('cpu_soft_nms', ['cpu_soft_nms.pyx'], **ext_args),
21 | Extension('gpu_nms', ['gpu_nms.pyx', 'nms_kernel.cu'], **ext_args),
22 | ]
23 |
24 |
25 | def customize_compiler_for_nvcc(self):
26 | """inject deep into distutils to customize how the dispatch
27 | to cc/nvcc works.
28 | If you subclass UnixCCompiler, it's not trivial to get your subclass
29 | injected in, and still have the right customizations (i.e.
30 | distutils.sysconfig.customize_compiler) run on it. So instead of going
31 | the OO route, I have this. Note, it's kindof like a wierd functional
32 | subclassing going on."""
33 |
34 | # tell the compiler it can processes .cu
35 | self.src_extensions.append('.cu')
36 |
37 | # save references to the default compiler_so and _comple methods
38 | default_compiler_so = self.compiler_so
39 | super = self._compile
40 |
41 | # now redefine the _compile method. This gets executed for each
42 | # object but distutils doesn't have the ability to change compilers
43 | # based on source extension: we add it.
44 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
45 | if osp.splitext(src)[1] == '.cu':
46 | # use the cuda for .cu files
47 | self.set_executable('compiler_so', 'nvcc')
48 | # use only a subset of the extra_postargs, which are 1-1 translated
49 | # from the extra_compile_args in the Extension class
50 | postargs = extra_postargs['nvcc']
51 | else:
52 | postargs = extra_postargs['cc']
53 |
54 | super(obj, src, ext, cc_args, postargs, pp_opts)
55 | # reset the default compiler_so, which we might have changed for cuda
56 | self.compiler_so = default_compiler_so
57 |
58 | # inject our redefined _compile method into the class
59 | self._compile = _compile
60 |
61 |
62 | # run the customize_compiler
63 | class custom_build_ext(build_ext):
64 |
65 | def build_extensions(self):
66 | customize_compiler_for_nvcc(self.compiler)
67 | build_ext.build_extensions(self)
68 |
69 |
70 | setup(
71 | name='nms',
72 | cmdclass={'build_ext': custom_build_ext},
73 | ext_modules=cythonize(extensions),
74 | )
75 |
--------------------------------------------------------------------------------
/datasets/icdar15.py:
--------------------------------------------------------------------------------
1 | """
2 | ICDAR2015 for Text Detection
3 | """
4 |
5 | import os
6 | import cv2
7 | import json
8 | import torch
9 | import random
10 | import numpy as np
11 | from PIL import Image
12 | from scipy.io import loadmat
13 | from torch.utils.data import Dataset
14 | from torchvision.transforms import transforms
15 | from datasets.utils import normalize_image, get_im_scale
16 |
17 |
18 | CLASSES = ('text',)
19 | NUM_CLASSES = 2
20 |
21 |
22 | class ICDAR15(Dataset):
23 |
24 | def __init__(self, dataroot, config, imageset='train'):
25 | assert imageset == 'train' or imageset == 'val' or imageset == 'all'
26 | self._imageset = imageset
27 | self._annotation_file = os.path.join(dataroot, '{}.odgt'.format(imageset))
28 | self._base_dir = os.path.join(dataroot, '{}_images'.format(imageset))
29 | self.name = 'ICDAR15'
30 | self.config = config
31 | self.annotations = self._read_annotations()
32 |
33 | def _read_annotations(self):
34 | # im path -> annotations
35 | with open(self._annotation_file, 'r') as f:
36 | lines = f.readlines()
37 | lines = list(map(lambda x: json.loads(x.rstrip('\n')), lines))
38 | return lines
39 |
40 | def __len__(self):
41 | return len(self.annotations)
42 |
43 | def __getitem__(self, idx):
44 | annotation = self.annotations[idx]
45 | im_name = annotation['im_name']
46 | gt_boxes = annotation['gtboxes']
47 | try:
48 | img = Image.open(os.path.join(self._base_dir, im_name))
49 | except OSError as e:
50 | idx = random.randint(0, len(self))
51 | annotation = self.annotations[idx]
52 | im_name = annotation['im_name']
53 | gt_boxes = annotation['gtboxes']
54 | img = Image.open(os.path.join(self._base_dir, im_name))
55 |
56 | if self._imageset == 'val':
57 | # testing or validation mode, original scale
58 | img = np.array(img).astype('float32')
59 | h, w = img.shape[:2]
60 | resize_h, resize_w, scale = get_im_scale(h, w, target_size=self.config['test_image_size'][0],
61 | max_size=self.config['test_max_image_size'])
62 | img = cv2.resize(img, (resize_w, resize_h))
63 | img = normalize_image(img)
64 | img = img.transpose(2, 0, 1)
65 | img = torch.Tensor(img)
66 | return img, im_name, scale, (h, w)
67 |
68 | img = np.array(img).astype('float32')
69 | labels = np.ones(len(gt_boxes), dtype=np.int32)
70 | labels = torch.LongTensor(labels)
71 | boxes = np.array(gt_boxes, dtype=np.float32)
72 | # C, H, W
73 |
74 | return img, labels, boxes
75 |
76 |
77 |
--------------------------------------------------------------------------------
/datasets/synthtext.py:
--------------------------------------------------------------------------------
1 | """
2 | Synth800K for Text Detection
3 | """
4 |
5 | import os
6 | import cv2
7 | import json
8 | import torch
9 | import random
10 | import numpy as np
11 | from PIL import Image
12 | from scipy.io import loadmat
13 | from torch.utils.data import Dataset
14 | from torchvision.transforms import transforms
15 | from datasets.utils import normalize_image, get_im_scale
16 |
17 |
18 | CLASSES = ('text',)
19 | NUM_CLASSES = 2
20 | SYNTHTEXT_ROOT = '/public_datasets/SynthText'
21 |
22 |
23 | class SynthText(Dataset):
24 |
25 | def __init__(self, dataroot,config, imageset='train'):
26 | assert imageset == 'train' or imageset == 'val' or imageset == 'all'
27 | self._imageset = imageset
28 | self._annotation_file = os.path.join(dataroot, '{}.odgt'.format(imageset))
29 | self._base_dir = dataroot
30 | self.config = config
31 | self.name = 'SynthText80K'
32 | self.annotations = self._read_annotations()
33 |
34 | def _read_annotations(self):
35 | # im path -> annotations
36 | with open(self._annotation_file, 'r') as f:
37 | lines = f.readlines()
38 | lines = list(map(lambda x: json.loads(x.rstrip('\n')), lines))
39 | return lines
40 |
41 | def __len__(self):
42 | return len(self.annotations)
43 |
44 | def __getitem__(self, idx):
45 | annotation = self.annotations[idx]
46 | im_name = annotation['im_name']
47 | gt_boxes = annotation['gtboxes']
48 | try:
49 | img = Image.open(os.path.join(self._base_dir, im_name))
50 | except OSError as e:
51 | idx = random.randint(0, len(self))
52 | annotation = self.annotations[idx]
53 | im_name = annotation['im_name']
54 | gt_boxes = annotation['gtboxes']
55 | img = Image.open(os.path.join(self._base_dir, im_name))
56 |
57 | if self._imageset == 'val':
58 | # testing or validation mode, original scale
59 | img = np.array(img).astype('float32')
60 | h, w = img.shape[:2]
61 | resize_h, resize_w, scale = get_im_scale(h, w, target_size=self.config['test_image_size'][0],
62 | max_size=self.config['test_max_image_size'])
63 | img = cv2.resize(img, (resize_w, resize_h))
64 | img = normalize_image(img)
65 | img = img.transpose(2, 0, 1)
66 | img = torch.Tensor(img)
67 | return img, im_name, scale, (h, w)
68 |
69 | img = np.array(img).astype('float32')
70 | labels = np.ones(len(gt_boxes), dtype=np.int32)
71 | labels = torch.LongTensor(labels)
72 | boxes = np.array(gt_boxes, dtype=np.float32)
73 | # C, H, W
74 |
75 | return img, labels, boxes
76 |
77 |
--------------------------------------------------------------------------------
/lib/det_ops/loss.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Loss functions for Detection
4 |
5 | """
6 | import math
7 | import numpy as np
8 | import torch
9 | import torch.nn as nn
10 | import torch.nn.functional as F
11 | from IPython import embed
12 | __all__ = ['SmoothL1Loss', 'SoftmaxCrossEntropy', 'SigmoidCrossEntropy', 'SigmoidFocalLoss']
13 |
14 |
15 | SoftmaxCrossEntropy = nn.CrossEntropyLoss
16 | SigmoidCrossEntropy = nn.BCEWithLogitsLoss
17 |
18 |
19 | class SigmoidFocalLoss(nn.Module):
20 | """ Focal Loss
21 |
22 | Args:
23 |
24 | Input:
25 | pred: []
26 | target: []
27 | Output:
28 |
29 | """
30 | def __init__(self, background=0, gamma=2, alpha=0.25):
31 | super(SigmoidFocalLoss, self).__init__()
32 | self.gamma = gamma
33 | self.alpha = alpha
34 | self.background = background
35 |
36 | def forward(self, pred, target):
37 | # pred.shape = [B, K, C]
38 | # target.shape = [B, N]
39 | B, N, C = pred.size()
40 | pred_sigmoid = pred.sigmoid()
41 | # # ignore: [B, N]
42 | # keep_mask = target > -1
43 | # # [B, N], ignore and background shared 0 index
44 | # keep_mask = keep_mask.long()
45 | mask = (target > -1)
46 | new_target = target * mask.long()
47 | new_target = new_target.reshape((new_target.size()[0], new_target.size()[1], 1))
48 | onehot_target = torch.zeros((B, N, C+1)).cuda()
49 | onehot_target.scatter_(2, new_target, 1.0)
50 | onehot = onehot_target[:, :, 1:].float()
51 | mask = mask.unsqueeze(2).float()
52 |
53 | # pred_sigmoid = pred_sigmoid.clamp(min=1e-6, max=1-1e-6)
54 | # bce = -(onehot * torch.log(pred_sigmoid) + (1-onehot)*torch.log(1-pred_sigmoid))
55 |
56 | weight = self.alpha*((1-pred_sigmoid).pow(self.gamma))*onehot \
57 | + (1-self.alpha)*(pred_sigmoid.pow(self.gamma))*(1-onehot)
58 | weight = weight * mask
59 | avg_factor = torch.sum(target > 0, dim=1).float()
60 | # embed()
61 | loss = F.binary_cross_entropy_with_logits(pred, onehot, weight, reduction='none').sum(dim=1).sum(dim=1)
62 | loss = loss.div_(avg_factor.clamp(min=1.0)).mean()
63 |
64 | return loss
65 |
66 |
67 | class SmoothL1Loss(nn.Module):
68 |
69 | def __init__(self):
70 | super(SmoothL1Loss, self).__init__()
71 | self.smooth_l1 = nn.SmoothL1Loss(reduction='none')
72 |
73 | def forward(self, offset, target, cls_target):
74 | # ignore background and ignore label
75 | # offset B*N*4
76 | # target B*N*4
77 | # cls: B*N*C
78 | # bg_mask = cls_target == 0
79 | # ig_mask = cls_target == -1
80 |
81 | mask = cls_target > 0 # ig_mask * bg_mask
82 | mask = mask.float()
83 | loss_raw = (self.smooth_l1(offset, target).sum(2)) * mask
84 | loss = loss_raw.sum(dim=1).div_(mask.sum(dim=1).clamp(min=1.0)).mean()
85 | return loss
--------------------------------------------------------------------------------
/models/retina.py:
--------------------------------------------------------------------------------
1 | """
2 | RetinaNet Model
3 | backbone: resnet50 + FPN
4 | """
5 |
6 | import torch
7 | import torch.nn as nn
8 | import numpy as np
9 | from models import fpn
10 |
11 |
12 | class RetinaNetHead(nn.Module):
13 |
14 | def __init__(self, num_classes, num_anchors):
15 | super(RetinaNetHead, self).__init__()
16 | self.num_classes = num_classes
17 |
18 | self.cls_branch = nn.Sequential(
19 | nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1),
20 | nn.ReLU(inplace=True),
21 | nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1),
22 | nn.ReLU(inplace=True),
23 | nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1),
24 | nn.ReLU(inplace=True),
25 | nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1),
26 | nn.ReLU(inplace=True)
27 | )
28 | self.cls_score = nn.Conv2d(256, out_channels=num_classes*num_anchors, kernel_size=3, stride=1, padding=1)
29 |
30 | self.bbox_branch = nn.Sequential(
31 | nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1),
32 | nn.ReLU(inplace=True),
33 | nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1),
34 | nn.ReLU(inplace=True),
35 | nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1),
36 | nn.ReLU(inplace=True),
37 | nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1),
38 | nn.ReLU(inplace=True),
39 | nn.Conv2d(256, out_channels=num_anchors*4, kernel_size=3, stride=1, padding=1)
40 | )
41 | self._initialize_weights()
42 |
43 | def _initialize_weights(self):
44 | for m in self.cls_branch.modules():
45 | if isinstance(m, nn.Conv2d):
46 | m.weight.data.normal_(0, 0.01)
47 | m.bias.data.fill_(0)
48 |
49 | for m in self.bbox_branch.modules():
50 | if isinstance(m, nn.Conv2d):
51 | m.weight.data.normal_(0, 0.01)
52 | m.bias.data.fill_(0)
53 |
54 | self.cls_score.weight.data.normal_(0, 0.01)
55 | pi = 0.01
56 | self.cls_score.bias.data.fill_(-np.log((1 - pi) / pi))
57 |
58 | def forward(self, x):
59 | bbox_output = self.bbox_branch(x)
60 | bbox_output = bbox_output.permute(0, 2, 3, 1).contiguous().view(x.size()[0], -1, 4)
61 | cls_output = self.cls_score(self.cls_branch(x))
62 | cls_output = cls_output.permute(0, 2, 3, 1).contiguous().view(x.size()[0], -1, self.num_classes)
63 | return cls_output, bbox_output
64 |
65 |
66 | class RetinaNet(nn.Module):
67 |
68 | def __init__(self, num_classes, num_anchors, pretrained_path):
69 | super(RetinaNet, self).__init__()
70 | self.fpn = fpn.FPN50(pretrained_path)
71 | self.head = RetinaNetHead(num_classes, num_anchors)
72 |
73 | def forward(self, x):
74 | # [P3, P4, P5, P6, P7]
75 | # stride: [8, 16, 32, 64, 128]
76 | feature_pyramids = self.fpn(x)
77 | cls_outputs = []
78 | bbox_outputs = []
79 | for fp in feature_pyramids:
80 | cls_output, bbox_output = self.head(fp)
81 | cls_outputs.append(cls_output)
82 | bbox_outputs.append(bbox_output)
83 |
84 | cls_outputs = torch.cat(cls_outputs, dim=1)
85 | bbox_outputs = torch.cat(bbox_outputs, dim=1)
86 |
87 | return cls_outputs, bbox_outputs
88 |
89 |
90 |
91 |
--------------------------------------------------------------------------------
/datasets/minibatch.py:
--------------------------------------------------------------------------------
1 | """
2 | Create Mini Batch
3 | """
4 | import cv2
5 | import torch
6 | import random
7 | import numpy as np
8 | from datasets.utils import flip_img_boxes
9 | from lib.det_ops import anchor_target
10 | from datasets.utils import normalize_image, get_im_scale
11 |
12 |
13 | def create_minibatch_func(config):
14 | aspect_ratios = config['aspect_ratios']
15 | anchor_sizes = config['anchor_sizes']
16 | anchor_areas = config['anchor_areas']
17 | strides = config['strides']
18 |
19 | anchor_layer = anchor_target.AnchorLayer(aspect_ratios=aspect_ratios,
20 | sizes=anchor_sizes,
21 | areas=anchor_areas,
22 | strides=strides)
23 |
24 | def collate_minibatch(batch):
25 | # (img, labels, boxes)
26 | # img: [H, W, C]
27 | # labels: [N]
28 | # boxes: [N, 4]
29 | batch_size = len(batch)
30 | max_size = config['max_image_size']
31 | # [N, 1]
32 | target_size_inds = np.random.randint(
33 | 0, high=len(config['image_scales']), size=batch_size
34 | )
35 |
36 | image_shapes = np.zeros((batch_size, 2), dtype=np.int)
37 | image_scales = np.zeros(batch_size, dtype=np.float)
38 | batch_height = 0
39 | batch_width = 0
40 | for i in range(batch_size):
41 | h, w = batch[i][0].shape[:2]
42 | target_size = config['image_scales'][target_size_inds[i]]
43 | h_, w_, s_ = get_im_scale(h, w, target_size, max_size)
44 | image_shapes[i, 0] = h_
45 | image_shapes[i, 1] = w_
46 | image_scales[i] = s_
47 | batch_height = max(h_, batch_height)
48 | batch_width = max(w_, batch_width)
49 |
50 | # pad images to support the last stride
51 | max_stride = strides[-1]
52 | batch_width = int(np.ceil(batch_width/max_stride)*max_stride)
53 | batch_height = int(np.ceil(batch_height/max_stride)*max_stride)
54 |
55 | labels = []
56 | gtboxes = []
57 | batch_images = torch.zeros((batch_size, 3, batch_height, batch_width))
58 | input_size = np.array([batch_height, batch_width])
59 | for i in range(batch_size):
60 | img, label, boxes = batch[i]
61 | boxes = boxes.astype('float32')
62 | h, w = image_shapes[i]
63 | scale = image_scales[i]
64 | img = cv2.resize(img, (w, h))
65 |
66 | # OpenCV resize (W, H)
67 | boxes = boxes * scale
68 | if random.random() < 0.5:
69 | img, boxes = flip_img_boxes(img, boxes)
70 |
71 | # transform or data augmentation
72 | img = normalize_image(img)
73 | img = img.transpose(2, 0, 1)
74 | img = torch.Tensor(img)
75 | # assign anchors
76 | boxes = torch.Tensor(boxes)
77 | label, boxes = anchor_layer.assign(boxes, label, input_size=input_size,
78 | neg_thresh=config['negative_anchor_threshold'],
79 | pos_thresh=config['positive_anchor_threshold'])
80 |
81 | labels.append(label.unsqueeze(0))
82 | gtboxes.append(boxes.unsqueeze(0))
83 | # print(img.shape, batch_images.shape)
84 | batch_images[i, :, :h, :w] = img
85 |
86 | labels = torch.cat(labels, dim=0)
87 | gtboxes = torch.cat(gtboxes, dim=0)
88 | return batch_images, labels, gtboxes
89 |
90 | return collate_minibatch
91 |
--------------------------------------------------------------------------------
/evaluation/evaluation.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | """
4 | import json
5 | import argparse
6 | import numpy as np
7 | from IPython import embed
8 |
9 |
10 | def calculate_ap(recall, precision):
11 | mrec = np.concatenate(([0.], recall, [1.]))
12 | mpre = np.concatenate(([0.], precision, [0.]))
13 |
14 | # compute the precision envelope
15 | for i in range(mpre.size - 1, 0, -1):
16 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
17 |
18 | # to calculate area under PR curve, look for points
19 | # where X axis (recall) changes value
20 | i = np.where(mrec[1:] != mrec[:-1])[0]
21 |
22 | # and sum (\Delta recall) * prec
23 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
24 |
25 | return ap
26 |
27 |
28 | def eval_ap(predict_path, gt_path, iou_thresh):
29 |
30 | with open(predict_path, 'r') as f:
31 | lines = f.readlines()
32 | predictions = [json.loads(x.rstrip('\n')) for x in lines]
33 |
34 | with open(gt_path, 'r') as f:
35 | lines = f.readlines()
36 | gt = [json.loads(x.rstrip('\n')) for x in lines]
37 |
38 | predict_boxes = []
39 | for p in predictions:
40 | im_name = p['image_id']
41 | boxes = p['result']
42 | for bb in boxes:
43 | bb['im_name'] = im_name
44 | predict_boxes.append(bb)
45 |
46 | gt_boxes = dict()
47 | npos = 0
48 | for g in gt:
49 | gt_boxes[g['im_name']] = {'box': np.array(g['gtboxes']),
50 | 'flag': np.zeros(len(g['gtboxes']), dtype=int)}
51 | npos += len(g['gtboxes'])
52 |
53 | # sort
54 | predict_boxes = sorted(predict_boxes, key=lambda x: x['prob'], reverse=True)
55 | tp = np.zeros(len(predict_boxes))
56 | fp = np.zeros(len(predict_boxes))
57 | for i in range(len(predict_boxes)):
58 | box = predict_boxes[i]
59 | im_name = box['im_name']
60 | _gt_boxes = gt_boxes[im_name]['box']
61 | bb = box['bbox']
62 | bb = np.array(bb)
63 |
64 | if len(_gt_boxes) > 0:
65 |
66 | ixmin = np.maximum(_gt_boxes[:, 0], bb[0])
67 | iymin = np.maximum(_gt_boxes[:, 1], bb[1])
68 | ixmax = np.minimum(_gt_boxes[:, 2], bb[2])
69 | iymax = np.minimum(_gt_boxes[:, 3], bb[3])
70 | iw = np.maximum(ixmax - ixmin + 1., 0.)
71 | ih = np.maximum(iymax - iymin + 1., 0.)
72 | inters = iw * ih
73 |
74 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
75 | (_gt_boxes[:, 2] - _gt_boxes[:, 0] + 1.) *
76 | (_gt_boxes[:, 3] - _gt_boxes[:, 1] + 1.) - inters)
77 |
78 | overlaps = inters / uni
79 | ovmax = np.max(overlaps)
80 | jmax = np.argmax(overlaps)
81 |
82 | if ovmax > iou_thresh:
83 | if gt_boxes[im_name]['flag'][jmax] > 0:
84 | fp[i] = 1
85 | else:
86 | tp[i] = 1
87 | gt_boxes[im_name]['flag'][jmax] = 1
88 | else:
89 | fp[i] = 1
90 |
91 | fp = np.cumsum(fp)
92 | tp = np.cumsum(tp)
93 | recall = tp / float(npos)
94 | # avoid divide by zero in case the first detection matches a difficult
95 | # ground truth
96 | precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
97 |
98 | ap = calculate_ap(recall, precision)
99 |
100 | return ap
101 |
102 |
103 | def main():
104 |
105 | parser = argparse.ArgumentParser()
106 | parser.add_argument('-p', '--predict', type=str, default='', required=True)
107 | parser.add_argument('-g', '--gt', type=str, default='/public_datasets/SynthText/val.odgt')
108 | parser.add_argument('-t', '--thresh', type=float, default=0.5)
109 |
110 | args = parser.parse_args()
111 |
112 | ap = eval_ap(args.predict, args.gt, args.thresh)
113 |
114 | print("eval finished, ap={:.3f}".format(ap))
115 |
116 |
117 | if __name__ == '__main__':
118 |
119 | main()
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
--------------------------------------------------------------------------------
/lib/bbox/bbox.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Bounding Box
4 |
5 |
6 | """
7 |
8 | import torch
9 | import numpy as np
10 | from IPython import embed
11 |
12 | def np_xywh2xyxy(boxes):
13 | # [x1,y1,w,h]
14 | boxes = np.hstack(
15 | (boxes[:, 0:2], boxes[:, 0:2] + np.maximum(0, boxes[:, 2:4] - 1))
16 | )
17 |
18 | return boxes
19 |
20 |
21 | def clip_boxes(boxes, image_height, image_width):
22 | boxes[:, 0] = torch.clamp(boxes[:, 0], min=0, max=image_width-1)
23 | boxes[:, 1] = torch.clamp(boxes[:, 1], min=0, max=image_width-1)
24 | boxes[:, 2] = torch.clamp(boxes[:, 2], min=0, max=image_height-1)
25 | boxes[:, 3] = torch.clamp(boxes[:, 3], min=0, max=image_height-1)
26 | return boxes
27 |
28 |
29 | def filter_boxes(boxes):
30 |
31 | keep = []
32 | for i in range(boxes.shape[0]):
33 | if boxes[i, 0] < boxes[i, 2] and boxes[i, 1] < boxes[i, 3]:
34 | keep.append(i)
35 | boxes = boxes[keep]
36 | return boxes
37 |
38 |
39 | def xywh2xyxy(boxes):
40 | """ xywh -> xyxy
41 | (xmin,ymin,xmax,ymax) and (xcenter,ycenter,width,height)
42 | Args:
43 | boxes: torch.FloatTensor[N,4]
44 | """
45 |
46 | x1 = boxes[:, 0] - 0.5 * boxes[:, 2]
47 | y1 = boxes[:, 1] - 0.5 * boxes[:, 3]
48 | x2 = boxes[:, 0] + 0.5 * boxes[:, 2]
49 | y2 = boxes[:, 1] + 0.5 * boxes[:, 3]
50 |
51 | return torch.stack([x1, y1, x2, y2]).transpose(0, 1)
52 |
53 |
54 | def xyxy2xywh(boxes):
55 | """ xyxy -> xywh
56 | (xmin,ymin,xmax,ymax) and (xcenter,ycenter,width,height)
57 | Args:
58 | boxes: torch.FloatTensor[N,4]
59 | """
60 | w = boxes[:, 2] - boxes[:, 0] + 1
61 | h = boxes[:, 3] - boxes[:, 1] + 1
62 | # center
63 | x = boxes[:, 0] + 0.5 * w
64 | y = boxes[:, 1] + 0.5 * h
65 |
66 | return torch.stack([x, y, w, h]).transpose(0, 1)
67 |
68 |
69 | def box_overlaps(box1, box2):
70 | """ Box IoU(Insertion over Union)(xmin,ymin,xmax,ymax)
71 | Args:
72 | box1: torch.FloatTensor[N, 4],
73 | box2: torch.FloatTensor[M, 4]
74 | mode: box representation format
75 | """
76 | # N = box1.size()[0]
77 | # M = box2.size()[0]
78 |
79 | # NxMx2
80 | lo = torch.max(box1[:, None, :2], box2[:, :2])
81 | hi = torch.min(box1[:, None, 2:], box2[:, 2:])
82 |
83 | inner_rect = (hi - lo + 1).clamp(0)
84 | # NxMx1
85 | inner = inner_rect[:, :, 0] * inner_rect[:, :, 1]
86 |
87 | area1 = (box1[:, 2]-box1[:, 0]+1)*(box1[:, 3]-box1[:, 1]+1)
88 | area2 = (box2[:, 2]-box2[:, 0]+1)*(box2[:, 3]-box2[:, 1]+1)
89 |
90 | iou = inner / (area1[:, None] + area2 - inner)
91 |
92 | return iou
93 |
94 |
95 | def box_nms(boxes, scores, threshold):
96 | """Non maximum suppression.
97 | Args:
98 | boxes: (tensor) bounding boxes, sized [N,4].
99 | scores: (tensor) bbox scores, sized [N,].
100 | threshold: (float) overlap threshold.
101 | Returns:
102 | keep: (tensor) selected indices.
103 | Reference:
104 | https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py
105 | """
106 | x1 = boxes[:, 0]
107 | y1 = boxes[:, 1]
108 | x2 = boxes[:, 2]
109 | y2 = boxes[:, 3]
110 |
111 | areas = (x2-x1+1) * (y2-y1+1)
112 | _, order = scores.sort(0, descending=True)
113 | keep = []
114 | while order.numel() > 0:
115 | i = order[0]
116 | keep.append(i)
117 |
118 | if order.numel() == 1:
119 | break
120 |
121 | xx1 = x1[order[1:]].clamp(min=x1[i].item())
122 | yy1 = y1[order[1:]].clamp(min=y1[i].item())
123 | xx2 = x2[order[1:]].clamp(max=x2[i].item())
124 | yy2 = y2[order[1:]].clamp(max=y2[i].item())
125 |
126 | w = (xx2-xx1+1).clamp(min=0)
127 | h = (yy2-yy1+1).clamp(min=0)
128 | inter = w*h
129 |
130 | ovr = inter / (areas[i] + areas[order[1:]] - inter)
131 |
132 | ids = (ovr <= threshold).nonzero().squeeze()
133 | if ids.numel() == 0:
134 | break
135 | order = order[ids+1]
136 | return torch.LongTensor(keep)
137 |
138 |
139 | if __name__ == '__main__':
140 | # TODO: Test it!
141 | pass
--------------------------------------------------------------------------------
/lib/nms/cpu_soft_nms.pyx:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------
2 | # Soft-NMS: Improving Object Detection With One Line of Code
3 | # Copyright (c) University of Maryland, College Park
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Navaneeth Bodla and Bharat Singh
6 | # ----------------------------------------------------------
7 |
8 | import numpy as np
9 | cimport numpy as np
10 |
11 |
12 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
13 | return a if a >= b else b
14 |
15 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
16 | return a if a <= b else b
17 |
18 | def cpu_soft_nms(
19 | np.ndarray[float, ndim=2] boxes_in,
20 | float sigma=0.5,
21 | float Nt=0.3,
22 | float threshold=0.001,
23 | unsigned int method=0
24 | ):
25 | boxes = boxes_in.copy()
26 | cdef unsigned int N = boxes.shape[0]
27 | cdef float iw, ih, box_area
28 | cdef float ua
29 | cdef int pos = 0
30 | cdef float maxscore = 0
31 | cdef int maxpos = 0
32 | cdef float x1, x2, y1, y2, tx1, tx2, ty1, ty2, ts, area, weight, ov
33 | inds = np.arange(N)
34 |
35 | for i in range(N):
36 | maxscore = boxes[i, 4]
37 | maxpos = i
38 |
39 | tx1 = boxes[i,0]
40 | ty1 = boxes[i,1]
41 | tx2 = boxes[i,2]
42 | ty2 = boxes[i,3]
43 | ts = boxes[i,4]
44 | ti = inds[i]
45 |
46 | pos = i + 1
47 | # get max box
48 | while pos < N:
49 | if maxscore < boxes[pos, 4]:
50 | maxscore = boxes[pos, 4]
51 | maxpos = pos
52 | pos = pos + 1
53 |
54 | # add max box as a detection
55 | boxes[i,0] = boxes[maxpos,0]
56 | boxes[i,1] = boxes[maxpos,1]
57 | boxes[i,2] = boxes[maxpos,2]
58 | boxes[i,3] = boxes[maxpos,3]
59 | boxes[i,4] = boxes[maxpos,4]
60 | inds[i] = inds[maxpos]
61 |
62 | # swap ith box with position of max box
63 | boxes[maxpos,0] = tx1
64 | boxes[maxpos,1] = ty1
65 | boxes[maxpos,2] = tx2
66 | boxes[maxpos,3] = ty2
67 | boxes[maxpos,4] = ts
68 | inds[maxpos] = ti
69 |
70 | tx1 = boxes[i,0]
71 | ty1 = boxes[i,1]
72 | tx2 = boxes[i,2]
73 | ty2 = boxes[i,3]
74 | ts = boxes[i,4]
75 |
76 | pos = i + 1
77 | # NMS iterations, note that N changes if detection boxes fall below
78 | # threshold
79 | while pos < N:
80 | x1 = boxes[pos, 0]
81 | y1 = boxes[pos, 1]
82 | x2 = boxes[pos, 2]
83 | y2 = boxes[pos, 3]
84 | s = boxes[pos, 4]
85 |
86 | area = (x2 - x1 + 1) * (y2 - y1 + 1)
87 | iw = (min(tx2, x2) - max(tx1, x1) + 1)
88 | if iw > 0:
89 | ih = (min(ty2, y2) - max(ty1, y1) + 1)
90 | if ih > 0:
91 | ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih)
92 | ov = iw * ih / ua #iou between max box and detection box
93 |
94 | if method == 1: # linear
95 | if ov > Nt:
96 | weight = 1 - ov
97 | else:
98 | weight = 1
99 | elif method == 2: # gaussian
100 | weight = np.exp(-(ov * ov)/sigma)
101 | else: # original NMS
102 | if ov > Nt:
103 | weight = 0
104 | else:
105 | weight = 1
106 |
107 | boxes[pos, 4] = weight*boxes[pos, 4]
108 |
109 | # if box score falls below threshold, discard the box by
110 | # swapping with last box update N
111 | if boxes[pos, 4] < threshold:
112 | boxes[pos,0] = boxes[N-1, 0]
113 | boxes[pos,1] = boxes[N-1, 1]
114 | boxes[pos,2] = boxes[N-1, 2]
115 | boxes[pos,3] = boxes[N-1, 3]
116 | boxes[pos,4] = boxes[N-1, 4]
117 | inds[pos] = inds[N-1]
118 | N = N - 1
119 | pos = pos - 1
120 |
121 | pos = pos + 1
122 |
123 | return boxes[:N], inds[:N]
--------------------------------------------------------------------------------
/models/resnet.py:
--------------------------------------------------------------------------------
1 | """
2 | Basemodel: ResNet
3 |
4 | """
5 |
6 | import torch
7 | from torchvision.models import resnet
8 | import torch.nn as nn
9 |
10 | Bottleneck = resnet.Bottleneck
11 |
12 |
13 | class ResNet50Stages(nn.Module):
14 |
15 | def __init__(self, pretrained_path):
16 | super(ResNet50Stages, self).__init__()
17 | self.inplanes = 64
18 | self.stages = [3, 4, 6, 3]
19 | self.mid_outputs = [64, 128, 256, 512]
20 |
21 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
22 | bias=False)
23 | self.bn1 = nn.BatchNorm2d(64)
24 | self.relu = nn.ReLU(inplace=True)
25 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
26 | self.layer1 = self._make_layer(Bottleneck, self.mid_outputs[0], self.stages[0])
27 | self.layer2 = self._make_layer(Bottleneck, self.mid_outputs[1], self.stages[1], stride=2)
28 | self.layer3 = self._make_layer(Bottleneck, self.mid_outputs[2], self.stages[2], stride=2)
29 | self.layer4 = self._make_layer(Bottleneck, self.mid_outputs[3], self.stages[3], stride=2)
30 |
31 | # self.load_state_dict(torch.load(pretrained_path))
32 | self.load_pretrained(pretrained_path)
33 |
34 | def _make_layer(self, block, planes, blocks, stride=1):
35 | downsample = None
36 | if stride != 1 or self.inplanes != planes * block.expansion:
37 | downsample = nn.Sequential(
38 | nn.Conv2d(self.inplanes, planes * block.expansion,
39 | kernel_size=1, stride=stride, bias=False),
40 | nn.BatchNorm2d(planes * block.expansion),
41 | )
42 |
43 | layers = []
44 | layers.append(block(self.inplanes, planes, stride, downsample))
45 | self.inplanes = planes * block.expansion
46 | for i in range(1, blocks):
47 | layers.append(block(self.inplanes, planes))
48 |
49 | return nn.Sequential(*layers)
50 |
51 | def freeze_bn(self):
52 | pass
53 |
54 | def load_pretrained(self, mpath):
55 |
56 | pretrained_dict = torch.load(mpath)
57 | model_dict = self.state_dict()
58 | pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
59 | self.load_state_dict(pretrained_dict)
60 |
61 | def forward(self, x):
62 | x = self.conv1(x)
63 | x = self.bn1(x)
64 | x = self.relu(x)
65 | x = self.maxpool(x)
66 | x1 = self.layer1(x)
67 | x2 = self.layer2(x1)
68 | x3 = self.layer3(x2)
69 | x4 = self.layer4(x3)
70 | return [x2, x3, x4]
71 |
72 |
73 | class ResNet50(nn.Module):
74 |
75 | def __init__(self, pretrained_path):
76 | super(ResNet50, self).__init__()
77 | self.layers = ResNet50Stages(pretrained_path)
78 |
79 | def forward(self, x):
80 | return self.layers(x)[-1]
81 |
82 |
83 | class ResNet50C4(nn.Module):
84 |
85 | def __init__(self, pretrained_path):
86 | super(ResNet50C4, self).__init__()
87 | self.inplanes = 64
88 | self.stages = [3, 4, 6]
89 | self.mid_outputs = [64, 128, 256, 512]
90 |
91 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
92 | bias=False)
93 | self.bn1 = nn.BatchNorm2d(64)
94 | self.relu = nn.ReLU(inplace=True)
95 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
96 | self.layer1 = self._make_layer(Bottleneck, self.mid_outputs[0], self.stages[0])
97 | self.layer2 = self._make_layer(Bottleneck, self.mid_outputs[1], self.stages[1], stride=2)
98 | self.layer3 = self._make_layer(Bottleneck, self.mid_outputs[2], self.stages[2], stride=2)
99 |
100 | # self.load_state_dict(torch.load(pretrained_path))
101 | self.load_pretrained(pretrained_path)
102 |
103 | def _make_layer(self, block, planes, blocks, stride=1):
104 | downsample = None
105 | if stride != 1 or self.inplanes != planes * block.expansion:
106 | downsample = nn.Sequential(
107 | nn.Conv2d(self.inplanes, planes * block.expansion,
108 | kernel_size=1, stride=stride, bias=False),
109 | nn.BatchNorm2d(planes * block.expansion),
110 | )
111 |
112 | layers = []
113 | layers.append(block(self.inplanes, planes, stride, downsample))
114 | self.inplanes = planes * block.expansion
115 | for i in range(1, blocks):
116 | layers.append(block(self.inplanes, planes))
117 |
118 | return nn.Sequential(*layers)
119 |
120 | def freeze_bn(self):
121 | pass
122 |
123 | def load_pretrained(self, mpath):
124 |
125 | pretrained_dict = torch.load(mpath)
126 | model_dict = self.state_dict()
127 | pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
128 | self.load_state_dict(pretrained_dict)
129 |
130 | def forward(self, x):
131 | x = self.conv1(x)
132 | x = self.bn1(x)
133 | x = self.relu(x)
134 | x = self.maxpool(x)
135 | x1 = self.layer1(x)
136 | x2 = self.layer2(x1)
137 | x3 = self.layer3(x2)
138 | return x3
139 |
140 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Training RetinaNet
4 |
5 |
6 | """
7 | import os
8 | import tqdm
9 | import argparse
10 | import numpy as np
11 | import torch.nn as nn
12 | import torch.optim as optim
13 | from models import retina
14 | from datasets import synthtext, icdar15, minibatch
15 | from torch.utils.data import DataLoader
16 | from lib.det_ops.loss import SigmoidFocalLoss, SmoothL1Loss
17 | from IPython import embed
18 | import tensorboardX
19 | from utils import logger
20 | from cfgs import config as cfg
21 |
22 |
23 | def initialize(config, args):
24 |
25 | logdir = config['logdir']
26 | if not os.path.exists(logdir):
27 | os.mkdir(logdir)
28 | if not os.path.exists(os.path.join(logdir, args.experiment)):
29 | os.mkdir(os.path.join(logdir, args.experiment))
30 |
31 | model_dump_dir = os.path.join(logdir, args.experiment, 'model_dump')
32 | tb_dump = os.path.join(logdir, args.experiment, 'tb_dump')
33 |
34 | if not os.path.exists(model_dump_dir):
35 | os.mkdir(model_dump_dir)
36 |
37 | if not os.path.exists(tb_dump):
38 | os.mkdir(tb_dump)
39 |
40 | config['tb_dump_dir'] = tb_dump
41 | config['model_dump_dir'] = model_dump_dir
42 |
43 |
44 | def learning_rate_decay(optimizer, step, config):
45 | base_lr = config['base_lr']
46 | lr = base_lr
47 | if step >= config['lr_decay'][0]:
48 | lr = base_lr * 0.1
49 | if step >= config['lr_decay'][0]:
50 | lr = base_lr * 0.01
51 |
52 | for param_group in optimizer.param_groups:
53 | param_group['lr'] = lr
54 |
55 |
56 | def train(args, config):
57 | anchor_scales = config['anchor_sizes']
58 | anchor_apsect_ratios = config['anchor_aspect_ratios']
59 | num_anchors = len(anchor_scales) * len(anchor_apsect_ratios)
60 |
61 | model = retina.RetinaNet(config['num_classes'], num_anchors, config['basemodel_path']).cuda()
62 | model = nn.DataParallel(model, device_ids=list(range(args.device)))
63 |
64 | if args.dataset == 'SynthText':
65 | train_dataset = synthtext.SynthText(dataroot=config['data_dir'], imageset=args.imageset, config=config)
66 | elif args.dataset == 'ICDAR':
67 | train_dataset = icdar15.ICDAR15(dataroot=config['data_dir'], imageset=args.imageset, config=config)
68 | else:
69 | raise NotImplemented()
70 |
71 | collate_minibatch = minibatch.create_minibatch_func(config)
72 |
73 | train_loader = DataLoader(
74 | dataset=train_dataset,
75 | batch_size=args.batch_size*args.device,
76 | shuffle=True,
77 | num_workers=config['workers'],
78 | collate_fn=collate_minibatch
79 | )
80 |
81 | writer = tensorboardX.SummaryWriter(config['tb_dump_dir'])
82 | # torch model
83 |
84 | optimizer = optim.SGD(lr=config['base_lr'], params=model.parameters(),
85 | weight_decay=config['weight_decay'], momentum=0.9)
86 |
87 | cls_criterion = SigmoidFocalLoss().cuda()
88 | box_criterion = SmoothL1Loss().cuda()
89 |
90 | start_epoch = 0
91 | global_step = 0
92 |
93 | # Load state dict from saved model
94 | if len(args.continue_path) > 0:
95 | model_state, optimizer_state, epoch, step = logger.load_checkpoints(args.continue_path)
96 | model.module.load_state_dict(model_state)
97 | optimizer.load_state_dict(optimizer_state)
98 | global_step = step+1
99 | start_epoch = epoch + 1
100 |
101 | for epoch in range(start_epoch, config['epochs']):
102 | losses = []
103 | data_iter = iter(train_loader)
104 | pbar = tqdm.tqdm(range(len(train_loader)))
105 | for i in pbar:
106 | img, labels, boxes = next(data_iter)
107 | img = img.cuda()
108 | labels = labels.long().cuda()
109 | boxes = boxes.cuda()
110 | cls_outputs, bbox_outputs = model(img)
111 | cls_loss = cls_criterion(cls_outputs, labels)
112 | box_loss = box_criterion(bbox_outputs, boxes, labels)
113 | loss = cls_loss + box_loss
114 |
115 | optimizer.zero_grad()
116 | loss.backward()
117 | optimizer.step()
118 | writer.add_scalar('train/box_loss', box_loss.item(), global_step)
119 | writer.add_scalar('train/cls_loss', cls_loss.item(), global_step)
120 | global_step += 1
121 | pbar.set_description('e:{} i:{} loss:{:.3f} cls_loss:{:.3f} box_loss:{:.3f}'.format(
122 | epoch, i + 1, loss.item(), cls_loss.item(), box_loss.item()
123 | ))
124 | losses.append(loss.item())
125 |
126 | # learning rate decay
127 | learning_rate_decay(optimizer, global_step, config)
128 |
129 | print("e:{} loss: {}".format(epoch, np.mean(losses)))
130 | logger.save_checkpoints(model.module, optimizer, epoch, global_step,
131 | path=os.path.join(config['model_dump_dir'],
132 | 'epoch-{}-iter-{}.pth'.format(epoch, global_step)))
133 |
134 |
135 | if __name__ == '__main__':
136 |
137 | parser = argparse.ArgumentParser()
138 | parser.add_argument('-d', '--device', type=int, default=1, help='training with ? GPUs')
139 | parser.add_argument('-b', '--batch_size', type=int, default=4, help='training batch size per GPU')
140 | parser.add_argument('-c', '--continue_path', type=str, default='', help='continue model parameters')
141 | parser.add_argument('-e', '--experiment', type=str, default='synth_baseline',
142 | help='experiment name, correspond to `config.py`')
143 | parser.add_argument('-ds', '--dataset', type=str, default='SynthText', help='dataset')
144 |
145 | _args = parser.parse_args()
146 | config = cfg.config[_args.experiment]
147 | train(_args, config)
148 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Test scripts
4 |
5 | """
6 | import argparse
7 | import json
8 | import tqdm
9 | import torch
10 | import numpy as np
11 | from lib.det_ops.anchors import compute_anchor_whs, generate_anchors
12 | from lib.bbox import bbox, box_transform
13 | from lib.nms import nms
14 | from utils.logger import load_checkpoints
15 | from models import retina
16 | from IPython import embed
17 | from datasets import synthtext, icdar15
18 | from cfgs import config as cfg
19 |
20 |
21 | def inference(model, dataset, anchor_wh, strides, result_file, config):
22 |
23 | model.eval()
24 | num_samples = len(dataset)
25 | pbar = tqdm.tqdm(range(num_samples))
26 | with torch.no_grad():
27 | for idx in pbar:
28 | img, im_name, scale, im_size = dataset[idx]
29 | h, w = img.shape[1], img.shape[2]
30 | img = img.cuda()
31 | cls_pred, bbox_pred = model(img.unsqueeze(0))
32 | scores = cls_pred.sigmoid()
33 | # bbox [N, 4]
34 | bbox_pred = bbox_pred[0]
35 | # cls [N, C]
36 | scores = scores[0]
37 |
38 | anchors = generate_anchors(anchor_wh, input_size=np.array([h, w]),
39 | strides=strides)
40 | anchors = anchors.cuda()
41 |
42 | # transform to bboxes
43 | boxes = box_transform.bbox_transform_inv(anchors, bbox_pred)
44 | boxes = boxes/scale
45 | boxes = bbox.clip_boxes(boxes, im_size[0], im_size[1])
46 |
47 | filter_boxes_inds_x = boxes[:, 0] >= boxes[:, 2]
48 | filter_boxes_inds_y = boxes[:, 1] >= boxes[:, 3]
49 | filter_boxes_inds = (1 - filter_boxes_inds_x) * (1 - filter_boxes_inds_y)
50 | boxes = boxes[filter_boxes_inds]
51 | scores = scores[filter_boxes_inds]
52 |
53 | result_boxes = [] # []
54 | # every class
55 | # 1. max detection score
56 | # 2. score thresh
57 | # 3. do nms
58 | # 4. top k
59 | max_labels = torch.argmax(scores, dim=1)
60 |
61 | for cls in range(config['num_classes']-1):
62 |
63 | # filter predictions through 'classification threshold'
64 | score = scores[:, cls]
65 | cls_inds = score > config['cls_thresh']
66 | # current class has the max score over all classes
67 | max_inds = max_labels == cls
68 | cls_inds = max_inds * cls_inds
69 | if cls_inds.sum() < 1:
70 | continue
71 | # score [K]
72 | score = score[cls_inds]
73 |
74 | # _boxes [K, 4]
75 | _boxes = boxes[cls_inds]
76 |
77 | # NMS remove duplicate
78 | keep = nms(torch.cat([_boxes, score.unsqueeze(1)], 1), config['test_nms'])
79 |
80 | score = score[keep]
81 | _boxes = _boxes[keep]
82 |
83 | for i in range(_boxes.shape[0]):
84 | result_boxes.append((cls, score[i].item(), _boxes[i].cpu().data.numpy().tolist()))
85 |
86 | # Keep Max Num Boxes
87 | if len(result_boxes) > config['test_max_boxes']:
88 | result_boxes = sorted(result_boxes, key=lambda x: x[1], reverse=True)
89 | result_boxes = result_boxes[:config['test_max_boxes']]
90 | pbar.set_description('im_det:{}/{}'.format(idx, num_samples))
91 |
92 | if len(result_boxes) == 0:
93 | continue
94 |
95 | result = dict()
96 | result['image_id'] = im_name
97 | det = []
98 | for i in range(len(result_boxes)):
99 | cls, s, b, = result_boxes[i]
100 | current_det = dict()
101 | current_det['prob'] = s
102 | current_det['class'] = cls+1
103 | current_det['bbox'] = b
104 |
105 | det.append(current_det)
106 | result['result'] = det
107 |
108 | with open(result_file, 'a+') as f:
109 | s = json.dumps(result)
110 | f.write('{}\n'.format(s))
111 |
112 | print("Det Finished!")
113 |
114 |
115 | def validate(args, config):
116 |
117 | anchor_scales = config['anchor_sizes']
118 | anchor_apsect_ratios = config['aspect_ratios']
119 | num_anchors = len(anchor_scales) * len(anchor_apsect_ratios)
120 |
121 | model = retina.RetinaNet(config['num_classes']-1, num_anchors, config['basemodel_path']).cuda()
122 |
123 | model_path = args.model_path
124 | output_file = args.output
125 | if args.dataset == 'SynthText':
126 | dataset = synthtext.SynthText(dataroot=config['data_dir'], imageset=args.imageset, config=config)
127 | elif args.dataset == 'ICDAR':
128 | dataset = icdar15.ICDAR15(dataroot=config['data_dir'], imageset=args.imageset, config=config)
129 | else:
130 | NotImplemented()
131 | state_dict, _, _, _ = load_checkpoints(model_path)
132 | model.load_state_dict(state_dict)
133 |
134 | anchor_whs = compute_anchor_whs(len(config['strides']), areas=config['anchor_areas'],
135 | aspect_ratios=anchor_apsect_ratios,
136 | sizes=anchor_scales)
137 |
138 | inference(model, dataset, anchor_whs, config['strides'], result_file=output_file, config=config)
139 |
140 |
141 | if __name__ == '__main__':
142 |
143 | parser = argparse.ArgumentParser()
144 | parser.add_argument('-o', '--output', type=str, default='result.det', help='output file path')
145 | parser.add_argument('-m', '--model_path', type=str, help='saved model path')
146 | parser.add_argument('-i', '--imageset', type=str, default='val', help='saved model path')
147 | parser.add_argument('-e', '--experiment', type=str, default='synth_baseline',
148 | help='experiment name, correspond to `config.py`')
149 | parser.add_argument('-ds', '--dataset', type=str, default='VOC', help='dataset')
150 | _args = parser.parse_args()
151 | config = cfg.config[_args.experiment]
152 | _args = parser.parse_args()
153 | validate(_args, config)
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
--------------------------------------------------------------------------------
/lib/nms/nms_kernel.cu:
--------------------------------------------------------------------------------
1 | // ------------------------------------------------------------------
2 | // Faster R-CNN
3 | // Copyright (c) 2015 Microsoft
4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details]
5 | // Written by Shaoqing Ren
6 | // ------------------------------------------------------------------
7 |
8 | #include
9 | #include
10 | #include
11 | #include "gpu_nms.hpp"
12 |
13 | #define CUDA_CHECK(condition) \
14 | /* Code block avoids redefinition of cudaError_t error */ \
15 | do { \
16 | cudaError_t error = condition; \
17 | if (error != cudaSuccess) { \
18 | std::cout << cudaGetErrorString(error) << std::endl; \
19 | } \
20 | } while (0)
21 |
22 | #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
23 | #define MULTIPLIER 16
24 | #define LONGLONG_SIZE 64
25 |
26 | int const threadsPerBlock =
27 | sizeof(unsigned long long) * 8 *
28 | MULTIPLIER; // number of bits for a long long variable
29 |
30 | __device__ inline float devIoU(float const* const a, float const* const b) {
31 | float left = max(a[0], b[0]), right = min(a[2], b[2]);
32 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
33 | float width = max(right - left + 1, 0.f),
34 | height = max(bottom - top + 1, 0.f);
35 | float interS = width * height;
36 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
37 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
38 | return interS / (Sa + Sb - interS);
39 | }
40 |
41 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
42 | const float* dev_boxes,
43 | unsigned long long* dev_mask) {
44 | const int row_start = blockIdx.y;
45 | const int col_start = blockIdx.x;
46 |
47 | // if (row_start > col_start) return;
48 |
49 | const int row_size =
50 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
51 | const int col_size =
52 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
53 |
54 | __shared__ float block_boxes[threadsPerBlock * 5];
55 | if (threadIdx.x < col_size) {
56 | block_boxes[threadIdx.x * 5 + 0] =
57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
58 | block_boxes[threadIdx.x * 5 + 1] =
59 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
60 | block_boxes[threadIdx.x * 5 + 2] =
61 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
62 | block_boxes[threadIdx.x * 5 + 3] =
63 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
64 | block_boxes[threadIdx.x * 5 + 4] =
65 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
66 | }
67 | __syncthreads();
68 |
69 | unsigned long long ts[MULTIPLIER];
70 |
71 | if (threadIdx.x < row_size) {
72 | #pragma unroll
73 | for (int i = 0; i < MULTIPLIER; ++i) {
74 | ts[i] = 0;
75 | }
76 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
77 | const float* cur_box = dev_boxes + cur_box_idx * 5;
78 | int i = 0;
79 | int start = 0;
80 | if (row_start == col_start) {
81 | start = threadIdx.x + 1;
82 | }
83 | for (i = start; i < col_size; i++) {
84 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
85 | ts[i / LONGLONG_SIZE] |= 1ULL << (i % LONGLONG_SIZE);
86 | }
87 | }
88 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
89 |
90 | #pragma unroll
91 | for (int i = 0; i < MULTIPLIER; ++i) {
92 | dev_mask[(cur_box_idx * col_blocks + col_start) * MULTIPLIER + i] =
93 | ts[i];
94 | }
95 | }
96 | }
97 |
98 | void _set_device(int device_id) {
99 | int current_device;
100 | CUDA_CHECK(cudaGetDevice(¤t_device));
101 | if (current_device == device_id) {
102 | return;
103 | }
104 | // The call to cudaSetDevice must come before any calls to Get, which
105 | // may perform initialization using the GPU.
106 | CUDA_CHECK(cudaSetDevice(device_id));
107 | }
108 |
109 | const size_t MEMORY_SIZE = 500000000;
110 | size_t nms_Malloc() {
111 | float* boxes_dev = NULL;
112 | CUDA_CHECK(cudaMalloc(&boxes_dev, MEMORY_SIZE));
113 | return size_t(boxes_dev);
114 | }
115 |
116 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
117 | int boxes_dim, float nms_overlap_thresh, int device_id, size_t base) {
118 | _set_device(device_id);
119 |
120 | float* boxes_dev = NULL;
121 | unsigned long long* mask_dev = NULL;
122 |
123 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
124 |
125 | if (base > 0) {
126 | size_t require_mem =
127 | boxes_num * boxes_dim * sizeof(float) +
128 | boxes_num * col_blocks * sizeof(unsigned long long) * MULTIPLIER;
129 | if (require_mem >= MEMORY_SIZE) {
130 | std::cout << "require_mem: " << require_mem << std::endl;
131 | }
132 | boxes_dev = (float*)(base);
133 | mask_dev =
134 | (unsigned long long*)(base +
135 | 512 * ((unsigned long long)(boxes_num *
136 | boxes_dim *
137 | sizeof(float) /
138 | 512) +
139 | 1));
140 | } else {
141 | CUDA_CHECK(
142 | cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(float)));
143 | CUDA_CHECK(cudaMalloc(&mask_dev, MULTIPLIER * boxes_num * col_blocks *
144 | sizeof(unsigned long long)));
145 | }
146 | CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host,
147 | boxes_num * boxes_dim * sizeof(float),
148 | cudaMemcpyHostToDevice));
149 |
150 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
151 | DIVUP(boxes_num, threadsPerBlock));
152 | dim3 threads(threadsPerBlock);
153 | nms_kernel<<>>(boxes_num, nms_overlap_thresh, boxes_dev,
154 | mask_dev);
155 |
156 | std::vector mask_host(boxes_num * col_blocks *
157 | MULTIPLIER);
158 | CUDA_CHECK(cudaMemcpy(
159 | &mask_host[0], mask_dev,
160 | sizeof(unsigned long long) * boxes_num * col_blocks * MULTIPLIER,
161 | cudaMemcpyDeviceToHost));
162 |
163 | std::vector remv(col_blocks * MULTIPLIER);
164 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks * MULTIPLIER);
165 |
166 | int num_to_keep = 0;
167 | for (int i = 0; i < boxes_num; i++) {
168 | int nblock = i / threadsPerBlock;
169 | int inblock = i % threadsPerBlock;
170 | int offset = inblock / LONGLONG_SIZE;
171 | int bit_pos = inblock % LONGLONG_SIZE;
172 |
173 | if (!(remv[nblock * MULTIPLIER + offset] & (1ULL << bit_pos))) {
174 | keep_out[num_to_keep++] = i;
175 | unsigned long long* p = &mask_host[0] + i * col_blocks * MULTIPLIER;
176 | for (int j = nblock * MULTIPLIER + offset;
177 | j < col_blocks * MULTIPLIER; j++) {
178 | remv[j] |= p[j];
179 | }
180 | }
181 | }
182 | *num_out = num_to_keep;
183 |
184 | if (!base) {
185 | CUDA_CHECK(cudaFree(boxes_dev));
186 | CUDA_CHECK(cudaFree(mask_dev));
187 | }
188 | }
189 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 | true
171 | DEFINITION_ORDER
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 | 1543635449354
229 |
230 |
231 | 1543635449354
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
--------------------------------------------------------------------------------