├── cfgs
    ├── voc.json
    ├── __init__.py
    └── config.py
├── lib
    ├── __init__.py
    ├── bbox
    │   ├── __init__.py
    │   ├── box_transform.py
    │   └── bbox.py
    ├── det_ops
    │   ├── __init__.py
    │   ├── anchors.py
    │   ├── anchor_target.py
    │   └── loss.py
    └── nms
    │   ├── .gitignore
    │   ├── __init__.py
    │   ├── Makefile
    │   ├── gpu_nms.hpp
    │   ├── nms_wrapper.py
    │   ├── gpu_nms.pyx
    │   ├── cpu_nms.pyx
    │   ├── setup.py
    │   ├── cpu_soft_nms.pyx
    │   └── nms_kernel.cu
├── models
    ├── __init__.py
    ├── fpn.py
    ├── retina.py
    └── resnet.py
├── utils
    ├── __init__.py
    ├── logger.py
    └── visualization.py
├── datasets
    ├── __init__.py
    ├── utils.py
    ├── icdar15.py
    ├── synthtext.py
    └── minibatch.py
├── evaluation
    ├── __init__.py
    └── evaluation.py
├── .DS_Store
├── images
    ├── .DS_Store
    ├── icdar.png
    └── synth.png
├── .idea
    ├── vcs.xml
    ├── misc.xml
    ├── modules.xml
    ├── RetinaNet-Text-Detection.iml
    └── workspace.xml
├── README.md
├── LICENSE
├── train.py
└── test.py


/cfgs/voc.json:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cfgs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/bbox/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/det_ops/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/nms/.gitignore:
--------------------------------------------------------------------------------
1 | *.cpp
2 | 


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wondervictor/RetinaNet-Text-Detection/HEAD/.DS_Store


--------------------------------------------------------------------------------
/lib/nms/__init__.py:
--------------------------------------------------------------------------------
1 | from .nms_wrapper import nms, soft_nms
2 | 
3 | __all__ = ['nms', 'soft_nms']
4 | 


--------------------------------------------------------------------------------
/images/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wondervictor/RetinaNet-Text-Detection/HEAD/images/.DS_Store


--------------------------------------------------------------------------------
/images/icdar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wondervictor/RetinaNet-Text-Detection/HEAD/images/icdar.png


--------------------------------------------------------------------------------
/images/synth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wondervictor/RetinaNet-Text-Detection/HEAD/images/synth.png


--------------------------------------------------------------------------------
/lib/nms/Makefile:
--------------------------------------------------------------------------------
1 | 
2 | all:
3 | 	echo "Compiling nms kernels..."
4 | 	python setup.py build_ext --inplace
5 | 
6 | clean:
7 | 	rm -f *.so
8 | 


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/lib/nms/gpu_nms.hpp:
--------------------------------------------------------------------------------
1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
2 |           int boxes_dim, float nms_overlap_thresh, int device_id, size_t base);
3 | size_t nms_Malloc();
4 | 


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/RetinaNet-Text-Detection.iml" filepath="$PROJECT_DIR$/.idea/RetinaNet-Text-Detection.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## RetinaNet-Text-Detection
 2 | 
 3 | **Work in progress (Fork of [wondervictor/RetinaNet](https://github.com/wondervictor/RetinaNet))**
 4 | 
 5 | RetinaNet for `Text Detection` implemented with Pure PyTorch
 6 | 
 7 | 
 8 | ### Results
 9 | 
10 | * ICDAR
11 | 
12 | ![](images/icdar.png)
13 | 
14 | * SynthText
15 | 
16 | ![](images/synth.png)
17 | 
18 | 
19 | ### Licence
20 | 
21 | This project is under the **MIT Licence**
22 | 


--------------------------------------------------------------------------------
/.idea/RetinaNet-Text-Detection.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="pytest" />
10 |     <option name="PROJECT_TEST_RUNNER" value="pytest" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/utils/logger.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | Training Logger
 4 | 
 5 | """
 6 | import torch
 7 | 
 8 | 
 9 | class Logger:
10 | 
11 |     def __init__(self):
12 |         pass
13 | 
14 | 
15 | def save_checkpoints(model, optimizer, epoch, iteration, path):
16 | 
17 |     state_dict = {
18 |         "model": model.state_dict(),
19 |         "optimizer": optimizer.state_dict(),
20 |         "epoch": epoch,
21 |         "iteration": iteration
22 |     }
23 | 
24 |     torch.save(state_dict, path)
25 | 
26 | 
27 | def load_checkpoints(path):
28 |     state_dict = torch.load(path)
29 | 
30 |     return state_dict['model'], state_dict['optimizer'], state_dict['epoch'], state_dict['iteration']
31 | 
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Vic Chan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/utils/visualization.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | """
 4 | 
 5 | import cv2
 6 | import json
 7 | import random
 8 | import argparse
 9 | 
10 | 
11 | def show_img(im_path, boxes):
12 | 
13 |     img = cv2.imread(im_path)
14 |     for bb in boxes:
15 |         if bb[4] < 0.3:
16 |             continue
17 |         img = cv2.rectangle(img, (int(bb[0]), int(bb[1])), (int(bb[2]), int(bb[3])), (0, 255, 0), 1)
18 |         img = cv2.putText(img, '{}:{:.2f}'.format(bb[5], bb[4]), (int(bb[0]), int(bb[1])+10), cv2.FONT_HERSHEY_SIMPLEX,
19 |                           0.5, (0, 255, 0), 1)
20 |     cv2.imshow('img', img)
21 | 
22 |     cv2.waitKey(0)
23 |     cv2.destroyAllWindows()
24 | 
25 | 
26 | def main():
27 |     parser = argparse.ArgumentParser()
28 |     parser.add_argument('-d', '--dt', default='', type=str)
29 |     args = parser.parse_args()
30 |     with open(args.dt, 'r') as f:
31 |         lines = f.readlines()
32 |     lines = [json.loads(x.rstrip('\n')) for x in lines]
33 |     random.shuffle(lines)
34 |     dt = dict()
35 |     for l in lines:
36 |         name = l['image_id']
37 |         res = l['result']
38 |         _boxes = []
39 |         for bb in res:
40 |             _boxes.append(bb['bbox']+[bb['prob'], bb['class']])
41 |         dt[name] = _boxes
42 | 
43 |     for k in dt.keys():
44 |         show_img('/public_datasets/SynthText/'+k, dt[k])
45 | 
46 | 
47 | if __name__ == '__main__':
48 | 
49 |     main()
50 | 


--------------------------------------------------------------------------------
/lib/nms/nms_wrapper.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | from .gpu_nms import gpu_nms
 5 | from .cpu_nms import cpu_nms
 6 | from .cpu_soft_nms import cpu_soft_nms
 7 | 
 8 | 
 9 | def nms(dets, thresh, device_id=None):
10 |     """Dispatch to either CPU or GPU NMS implementations."""
11 | 
12 |     if isinstance(dets, torch.Tensor):
13 |         if dets.is_cuda:
14 |             device_id = dets.get_device()
15 |         dets = dets.detach().cpu().numpy()
16 |     assert isinstance(dets, np.ndarray)
17 | 
18 |     if dets.shape[0] == 0:
19 |         inds = []
20 |     else:
21 |         inds = (gpu_nms(dets, thresh, device_id=device_id)
22 |                 if device_id is not None else cpu_nms(dets, thresh))
23 | 
24 |     if isinstance(dets, torch.Tensor):
25 |         return dets.new_tensor(inds, dtype=torch.long)
26 |     else:
27 |         return np.array(inds, dtype=np.int)
28 | 
29 | 
30 | def soft_nms(dets, Nt=0.3, method=1, sigma=0.5, min_score=0):
31 |     if isinstance(dets, torch.Tensor):
32 |         _dets = dets.detach().cpu().numpy()
33 |     else:
34 |         _dets = dets.copy()
35 |     assert isinstance(_dets, np.ndarray)
36 | 
37 |     new_dets, inds = cpu_soft_nms(
38 |         _dets, Nt=Nt, method=method, sigma=sigma, threshold=min_score)
39 | 
40 |     if isinstance(dets, torch.Tensor):
41 |         return dets.new_tensor(
42 |             inds, dtype=torch.long), dets.new_tensor(new_dets)
43 |     else:
44 |         return np.array(
45 |             inds, dtype=np.int), np.array(
46 |                 new_dets, dtype=np.float32)
47 | 


--------------------------------------------------------------------------------
/lib/nms/gpu_nms.pyx:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Faster R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | cimport numpy as np
10 | 
11 | assert sizeof(int) == sizeof(np.int32_t)
12 | 
13 | cdef extern from "gpu_nms.hpp":
14 |     void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int, size_t) nogil
15 |     size_t nms_Malloc() nogil
16 | 
17 | memory_pool = {}
18 | 
19 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
20 |             np.int32_t device_id=0):
21 |     cdef int boxes_num = dets.shape[0]
22 |     cdef int boxes_dim = dets.shape[1]
23 |     cdef int num_out
24 |     cdef size_t base
25 |     cdef np.ndarray[np.int32_t, ndim=1] \
26 |         keep = np.zeros(boxes_num, dtype=np.int32)
27 |     cdef np.ndarray[np.float32_t, ndim=1] \
28 |         scores = dets[:, 4]
29 |     cdef np.ndarray[np.int_t, ndim=1] \
30 |         order = scores.argsort()[::-1]
31 |     cdef np.ndarray[np.float32_t, ndim=2] \
32 |         sorted_dets = dets[order, :]
33 |     cdef float cthresh = thresh
34 |     if device_id not in memory_pool:
35 |         with nogil:
36 |             base = nms_Malloc()
37 |         memory_pool[device_id] = base
38 |         # print "malloc", base
39 |     base = memory_pool[device_id]
40 |     with nogil:
41 |         _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, cthresh, device_id, base)
42 |     keep = keep[:num_out]
43 |     return list(order[keep])
44 | 


--------------------------------------------------------------------------------
/datasets/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | Dataset utils
 4 | 
 5 | """
 6 | import cv2
 7 | import numpy as np
 8 | from PIL import Image
 9 | 
10 | 
11 | def flip_pil_img_and_boxes(img, boxes=None):
12 |     """ Flip PIL Images and Boxes
13 |     Args:
14 |         img: PIL Image
15 |         boxes: [N, 4]
16 |     """
17 |     assert isinstance(img, Image.Image), "img should be PIL.Image"
18 |     w, h = img.size
19 |     flip_img = img.transpose(Image.FLIP_LEFT_RIGHT)
20 |     if boxes is not None:
21 |         flip_boxes = boxes.copy()
22 |         flip_boxes[:, 0] = w - boxes[:, 2] - 1
23 |         flip_boxes[:, 2] = w - boxes[:, 0] - 1
24 |         return flip_img, flip_boxes
25 |     else:
26 |         return flip_img
27 | 
28 | 
29 | def flip_img_boxes(img, boxes=None):
30 | 
31 |     h, w, c = img.shape
32 |     flip_img = cv2.flip(img, 1)
33 |     if boxes is not None:
34 |         flip_boxes = boxes.copy()
35 |         for i in range(flip_boxes.shape[0]):
36 |             flip_boxes[i, 0] = w - boxes[i, 2] - 1
37 |             flip_boxes[i, 2] = w - boxes[i, 0] - 1
38 |         return flip_img, flip_boxes
39 |     else:
40 |         return flip_img
41 | 
42 | 
43 | def normalize_image(img):
44 |     img = img / 255.0
45 |     mean = np.array([.485, .456, .406])
46 |     std = np.array([.229, .224, .225])
47 |     img = (img - mean) / std
48 |     return img
49 | 
50 | 
51 | def get_im_scale(h, w, target_size, max_size):
52 |     img_min_size = min(h, w)
53 |     img_max_size = max(h, w)
54 |     scale = target_size / img_min_size
55 |     if scale * img_max_size > max_size:
56 |         scale = max_size / img_max_size
57 | 
58 |     return int(round(h*scale)), int(round(w*scale)), scale
59 | 


--------------------------------------------------------------------------------
/lib/det_ops/anchors.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Generate Anchors
 3 | """
 4 | 
 5 | import math
 6 | import torch
 7 | 
 8 | 
 9 | def mesh_grid(x, y):
10 |     """ mesh grid
11 | 
12 |     """
13 |     a = torch.arange(0, x)
14 |     b = torch.arange(0, y)
15 |     xx = a.repeat(y).view(-1, 1)
16 |     yy = b.view(-1, 1).repeat(1, x).view(-1, 1)
17 | 
18 |     return torch.cat([xx, yy], dim=1).float()
19 | 
20 | 
21 | def compute_anchor_whs(num_layers, areas, aspect_ratios, sizes):
22 |     anchors = []
23 |     for i in range(len(areas)):
24 |         area = areas[i]
25 |         for ar in aspect_ratios:
26 |             h = math.sqrt(area / ar)
27 |             w = h * ar
28 |             for s in sizes:
29 |                 anchor_h = h * s
30 |                 anchor_w = w * s
31 |                 anchors.append([anchor_w, anchor_h])
32 |     # M * K * 2
33 |     # Faster R-CNN: 1*K*2 (1x9x2)
34 |     # FPN: 5*K*2 (5x3x2)
35 |     # RetinaNet: 5*K*2 (5*9*2)
36 |     return torch.Tensor(anchors).view(num_layers, -1, 2)
37 | 
38 | 
39 | def generate_anchors(anchor_whs, input_size, strides):
40 |     """ generate anchors
41 |     """
42 |     boxes = []
43 |     num_strides = len(strides)
44 |     num_anchors = anchor_whs.shape[1]
45 | 
46 |     for i in range(num_strides):
47 |         stride = strides[i]
48 |         feature_size = input_size / stride
49 |         fmw, fmh = int(math.ceil(feature_size[0])), int(math.ceil(feature_size[1]))
50 |         xy = mesh_grid(fmh, fmw) + 0.5  # shift to center
51 |         xy = (xy * stride).view(fmh, fmw, 1, 2).expand(fmh, fmw, num_anchors, 2)
52 |         wh = anchor_whs[i].view(1, 1, num_anchors, 2).expand(fmh, fmw, num_anchors, 2)
53 |         box = torch.cat([xy, wh], dim=3)
54 |         boxes.append(box.view(-1, 4))
55 |     boxes = torch.cat(boxes, 0)
56 |     # box: H * W * self._num_anchors * 2
57 |     return boxes
58 | 


--------------------------------------------------------------------------------
/models/fpn.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Feature Pyramid Network for Object Detection
 3 | 
 4 | """
 5 | 
 6 | import torch
 7 | import torch.nn.functional as F
 8 | from torchvision.models import resnet
 9 | import torch.nn as nn
10 | from .resnet import ResNet50Stages
11 | 
12 | 
13 | class FPN50(nn.Module):
14 | 
15 |     def __init__(self, pretrained_path):
16 |         super(FPN50, self).__init__()
17 |         self.backbone = ResNet50Stages(pretrained_path)
18 | 
19 |         self.lateral_layer1 = nn.Conv2d(2048, 256, 1)
20 |         self.lateral_layer2 = nn.Conv2d(1024, 256, 1)
21 |         self.lateral_layer3 = nn.Conv2d(512,  256, 1)
22 | 
23 |         self.conv6 = nn.Conv2d(2048, 256, 3, padding=1, stride=2)
24 |         self.conv7 = nn.Conv2d(256,  256, 3, padding=1, stride=2)
25 |         self.relu = nn.ReLU(inplace=True)
26 | 
27 |         self._weight_initialize()
28 | 
29 |     def _weight_initialize(self):
30 | 
31 |         self.lateral_layer1.weight.data.normal_(std=0.01)
32 |         self.lateral_layer1.bias.data.fill_(0.0)
33 | 
34 |         self.lateral_layer2.weight.data.normal_(std=0.01)
35 |         self.lateral_layer2.bias.data.fill_(0.0)
36 | 
37 |         self.lateral_layer3.weight.data.normal_(std=0.01)
38 |         self.lateral_layer3.bias.data.fill_(0.0)
39 | 
40 |         self.conv6.weight.data.normal_(std=0.01)
41 |         self.conv6.bias.data.fill_(0.0)
42 | 
43 |         self.conv7.weight.data.normal_(std=0.01)
44 |         self.conv7.bias.data.fill_(0.0)
45 | 
46 |     def upsample_add(self, x, y):
47 |         _, _, h, w = y.size()
48 |         return F.interpolate(x, size=(h, w), mode='bilinear', align_corners=True)
49 | 
50 |     def forward(self, x):
51 |         c3, c4, c5 = self.backbone(x)
52 |         # c2: 64*4=256 c3: 128*4=512 c4: 256*4=1024 c5: 512*4=2048
53 | 
54 |         p5 = self.lateral_layer1(c5)
55 | 
56 |         p4 = self.lateral_layer2(c4)
57 |         p4 = self.upsample_add(p5, p4)
58 | 
59 |         p3 = self.lateral_layer3(c3)
60 |         p3 = self.upsample_add(p4, p3)
61 | 
62 |         p6 = self.conv6(c5)
63 |         p7 = self.conv7(self.relu(p6))
64 | 
65 |         return p3, p4, p5, p6, p7
66 | 


--------------------------------------------------------------------------------
/lib/bbox/box_transform.py:
--------------------------------------------------------------------------------
 1 | """
 2 | BBox transform
 3 | """
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | def bbox_transform(boxes, gtboxes):
 9 |     """ Bounding Box Transform
10 |     from groundtruth boxes and proposal boxes to deltas
11 | 
12 |     Args:
13 |         boxes: [N, 4] torch.Tensor (xyxy)
14 |         gtboxes: [N, 4] torch.Tensor  (xywh)
15 |     Return:
16 |         delta: [N, 4] torch.Tensor
17 |     """
18 |     gt_w = gtboxes[:, 2] - gtboxes[:, 0] + 1
19 |     gt_h = gtboxes[:, 3] - gtboxes[:, 1] + 1
20 |     # center
21 |     gt_x = gtboxes[:, 0] + 0.5 * gt_w
22 |     gt_y = gtboxes[:, 1] + 0.5 * gt_h
23 | 
24 |     # Anchors [x,y,w,h]
25 |     anchor_x = boxes[:, 0]
26 |     anchor_y = boxes[:, 1]
27 |     anchor_w = boxes[:, 2]
28 |     anchor_h = boxes[:, 3]
29 |     # anchor_w = boxes[:, 2] - boxes[:, 0] + 1
30 |     # anchor_h = boxes[:, 3] - boxes[:, 1] + 1
31 |     # # center
32 |     # anchor_x = boxes[:, 0] + 0.5 * anchor_w
33 |     # anchor_y = boxes[:, 1] + 0.5 * anchor_h
34 | 
35 |     delta_x = (gt_x - anchor_x) / anchor_w
36 |     delta_y = (gt_y - anchor_y) / anchor_h
37 |     delta_w = torch.log(gt_w / anchor_w)
38 |     delta_h = torch.log(gt_h / anchor_h)
39 | 
40 |     # [N, 4]
41 |     return torch.stack([delta_x, delta_y, delta_w, delta_h]).transpose(0, 1)
42 | 
43 | 
44 | def bbox_transform_inv(boxes, delta):
45 |     """ Inverse Bounding Box Transform
46 |     from deltas and proposal boxes to predicted boxes
47 |     Args:
48 |         boxes: [N, 4] torch.Tensor (xywh)
49 |         delta: [N, 4] torch.Tensor (xywh)
50 |     Return:
51 |         pred: [N, 4] torch.Tensor (xyxy)
52 |     """
53 |     pred_boxes = torch.zeros_like(boxes)
54 |     pred_x = boxes[:, 0] + boxes[:, 2] * delta[:, 0]
55 |     pred_y = boxes[:, 1] + boxes[:, 3] * delta[:, 1]
56 |     pred_w = boxes[:, 2] * torch.exp(delta[:, 2])
57 |     pred_h = boxes[:, 3] * torch.exp(delta[:, 3])
58 | 
59 |     pred_boxes[:, 0] = pred_x - 0.5 * pred_w
60 |     pred_boxes[:, 1] = pred_y - 0.5 * pred_h
61 |     pred_boxes[:, 2] = pred_x + 0.5 * pred_w
62 |     pred_boxes[:, 3] = pred_y + 0.5 * pred_h
63 | 
64 |     return pred_boxes
65 | 
66 | 
67 | if __name__ == '__main__':
68 | 
69 |     pass
70 | 


--------------------------------------------------------------------------------
/cfgs/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Config File
 3 | """
 4 | 
 5 | 
 6 | config = {
 7 | 
 8 |     "synth_baseline": {
 9 |         # lr and general config
10 |         'base_lr': 1e-2,
11 |         "lr_decay": [60000, 80000],
12 |         "workers": 8,
13 |         "num_classes": 21,
14 |         "weight_decay": 1e-4,
15 |         "epochs": 200,
16 | 
17 |         "basemodel_path": '/home/tianhengcheng/.torch/models/resnet50-19c8e357.pth',
18 |         "data_dir": "/public_datasets/SynthText",
19 | 
20 |         # anchor config
21 |         "positive_anchor_threshold": 0.5,
22 |         "negative_anchor_threshold": 0.4,
23 |         "anchor_sizes": [2 ** 0, 2 ** (1 / 3), 2 ** (2 / 3)],
24 |         "aspect_ratios": [1, 3, 5],
25 |         "anchor_areas": [32 ** 2, 64 ** 2, 128 ** 2, 256 ** 2, 512 ** 2],
26 |         "strides": [8, 16, 32, 64, 128],
27 |         "base_size": 8,
28 | 
29 |         # dataset
30 |         "image_scales": [600],
31 |         "max_image_size": 1000,
32 | 
33 |         # test config
34 |         "pre_nms_boxes": 1000,
35 |         "test_nms": 0.5,
36 |         "test_max_boxes": 300,
37 |         "cls_thresh": 0.05,
38 | 
39 |         # log
40 |         "logdir": "log",
41 |         "tb_dump_dir": "",
42 |         "model_dump_dir": "",
43 |     },
44 | 
45 |     "icdar_baseline": {
46 |         # lr and general config
47 |         'base_lr': 1e-2,
48 |         "lr_decay": [60000, 80000],
49 |         "workers": 8,
50 |         "num_classes": 21,
51 |         "weight_decay": 1e-4,
52 |         "epochs": 200,
53 | 
54 |         "basemodel_path": '/home/tianhengcheng/.torch/models/resnet50-19c8e357.pth',
55 |         "data_dir": "/public_datasets/Text/icdar2015/",
56 | 
57 |         # anchor config
58 |         "positive_anchor_threshold": 0.5,
59 |         "negative_anchor_threshold": 0.4,
60 |         "anchor_sizes": [2 ** 0, 2 ** (1 / 3), 2 ** (2 / 3)],
61 |         "aspect_ratios": [1, 3, 5],
62 |         "anchor_areas": [32 ** 2, 64 ** 2, 128 ** 2, 256 ** 2, 512 ** 2],
63 |         "strides": [8, 16, 32, 64, 128],
64 |         "base_size": 8,
65 | 
66 |         # dataset
67 |         "image_scales": [600],
68 |         "max_image_size": 1000,
69 | 
70 |         # test config
71 |         "pre_nms_boxes": 1000,
72 |         "test_nms": 0.5,
73 |         "test_max_boxes": 300,
74 |         "cls_thresh": 0.05,
75 | 
76 |         # log
77 |         "logdir": "log",
78 |         "tb_dump_dir": "",
79 |         "model_dump_dir": "",
80 |     }
81 | 
82 | }
83 | 


--------------------------------------------------------------------------------
/lib/nms/cpu_nms.pyx:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | cimport numpy as np
10 | 
11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
12 |     return a if a >= b else b
13 | 
14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
15 |     return a if a <= b else b
16 | 
17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
18 |     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
19 |     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
20 |     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
21 |     cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
22 |     cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
23 | 
24 |     cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
25 |     cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
26 | 
27 |     cdef int ndets = dets.shape[0]
28 |     cdef np.ndarray[np.int_t, ndim=1] suppressed = \
29 |             np.zeros((ndets), dtype=np.int)
30 | 
31 |     # nominal indices
32 |     cdef int _i, _j
33 |     # sorted indices
34 |     cdef int i, j
35 |     # temp variables for box i's (the box currently under consideration)
36 |     cdef np.float32_t ix1, iy1, ix2, iy2, iarea
37 |     # variables for computing overlap with box j (lower scoring box)
38 |     cdef np.float32_t xx1, yy1, xx2, yy2
39 |     cdef np.float32_t w, h
40 |     cdef np.float32_t inter, ovr
41 | 
42 |     keep = []
43 |     for _i in range(ndets):
44 |         i = order[_i]
45 |         if suppressed[i] == 1:
46 |             continue
47 |         keep.append(i)
48 |         ix1 = x1[i]
49 |         iy1 = y1[i]
50 |         ix2 = x2[i]
51 |         iy2 = y2[i]
52 |         iarea = areas[i]
53 |         for _j in range(_i + 1, ndets):
54 |             j = order[_j]
55 |             if suppressed[j] == 1:
56 |                 continue
57 |             xx1 = max(ix1, x1[j])
58 |             yy1 = max(iy1, y1[j])
59 |             xx2 = min(ix2, x2[j])
60 |             yy2 = min(iy2, y2[j])
61 |             w = max(0.0, xx2 - xx1 + 1)
62 |             h = max(0.0, yy2 - yy1 + 1)
63 |             inter = w * h
64 |             ovr = inter / (iarea + areas[j] - inter)
65 |             if ovr >= thresh:
66 |                 suppressed[j] = 1
67 | 
68 |     return keep
69 | 


--------------------------------------------------------------------------------
/lib/det_ops/anchor_target.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | Anchor Layer
 4 | 
 5 | 
 6 | """
 7 | 
 8 | import torch
 9 | import torch.nn.functional as F
10 | from lib.det_ops.anchors import compute_anchor_whs, generate_anchors
11 | import sys
12 | sys.path.append('../')
13 | from lib.bbox import bbox, box_transform
14 | from IPython import embed
15 | 
16 | 
17 | class AnchorLayer:
18 |     """ Anchor Layer
19 |     """
20 |     def __init__(self, strides, areas, aspect_ratios, sizes):
21 |         self.aspect_ratios = aspect_ratios
22 |         self.areas = areas
23 |         self.strides = strides
24 |         self.sizes = sizes
25 | 
26 |         self._anchor_sizes = self._compute_anchor_size()
27 | 
28 |         # self._num_anchors = len(self.scales) * len(self.aspect_ratios)
29 | 
30 |     def _compute_anchor_size(self):
31 |         return compute_anchor_whs(len(self.strides), self.areas, self.aspect_ratios, self.sizes)
32 | 
33 |     def _generate_anchors(self, input_size):
34 |         boxes = generate_anchors(self._anchor_sizes, input_size, self.strides)
35 |         return boxes
36 | 
37 |     def assign(self, gt_boxes, labels, input_size, neg_thresh=0.4, pos_thresh=0.5):
38 |         """ assign groundtruth box to anchor box
39 | 
40 |         """
41 |         anchor_boxes = self._generate_anchors(input_size)
42 |         if labels.shape[0] == 0:
43 |             return torch.LongTensor([0]*anchor_boxes.shape[0]), torch.zeros_like(anchor_boxes)
44 |         # M * N
45 |         xyxy_anchors = bbox.xywh2xyxy(anchor_boxes)
46 |         ious = bbox.box_overlaps(xyxy_anchors, gt_boxes)
47 |         max_ious, max_inds = ious.max(1)
48 |         # M * 4
49 |         matched_boxes = gt_boxes[max_inds]
50 |         box_targets = box_transform.bbox_transform(anchor_boxes, matched_boxes)
51 | 
52 |         cls_targets = labels[max_inds]
53 |         # negative
54 |         cls_targets[max_ious < neg_thresh] = 0
55 |         # ignore
56 |         cls_targets[(max_ious > neg_thresh) & (max_ious < pos_thresh)] = -1
57 |         return cls_targets, box_targets
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     # RetinaNet settings
62 |     strides = [8, 16, 32, 64, 128]
63 |     aspect_ratios = [0.5, 1, 2]
64 |     sizes = [1, 2**(1/3), 2**(2/3)]
65 |     areas = [32**2, 64**2, 128**2, 256**2, 512**2]
66 |     anchor_layer = AnchorLayer(strides=strides, areas=areas, aspect_ratios=aspect_ratios, sizes=sizes)
67 | 
68 |     boxes = torch.Tensor([[10, 20, 44, 56], [50, 34, 260, 340],
69 |                           [70, 80, 190, 410], [360, 270, 500, 600]])
70 |     labels = torch.LongTensor([3, 1, 1, 4])
71 |     cls_target, box_target = anchor_layer.assign(boxes, labels, torch.FloatTensor([600, 600]), 0.4, 0.5)
72 | 
73 |     embed()
74 | 


--------------------------------------------------------------------------------
/lib/nms/setup.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | from distutils.core import setup, Extension
 3 | 
 4 | import numpy as np
 5 | from Cython.Build import cythonize
 6 | from Cython.Distutils import build_ext
 7 | 
 8 | # extensions
 9 | ext_args = dict(
10 |     include_dirs=[np.get_include()],
11 |     language='c++',
12 |     extra_compile_args={
13 |         'cc': ['-Wno-unused-function', '-Wno-write-strings'],
14 |         'nvcc': ['-c', '--compiler-options', '-fPIC'],
15 |     },
16 | )
17 | 
18 | extensions = [
19 |     Extension('cpu_nms', ['cpu_nms.pyx'], **ext_args),
20 |     Extension('cpu_soft_nms', ['cpu_soft_nms.pyx'], **ext_args),
21 |     Extension('gpu_nms', ['gpu_nms.pyx', 'nms_kernel.cu'], **ext_args),
22 | ]
23 | 
24 | 
25 | def customize_compiler_for_nvcc(self):
26 |     """inject deep into distutils to customize how the dispatch
27 |     to cc/nvcc works.
28 |     If you subclass UnixCCompiler, it's not trivial to get your subclass
29 |     injected in, and still have the right customizations (i.e.
30 |     distutils.sysconfig.customize_compiler) run on it. So instead of going
31 |     the OO route, I have this. Note, it's kindof like a wierd functional
32 |     subclassing going on."""
33 | 
34 |     # tell the compiler it can processes .cu
35 |     self.src_extensions.append('.cu')
36 | 
37 |     # save references to the default compiler_so and _comple methods
38 |     default_compiler_so = self.compiler_so
39 |     super = self._compile
40 | 
41 |     # now redefine the _compile method. This gets executed for each
42 |     # object but distutils doesn't have the ability to change compilers
43 |     # based on source extension: we add it.
44 |     def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
45 |         if osp.splitext(src)[1] == '.cu':
46 |             # use the cuda for .cu files
47 |             self.set_executable('compiler_so', 'nvcc')
48 |             # use only a subset of the extra_postargs, which are 1-1 translated
49 |             # from the extra_compile_args in the Extension class
50 |             postargs = extra_postargs['nvcc']
51 |         else:
52 |             postargs = extra_postargs['cc']
53 | 
54 |         super(obj, src, ext, cc_args, postargs, pp_opts)
55 |         # reset the default compiler_so, which we might have changed for cuda
56 |         self.compiler_so = default_compiler_so
57 | 
58 |     # inject our redefined _compile method into the class
59 |     self._compile = _compile
60 | 
61 | 
62 | # run the customize_compiler
63 | class custom_build_ext(build_ext):
64 | 
65 |     def build_extensions(self):
66 |         customize_compiler_for_nvcc(self.compiler)
67 |         build_ext.build_extensions(self)
68 | 
69 | 
70 | setup(
71 |     name='nms',
72 |     cmdclass={'build_ext': custom_build_ext},
73 |     ext_modules=cythonize(extensions),
74 | )
75 | 


--------------------------------------------------------------------------------
/datasets/icdar15.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ICDAR2015 for Text Detection
 3 | """
 4 | 
 5 | import os
 6 | import cv2
 7 | import json
 8 | import torch
 9 | import random
10 | import numpy as np
11 | from PIL import Image
12 | from scipy.io import loadmat
13 | from torch.utils.data import Dataset
14 | from torchvision.transforms import transforms
15 | from datasets.utils import normalize_image, get_im_scale
16 | 
17 | 
18 | CLASSES = ('text',)
19 | NUM_CLASSES = 2
20 | 
21 | 
22 | class ICDAR15(Dataset):
23 | 
24 |     def __init__(self, dataroot, config, imageset='train'):
25 |         assert imageset == 'train' or imageset == 'val' or imageset == 'all'
26 |         self._imageset = imageset
27 |         self._annotation_file = os.path.join(dataroot, '{}.odgt'.format(imageset))
28 |         self._base_dir = os.path.join(dataroot, '{}_images'.format(imageset))
29 |         self.name = 'ICDAR15'
30 |         self.config = config
31 |         self.annotations = self._read_annotations()
32 | 
33 |     def _read_annotations(self):
34 |         # im path -> annotations
35 |         with open(self._annotation_file, 'r') as f:
36 |             lines = f.readlines()
37 |         lines = list(map(lambda x: json.loads(x.rstrip('\n')), lines))
38 |         return lines
39 | 
40 |     def __len__(self):
41 |         return len(self.annotations)
42 | 
43 |     def __getitem__(self, idx):
44 |         annotation = self.annotations[idx]
45 |         im_name = annotation['im_name']
46 |         gt_boxes = annotation['gtboxes']
47 |         try:
48 |             img = Image.open(os.path.join(self._base_dir, im_name))
49 |         except OSError as e:
50 |             idx = random.randint(0, len(self))
51 |             annotation = self.annotations[idx]
52 |             im_name = annotation['im_name']
53 |             gt_boxes = annotation['gtboxes']
54 |             img = Image.open(os.path.join(self._base_dir, im_name))
55 | 
56 |         if self._imageset == 'val':
57 |             # testing or validation mode, original scale
58 |             img = np.array(img).astype('float32')
59 |             h, w = img.shape[:2]
60 |             resize_h, resize_w, scale = get_im_scale(h, w, target_size=self.config['test_image_size'][0],
61 |                                                      max_size=self.config['test_max_image_size'])
62 |             img = cv2.resize(img, (resize_w, resize_h))
63 |             img = normalize_image(img)
64 |             img = img.transpose(2, 0, 1)
65 |             img = torch.Tensor(img)
66 |             return img, im_name, scale, (h, w)
67 | 
68 |         img = np.array(img).astype('float32')
69 |         labels = np.ones(len(gt_boxes), dtype=np.int32)
70 |         labels = torch.LongTensor(labels)
71 |         boxes = np.array(gt_boxes, dtype=np.float32)
72 |         # C, H, W
73 | 
74 |         return img, labels, boxes
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/datasets/synthtext.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Synth800K for Text Detection
 3 | """
 4 | 
 5 | import os
 6 | import cv2
 7 | import json
 8 | import torch
 9 | import random
10 | import numpy as np
11 | from PIL import Image
12 | from scipy.io import loadmat
13 | from torch.utils.data import Dataset
14 | from torchvision.transforms import transforms
15 | from datasets.utils import normalize_image, get_im_scale
16 | 
17 | 
18 | CLASSES = ('text',)
19 | NUM_CLASSES = 2
20 | SYNTHTEXT_ROOT = '/public_datasets/SynthText'
21 | 
22 | 
23 | class SynthText(Dataset):
24 | 
25 |     def __init__(self, dataroot,config, imageset='train'):
26 |         assert imageset == 'train' or imageset == 'val' or imageset == 'all'
27 |         self._imageset = imageset
28 |         self._annotation_file = os.path.join(dataroot, '{}.odgt'.format(imageset))
29 |         self._base_dir = dataroot
30 |         self.config = config
31 |         self.name = 'SynthText80K'
32 |         self.annotations = self._read_annotations()
33 | 
34 |     def _read_annotations(self):
35 |         # im path -> annotations
36 |         with open(self._annotation_file, 'r') as f:
37 |             lines = f.readlines()
38 |         lines = list(map(lambda x: json.loads(x.rstrip('\n')), lines))
39 |         return lines
40 | 
41 |     def __len__(self):
42 |         return len(self.annotations)
43 | 
44 |     def __getitem__(self, idx):
45 |         annotation = self.annotations[idx]
46 |         im_name = annotation['im_name']
47 |         gt_boxes = annotation['gtboxes']
48 |         try:
49 |             img = Image.open(os.path.join(self._base_dir, im_name))
50 |         except OSError as e:
51 |             idx = random.randint(0, len(self))
52 |             annotation = self.annotations[idx]
53 |             im_name = annotation['im_name']
54 |             gt_boxes = annotation['gtboxes']
55 |             img = Image.open(os.path.join(self._base_dir, im_name))
56 | 
57 |         if self._imageset == 'val':
58 |             # testing or validation mode, original scale
59 |             img = np.array(img).astype('float32')
60 |             h, w = img.shape[:2]
61 |             resize_h, resize_w, scale = get_im_scale(h, w, target_size=self.config['test_image_size'][0],
62 |                                                      max_size=self.config['test_max_image_size'])
63 |             img = cv2.resize(img, (resize_w, resize_h))
64 |             img = normalize_image(img)
65 |             img = img.transpose(2, 0, 1)
66 |             img = torch.Tensor(img)
67 |             return img, im_name, scale, (h, w)
68 | 
69 |         img = np.array(img).astype('float32')
70 |         labels = np.ones(len(gt_boxes), dtype=np.int32)
71 |         labels = torch.LongTensor(labels)
72 |         boxes = np.array(gt_boxes, dtype=np.float32)
73 |         # C, H, W
74 | 
75 |         return img, labels, boxes
76 | 
77 | 


--------------------------------------------------------------------------------
/lib/det_ops/loss.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | Loss functions for Detection
 4 | 
 5 | """
 6 | import math
 7 | import numpy as np
 8 | import torch
 9 | import torch.nn as nn
10 | import torch.nn.functional as F
11 | from IPython import embed
12 | __all__ = ['SmoothL1Loss', 'SoftmaxCrossEntropy', 'SigmoidCrossEntropy', 'SigmoidFocalLoss']
13 | 
14 | 
15 | SoftmaxCrossEntropy = nn.CrossEntropyLoss
16 | SigmoidCrossEntropy = nn.BCEWithLogitsLoss
17 | 
18 | 
19 | class SigmoidFocalLoss(nn.Module):
20 |     """ Focal Loss
21 | 
22 |     Args:
23 | 
24 |     Input:
25 |         pred: []
26 |         target: []
27 |     Output:
28 | 
29 |     """
30 |     def __init__(self, background=0, gamma=2, alpha=0.25):
31 |         super(SigmoidFocalLoss, self).__init__()
32 |         self.gamma = gamma
33 |         self.alpha = alpha
34 |         self.background = background
35 | 
36 |     def forward(self, pred, target):
37 |         # pred.shape = [B, K, C]
38 |         # target.shape = [B, N]
39 |         B, N, C = pred.size()
40 |         pred_sigmoid = pred.sigmoid()
41 |         # # ignore: [B, N]
42 |         # keep_mask = target > -1
43 |         # # [B, N], ignore and background shared 0 index
44 |         # keep_mask = keep_mask.long()
45 |         mask = (target > -1)
46 |         new_target = target * mask.long()
47 |         new_target = new_target.reshape((new_target.size()[0], new_target.size()[1], 1))
48 |         onehot_target = torch.zeros((B, N, C+1)).cuda()
49 |         onehot_target.scatter_(2, new_target, 1.0)
50 |         onehot = onehot_target[:, :, 1:].float()
51 |         mask = mask.unsqueeze(2).float()
52 | 
53 |         # pred_sigmoid = pred_sigmoid.clamp(min=1e-6, max=1-1e-6)
54 |         # bce = -(onehot * torch.log(pred_sigmoid) + (1-onehot)*torch.log(1-pred_sigmoid))
55 | 
56 |         weight = self.alpha*((1-pred_sigmoid).pow(self.gamma))*onehot \
57 |             + (1-self.alpha)*(pred_sigmoid.pow(self.gamma))*(1-onehot)
58 |         weight = weight * mask
59 |         avg_factor = torch.sum(target > 0, dim=1).float()
60 |         # embed()
61 |         loss = F.binary_cross_entropy_with_logits(pred, onehot, weight, reduction='none').sum(dim=1).sum(dim=1)
62 |         loss = loss.div_(avg_factor.clamp(min=1.0)).mean()
63 | 
64 |         return loss
65 | 
66 | 
67 | class SmoothL1Loss(nn.Module):
68 | 
69 |     def __init__(self):
70 |         super(SmoothL1Loss, self).__init__()
71 |         self.smooth_l1 = nn.SmoothL1Loss(reduction='none')
72 | 
73 |     def forward(self, offset, target, cls_target):
74 |         # ignore background and ignore label
75 |         # offset B*N*4
76 |         # target B*N*4
77 |         # cls: B*N*C
78 |         # bg_mask = cls_target == 0
79 |         # ig_mask = cls_target == -1
80 | 
81 |         mask = cls_target > 0  # ig_mask * bg_mask
82 |         mask = mask.float()
83 |         loss_raw = (self.smooth_l1(offset, target).sum(2)) * mask
84 |         loss = loss_raw.sum(dim=1).div_(mask.sum(dim=1).clamp(min=1.0)).mean()
85 |         return loss


--------------------------------------------------------------------------------
/models/retina.py:
--------------------------------------------------------------------------------
 1 | """
 2 | RetinaNet Model
 3 | backbone: resnet50 + FPN
 4 | """
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | import numpy as np
 9 | from models import fpn
10 | 
11 | 
12 | class RetinaNetHead(nn.Module):
13 | 
14 |     def __init__(self, num_classes, num_anchors):
15 |         super(RetinaNetHead, self).__init__()
16 |         self.num_classes = num_classes
17 | 
18 |         self.cls_branch = nn.Sequential(
19 |             nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1),
20 |             nn.ReLU(inplace=True),
21 |             nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1),
22 |             nn.ReLU(inplace=True),
23 |             nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1),
24 |             nn.ReLU(inplace=True),
25 |             nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1),
26 |             nn.ReLU(inplace=True)
27 |         )
28 |         self.cls_score = nn.Conv2d(256, out_channels=num_classes*num_anchors, kernel_size=3, stride=1, padding=1)
29 | 
30 |         self.bbox_branch = nn.Sequential(
31 |             nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1),
32 |             nn.ReLU(inplace=True),
33 |             nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1),
34 |             nn.ReLU(inplace=True),
35 |             nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1),
36 |             nn.ReLU(inplace=True),
37 |             nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1),
38 |             nn.ReLU(inplace=True),
39 |             nn.Conv2d(256, out_channels=num_anchors*4, kernel_size=3, stride=1, padding=1)
40 |         )
41 |         self._initialize_weights()
42 | 
43 |     def _initialize_weights(self):
44 |         for m in self.cls_branch.modules():
45 |             if isinstance(m, nn.Conv2d):
46 |                 m.weight.data.normal_(0, 0.01)
47 |                 m.bias.data.fill_(0)
48 | 
49 |         for m in self.bbox_branch.modules():
50 |             if isinstance(m, nn.Conv2d):
51 |                 m.weight.data.normal_(0, 0.01)
52 |                 m.bias.data.fill_(0)
53 | 
54 |         self.cls_score.weight.data.normal_(0, 0.01)
55 |         pi = 0.01
56 |         self.cls_score.bias.data.fill_(-np.log((1 - pi) / pi))
57 | 
58 |     def forward(self, x):
59 |         bbox_output = self.bbox_branch(x)
60 |         bbox_output = bbox_output.permute(0, 2, 3, 1).contiguous().view(x.size()[0], -1, 4)
61 |         cls_output = self.cls_score(self.cls_branch(x))
62 |         cls_output = cls_output.permute(0, 2, 3, 1).contiguous().view(x.size()[0], -1, self.num_classes)
63 |         return cls_output, bbox_output
64 | 
65 | 
66 | class RetinaNet(nn.Module):
67 | 
68 |     def __init__(self, num_classes, num_anchors, pretrained_path):
69 |         super(RetinaNet, self).__init__()
70 |         self.fpn = fpn.FPN50(pretrained_path)
71 |         self.head = RetinaNetHead(num_classes, num_anchors)
72 | 
73 |     def forward(self, x):
74 |         # [P3, P4, P5, P6, P7]
75 |         # stride: [8, 16, 32, 64, 128]
76 |         feature_pyramids = self.fpn(x)
77 |         cls_outputs = []
78 |         bbox_outputs = []
79 |         for fp in feature_pyramids:
80 |             cls_output, bbox_output = self.head(fp)
81 |             cls_outputs.append(cls_output)
82 |             bbox_outputs.append(bbox_output)
83 | 
84 |         cls_outputs = torch.cat(cls_outputs, dim=1)
85 |         bbox_outputs = torch.cat(bbox_outputs, dim=1)
86 | 
87 |         return cls_outputs, bbox_outputs
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/datasets/minibatch.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Create Mini Batch
 3 | """
 4 | import cv2
 5 | import torch
 6 | import random
 7 | import numpy as np
 8 | from datasets.utils import flip_img_boxes
 9 | from lib.det_ops import anchor_target
10 | from datasets.utils import normalize_image, get_im_scale
11 | 
12 | 
13 | def create_minibatch_func(config):
14 |     aspect_ratios = config['aspect_ratios']
15 |     anchor_sizes = config['anchor_sizes']
16 |     anchor_areas = config['anchor_areas']
17 |     strides = config['strides']
18 | 
19 |     anchor_layer = anchor_target.AnchorLayer(aspect_ratios=aspect_ratios,
20 |                                              sizes=anchor_sizes,
21 |                                              areas=anchor_areas,
22 |                                              strides=strides)
23 | 
24 |     def collate_minibatch(batch):
25 |         # (img, labels, boxes)
26 |         # img: [H, W, C]
27 |         # labels: [N]
28 |         # boxes: [N, 4]
29 |         batch_size = len(batch)
30 |         max_size = config['max_image_size']
31 |         # [N, 1]
32 |         target_size_inds = np.random.randint(
33 |             0, high=len(config['image_scales']), size=batch_size
34 |         )
35 | 
36 |         image_shapes = np.zeros((batch_size, 2), dtype=np.int)
37 |         image_scales = np.zeros(batch_size, dtype=np.float)
38 |         batch_height = 0
39 |         batch_width = 0
40 |         for i in range(batch_size):
41 |             h, w = batch[i][0].shape[:2]
42 |             target_size = config['image_scales'][target_size_inds[i]]
43 |             h_, w_, s_ = get_im_scale(h, w, target_size, max_size)
44 |             image_shapes[i, 0] = h_
45 |             image_shapes[i, 1] = w_
46 |             image_scales[i] = s_
47 |             batch_height = max(h_, batch_height)
48 |             batch_width = max(w_, batch_width)
49 | 
50 |         # pad images to support the last stride
51 |         max_stride = strides[-1]
52 |         batch_width = int(np.ceil(batch_width/max_stride)*max_stride)
53 |         batch_height = int(np.ceil(batch_height/max_stride)*max_stride)
54 | 
55 |         labels = []
56 |         gtboxes = []
57 |         batch_images = torch.zeros((batch_size, 3, batch_height, batch_width))
58 |         input_size = np.array([batch_height, batch_width])
59 |         for i in range(batch_size):
60 |             img, label, boxes = batch[i]
61 |             boxes = boxes.astype('float32')
62 |             h, w = image_shapes[i]
63 |             scale = image_scales[i]
64 |             img = cv2.resize(img, (w, h))
65 | 
66 |             # OpenCV resize (W, H)
67 |             boxes = boxes * scale
68 |             if random.random() < 0.5:
69 |                 img, boxes = flip_img_boxes(img, boxes)
70 | 
71 |             # transform or data augmentation
72 |             img = normalize_image(img)
73 |             img = img.transpose(2, 0, 1)
74 |             img = torch.Tensor(img)
75 |             # assign anchors
76 |             boxes = torch.Tensor(boxes)
77 |             label, boxes = anchor_layer.assign(boxes, label, input_size=input_size,
78 |                                                neg_thresh=config['negative_anchor_threshold'],
79 |                                                pos_thresh=config['positive_anchor_threshold'])
80 | 
81 |             labels.append(label.unsqueeze(0))
82 |             gtboxes.append(boxes.unsqueeze(0))
83 |             # print(img.shape, batch_images.shape)
84 |             batch_images[i, :, :h, :w] = img
85 | 
86 |         labels = torch.cat(labels, dim=0)
87 |         gtboxes = torch.cat(gtboxes, dim=0)
88 |         return batch_images, labels, gtboxes
89 | 
90 |     return collate_minibatch
91 | 


--------------------------------------------------------------------------------
/evaluation/evaluation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | """
  4 | import json
  5 | import argparse
  6 | import numpy as np
  7 | from IPython import embed
  8 | 
  9 | 
 10 | def calculate_ap(recall, precision):
 11 |     mrec = np.concatenate(([0.], recall, [1.]))
 12 |     mpre = np.concatenate(([0.], precision, [0.]))
 13 | 
 14 |     # compute the precision envelope
 15 |     for i in range(mpre.size - 1, 0, -1):
 16 |         mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
 17 | 
 18 |     # to calculate area under PR curve, look for points
 19 |     # where X axis (recall) changes value
 20 |     i = np.where(mrec[1:] != mrec[:-1])[0]
 21 | 
 22 |     # and sum (\Delta recall) * prec
 23 |     ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
 24 | 
 25 |     return ap
 26 | 
 27 | 
 28 | def eval_ap(predict_path, gt_path, iou_thresh):
 29 | 
 30 |     with open(predict_path, 'r') as f:
 31 |         lines = f.readlines()
 32 |         predictions = [json.loads(x.rstrip('\n')) for x in lines]
 33 | 
 34 |     with open(gt_path, 'r') as f:
 35 |         lines = f.readlines()
 36 |         gt = [json.loads(x.rstrip('\n')) for x in lines]
 37 | 
 38 |     predict_boxes = []
 39 |     for p in predictions:
 40 |         im_name = p['image_id']
 41 |         boxes = p['result']
 42 |         for bb in boxes:
 43 |             bb['im_name'] = im_name
 44 |             predict_boxes.append(bb)
 45 | 
 46 |     gt_boxes = dict()
 47 |     npos = 0
 48 |     for g in gt:
 49 |         gt_boxes[g['im_name']] = {'box': np.array(g['gtboxes']),
 50 |                                   'flag': np.zeros(len(g['gtboxes']), dtype=int)}
 51 |         npos += len(g['gtboxes'])
 52 | 
 53 |     # sort
 54 |     predict_boxes = sorted(predict_boxes, key=lambda x: x['prob'], reverse=True)
 55 |     tp = np.zeros(len(predict_boxes))
 56 |     fp = np.zeros(len(predict_boxes))
 57 |     for i in range(len(predict_boxes)):
 58 |         box = predict_boxes[i]
 59 |         im_name = box['im_name']
 60 |         _gt_boxes = gt_boxes[im_name]['box']
 61 |         bb = box['bbox']
 62 |         bb = np.array(bb)
 63 | 
 64 |         if len(_gt_boxes) > 0:
 65 | 
 66 |             ixmin = np.maximum(_gt_boxes[:, 0], bb[0])
 67 |             iymin = np.maximum(_gt_boxes[:, 1], bb[1])
 68 |             ixmax = np.minimum(_gt_boxes[:, 2], bb[2])
 69 |             iymax = np.minimum(_gt_boxes[:, 3], bb[3])
 70 |             iw = np.maximum(ixmax - ixmin + 1., 0.)
 71 |             ih = np.maximum(iymax - iymin + 1., 0.)
 72 |             inters = iw * ih
 73 | 
 74 |             uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
 75 |                    (_gt_boxes[:, 2] - _gt_boxes[:, 0] + 1.) *
 76 |                    (_gt_boxes[:, 3] - _gt_boxes[:, 1] + 1.) - inters)
 77 | 
 78 |             overlaps = inters / uni
 79 |             ovmax = np.max(overlaps)
 80 |             jmax = np.argmax(overlaps)
 81 | 
 82 |             if ovmax > iou_thresh:
 83 |                 if gt_boxes[im_name]['flag'][jmax] > 0:
 84 |                     fp[i] = 1
 85 |                 else:
 86 |                     tp[i] = 1
 87 |                     gt_boxes[im_name]['flag'][jmax] = 1
 88 |             else:
 89 |                 fp[i] = 1
 90 | 
 91 |     fp = np.cumsum(fp)
 92 |     tp = np.cumsum(tp)
 93 |     recall = tp / float(npos)
 94 |     # avoid divide by zero in case the first detection matches a difficult
 95 |     # ground truth
 96 |     precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
 97 | 
 98 |     ap = calculate_ap(recall, precision)
 99 | 
100 |     return ap
101 | 
102 | 
103 | def main():
104 | 
105 |     parser = argparse.ArgumentParser()
106 |     parser.add_argument('-p', '--predict', type=str, default='', required=True)
107 |     parser.add_argument('-g', '--gt', type=str, default='/public_datasets/SynthText/val.odgt')
108 |     parser.add_argument('-t', '--thresh', type=float, default=0.5)
109 | 
110 |     args = parser.parse_args()
111 | 
112 |     ap = eval_ap(args.predict, args.gt, args.thresh)
113 | 
114 |     print("eval finished, ap={:.3f}".format(ap))
115 | 
116 | 
117 | if __name__ == '__main__':
118 | 
119 |     main()
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 


--------------------------------------------------------------------------------
/lib/bbox/bbox.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Bounding Box
  4 | 
  5 | 
  6 | """
  7 | 
  8 | import torch
  9 | import numpy as np
 10 | from IPython import embed
 11 | 
 12 | def np_xywh2xyxy(boxes):
 13 |     # [x1,y1,w,h]
 14 |     boxes = np.hstack(
 15 |         (boxes[:, 0:2], boxes[:, 0:2] + np.maximum(0, boxes[:, 2:4] - 1))
 16 |     )
 17 | 
 18 |     return boxes
 19 | 
 20 | 
 21 | def clip_boxes(boxes, image_height, image_width):
 22 |     boxes[:, 0] = torch.clamp(boxes[:, 0], min=0, max=image_width-1)
 23 |     boxes[:, 1] = torch.clamp(boxes[:, 1], min=0, max=image_width-1)
 24 |     boxes[:, 2] = torch.clamp(boxes[:, 2], min=0, max=image_height-1)
 25 |     boxes[:, 3] = torch.clamp(boxes[:, 3], min=0, max=image_height-1)
 26 |     return boxes
 27 | 
 28 | 
 29 | def filter_boxes(boxes):
 30 | 
 31 |     keep = []
 32 |     for i in range(boxes.shape[0]):
 33 |         if boxes[i, 0] < boxes[i, 2] and boxes[i, 1] < boxes[i, 3]:
 34 |             keep.append(i)
 35 |     boxes = boxes[keep]
 36 |     return boxes
 37 | 
 38 | 
 39 | def xywh2xyxy(boxes):
 40 |     """ xywh -> xyxy
 41 |     (xmin,ymin,xmax,ymax) and (xcenter,ycenter,width,height)
 42 |     Args:
 43 |         boxes: torch.FloatTensor[N,4]
 44 |     """
 45 | 
 46 |     x1 = boxes[:, 0] - 0.5 * boxes[:, 2]
 47 |     y1 = boxes[:, 1] - 0.5 * boxes[:, 3]
 48 |     x2 = boxes[:, 0] + 0.5 * boxes[:, 2]
 49 |     y2 = boxes[:, 1] + 0.5 * boxes[:, 3]
 50 | 
 51 |     return torch.stack([x1, y1, x2, y2]).transpose(0, 1)
 52 | 
 53 | 
 54 | def xyxy2xywh(boxes):
 55 |     """ xyxy -> xywh
 56 |     (xmin,ymin,xmax,ymax) and (xcenter,ycenter,width,height)
 57 |     Args:
 58 |         boxes: torch.FloatTensor[N,4]
 59 |     """
 60 |     w = boxes[:, 2] - boxes[:, 0] + 1
 61 |     h = boxes[:, 3] - boxes[:, 1] + 1
 62 |     # center
 63 |     x = boxes[:, 0] + 0.5 * w
 64 |     y = boxes[:, 1] + 0.5 * h
 65 | 
 66 |     return torch.stack([x, y, w, h]).transpose(0, 1)
 67 | 
 68 | 
 69 | def box_overlaps(box1, box2):
 70 |     """ Box IoU(Insertion over Union)(xmin,ymin,xmax,ymax)
 71 |     Args:
 72 |         box1: torch.FloatTensor[N, 4],
 73 |         box2: torch.FloatTensor[M, 4]
 74 |         mode: box representation format
 75 |     """
 76 |     # N = box1.size()[0]
 77 |     # M = box2.size()[0]
 78 | 
 79 |     # NxMx2
 80 |     lo = torch.max(box1[:, None, :2], box2[:, :2])
 81 |     hi = torch.min(box1[:, None, 2:], box2[:, 2:])
 82 | 
 83 |     inner_rect = (hi - lo + 1).clamp(0)
 84 |     # NxMx1
 85 |     inner = inner_rect[:, :, 0] * inner_rect[:, :, 1]
 86 | 
 87 |     area1 = (box1[:, 2]-box1[:, 0]+1)*(box1[:, 3]-box1[:, 1]+1)
 88 |     area2 = (box2[:, 2]-box2[:, 0]+1)*(box2[:, 3]-box2[:, 1]+1)
 89 | 
 90 |     iou = inner / (area1[:, None] + area2 - inner)
 91 | 
 92 |     return iou
 93 | 
 94 | 
 95 | def box_nms(boxes, scores, threshold):
 96 |     """Non maximum suppression.
 97 |     Args:
 98 |       boxes: (tensor) bounding boxes, sized [N,4].
 99 |       scores: (tensor) bbox scores, sized [N,].
100 |       threshold: (float) overlap threshold.
101 |     Returns:
102 |       keep: (tensor) selected indices.
103 |     Reference:
104 |       https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py
105 |     """
106 |     x1 = boxes[:, 0]
107 |     y1 = boxes[:, 1]
108 |     x2 = boxes[:, 2]
109 |     y2 = boxes[:, 3]
110 | 
111 |     areas = (x2-x1+1) * (y2-y1+1)
112 |     _, order = scores.sort(0, descending=True)
113 |     keep = []
114 |     while order.numel() > 0:
115 |         i = order[0]
116 |         keep.append(i)
117 | 
118 |         if order.numel() == 1:
119 |             break
120 | 
121 |         xx1 = x1[order[1:]].clamp(min=x1[i].item())
122 |         yy1 = y1[order[1:]].clamp(min=y1[i].item())
123 |         xx2 = x2[order[1:]].clamp(max=x2[i].item())
124 |         yy2 = y2[order[1:]].clamp(max=y2[i].item())
125 | 
126 |         w = (xx2-xx1+1).clamp(min=0)
127 |         h = (yy2-yy1+1).clamp(min=0)
128 |         inter = w*h
129 | 
130 |         ovr = inter / (areas[i] + areas[order[1:]] - inter)
131 | 
132 |         ids = (ovr <= threshold).nonzero().squeeze()
133 |         if ids.numel() == 0:
134 |             break
135 |         order = order[ids+1]
136 |     return torch.LongTensor(keep)
137 | 
138 | 
139 | if __name__ == '__main__':
140 |     # TODO: Test it!
141 |     pass


--------------------------------------------------------------------------------
/lib/nms/cpu_soft_nms.pyx:
--------------------------------------------------------------------------------
  1 | # ----------------------------------------------------------
  2 | # Soft-NMS: Improving Object Detection With One Line of Code
  3 | # Copyright (c) University of Maryland, College Park
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Navaneeth Bodla and Bharat Singh
  6 | # ----------------------------------------------------------
  7 | 
  8 | import numpy as np
  9 | cimport numpy as np
 10 | 
 11 | 
 12 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
 13 |     return a if a >= b else b
 14 | 
 15 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
 16 |     return a if a <= b else b
 17 | 
 18 | def cpu_soft_nms(
 19 |     np.ndarray[float, ndim=2] boxes_in,
 20 |     float sigma=0.5,
 21 |     float Nt=0.3,
 22 |     float threshold=0.001,
 23 |     unsigned int method=0
 24 | ):
 25 |     boxes = boxes_in.copy()
 26 |     cdef unsigned int N = boxes.shape[0]
 27 |     cdef float iw, ih, box_area
 28 |     cdef float ua
 29 |     cdef int pos = 0
 30 |     cdef float maxscore = 0
 31 |     cdef int maxpos = 0
 32 |     cdef float x1, x2, y1, y2, tx1, tx2, ty1, ty2, ts, area, weight, ov
 33 |     inds = np.arange(N)
 34 | 
 35 |     for i in range(N):
 36 |         maxscore = boxes[i, 4]
 37 |         maxpos = i
 38 | 
 39 |         tx1 = boxes[i,0]
 40 |         ty1 = boxes[i,1]
 41 |         tx2 = boxes[i,2]
 42 |         ty2 = boxes[i,3]
 43 |         ts = boxes[i,4]
 44 |         ti = inds[i]
 45 | 
 46 |         pos = i + 1
 47 |         # get max box
 48 |         while pos < N:
 49 |             if maxscore < boxes[pos, 4]:
 50 |                 maxscore = boxes[pos, 4]
 51 |                 maxpos = pos
 52 |             pos = pos + 1
 53 | 
 54 |         # add max box as a detection
 55 |         boxes[i,0] = boxes[maxpos,0]
 56 |         boxes[i,1] = boxes[maxpos,1]
 57 |         boxes[i,2] = boxes[maxpos,2]
 58 |         boxes[i,3] = boxes[maxpos,3]
 59 |         boxes[i,4] = boxes[maxpos,4]
 60 |         inds[i] = inds[maxpos]
 61 | 
 62 |         # swap ith box with position of max box
 63 |         boxes[maxpos,0] = tx1
 64 |         boxes[maxpos,1] = ty1
 65 |         boxes[maxpos,2] = tx2
 66 |         boxes[maxpos,3] = ty2
 67 |         boxes[maxpos,4] = ts
 68 |         inds[maxpos] = ti
 69 | 
 70 |         tx1 = boxes[i,0]
 71 |         ty1 = boxes[i,1]
 72 |         tx2 = boxes[i,2]
 73 |         ty2 = boxes[i,3]
 74 |         ts = boxes[i,4]
 75 | 
 76 |         pos = i + 1
 77 |         # NMS iterations, note that N changes if detection boxes fall below
 78 |         # threshold
 79 |         while pos < N:
 80 |             x1 = boxes[pos, 0]
 81 |             y1 = boxes[pos, 1]
 82 |             x2 = boxes[pos, 2]
 83 |             y2 = boxes[pos, 3]
 84 |             s = boxes[pos, 4]
 85 | 
 86 |             area = (x2 - x1 + 1) * (y2 - y1 + 1)
 87 |             iw = (min(tx2, x2) - max(tx1, x1) + 1)
 88 |             if iw > 0:
 89 |                 ih = (min(ty2, y2) - max(ty1, y1) + 1)
 90 |                 if ih > 0:
 91 |                     ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih)
 92 |                     ov = iw * ih / ua #iou between max box and detection box
 93 | 
 94 |                     if method == 1: # linear
 95 |                         if ov > Nt:
 96 |                             weight = 1 - ov
 97 |                         else:
 98 |                             weight = 1
 99 |                     elif method == 2: # gaussian
100 |                         weight = np.exp(-(ov * ov)/sigma)
101 |                     else: # original NMS
102 |                         if ov > Nt:
103 |                             weight = 0
104 |                         else:
105 |                             weight = 1
106 | 
107 |                     boxes[pos, 4] = weight*boxes[pos, 4]
108 | 
109 |                     # if box score falls below threshold, discard the box by
110 |                     # swapping with last box update N
111 |                     if boxes[pos, 4] < threshold:
112 |                         boxes[pos,0] = boxes[N-1, 0]
113 |                         boxes[pos,1] = boxes[N-1, 1]
114 |                         boxes[pos,2] = boxes[N-1, 2]
115 |                         boxes[pos,3] = boxes[N-1, 3]
116 |                         boxes[pos,4] = boxes[N-1, 4]
117 |                         inds[pos] = inds[N-1]
118 |                         N = N - 1
119 |                         pos = pos - 1
120 | 
121 |             pos = pos + 1
122 | 
123 |     return boxes[:N], inds[:N]


--------------------------------------------------------------------------------
/models/resnet.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Basemodel: ResNet
  3 | 
  4 | """
  5 | 
  6 | import torch
  7 | from torchvision.models import resnet
  8 | import torch.nn as nn
  9 | 
 10 | Bottleneck = resnet.Bottleneck
 11 | 
 12 | 
 13 | class ResNet50Stages(nn.Module):
 14 | 
 15 |     def __init__(self, pretrained_path):
 16 |         super(ResNet50Stages, self).__init__()
 17 |         self.inplanes = 64
 18 |         self.stages = [3, 4, 6, 3]
 19 |         self.mid_outputs = [64, 128, 256, 512]
 20 | 
 21 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
 22 |                                bias=False)
 23 |         self.bn1 = nn.BatchNorm2d(64)
 24 |         self.relu = nn.ReLU(inplace=True)
 25 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 26 |         self.layer1 = self._make_layer(Bottleneck, self.mid_outputs[0], self.stages[0])
 27 |         self.layer2 = self._make_layer(Bottleneck, self.mid_outputs[1], self.stages[1], stride=2)
 28 |         self.layer3 = self._make_layer(Bottleneck, self.mid_outputs[2], self.stages[2], stride=2)
 29 |         self.layer4 = self._make_layer(Bottleneck, self.mid_outputs[3], self.stages[3], stride=2)
 30 | 
 31 |         # self.load_state_dict(torch.load(pretrained_path))
 32 |         self.load_pretrained(pretrained_path)
 33 | 
 34 |     def _make_layer(self, block, planes, blocks, stride=1):
 35 |         downsample = None
 36 |         if stride != 1 or self.inplanes != planes * block.expansion:
 37 |             downsample = nn.Sequential(
 38 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
 39 |                           kernel_size=1, stride=stride, bias=False),
 40 |                 nn.BatchNorm2d(planes * block.expansion),
 41 |             )
 42 | 
 43 |         layers = []
 44 |         layers.append(block(self.inplanes, planes, stride, downsample))
 45 |         self.inplanes = planes * block.expansion
 46 |         for i in range(1, blocks):
 47 |             layers.append(block(self.inplanes, planes))
 48 | 
 49 |         return nn.Sequential(*layers)
 50 | 
 51 |     def freeze_bn(self):
 52 |         pass
 53 | 
 54 |     def load_pretrained(self, mpath):
 55 |         
 56 |         pretrained_dict = torch.load(mpath)
 57 |         model_dict = self.state_dict()
 58 |         pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
 59 |         self.load_state_dict(pretrained_dict)
 60 | 
 61 |     def forward(self, x):
 62 |         x = self.conv1(x)
 63 |         x = self.bn1(x)
 64 |         x = self.relu(x)
 65 |         x = self.maxpool(x)
 66 |         x1 = self.layer1(x)
 67 |         x2 = self.layer2(x1)
 68 |         x3 = self.layer3(x2)
 69 |         x4 = self.layer4(x3)
 70 |         return [x2, x3, x4]
 71 | 
 72 | 
 73 | class ResNet50(nn.Module):
 74 | 
 75 |     def __init__(self, pretrained_path):
 76 |         super(ResNet50, self).__init__()
 77 |         self.layers = ResNet50Stages(pretrained_path)
 78 | 
 79 |     def forward(self, x):
 80 |         return self.layers(x)[-1]
 81 | 
 82 | 
 83 | class ResNet50C4(nn.Module):
 84 | 
 85 |     def __init__(self, pretrained_path):
 86 |         super(ResNet50C4, self).__init__()
 87 |         self.inplanes = 64
 88 |         self.stages = [3, 4, 6]
 89 |         self.mid_outputs = [64, 128, 256, 512]
 90 | 
 91 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
 92 |                                bias=False)
 93 |         self.bn1 = nn.BatchNorm2d(64)
 94 |         self.relu = nn.ReLU(inplace=True)
 95 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 96 |         self.layer1 = self._make_layer(Bottleneck, self.mid_outputs[0], self.stages[0])
 97 |         self.layer2 = self._make_layer(Bottleneck, self.mid_outputs[1], self.stages[1], stride=2)
 98 |         self.layer3 = self._make_layer(Bottleneck, self.mid_outputs[2], self.stages[2], stride=2)
 99 | 
100 |         # self.load_state_dict(torch.load(pretrained_path))
101 |         self.load_pretrained(pretrained_path)
102 | 
103 |     def _make_layer(self, block, planes, blocks, stride=1):
104 |         downsample = None
105 |         if stride != 1 or self.inplanes != planes * block.expansion:
106 |             downsample = nn.Sequential(
107 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
108 |                           kernel_size=1, stride=stride, bias=False),
109 |                 nn.BatchNorm2d(planes * block.expansion),
110 |             )
111 | 
112 |         layers = []
113 |         layers.append(block(self.inplanes, planes, stride, downsample))
114 |         self.inplanes = planes * block.expansion
115 |         for i in range(1, blocks):
116 |             layers.append(block(self.inplanes, planes))
117 | 
118 |         return nn.Sequential(*layers)
119 | 
120 |     def freeze_bn(self):
121 |         pass
122 | 
123 |     def load_pretrained(self, mpath):
124 | 
125 |         pretrained_dict = torch.load(mpath)
126 |         model_dict = self.state_dict()
127 |         pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
128 |         self.load_state_dict(pretrained_dict)
129 | 
130 |     def forward(self, x):
131 |         x = self.conv1(x)
132 |         x = self.bn1(x)
133 |         x = self.relu(x)
134 |         x = self.maxpool(x)
135 |         x1 = self.layer1(x)
136 |         x2 = self.layer2(x1)
137 |         x3 = self.layer3(x2)
138 |         return x3
139 | 
140 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Training RetinaNet
  4 | 
  5 | 
  6 | """
  7 | import os
  8 | import tqdm
  9 | import argparse
 10 | import numpy as np
 11 | import torch.nn as nn
 12 | import torch.optim as optim
 13 | from models import retina
 14 | from datasets import synthtext, icdar15, minibatch
 15 | from torch.utils.data import DataLoader
 16 | from lib.det_ops.loss import SigmoidFocalLoss, SmoothL1Loss
 17 | from IPython import embed
 18 | import tensorboardX
 19 | from utils import logger
 20 | from cfgs import config as cfg
 21 | 
 22 | 
 23 | def initialize(config, args):
 24 | 
 25 |     logdir = config['logdir']
 26 |     if not os.path.exists(logdir):
 27 |         os.mkdir(logdir)
 28 |     if not os.path.exists(os.path.join(logdir, args.experiment)):
 29 |         os.mkdir(os.path.join(logdir, args.experiment))
 30 | 
 31 |     model_dump_dir = os.path.join(logdir, args.experiment, 'model_dump')
 32 |     tb_dump = os.path.join(logdir, args.experiment, 'tb_dump')
 33 | 
 34 |     if not os.path.exists(model_dump_dir):
 35 |         os.mkdir(model_dump_dir)
 36 | 
 37 |     if not os.path.exists(tb_dump):
 38 |         os.mkdir(tb_dump)
 39 | 
 40 |     config['tb_dump_dir'] = tb_dump
 41 |     config['model_dump_dir'] = model_dump_dir
 42 | 
 43 | 
 44 | def learning_rate_decay(optimizer, step, config):
 45 |     base_lr = config['base_lr']
 46 |     lr = base_lr
 47 |     if step >= config['lr_decay'][0]:
 48 |         lr = base_lr * 0.1
 49 |     if step >= config['lr_decay'][0]:
 50 |         lr = base_lr * 0.01
 51 | 
 52 |     for param_group in optimizer.param_groups:
 53 |         param_group['lr'] = lr
 54 | 
 55 | 
 56 | def train(args, config):
 57 |     anchor_scales = config['anchor_sizes']
 58 |     anchor_apsect_ratios = config['anchor_aspect_ratios']
 59 |     num_anchors = len(anchor_scales) * len(anchor_apsect_ratios)
 60 | 
 61 |     model = retina.RetinaNet(config['num_classes'], num_anchors, config['basemodel_path']).cuda()
 62 |     model = nn.DataParallel(model, device_ids=list(range(args.device)))
 63 | 
 64 |     if args.dataset == 'SynthText':
 65 |         train_dataset = synthtext.SynthText(dataroot=config['data_dir'], imageset=args.imageset, config=config)
 66 |     elif args.dataset == 'ICDAR':
 67 |         train_dataset = icdar15.ICDAR15(dataroot=config['data_dir'], imageset=args.imageset, config=config)
 68 |     else:
 69 |         raise NotImplemented()
 70 | 
 71 |     collate_minibatch = minibatch.create_minibatch_func(config)
 72 | 
 73 |     train_loader = DataLoader(
 74 |         dataset=train_dataset,
 75 |         batch_size=args.batch_size*args.device,
 76 |         shuffle=True,
 77 |         num_workers=config['workers'],
 78 |         collate_fn=collate_minibatch
 79 |     )
 80 | 
 81 |     writer = tensorboardX.SummaryWriter(config['tb_dump_dir'])
 82 |     # torch model
 83 | 
 84 |     optimizer = optim.SGD(lr=config['base_lr'], params=model.parameters(),
 85 |                           weight_decay=config['weight_decay'], momentum=0.9)
 86 | 
 87 |     cls_criterion = SigmoidFocalLoss().cuda()
 88 |     box_criterion = SmoothL1Loss().cuda()
 89 | 
 90 |     start_epoch = 0
 91 |     global_step = 0
 92 | 
 93 |     # Load state dict from saved model
 94 |     if len(args.continue_path) > 0:
 95 |         model_state, optimizer_state, epoch, step = logger.load_checkpoints(args.continue_path)
 96 |         model.module.load_state_dict(model_state)
 97 |         optimizer.load_state_dict(optimizer_state)
 98 |         global_step = step+1
 99 |         start_epoch = epoch + 1
100 | 
101 |     for epoch in range(start_epoch, config['epochs']):
102 |         losses = []
103 |         data_iter = iter(train_loader)
104 |         pbar = tqdm.tqdm(range(len(train_loader)))
105 |         for i in pbar:
106 |             img, labels, boxes = next(data_iter)
107 |             img = img.cuda()
108 |             labels = labels.long().cuda()
109 |             boxes = boxes.cuda()
110 |             cls_outputs, bbox_outputs = model(img)
111 |             cls_loss = cls_criterion(cls_outputs, labels)
112 |             box_loss = box_criterion(bbox_outputs, boxes, labels)
113 |             loss = cls_loss + box_loss
114 | 
115 |             optimizer.zero_grad()
116 |             loss.backward()
117 |             optimizer.step()
118 |             writer.add_scalar('train/box_loss', box_loss.item(), global_step)
119 |             writer.add_scalar('train/cls_loss', cls_loss.item(), global_step)
120 |             global_step += 1
121 |             pbar.set_description('e:{} i:{} loss:{:.3f} cls_loss:{:.3f} box_loss:{:.3f}'.format(
122 |                 epoch, i + 1, loss.item(), cls_loss.item(), box_loss.item()
123 |             ))
124 |             losses.append(loss.item())
125 | 
126 |             # learning rate decay
127 |             learning_rate_decay(optimizer, global_step, config)
128 | 
129 |         print("e:{} loss: {}".format(epoch, np.mean(losses)))
130 |         logger.save_checkpoints(model.module, optimizer, epoch, global_step,
131 |                                 path=os.path.join(config['model_dump_dir'],
132 |                                                   'epoch-{}-iter-{}.pth'.format(epoch, global_step)))
133 | 
134 | 
135 | if __name__ == '__main__':
136 | 
137 |     parser = argparse.ArgumentParser()
138 |     parser.add_argument('-d', '--device', type=int, default=1, help='training with ? GPUs')
139 |     parser.add_argument('-b', '--batch_size', type=int, default=4, help='training batch size per GPU')
140 |     parser.add_argument('-c', '--continue_path', type=str, default='', help='continue model parameters')
141 |     parser.add_argument('-e', '--experiment', type=str, default='synth_baseline',
142 |                         help='experiment name, correspond to `config.py`')
143 |     parser.add_argument('-ds', '--dataset', type=str, default='SynthText', help='dataset')
144 | 
145 |     _args = parser.parse_args()
146 |     config = cfg.config[_args.experiment]
147 |     train(_args, config)
148 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Test scripts
  4 | 
  5 | """
  6 | import argparse
  7 | import json
  8 | import tqdm
  9 | import torch
 10 | import numpy as np
 11 | from lib.det_ops.anchors import compute_anchor_whs, generate_anchors
 12 | from lib.bbox import bbox, box_transform
 13 | from lib.nms import nms
 14 | from utils.logger import load_checkpoints
 15 | from models import retina
 16 | from IPython import embed
 17 | from datasets import synthtext, icdar15
 18 | from cfgs import config as cfg
 19 | 
 20 | 
 21 | def inference(model, dataset, anchor_wh, strides, result_file, config):
 22 | 
 23 |     model.eval()
 24 |     num_samples = len(dataset)
 25 |     pbar = tqdm.tqdm(range(num_samples))
 26 |     with torch.no_grad():
 27 |         for idx in pbar:
 28 |             img, im_name, scale, im_size = dataset[idx]
 29 |             h, w = img.shape[1], img.shape[2]
 30 |             img = img.cuda()
 31 |             cls_pred, bbox_pred = model(img.unsqueeze(0))
 32 |             scores = cls_pred.sigmoid()
 33 |             # bbox [N, 4]
 34 |             bbox_pred = bbox_pred[0]
 35 |             # cls [N, C]
 36 |             scores = scores[0]
 37 | 
 38 |             anchors = generate_anchors(anchor_wh, input_size=np.array([h, w]),
 39 |                                        strides=strides)
 40 |             anchors = anchors.cuda()
 41 | 
 42 |             # transform to bboxes
 43 |             boxes = box_transform.bbox_transform_inv(anchors, bbox_pred)
 44 |             boxes = boxes/scale
 45 |             boxes = bbox.clip_boxes(boxes, im_size[0], im_size[1])
 46 | 
 47 |             filter_boxes_inds_x = boxes[:, 0] >= boxes[:, 2]
 48 |             filter_boxes_inds_y = boxes[:, 1] >= boxes[:, 3]
 49 |             filter_boxes_inds = (1 - filter_boxes_inds_x) * (1 - filter_boxes_inds_y)
 50 |             boxes = boxes[filter_boxes_inds]
 51 |             scores = scores[filter_boxes_inds]
 52 | 
 53 |             result_boxes = []  # []
 54 |             # every class
 55 |             # 1. max detection score
 56 |             # 2. score thresh
 57 |             # 3. do nms
 58 |             # 4. top k
 59 |             max_labels = torch.argmax(scores, dim=1)
 60 | 
 61 |             for cls in range(config['num_classes']-1):
 62 | 
 63 |                 # filter predictions through 'classification threshold'
 64 |                 score = scores[:, cls]
 65 |                 cls_inds = score > config['cls_thresh']
 66 |                 # current class has the max score over all classes
 67 |                 max_inds = max_labels == cls
 68 |                 cls_inds = max_inds * cls_inds
 69 |                 if cls_inds.sum() < 1:
 70 |                     continue
 71 |                 # score [K]
 72 |                 score = score[cls_inds]
 73 | 
 74 |                 # _boxes [K, 4]
 75 |                 _boxes = boxes[cls_inds]
 76 | 
 77 |                 # NMS remove duplicate
 78 |                 keep = nms(torch.cat([_boxes, score.unsqueeze(1)], 1), config['test_nms'])
 79 | 
 80 |                 score = score[keep]
 81 |                 _boxes = _boxes[keep]
 82 | 
 83 |                 for i in range(_boxes.shape[0]):
 84 |                     result_boxes.append((cls, score[i].item(), _boxes[i].cpu().data.numpy().tolist()))
 85 | 
 86 |             # Keep Max Num Boxes
 87 |             if len(result_boxes) > config['test_max_boxes']:
 88 |                 result_boxes = sorted(result_boxes, key=lambda x: x[1], reverse=True)
 89 |                 result_boxes = result_boxes[:config['test_max_boxes']]
 90 |             pbar.set_description('im_det:{}/{}'.format(idx, num_samples))
 91 | 
 92 |             if len(result_boxes) == 0:
 93 |                 continue
 94 | 
 95 |             result = dict()
 96 |             result['image_id'] = im_name
 97 |             det = []
 98 |             for i in range(len(result_boxes)):
 99 |                 cls, s, b, = result_boxes[i]
100 |                 current_det = dict()
101 |                 current_det['prob'] = s
102 |                 current_det['class'] = cls+1
103 |                 current_det['bbox'] = b
104 | 
105 |                 det.append(current_det)
106 |             result['result'] = det
107 | 
108 |             with open(result_file, 'a+') as f:
109 |                 s = json.dumps(result)
110 |                 f.write('{}\n'.format(s))
111 | 
112 |         print("Det Finished!")
113 | 
114 | 
115 | def validate(args, config):
116 | 
117 |     anchor_scales = config['anchor_sizes']
118 |     anchor_apsect_ratios = config['aspect_ratios']
119 |     num_anchors = len(anchor_scales) * len(anchor_apsect_ratios)
120 | 
121 |     model = retina.RetinaNet(config['num_classes']-1, num_anchors, config['basemodel_path']).cuda()
122 | 
123 |     model_path = args.model_path
124 |     output_file = args.output
125 |     if args.dataset == 'SynthText':
126 |         dataset = synthtext.SynthText(dataroot=config['data_dir'], imageset=args.imageset, config=config)
127 |     elif args.dataset == 'ICDAR':
128 |         dataset = icdar15.ICDAR15(dataroot=config['data_dir'], imageset=args.imageset, config=config)
129 |     else:
130 |         NotImplemented()
131 |     state_dict, _, _, _ = load_checkpoints(model_path)
132 |     model.load_state_dict(state_dict)
133 | 
134 |     anchor_whs = compute_anchor_whs(len(config['strides']), areas=config['anchor_areas'],
135 |                                     aspect_ratios=anchor_apsect_ratios,
136 |                                     sizes=anchor_scales)
137 | 
138 |     inference(model, dataset, anchor_whs, config['strides'], result_file=output_file, config=config)
139 | 
140 | 
141 | if __name__ == '__main__':
142 | 
143 |     parser = argparse.ArgumentParser()
144 |     parser.add_argument('-o', '--output', type=str, default='result.det', help='output file path')
145 |     parser.add_argument('-m', '--model_path', type=str, help='saved model path')
146 |     parser.add_argument('-i', '--imageset', type=str, default='val', help='saved model path')
147 |     parser.add_argument('-e', '--experiment', type=str, default='synth_baseline',
148 |                         help='experiment name, correspond to `config.py`')
149 |     parser.add_argument('-ds', '--dataset', type=str, default='VOC', help='dataset')
150 |     _args = parser.parse_args()
151 |     config = cfg.config[_args.experiment]
152 |     _args = parser.parse_args()
153 |     validate(_args, config)
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 


--------------------------------------------------------------------------------
/lib/nms/nms_kernel.cu:
--------------------------------------------------------------------------------
  1 | // ------------------------------------------------------------------
  2 | // Faster R-CNN
  3 | // Copyright (c) 2015 Microsoft
  4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details]
  5 | // Written by Shaoqing Ren
  6 | // ------------------------------------------------------------------
  7 | 
  8 | #include <stdio.h>
  9 | #include <iostream>
 10 | #include <vector>
 11 | #include "gpu_nms.hpp"
 12 | 
 13 | #define CUDA_CHECK(condition)                                    \
 14 |     /* Code block avoids redefinition of cudaError_t error */    \
 15 |     do {                                                         \
 16 |         cudaError_t error = condition;                           \
 17 |         if (error != cudaSuccess) {                              \
 18 |             std::cout << cudaGetErrorString(error) << std::endl; \
 19 |         }                                                        \
 20 |     } while (0)
 21 | 
 22 | #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
 23 | #define MULTIPLIER 16
 24 | #define LONGLONG_SIZE 64
 25 | 
 26 | int const threadsPerBlock =
 27 |     sizeof(unsigned long long) * 8 *
 28 |     MULTIPLIER;  // number of bits for a long long variable
 29 | 
 30 | __device__ inline float devIoU(float const* const a, float const* const b) {
 31 |     float left = max(a[0], b[0]), right = min(a[2], b[2]);
 32 |     float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
 33 |     float width = max(right - left + 1, 0.f),
 34 |           height = max(bottom - top + 1, 0.f);
 35 |     float interS = width * height;
 36 |     float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
 37 |     float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
 38 |     return interS / (Sa + Sb - interS);
 39 | }
 40 | 
 41 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
 42 |                            const float* dev_boxes,
 43 |                            unsigned long long* dev_mask) {
 44 |     const int row_start = blockIdx.y;
 45 |     const int col_start = blockIdx.x;
 46 | 
 47 |     // if (row_start > col_start) return;
 48 | 
 49 |     const int row_size =
 50 |         min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
 51 |     const int col_size =
 52 |         min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
 53 | 
 54 |     __shared__ float block_boxes[threadsPerBlock * 5];
 55 |     if (threadIdx.x < col_size) {
 56 |         block_boxes[threadIdx.x * 5 + 0] =
 57 |             dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
 58 |         block_boxes[threadIdx.x * 5 + 1] =
 59 |             dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
 60 |         block_boxes[threadIdx.x * 5 + 2] =
 61 |             dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
 62 |         block_boxes[threadIdx.x * 5 + 3] =
 63 |             dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
 64 |         block_boxes[threadIdx.x * 5 + 4] =
 65 |             dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
 66 |     }
 67 |     __syncthreads();
 68 | 
 69 |     unsigned long long ts[MULTIPLIER];
 70 | 
 71 |     if (threadIdx.x < row_size) {
 72 | #pragma unroll
 73 |         for (int i = 0; i < MULTIPLIER; ++i) {
 74 |             ts[i] = 0;
 75 |         }
 76 |         const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
 77 |         const float* cur_box = dev_boxes + cur_box_idx * 5;
 78 |         int i = 0;
 79 |         int start = 0;
 80 |         if (row_start == col_start) {
 81 |             start = threadIdx.x + 1;
 82 |         }
 83 |         for (i = start; i < col_size; i++) {
 84 |             if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
 85 |                 ts[i / LONGLONG_SIZE] |= 1ULL << (i % LONGLONG_SIZE);
 86 |             }
 87 |         }
 88 |         const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
 89 | 
 90 | #pragma unroll
 91 |         for (int i = 0; i < MULTIPLIER; ++i) {
 92 |             dev_mask[(cur_box_idx * col_blocks + col_start) * MULTIPLIER + i] =
 93 |                 ts[i];
 94 |         }
 95 |     }
 96 | }
 97 | 
 98 | void _set_device(int device_id) {
 99 |     int current_device;
100 |     CUDA_CHECK(cudaGetDevice(&current_device));
101 |     if (current_device == device_id) {
102 |         return;
103 |     }
104 |     // The call to cudaSetDevice must come before any calls to Get, which
105 |     // may perform initialization using the GPU.
106 |     CUDA_CHECK(cudaSetDevice(device_id));
107 | }
108 | 
109 | const size_t MEMORY_SIZE = 500000000;
110 | size_t nms_Malloc() {
111 |     float* boxes_dev = NULL;
112 |     CUDA_CHECK(cudaMalloc(&boxes_dev, MEMORY_SIZE));
113 |     return size_t(boxes_dev);
114 | }
115 | 
116 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
117 |           int boxes_dim, float nms_overlap_thresh, int device_id, size_t base) {
118 |     _set_device(device_id);
119 | 
120 |     float* boxes_dev = NULL;
121 |     unsigned long long* mask_dev = NULL;
122 | 
123 |     const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
124 | 
125 |     if (base > 0) {
126 |         size_t require_mem =
127 |             boxes_num * boxes_dim * sizeof(float) +
128 |             boxes_num * col_blocks * sizeof(unsigned long long) * MULTIPLIER;
129 |         if (require_mem >= MEMORY_SIZE) {
130 |             std::cout << "require_mem: " << require_mem << std::endl;
131 |         }
132 |         boxes_dev = (float*)(base);
133 |         mask_dev =
134 |             (unsigned long long*)(base +
135 |                                   512 * ((unsigned long long)(boxes_num *
136 |                                                               boxes_dim *
137 |                                                               sizeof(float) /
138 |                                                               512) +
139 |                                          1));
140 |     } else {
141 |         CUDA_CHECK(
142 |             cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(float)));
143 |         CUDA_CHECK(cudaMalloc(&mask_dev, MULTIPLIER * boxes_num * col_blocks *
144 |                                              sizeof(unsigned long long)));
145 |     }
146 |     CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host,
147 |                           boxes_num * boxes_dim * sizeof(float),
148 |                           cudaMemcpyHostToDevice));
149 | 
150 |     dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
151 |                 DIVUP(boxes_num, threadsPerBlock));
152 |     dim3 threads(threadsPerBlock);
153 |     nms_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes_dev,
154 |                                     mask_dev);
155 | 
156 |     std::vector<unsigned long long> mask_host(boxes_num * col_blocks *
157 |                                               MULTIPLIER);
158 |     CUDA_CHECK(cudaMemcpy(
159 |         &mask_host[0], mask_dev,
160 |         sizeof(unsigned long long) * boxes_num * col_blocks * MULTIPLIER,
161 |         cudaMemcpyDeviceToHost));
162 | 
163 |     std::vector<unsigned long long> remv(col_blocks * MULTIPLIER);
164 |     memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks * MULTIPLIER);
165 | 
166 |     int num_to_keep = 0;
167 |     for (int i = 0; i < boxes_num; i++) {
168 |         int nblock = i / threadsPerBlock;
169 |         int inblock = i % threadsPerBlock;
170 |         int offset = inblock / LONGLONG_SIZE;
171 |         int bit_pos = inblock % LONGLONG_SIZE;
172 | 
173 |         if (!(remv[nblock * MULTIPLIER + offset] & (1ULL << bit_pos))) {
174 |             keep_out[num_to_keep++] = i;
175 |             unsigned long long* p = &mask_host[0] + i * col_blocks * MULTIPLIER;
176 |             for (int j = nblock * MULTIPLIER + offset;
177 |                  j < col_blocks * MULTIPLIER; j++) {
178 |                 remv[j] |= p[j];
179 |             }
180 |         }
181 |     }
182 |     *num_out = num_to_keep;
183 | 
184 |     if (!base) {
185 |         CUDA_CHECK(cudaFree(boxes_dev));
186 |         CUDA_CHECK(cudaFree(mask_dev));
187 |     }
188 | }
189 | 


--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="ad23dd17-2b3c-4c1e-97e2-3bd688b6847e" name="Default Changelist" comment="">
  5 |       <change afterPath="$PROJECT_DIR$/.idea/vcs.xml" afterDir="false" />
  6 |       <change afterPath="$PROJECT_DIR$/datasets/icdar15.py" afterDir="false" />
  7 |       <change afterPath="$PROJECT_DIR$/datasets/synthtext.py" afterDir="false" />
  8 |     </list>
  9 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
 10 |     <option name="SHOW_DIALOG" value="false" />
 11 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 12 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 13 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 14 |   </component>
 15 |   <component name="FUSProjectUsageTrigger">
 16 |     <session id="164681021">
 17 |       <usages-collector id="statistics.lifecycle.project">
 18 |         <counts>
 19 |           <entry key="project.closed" value="1" />
 20 |           <entry key="project.open.time.0" value="1" />
 21 |           <entry key="project.opened" value="1" />
 22 |         </counts>
 23 |       </usages-collector>
 24 |       <usages-collector id="statistics.file.extensions.open">
 25 |         <counts>
 26 |           <entry key="LICENSE" value="1" />
 27 |           <entry key="md" value="1" />
 28 |           <entry key="py" value="8" />
 29 |         </counts>
 30 |       </usages-collector>
 31 |       <usages-collector id="statistics.file.types.open">
 32 |         <counts>
 33 |           <entry key="Markdown" value="1" />
 34 |           <entry key="PLAIN_TEXT" value="1" />
 35 |           <entry key="Python" value="8" />
 36 |         </counts>
 37 |       </usages-collector>
 38 |       <usages-collector id="statistics.file.extensions.edit">
 39 |         <counts>
 40 |           <entry key="md" value="187" />
 41 |           <entry key="py" value="174" />
 42 |         </counts>
 43 |       </usages-collector>
 44 |       <usages-collector id="statistics.file.types.edit">
 45 |         <counts>
 46 |           <entry key="Markdown" value="187" />
 47 |           <entry key="Python" value="174" />
 48 |         </counts>
 49 |       </usages-collector>
 50 |     </session>
 51 |   </component>
 52 |   <component name="FileEditorManager">
 53 |     <leaf>
 54 |       <file pinned="false" current-in-tab="false">
 55 |         <entry file="file://$PROJECT_DIR$/utils/logger.py">
 56 |           <provider selected="true" editor-type-id="text-editor" />
 57 |         </entry>
 58 |       </file>
 59 |       <file pinned="false" current-in-tab="false">
 60 |         <entry file="file://$PROJECT_DIR$/utils/visualization.py">
 61 |           <provider selected="true" editor-type-id="text-editor" />
 62 |         </entry>
 63 |       </file>
 64 |       <file pinned="false" current-in-tab="false">
 65 |         <entry file="file://$PROJECT_DIR$/datasets/utils.py">
 66 |           <provider selected="true" editor-type-id="text-editor" />
 67 |         </entry>
 68 |       </file>
 69 |       <file pinned="false" current-in-tab="false">
 70 |         <entry file="file://$PROJECT_DIR$/LICENSE">
 71 |           <provider selected="true" editor-type-id="text-editor" />
 72 |         </entry>
 73 |       </file>
 74 |       <file pinned="false" current-in-tab="true">
 75 |         <entry file="file://$PROJECT_DIR$/README.md">
 76 |           <provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
 77 |             <state split_layout="SPLIT">
 78 |               <first_editor relative-caret-position="252">
 79 |                 <caret line="12" lean-forward="true" selection-start-line="12" selection-end-line="12" />
 80 |               </first_editor>
 81 |               <second_editor />
 82 |             </state>
 83 |           </provider>
 84 |         </entry>
 85 |       </file>
 86 |       <file pinned="false" current-in-tab="false">
 87 |         <entry file="file://$PROJECT_DIR$/datasets/synthtext.py">
 88 |           <provider selected="true" editor-type-id="text-editor">
 89 |             <state relative-caret-position="345">
 90 |               <caret line="28" column="33" selection-start-line="28" selection-start-column="33" selection-end-line="28" selection-end-column="33" />
 91 |               <folding>
 92 |                 <element signature="e#38#47#0" expanded="true" />
 93 |               </folding>
 94 |             </state>
 95 |           </provider>
 96 |         </entry>
 97 |       </file>
 98 |       <file pinned="false" current-in-tab="false">
 99 |         <entry file="file://$PROJECT_DIR$/datasets/icdar15.py">
100 |           <provider selected="true" editor-type-id="text-editor">
101 |             <state relative-caret-position="567">
102 |               <caret line="27" column="46" selection-start-line="27" selection-start-column="46" selection-end-line="27" selection-end-column="46" />
103 |               <folding>
104 |                 <element signature="e#38#47#0" expanded="true" />
105 |               </folding>
106 |             </state>
107 |           </provider>
108 |         </entry>
109 |       </file>
110 |       <file pinned="false" current-in-tab="false">
111 |         <entry file="file://$PROJECT_DIR$/train.py">
112 |           <provider selected="true" editor-type-id="text-editor">
113 |             <state relative-caret-position="510">
114 |               <caret line="140" column="70" selection-start-line="140" selection-start-column="70" selection-end-line="140" selection-end-column="70" />
115 |               <folding>
116 |                 <element signature="e#30#39#0" expanded="true" />
117 |               </folding>
118 |             </state>
119 |           </provider>
120 |         </entry>
121 |       </file>
122 |       <file pinned="false" current-in-tab="false">
123 |         <entry file="file://$PROJECT_DIR$/cfgs/config.py">
124 |           <provider selected="true" editor-type-id="text-editor">
125 |             <state relative-caret-position="69">
126 |               <caret line="54" column="53" selection-start-line="54" selection-start-column="53" selection-end-line="54" selection-end-column="53" />
127 |             </state>
128 |           </provider>
129 |         </entry>
130 |       </file>
131 |       <file pinned="false" current-in-tab="false">
132 |         <entry file="file://$PROJECT_DIR$/test.py">
133 |           <provider selected="true" editor-type-id="text-editor">
134 |             <state relative-caret-position="215">
135 |               <caret line="146" column="70" selection-start-line="146" selection-start-column="70" selection-end-line="146" selection-end-column="70" />
136 |               <folding>
137 |                 <element signature="e#23#38#0" expanded="true" />
138 |               </folding>
139 |             </state>
140 |           </provider>
141 |         </entry>
142 |       </file>
143 |     </leaf>
144 |   </component>
145 |   <component name="FileTemplateManagerImpl">
146 |     <option name="RECENT_TEMPLATES">
147 |       <list>
148 |         <option value="Python Script" />
149 |       </list>
150 |     </option>
151 |   </component>
152 |   <component name="Git.Settings">
153 |     <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
154 |   </component>
155 |   <component name="IdeDocumentHistory">
156 |     <option name="CHANGED_PATHS">
157 |       <list>
158 |         <option value="$PROJECT_DIR$/cfgs/config.py" />
159 |         <option value="$PROJECT_DIR$/datasets/synthtext.py" />
160 |         <option value="$PROJECT_DIR$/datasets/icdar15.py" />
161 |         <option value="$PROJECT_DIR$/test.py" />
162 |         <option value="$PROJECT_DIR$/train.py" />
163 |         <option value="$PROJECT_DIR$/README.md" />
164 |       </list>
165 |     </option>
166 |   </component>
167 |   <component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
168 |   <component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
169 |   <component name="JsGulpfileManager">
170 |     <detection-done>true</detection-done>
171 |     <sorting>DEFINITION_ORDER</sorting>
172 |   </component>
173 |   <component name="ProjectFrameBounds" extendedState="6" fullScreen="true">
174 |     <option name="x" value="39" />
175 |     <option name="y" value="23" />
176 |     <option name="width" value="1440" />
177 |     <option name="height" value="877" />
178 |   </component>
179 |   <component name="ProjectView">
180 |     <navigator proportions="" version="1">
181 |       <foldersAlwaysOnTop value="true" />
182 |     </navigator>
183 |     <panes>
184 |       <pane id="Scope" />
185 |       <pane id="ProjectPane">
186 |         <subPane>
187 |           <expand>
188 |             <path>
189 |               <item name="RetinaNet-Text-Detection" type="b2602c69:ProjectViewProjectNode" />
190 |               <item name="RetinaNet-Text-Detection" type="462c0819:PsiDirectoryNode" />
191 |             </path>
192 |             <path>
193 |               <item name="RetinaNet-Text-Detection" type="b2602c69:ProjectViewProjectNode" />
194 |               <item name="RetinaNet-Text-Detection" type="462c0819:PsiDirectoryNode" />
195 |               <item name="images" type="462c0819:PsiDirectoryNode" />
196 |             </path>
197 |           </expand>
198 |           <select />
199 |         </subPane>
200 |       </pane>
201 |     </panes>
202 |   </component>
203 |   <component name="PropertiesComponent">
204 |     <property name="WebServerToolWindowFactoryState" value="true" />
205 |     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
206 |     <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
207 |     <property name="nodejs_npm_path_reset_for_default_project" value="true" />
208 |     <property name="settings.editor.selected.configurable" value="reference.settingsdialog.IDE.editor.colors" />
209 |   </component>
210 |   <component name="RunDashboard">
211 |     <option name="ruleStates">
212 |       <list>
213 |         <RuleState>
214 |           <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
215 |         </RuleState>
216 |         <RuleState>
217 |           <option name="name" value="StatusDashboardGroupingRule" />
218 |         </RuleState>
219 |       </list>
220 |     </option>
221 |   </component>
222 |   <component name="SvnConfiguration">
223 |     <configuration />
224 |   </component>
225 |   <component name="TaskManager">
226 |     <task active="true" id="Default" summary="Default task">
227 |       <changelist id="ad23dd17-2b3c-4c1e-97e2-3bd688b6847e" name="Default Changelist" comment="" />
228 |       <created>1543635449354</created>
229 |       <option name="number" value="Default" />
230 |       <option name="presentableId" value="Default" />
231 |       <updated>1543635449354</updated>
232 |     </task>
233 |     <servers />
234 |   </component>
235 |   <component name="ToolWindowManager">
236 |     <frame x="0" y="0" width="1440" height="900" extended-state="6" />
237 |     <editor active="true" />
238 |     <layout>
239 |       <window_info id="Favorites" side_tool="true" />
240 |       <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.24964234" />
241 |       <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
242 |       <window_info anchor="bottom" id="Docker" show_stripe_button="false" />
243 |       <window_info anchor="bottom" id="Database Changes" show_stripe_button="false" />
244 |       <window_info anchor="bottom" id="Version Control" show_stripe_button="false" />
245 |       <window_info anchor="bottom" id="Python Console" />
246 |       <window_info anchor="bottom" id="Terminal" />
247 |       <window_info anchor="bottom" id="Event Log" side_tool="true" />
248 |       <window_info anchor="bottom" id="Message" order="0" />
249 |       <window_info anchor="bottom" id="Find" order="1" />
250 |       <window_info anchor="bottom" id="Run" order="2" />
251 |       <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
252 |       <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
253 |       <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
254 |       <window_info anchor="bottom" id="TODO" order="6" />
255 |       <window_info anchor="right" id="Remote Host" />
256 |       <window_info anchor="right" id="SciView" />
257 |       <window_info anchor="right" id="Database" />
258 |       <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
259 |       <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
260 |       <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
261 |     </layout>
262 |   </component>
263 |   <component name="TypeScriptGeneratedFilesManager">
264 |     <option name="version" value="1" />
265 |   </component>
266 |   <component name="VcsContentAnnotationSettings">
267 |     <option name="myLimit" value="2678400000" />
268 |   </component>
269 |   <component name="editorHistoryManager">
270 |     <entry file="file://$PROJECT_DIR$/utils/logger.py">
271 |       <provider selected="true" editor-type-id="text-editor" />
272 |     </entry>
273 |     <entry file="file://$PROJECT_DIR$/utils/visualization.py">
274 |       <provider selected="true" editor-type-id="text-editor" />
275 |     </entry>
276 |     <entry file="file://$PROJECT_DIR$/datasets/utils.py">
277 |       <provider selected="true" editor-type-id="text-editor" />
278 |     </entry>
279 |     <entry file="file://$PROJECT_DIR$/LICENSE">
280 |       <provider selected="true" editor-type-id="text-editor" />
281 |     </entry>
282 |     <entry file="file://$PROJECT_DIR$/cfgs/config.py">
283 |       <provider selected="true" editor-type-id="text-editor">
284 |         <state relative-caret-position="69">
285 |           <caret line="54" column="53" selection-start-line="54" selection-start-column="53" selection-end-line="54" selection-end-column="53" />
286 |         </state>
287 |       </provider>
288 |     </entry>
289 |     <entry file="file://$PROJECT_DIR$/datasets/synthtext.py">
290 |       <provider selected="true" editor-type-id="text-editor">
291 |         <state relative-caret-position="345">
292 |           <caret line="28" column="33" selection-start-line="28" selection-start-column="33" selection-end-line="28" selection-end-column="33" />
293 |           <folding>
294 |             <element signature="e#38#47#0" expanded="true" />
295 |           </folding>
296 |         </state>
297 |       </provider>
298 |     </entry>
299 |     <entry file="file://$PROJECT_DIR$/datasets/icdar15.py">
300 |       <provider selected="true" editor-type-id="text-editor">
301 |         <state relative-caret-position="567">
302 |           <caret line="27" column="46" selection-start-line="27" selection-start-column="46" selection-end-line="27" selection-end-column="46" />
303 |           <folding>
304 |             <element signature="e#38#47#0" expanded="true" />
305 |           </folding>
306 |         </state>
307 |       </provider>
308 |     </entry>
309 |     <entry file="file://$PROJECT_DIR$/test.py">
310 |       <provider selected="true" editor-type-id="text-editor">
311 |         <state relative-caret-position="215">
312 |           <caret line="146" column="70" selection-start-line="146" selection-start-column="70" selection-end-line="146" selection-end-column="70" />
313 |           <folding>
314 |             <element signature="e#23#38#0" expanded="true" />
315 |           </folding>
316 |         </state>
317 |       </provider>
318 |     </entry>
319 |     <entry file="file://$PROJECT_DIR$/train.py">
320 |       <provider selected="true" editor-type-id="text-editor">
321 |         <state relative-caret-position="510">
322 |           <caret line="140" column="70" selection-start-line="140" selection-start-column="70" selection-end-line="140" selection-end-column="70" />
323 |           <folding>
324 |             <element signature="e#30#39#0" expanded="true" />
325 |           </folding>
326 |         </state>
327 |       </provider>
328 |     </entry>
329 |     <entry file="file://$PROJECT_DIR$/README.md">
330 |       <provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
331 |         <state split_layout="SPLIT">
332 |           <first_editor relative-caret-position="252">
333 |             <caret line="12" lean-forward="true" selection-start-line="12" selection-end-line="12" />
334 |           </first_editor>
335 |           <second_editor />
336 |         </state>
337 |       </provider>
338 |     </entry>
339 |   </component>
340 | </project>


--------------------------------------------------------------------------------